def insert_get_countries(dataset): parsed_countries = [] countries = [] for row in dataset[1:]: country = row[1] if country not in parsed_countries: parsed_countries.append(country) iso3 = covid19_data_downloader.get_iso3_country(country) state = '' population = None countries.append((country, iso3, state, population)) inserter = DataInserter() return inserter.insert_country(countries)
def __init__(self, select_fun, insert_fun, worker_class, lock=threading.Lock(), db_m=DBManager()): threading.Thread.__init__(self) self.new_data = queue.Queue() self.done_data = queue.Queue() self.db_m = db_m self._db_access = lock self.db_insert = DataInserter(self.db_m) self.db_select = DataSelector(self.db_m) self.num_loc_threads = 2 self.max_threads = 5 # 5 worked.. self.batch_size = 10 self.quota_exceeded = False self.select_fun = select_fun.__name__ self.insert_fun = insert_fun.__name__ self.worker_class = worker_class
def insert_get_countries(global_conf, us_deaths): parsed_countries = [] parsed_states = [] countries = [] for row in global_conf[1:]: country = row[1] if country not in parsed_countries: parsed_countries.append(country) iso3 = get_iso3_country(country) state = '' population = None countries.append((country, iso3, state, population)) for row in us_deaths[1:]: state = row[6] if state not in parsed_states: parsed_states.append(state) state_code = us_states.state_to_code(state) country = row[7] iso3 = row[2] population = row[11] if state_code is None: state_code = state countries.append((country, iso3, state_code, population)) data_inserter = DataInserter() return data_inserter.insert_country(countries)
input_data = (country_id, date, 0, sum_deaths, 0) parsed_data.append(input_data) for d in parsed_data: date = d[1] parsed_date = parse_date(date) index = parsed_data.index(d) parsed_data[index] = (d[0], parsed_date, d[2], d[3], d[4]) return parsed_data if __name__ == "__main__": data_inserter = DataInserter() directory = '../data/Covid-19-data/csse_covid_19_data/csse_covid_19_time_series' path_global_conf_file = directory + '/' + 'time_series_covid19_confirmed_global.csv' path_global_deaths_file = directory + '/' + 'time_series_covid19_deaths_global.csv' path_us_conf_file = directory + '/' + 'time_series_covid19_confirmed_US.csv' path_us_deaths_file = directory + '/' + 'time_series_covid19_deaths_US.csv' path_global_recov_file = directory + '/' + 'time_series_covid19_recovered_global.csv' data = parse_csv_files( path_global_conf_file, path_global_deaths_file, path_us_conf_file, path_us_deaths_file, path_global_recov_file ) # no starting date = all, data='no-zero-month/day/last-two-numbers-year' # print(data) data_inserter.insert_covid19_data(data)
sum_deaths = sum(deaths) sum_recovs = sum(recovs) index = parsed_data.index(old_data[0]) parsed_data[index] = (old_data[0][0], old_data[0][1], sum_confs, sum_deaths, sum_recovs) else: parsed_data.append((country_id, date, conf, death, recov)) for row in parsed_data: date = row[1] parsed_date = parse_date(date) index = parsed_data.index(row) parsed_data[index] = (row[0], parsed_date, row[2], row[3], row[4]) return parsed_data if __name__ == "__main__": data_inserter = DataInserter() directory = '../data/SARS-03-data' file_path = directory + '/' + 'sars_2003_complete_dataset_clean.csv' data = parse_csv_files(file_path) # print(data) # data_inserter.insert_sars_data(data)
class Pipe(threading.Thread): def __init__(self, select_fun, insert_fun, worker_class, lock=threading.Lock(), db_m=DBManager()): threading.Thread.__init__(self) self.new_data = queue.Queue() self.done_data = queue.Queue() self.db_m = db_m self._db_access = lock self.db_insert = DataInserter(self.db_m) self.db_select = DataSelector(self.db_m) self.num_loc_threads = 2 self.max_threads = 5 # 5 worked.. self.batch_size = 10 self.quota_exceeded = False self.select_fun = select_fun.__name__ self.insert_fun = insert_fun.__name__ self.worker_class = worker_class def get_new_data(self): batch = [] it = 0 while it < self.batch_size and not self.new_data.empty(): batch.append(self.new_data.get()) it += 1 return batch def get_done_data(self): batch = [] it = 0 while it < self.batch_size and not self.done_data.empty(): batch.append(self.done_data.get()) it += 1 return batch def put_new_data(self, data): for d in data: self.new_data.put(d) def put_done_data(self, data): for d in data: self.done_data.put(d) def stop(self): self.quota_exceeded = True def run(self): worker_threads = [] epoch_count = 0 select_scale = 5 with self._db_access: new_data = self.db_select.__getattribute__(self.select_fun)( self.batch_size * select_scale) print('Data selected') self.put_new_data(new_data) thread_id = 0 while not self.new_data.empty() and not self.quota_exceeded: print("----- Beginning " + str(epoch_count) + " epoch -----") worker_threads = [t for t in worker_threads if t.is_alive()] print("Active threads: " + str(len(worker_threads))) print("Data to process: " + str(self.new_data.qsize())) for i in range(self.num_loc_threads): thread = self.worker_class(thread_id, self.get_new_data(), self) thread.start() worker_threads.append(thread) thread_id += 1 print("Processing started") if len(worker_threads) > self.max_threads: print('Too many to process, waiting..') for t in worker_threads[:-self.max_threads // 2]: t.join() print('Resuming...') print("Inserting started") print("Data to insert: " + str(self.done_data.qsize())) if not self.done_data.empty(): with self._db_access: while not self.done_data.empty(): self.db_insert.__getattribute__(self.insert_fun)( self.get_done_data()) with self._db_access: new_data = self.db_select.__getattribute__(self.select_fun)( self.batch_size * select_scale) print('New data selected') self.put_new_data(new_data) epoch_count += 1 print('--- No more data ---') print('Joining threads') for t in worker_threads: t.join() print("All thread finished, inserting last..") if not self.done_data.empty(): with self._db_access: while not self.done_data.empty(): self.db_insert.__getattribute__(self.insert_fun)( self.get_done_data()) print('--- Everything added ---') def run_one(self): epoch_count = 0 with self._db_access: new_data = self.db_select.__getattribute__(self.select_fun)( self.batch_size) print('Data selected') self.put_new_data(new_data) while not self.new_data.empty() and not self.quota_exceeded: print("----- Beginning " + str(epoch_count) + " epoch -----") print(self.quota_exceeded) worker = self.worker_class(1, self.get_new_data(), self) worker.start() print("Processing started") worker.join() print("Data to insert: " + str(self.done_data.qsize())) with self._db_access: self.db_insert.__getattribute__(self.insert_fun)( self.get_done_data()) new_data = self.db_select.__getattribute__(self.select_fun)( self.batch_size) print('New data selected') self.put_new_data(new_data) epoch_count += 1 if not self.done_data.empty(): with self._db_access: while not self.done_data.empty(): self.db_insert.__getattribute__(self.insert_fun)( self.get_done_data()) print('--- Everything added ---')
import json import os from db_utils.data_inserter import DataInserter tw_data_dir = '../../data/tweets/' rtw_data_dir = '../../data/retweets/' usr_data_dir = '../../data/users/' data_dir = usr_data_dir if __name__ == "__main__": db = DataInserter() filenames = [] for filename in os.listdir(data_dir): if '.DS' not in filename and '_in' not in filename: filenames.append(filename) for filename in filenames: print(filename) with open(data_dir + filename) as f: data = json.load(f) if 'retweets' in data_dir: db.insert_retweets(data) elif 'users' in data_dir: db.fast_insert_users(data) else: db.fast_insert_tweets(data)