def load(date_string: str, dump_target: str): """ Gets case-sensitive and case-insensitive loader for each Wikipedia dump Each loader has a writer for each table :param date_string: :param dump_target: :return: """ loaders = [] article_count = fetch_data(dump_target) # Get case-sensitive ngrams writers = get_writers(date_string, article_count) message = 'Update case-sensitive ngrams for dump date {}'.format( date_string) loaders.append(get_dolt_loader(writers, True, message)) loaders.append(get_branch_creator('{}/case-sensitive'.format(date_string))) # Get case-insensitive ngrams l_message = 'Update case-insensitive ngrams for dump date {}'.format( date_string) l_writers = get_writers(date_string, article_count, lower='_lower') loaders.append( get_dolt_loader(l_writers, True, l_message, '{}/case-insensitive'.format(date_string))) load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=REPO_PATH)
def load(branch_date: str): loaders = [] master_writer = get_df_table_writer('word_frequency', get_master_df_builder(), pk_cols=['word'], import_mode='replace') message = 'Update Wikipedia word frequencies for {} XML dump'.format( branch_date) loaders.append(get_dolt_loader([master_writer], True, message, 'master')) loaders.append(get_branch_creator(branch_date)) for filter_name in FILTER_NAMES: filter_writer = get_df_table_writer('word_frequency', get_filter_df_builder(filter_name), pk_cols=['word'], import_mode='replace') branch_name = '{}/filter_{}'.format(branch_date, filter_name) filter_message = 'Update Wikipedia word frequencies with {} filter for {} XML dump'.format( branch_date, filter_name) loaders.append( get_dolt_loader([filter_writer], True, filter_message, branch_name)) load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=REPO_PATH)
def write_results_to_dolt(results_file: str, remote: str, branch: str): table_writer = get_df_table_writer(RESULTS_TABLE, lambda: pd.read_csv(results_file), RESULTS_TABLE_PKS, import_mode='update') loader = get_dolt_loader(table_writer, True, 'benchmark run', branch) load_to_dolthub(loader, clone=True, push=True, remote_name='origin', remote_url=remote)
def write_results_to_dolt(results_dir: str, remote: str, branch: str): dfs = [ pd.read_csv(os.path.join(results_dir, filename)) for filename in os.listdir(results_dir) ] table_writer = get_df_table_writer(RESULTS_TABLE, lambda: pd.concat(dfs), RESULTS_TABLE_PKS, import_mode='update') loader = get_dolt_loader(table_writer, True, 'benchmark run', branch) load_to_dolthub(loader, clone=True, push=True, remote_name='origin', remote_url=remote)
def load(): table_writers = [] for dataset in DATASETS: tramsformers = [] if dataset.pk_cols else [insert_unique_key] pk_cols = ['hash_id'] if not dataset.pk_cols else dataset.pk_cols writer = get_df_table_writer(dataset.table_name, get_mta_data_as_df( get_mta_url(dataset.dataset_id)), pk_cols, transformers=tramsformers) table_writers.append(writer) loaders = [ get_dolt_loader(table_writers, True, 'Update MTA data for date {}'.format(datetime.now())) ] load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=REPO_PATH)
def load_dataset(repo_path: str, datasets: List[FiveThirtyEightDataset], message: str): table_writers = [get_df_table_writer(ds.name, ds.get_dataset_fetcher(), ds.primary_keys) for ds in datasets] loaders = [get_dolt_loader(table_writers, True, message)] load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=repo_path)
def load_fx_rates_running_averages(): table_writer = get_table_transformer(get_raw_fx_rates, 'eur_fx_rate_averages', ['currency'], get_average_rates) loader = get_dolt_loader(table_writer, True, 'Updated averages for date {}'.format(datetime.now())) load_to_dolthub(loader, clone=True, push=True, remote_url=FX_RATES_REPO)
def load_raw_fx_rates(): table_writer = get_df_table_writer('eur_fx_rates', get_raw_data, ['currency', 'timestamp']) message = 'Updated raw FX rates for date {}'.format(datetime.now()) loader = get_dolt_loader(table_writer, commit=True, message=message) load_to_dolthub(loader, clone=True, push=True, remote_url=FX_RATES_REPO)
def load(git_hash: str, github_actions_run_url: str): table_writers = [get_df_table_writer('eod_data', get_data, ['date', 'ticker'], 'update')] loaders = [get_dolt_loader(table_writers, True, get_commit_message(git_hash, github_actions_run_url))] load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=REMOTE_DB)