def load(branch_date: str): loaders = [] master_writer = get_df_table_writer('word_frequency', get_master_df_builder(), pk_cols=['word'], import_mode='replace') message = 'Update Wikipedia word frequencies for {} XML dump'.format( branch_date) loaders.append(get_dolt_loader([master_writer], True, message, 'master')) loaders.append(get_branch_creator(branch_date)) for filter_name in FILTER_NAMES: filter_writer = get_df_table_writer('word_frequency', get_filter_df_builder(filter_name), pk_cols=['word'], import_mode='replace') branch_name = '{}/filter_{}'.format(branch_date, filter_name) filter_message = 'Update Wikipedia word frequencies with {} filter for {} XML dump'.format( branch_date, filter_name) loaders.append( get_dolt_loader([filter_writer], True, filter_message, branch_name)) load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=REPO_PATH)
def test_branch_creator(initial_test_data): repo = initial_test_data new_branch = 'new-branch' assert repo.get_branch_list() == ['master'] branch_name = get_branch_creator(new_branch)(repo) assert branch_name == new_branch assert new_branch in repo.get_branch_list()
def load(date_string: str, dump_target: str): """ Gets case-sensitive and case-insensitive loader for each Wikipedia dump Each loader has a writer for each table :param date_string: :param dump_target: :return: """ loaders = [] article_count = fetch_data(dump_target) # Get case-sensitive ngrams writers = get_writers(date_string, article_count) message = 'Update case-sensitive ngrams for dump date {}'.format( date_string) loaders.append(get_dolt_loader(writers, True, message)) loaders.append(get_branch_creator('{}/case-sensitive'.format(date_string))) # Get case-insensitive ngrams l_message = 'Update case-insensitive ngrams for dump date {}'.format( date_string) l_writers = get_writers(date_string, article_count, lower='_lower') loaders.append( get_dolt_loader(l_writers, True, l_message, '{}/case-insensitive'.format(date_string))) load_to_dolthub(loaders, clone=True, push=True, remote_name='origin', remote_url=REPO_PATH)
def get_wikipedia_loaders(branch_date: str): loaders = [] master_writer = get_df_table_writer('word_frequency', get_master_df_builder(), pk_cols=['word'], import_mode='replace') message = 'Update Wikipedia word frequencies for {} XML dump'.format( branch_date) loaders.append(get_dolt_loader([master_writer], True, message, 'master')) loaders.append(get_branch_creator(branch_date)) for filter_name in FILTER_NAMES: filter_writer = get_df_table_writer('word_frequency', get_filter_df_builder(filter_name), pk_cols=['word'], import_mode='replace') branch_name = '{}/filter_{}'.format(branch_date, filter_name) filter_message = 'Update Wikipedia word frequencies with {} filter for {} XML dump'.format( branch_date, filter_name) loaders.append( get_dolt_loader([filter_writer], True, filter_message, branch_name)) return loaders
def test_branch_creator(initial_test_data): repo = initial_test_data new_branch = 'new-branch' _, branches = repo.branch() assert [b.name for b in branches] == ['master'] branch_name = get_branch_creator(new_branch)(repo) assert branch_name == new_branch _, branches = repo.branch() assert new_branch in [branch.name for branch in branches]