示例#1
0
def load(branch_date: str):
    loaders = []
    master_writer = get_df_table_writer('word_frequency',
                                        get_master_df_builder(),
                                        pk_cols=['word'],
                                        import_mode='replace')
    message = 'Update Wikipedia word frequencies for {} XML dump'.format(
        branch_date)
    loaders.append(get_dolt_loader([master_writer], True, message, 'master'))

    loaders.append(get_branch_creator(branch_date))

    for filter_name in FILTER_NAMES:
        filter_writer = get_df_table_writer('word_frequency',
                                            get_filter_df_builder(filter_name),
                                            pk_cols=['word'],
                                            import_mode='replace')
        branch_name = '{}/filter_{}'.format(branch_date, filter_name)
        filter_message = 'Update Wikipedia word frequencies with {} filter for {} XML dump'.format(
            branch_date, filter_name)
        loaders.append(
            get_dolt_loader([filter_writer], True, filter_message,
                            branch_name))

    load_to_dolthub(loaders,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=REPO_PATH)
示例#2
0
def test_branch_creator(initial_test_data):
    repo = initial_test_data
    new_branch = 'new-branch'
    assert repo.get_branch_list() == ['master']
    branch_name = get_branch_creator(new_branch)(repo)
    assert branch_name == new_branch
    assert new_branch in repo.get_branch_list()
示例#3
0
def load(date_string: str, dump_target: str):
    """
    Gets case-sensitive and case-insensitive loader for each Wikipedia dump
    Each loader has a writer for each table
    :param date_string:
    :param dump_target:
    :return:
    """
    loaders = []
    article_count = fetch_data(dump_target)

    # Get case-sensitive ngrams
    writers = get_writers(date_string, article_count)
    message = 'Update case-sensitive ngrams for dump date {}'.format(
        date_string)
    loaders.append(get_dolt_loader(writers, True, message))
    loaders.append(get_branch_creator('{}/case-sensitive'.format(date_string)))

    # Get case-insensitive ngrams
    l_message = 'Update case-insensitive ngrams for dump date {}'.format(
        date_string)
    l_writers = get_writers(date_string, article_count, lower='_lower')
    loaders.append(
        get_dolt_loader(l_writers, True, l_message,
                        '{}/case-insensitive'.format(date_string)))

    load_to_dolthub(loaders,
                    clone=True,
                    push=True,
                    remote_name='origin',
                    remote_url=REPO_PATH)
示例#4
0
def get_wikipedia_loaders(branch_date: str):
    loaders = []
    master_writer = get_df_table_writer('word_frequency',
                                        get_master_df_builder(),
                                        pk_cols=['word'],
                                        import_mode='replace')
    message = 'Update Wikipedia word frequencies for {} XML dump'.format(
        branch_date)
    loaders.append(get_dolt_loader([master_writer], True, message, 'master'))

    loaders.append(get_branch_creator(branch_date))

    for filter_name in FILTER_NAMES:
        filter_writer = get_df_table_writer('word_frequency',
                                            get_filter_df_builder(filter_name),
                                            pk_cols=['word'],
                                            import_mode='replace')
        branch_name = '{}/filter_{}'.format(branch_date, filter_name)
        filter_message = 'Update Wikipedia word frequencies with {} filter for {} XML dump'.format(
            branch_date, filter_name)
        loaders.append(
            get_dolt_loader([filter_writer], True, filter_message,
                            branch_name))

    return loaders
示例#5
0
def test_branch_creator(initial_test_data):
    repo = initial_test_data
    new_branch = 'new-branch'
    _, branches = repo.branch()
    assert [b.name for b in branches] == ['master']
    branch_name = get_branch_creator(new_branch)(repo)
    assert branch_name == new_branch
    _, branches = repo.branch()
    assert new_branch in [branch.name for branch in branches]