Python SearchDatabase примеры использования

Язык программирования: Python

Пространство имен/Пакет: fuzzyfinder.database

Класс/Тип: SearchDatabase

Примеров на hotexamples.com: 7

Python SearchDatabase - 7 примеров найдено. Это лучшие примеры Python кода для fuzzyfinder.database.SearchDatabase, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

SearchDatabase(7)

write_list_dicts_parallel(6)

_update_token_stats_tables(2)

build_or_replace_stats_tables(2)

find_potental_matches(1)

get_value_from_db_state_table(1)

write_all_col_counters_to_db(1)

write_pandas_dataframe(1)

Пример #1

Показать файл

def test_integrity():

    # Want to test database insert functionality to check that:
    # 1. It's not possible to add the same unique_id twice
    # 2. Token counts are computed correctly when you try and add the same unique_id twice

    record_dicts = []
    record_dict = {"uid": 0, "value": "hello"}
    record_dicts.append(record_dict)

    # test it works at the level of the record
    record = Record(record_dict, unique_id_col='uid')

    # Test it works at the level of the db
    db_filename = tempfile.NamedTemporaryFile().name
    db = SearchDatabase(db_filename)

    db.write_list_dicts_parallel(record_dicts,
                                 unique_id_col='uid',
                                 batch_size=5)

    # Check records are written without uid values

    sql = 'select * from df'
    rec = db.conn.execute(sql).fetchall()[0]
    assert rec['unique_id'] == '0'
    assert '0' not in rec['concat_all']

    # Reconnect to file and check the unique_id_col is correct
    db2 = SearchDatabase(db_filename)

    assert db2.unique_id_col == 'uid'

Пример #2

Показать файл

def test_build_and_search():

    db_filename = tempfile.NamedTemporaryFile().name

    db = SearchDatabase(db_filename)
    rec1 = {"unique_id": 1, "first_name": "robin", "surname": "linacre"}
    rec2 = {"unique_id": 2, "first_name": "robyn", "surname": "linaker"}
    rec3 = {"unique_id": 3, "first_name": "robin", "surname": "linacre"}
    rec3 = {"unique_id": 4, "first_name": "david", "surname": "smith"}

    dicts = [rec1, rec2, rec3]
    db.write_list_dicts_parallel(dicts, unique_id_col="unique_id")

    db.build_or_replace_stats_tables()

    search_rec = {"unique_id": 4, "first_name": "robin", "surname": None}

    assert 1 in db.find_potental_matches(search_rec).keys()

    # With record caching, we want to make sure that if the search rec is changed but the unique id
    # is for some reason left the same, we get different search results

    search_rec = {"unique_id": 4, "first_name": "david", "surname": None}

    assert 4 in db.find_potental_matches(search_rec).keys()

Пример #3

Показать файл

def test_json_problem():

    db_filename = tempfile.NamedTemporaryFile().name

    db = SearchDatabase(db_filename)
    rec1 = {"unique_id": 1, "first_name": "robin", "int_problem": 1}
    rec2 = {"unique_id": 2, "first_name": "robyn", "int_problem": 2}
    rec3 = {"unique_id": 3, "first_name": "robin", "int_problem": 3}
    rec3 = {"unique_id": 4, "first_name": "david", "int_problem": None}

    import pandas as pd

    dicts = [rec1, rec2, rec3]
    df = pd.DataFrame(dicts)
    df["int_problem"] = df["int_problem"].astype(pd.Int64Dtype())

    db.write_pandas_dataframe(df, unique_id_col="unique_id")

Пример #4

Показать файл

Файл: test_record.py Проект: uk-gov-mirror/moj-analytical-services.fuzzyfinder

def test_record():

    db_filename = tempfile.NamedTemporaryFile().name

    db = SearchDatabase(db_filename)
    rec1 = {
        'unique_id': "rectest_1",
        'first_name': 'robin',
        'surname': 'linacre'
    }
    rec2 = {
        'unique_id': "rectest_2",
        'first_name': 'robyn',
        'surname': 'linaker'
    }
    rec3 = {
        'unique_id': "rectest_3",
        'first_name': 'robin',
        'surname': 'linacre'
    }

    dicts = [rec1, rec2, rec3]
    db.write_list_dicts_parallel(dicts, unique_id_col='unique_id')

    db.build_or_replace_stats_tables()

    # You have to be careful with caching here - deliberately do not include unique id here
    # Different unique ids should be assignd
    search_rec = {
        'unique_id': 'serach_rec_1',
        'first_name': 'robin',
        'surname': "smith"
    }

    r = Record(search_rec, 'unique_id', db.conn)

    assert 'ROBIN' in r.tokens_in_order_of_rarity
    assert 'SMITH' not in r.tokens_in_order_of_rarity

    search_rec = {
        'unique_id': 'serach_rec_2',
        'first_name': 'dave',
        'surname': "linacre"
    }
    r = Record(search_rec, 'unique_id', db.conn)

    assert 'LINACRE' in r.tokens_in_order_of_rarity
    assert 'DAVE' not in r.tokens_in_order_of_rarity

Пример #5

Показать файл

def test_integrity(db_con_string):

    # Want to test database insert functionality to check that:
    # 1. It's not possible to add the same unique_id twice
    # 2. Token counts are computed correctly when you try and add the same unique_id twice
    if db_con_string == "temp":
        db_filename = tempfile.NamedTemporaryFile().name
    else:
        db_filename = db_con_string

    db = SearchDatabase(db_filename)

    rec_tokens = []
    rec_tokens.extend(["a"] * 1)
    rec_tokens.extend(["b"] * 2)
    rec_tokens.extend(["c"] * 3)
    rec_tokens.extend(["d"] * 4)

    records = []
    for rec_num, char in enumerate(rec_tokens):
        record = {"unique_id": rec_num, "value": char}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5)

    db._update_token_stats_tables()

    # At the moment, all tokens should have a count of 1

    sql_tkn_count = """
    select token_proportion

    from value_token_counts
    where token = 'A'
    """

    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["token_proportion"] == 0.1

    sql_tkn_count = """
    select token_proportion

    from value_token_counts
    where token = 'B'
    """

    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["token_proportion"] == 0.2

    # Add another 10 As.  Now there are 11 in 20
    records = []
    for i in range(10, 20):
        record = {"unique_id": i, "value": "a"}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5)

    db._update_token_stats_tables()

    sql_tkn_count = """
    select token_proportion

    from value_token_counts
    where token = 'A'
    """

    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["token_proportion"] == 0.55

    # Add another 10 As, with repeated IDs, so they should be skipped
    records = []
    for i in range(10, 20):
        record = {"unique_id": i, "value": "a"}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5)

    db._update_token_stats_tables()

    sql_tkn_count = """
    select token_proportion

    from value_token_counts
    where token = 'A'
    """

    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["token_proportion"] == 0.55

    # Token proportions should sum to 1
    sql_tkn_count = """
    select sum(token_proportion) as sum

    from value_token_counts

    """
    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["sum"] == 1.00

Пример #6

Показать файл

def test_integrity():

    # Want to test database insert functionality to check that:
    # 1. It's not possible to add the same unique_id twice
    # 2. Token counts are computed correctly when you try and add the same unique_id twice

    db_filename = tempfile.NamedTemporaryFile().name

    db = SearchDatabase(db_filename)

    rec_tokens = []
    rec_tokens.extend(["a"] * 1)
    rec_tokens.extend(["b"] * 2)
    rec_tokens.extend(["c"] * 3)
    rec_tokens.extend(["d"] * 4)

    records = []
    for rec_num, char in enumerate(rec_tokens):
        record = {"unique_id": rec_num, "value": char}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5,
                                 write_column_counters=False)

    # should be out of sync

    # Add another 10 As.  Now there are 11 in 20
    records = []
    for i in range(10, 20):
        record = {"unique_id": i, "value": "a"}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5,
                                 write_column_counters=False)

    # Status
    assert db.get_value_from_db_state_table('col_counters_in_sync') == 'false'

    db.write_all_col_counters_to_db()

    assert db.get_value_from_db_state_table('col_counters_in_sync') == 'true'

    db._update_token_stats_tables()

    sql_tkn_count = """
    select token_proportion

    from value_token_counts
    where token = 'A'
    """

    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["token_proportion"] == 0.55

    # Add another 10 As, with repeated IDs, so they should be skipped
    records = []
    for i in range(10, 20):
        record = {"unique_id": i, "value": "a"}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5)

    db._update_token_stats_tables()

    sql_tkn_count = """
    select token_proportion

    from value_token_counts
    where token = 'A'
    """

    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["token_proportion"] == 0.55

    # Token proportions should sum to 1
    sql_tkn_count = """
    select sum(token_proportion) as sum

    from value_token_counts

    """
    results = db.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["sum"] == 1.00

    db2 = SearchDatabase(db_filename)

    assert db.get_value_from_db_state_table('col_counters_in_sync') == 'true'

    db2.write_list_dicts_parallel(records,
                                  unique_id_col='unique_id',
                                  batch_size=5,
                                  write_column_counters=False)

    with pytest.warns(UserWarning):
        db3 = SearchDatabase(db_filename)

    assert db.get_value_from_db_state_table('col_counters_in_sync') == 'false'

Пример #7

Показать файл

Файл: test_integrity.py Проект: uk-gov-mirror/moj-analytical-services.fuzzyfinder

def test_integrity():

    # Want to test database insert functionality to check that:
    # 1. It's not possible to add the same unique_id twice
    # 2. Token counts are computed correctly when you try and add the same unique_id twice
    db_filename = tempfile.NamedTemporaryFile().name
    db = SearchDatabase(db_filename)
    records = []
    for char in list(string.ascii_lowercase):
        record = {"unique_id": char, "value": char}
        records.append(record)

    db.write_list_dicts_parallel(records,
                                 unique_id_col='unique_id',
                                 batch_size=5)

    sql_df_count = """
    select count(*) as count from df
    """

    results = db.conn.execute(sql_df_count)
    results = results.fetchall()
    assert results[0]["count"] == 26

    db2 = SearchDatabase(db_filename)
    db2.write_list_dicts_parallel(records,
                                  unique_id_col='unique_id',
                                  batch_size=10)

    results = db2.conn.execute(sql_df_count)
    results = results.fetchall()
    assert results[0]["count"] == 26

    # At the moment, all tokens should have a count of 1

    sql_tkn_count = """
    select
        max(token_count) as max,
        min(token_count) as min,
        count(*) as count
    from value_token_counts
    """

    results = db2.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["max"] == 1
    assert results[0]["min"] == 1
    assert results[0]["count"] == 26

    # Note records deliberately includes 29 items now, we expect three new
    for char in ["a", "b", "c"]:
        record = {"unique_id": f"{char}_2", "value": char}
        records.append(record)

    db2.write_list_dicts_parallel(records,
                                  unique_id_col='unique_id',
                                  batch_size=10)

    results = db2.conn.execute(sql_df_count)
    results = results.fetchall()

    assert results[0]["count"] == 29

    results = db2.conn.execute(sql_tkn_count)
    results = results.fetchall()
    assert results[0]["max"] == 2
    assert results[0]["min"] == 1
    assert results[0]["count"] == 26

    sql_count_a = """
    select token_count
    from value_token_counts
    where token = 'A'
    """

    results = db.conn.execute(sql_count_a)
    results = results.fetchall()
    assert results[0]["token_count"] == 2