예제 #1
0
def test_update_anahashes(dbsession, datafiles):
    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf-a', 'wf-b', 'wf-c']

    alphabet_file = datafiles.listdir()[0]

    bulk_add_wordforms(dbsession, wfs)

    anahashes = dbsession.query(Anahash).order_by(Anahash.anahash_id).all()
    assert len(anahashes) == 0

    wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all()
    for w in wrdfrms:
        assert (w.anahash) is None

    update_anahashes(dbsession, alphabet_file)

    # If we don't commit here, the anahashes won't be updated when we do the
    # tests.
    dbsession.commit()

    wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all()
    anahashes = dbsession.query(Anahash).order_by(Anahash.anahash_id).all()

    # Three anahashes were added
    assert len(anahashes) == 3

    # The anahashes are connected to the correct wordforms
    for wf, a in zip(wrdfrms, (3, 2, 1)):
        assert wf.anahash_id == a
예제 #2
0
def test_bulk_add_wordforms_not_unique(dbsession):
    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1', 'wf1', 'wf2']

    print(dbsession)

    bulk_add_wordforms(dbsession, wfs)

    wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all()

    assert len(wrdfrms) == 2
예제 #3
0
def test_update_anahashes_empty_wf(dbsession, datafiles):
    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf-a', 'wf-b', 'wf-c', ' ']

    alphabet_file = datafiles.listdir()[0]

    bulk_add_wordforms(dbsession, wfs)

    # make sure ticcl doesn't choke on the empty wordform (it must not be added
    # to the database)
    update_anahashes(dbsession, alphabet_file)
예제 #4
0
def test_bulk_add_wordforms_all_new(dbsession):
    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1', 'wf2', 'wf3']

    print(dbsession)

    bulk_add_wordforms(dbsession, wfs)

    wordforms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all()

    assert len(wordforms) == len(wfs['wordform'])
    assert [wf.wordform for wf in wordforms] == list(wfs['wordform'])
예제 #5
0
def test_bulk_add_wordforms_drop_empty_and_nan(dbsession):
    wfs = pd.DataFrame()
    wfs["wordform"] = ["wf1", "", "wf2", np.NaN]

    print(dbsession)

    bulk_add_wordforms(dbsession, wfs)

    wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all()

    assert len(wrdfrms) == 2
    assert wrdfrms[0].wordform == "wf1"
    assert wrdfrms[1].wordform == "wf2"
예제 #6
0
def test_bulk_add_wordforms_whitespace(dbsession):
    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1 ', '  wf2', ' ', '    \t']

    print(dbsession)

    bulk_add_wordforms(dbsession, wfs)

    wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all()

    assert len(wrdfrms) == 2
    assert wrdfrms[0].wordform == 'wf1'
    assert wrdfrms[1].wordform == 'wf2'
예제 #7
0
def test_bulk_add_wordforms_replace_underscores(dbsession):
    wfs = pd.DataFrame()
    wfs["wordform"] = ["wf_1", "wf 2"]

    print(dbsession)

    bulk_add_wordforms(dbsession, wfs)

    wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all()

    assert len(wrdfrms) == 2
    assert wrdfrms[0].wordform == "wf*1"
    assert wrdfrms[1].wordform == "wf_2"
예제 #8
0
def test_get_word_frequency_df(dbsession):
    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1', 'wf2', 'wf3']

    bulk_add_wordforms(dbsession, wfs)

    freq_df = get_word_frequency_df(dbsession)

    expected = pd.DataFrame({
        'wordform': ['wf1', 'wf2', 'wf3'],
        'frequency': [1, 1, 1]
    }).set_index('wordform')

    assert freq_df.equals(expected)
예제 #9
0
def test_bulk_add_anahashes(dbsession):
    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1', 'wf2', 'wf3']

    bulk_add_wordforms(dbsession, wfs)

    a = pd.DataFrame({
        'wordform': ['wf1', 'wf2', 'wf3'],
        'anahash': [1, 2, 3]
    }).set_index('wordform')

    bulk_add_anahashes(dbsession, a)

    ahs = dbsession.query(Anahash).order_by(Anahash.anahash_id).all()

    print(ahs[0])

    assert [a.anahash for a in ahs] == list(a['anahash'])
예제 #10
0
def test_bulk_add_wordforms_some_new(dbsession):
    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1', 'wf2', 'wf3']

    print(dbsession)

    bulk_add_wordforms(dbsession, wfs)

    wfs['wordform'] = ['wf3', 'wf4', 'wf5']
    wfs['wordform_lowercase'] = ['wf3', 'wf4', 'wf4']

    n = bulk_add_wordforms(dbsession, wfs)

    assert n == 2

    wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all()

    assert len(wrdfrms) == 5
    assert [w.wordform for w in wrdfrms] == ['wf1', 'wf2', 'wf3', 'wf4', 'wf5']
예제 #11
0
def test_update_anahashes_nothing_to_update(dbsession, datafiles):
    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1', 'wf2', 'wf3']

    bulk_add_wordforms(dbsession, wfs)

    a = pd.DataFrame({
        'wordform': ['wf1', 'wf2', 'wf3'],
        'anahash': [1, 2, 3]
    }).set_index('wordform')

    bulk_add_anahashes(dbsession, a)

    connect_anahashes_to_wordforms(dbsession, a, a['anahash'].to_dict())
    alphabet_file = datafiles.listdir()[0]
    update_anahashes(dbsession, alphabet_file)

    wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all()

    assert [wf.anahash.anahash for wf in wrdfrms] == list(a['anahash'])
예제 #12
0
def test_connect_anahashes_to_wordforms(dbsession):
    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1', 'wf2', 'wf3']

    bulk_add_wordforms(dbsession, wfs)

    wfs = get_word_frequency_df(dbsession, add_ids=True)
    wf_mapping = wfs['wordform_id'].to_dict()

    a = pd.DataFrame({
        'wordform': ['wf1', 'wf2', 'wf3'],
        'anahash': [1, 2, 3]
    }).set_index('wordform')

    bulk_add_anahashes(dbsession, a)

    connect_anahashes_to_wordforms(dbsession, a, wf_mapping)

    wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all()

    assert [wf.anahash.anahash for wf in wrdfrms] == list(a['anahash'])
예제 #13
0
def test_connect_anahashes_to_wordforms_empty(dbsession):
    wfs = pd.DataFrame()
    wfs['wordform'] = ['wf1', 'wf2', 'wf3']

    bulk_add_wordforms(dbsession, wfs)

    a = pd.DataFrame({
        'wordform': ['wf1', 'wf2', 'wf3'],
        'anahash': [1, 2, 3]
    }).set_index('wordform')

    bulk_add_anahashes(dbsession, a)

    connect_anahashes_to_wordforms(dbsession, a, a['anahash'].to_dict())

    # nothing was updated the second time around (the values didn't change)
    # (and there is no error when running this)
    connect_anahashes_to_wordforms(dbsession, a, a['anahash'].to_dict())

    wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all()

    assert [wf.anahash.anahash for wf in wrdfrms] == list(a['anahash'])