def test_update_anahashes(dbsession, datafiles): wfs = pd.DataFrame() wfs['wordform'] = ['wf-a', 'wf-b', 'wf-c'] alphabet_file = datafiles.listdir()[0] bulk_add_wordforms(dbsession, wfs) anahashes = dbsession.query(Anahash).order_by(Anahash.anahash_id).all() assert len(anahashes) == 0 wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all() for w in wrdfrms: assert (w.anahash) is None update_anahashes(dbsession, alphabet_file) # If we don't commit here, the anahashes won't be updated when we do the # tests. dbsession.commit() wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all() anahashes = dbsession.query(Anahash).order_by(Anahash.anahash_id).all() # Three anahashes were added assert len(anahashes) == 3 # The anahashes are connected to the correct wordforms for wf, a in zip(wrdfrms, (3, 2, 1)): assert wf.anahash_id == a
def test_bulk_add_wordforms_not_unique(dbsession): wfs = pd.DataFrame() wfs['wordform'] = ['wf1', 'wf1', 'wf2'] print(dbsession) bulk_add_wordforms(dbsession, wfs) wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all() assert len(wrdfrms) == 2
def test_update_anahashes_empty_wf(dbsession, datafiles): wfs = pd.DataFrame() wfs['wordform'] = ['wf-a', 'wf-b', 'wf-c', ' '] alphabet_file = datafiles.listdir()[0] bulk_add_wordforms(dbsession, wfs) # make sure ticcl doesn't choke on the empty wordform (it must not be added # to the database) update_anahashes(dbsession, alphabet_file)
def test_bulk_add_wordforms_all_new(dbsession): wfs = pd.DataFrame() wfs['wordform'] = ['wf1', 'wf2', 'wf3'] print(dbsession) bulk_add_wordforms(dbsession, wfs) wordforms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all() assert len(wordforms) == len(wfs['wordform']) assert [wf.wordform for wf in wordforms] == list(wfs['wordform'])
def test_bulk_add_wordforms_drop_empty_and_nan(dbsession): wfs = pd.DataFrame() wfs["wordform"] = ["wf1", "", "wf2", np.NaN] print(dbsession) bulk_add_wordforms(dbsession, wfs) wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all() assert len(wrdfrms) == 2 assert wrdfrms[0].wordform == "wf1" assert wrdfrms[1].wordform == "wf2"
def test_bulk_add_wordforms_whitespace(dbsession): wfs = pd.DataFrame() wfs['wordform'] = ['wf1 ', ' wf2', ' ', ' \t'] print(dbsession) bulk_add_wordforms(dbsession, wfs) wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all() assert len(wrdfrms) == 2 assert wrdfrms[0].wordform == 'wf1' assert wrdfrms[1].wordform == 'wf2'
def test_bulk_add_wordforms_replace_underscores(dbsession): wfs = pd.DataFrame() wfs["wordform"] = ["wf_1", "wf 2"] print(dbsession) bulk_add_wordforms(dbsession, wfs) wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all() assert len(wrdfrms) == 2 assert wrdfrms[0].wordform == "wf*1" assert wrdfrms[1].wordform == "wf_2"
def test_get_word_frequency_df(dbsession): wfs = pd.DataFrame() wfs['wordform'] = ['wf1', 'wf2', 'wf3'] bulk_add_wordforms(dbsession, wfs) freq_df = get_word_frequency_df(dbsession) expected = pd.DataFrame({ 'wordform': ['wf1', 'wf2', 'wf3'], 'frequency': [1, 1, 1] }).set_index('wordform') assert freq_df.equals(expected)
def test_bulk_add_anahashes(dbsession): wfs = pd.DataFrame() wfs['wordform'] = ['wf1', 'wf2', 'wf3'] bulk_add_wordforms(dbsession, wfs) a = pd.DataFrame({ 'wordform': ['wf1', 'wf2', 'wf3'], 'anahash': [1, 2, 3] }).set_index('wordform') bulk_add_anahashes(dbsession, a) ahs = dbsession.query(Anahash).order_by(Anahash.anahash_id).all() print(ahs[0]) assert [a.anahash for a in ahs] == list(a['anahash'])
def test_bulk_add_wordforms_some_new(dbsession): wfs = pd.DataFrame() wfs['wordform'] = ['wf1', 'wf2', 'wf3'] print(dbsession) bulk_add_wordforms(dbsession, wfs) wfs['wordform'] = ['wf3', 'wf4', 'wf5'] wfs['wordform_lowercase'] = ['wf3', 'wf4', 'wf4'] n = bulk_add_wordforms(dbsession, wfs) assert n == 2 wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all() assert len(wrdfrms) == 5 assert [w.wordform for w in wrdfrms] == ['wf1', 'wf2', 'wf3', 'wf4', 'wf5']
def test_update_anahashes_nothing_to_update(dbsession, datafiles): wfs = pd.DataFrame() wfs['wordform'] = ['wf1', 'wf2', 'wf3'] bulk_add_wordforms(dbsession, wfs) a = pd.DataFrame({ 'wordform': ['wf1', 'wf2', 'wf3'], 'anahash': [1, 2, 3] }).set_index('wordform') bulk_add_anahashes(dbsession, a) connect_anahashes_to_wordforms(dbsession, a, a['anahash'].to_dict()) alphabet_file = datafiles.listdir()[0] update_anahashes(dbsession, alphabet_file) wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all() assert [wf.anahash.anahash for wf in wrdfrms] == list(a['anahash'])
def test_connect_anahashes_to_wordforms(dbsession): wfs = pd.DataFrame() wfs['wordform'] = ['wf1', 'wf2', 'wf3'] bulk_add_wordforms(dbsession, wfs) wfs = get_word_frequency_df(dbsession, add_ids=True) wf_mapping = wfs['wordform_id'].to_dict() a = pd.DataFrame({ 'wordform': ['wf1', 'wf2', 'wf3'], 'anahash': [1, 2, 3] }).set_index('wordform') bulk_add_anahashes(dbsession, a) connect_anahashes_to_wordforms(dbsession, a, wf_mapping) wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all() assert [wf.anahash.anahash for wf in wrdfrms] == list(a['anahash'])
def test_connect_anahashes_to_wordforms_empty(dbsession): wfs = pd.DataFrame() wfs['wordform'] = ['wf1', 'wf2', 'wf3'] bulk_add_wordforms(dbsession, wfs) a = pd.DataFrame({ 'wordform': ['wf1', 'wf2', 'wf3'], 'anahash': [1, 2, 3] }).set_index('wordform') bulk_add_anahashes(dbsession, a) connect_anahashes_to_wordforms(dbsession, a, a['anahash'].to_dict()) # nothing was updated the second time around (the values didn't change) # (and there is no error when running this) connect_anahashes_to_wordforms(dbsession, a, a['anahash'].to_dict()) wrdfrms = dbsession.query(Wordform).order_by(Wordform.wordform_id).all() assert [wf.anahash.anahash for wf in wrdfrms] == list(a['anahash'])