class Tests(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.lex = LexStat(test_data('KSL.qlc')) self.part = Partial(test_data('partial_cognates.tsv'), segments='segments') self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x) self.part.add_entries('pid2', 'partialids2', lambda x: [int(y) for y in x.split(' ')]) def test_bcubes(self): res = bcubes(self.lex, test='cogid', pprint=False) self.assertAlmostEquals(res, (1.0, 1.0, 1.0)) _ = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True) def test_partial_bcubes(self): res = partial_bcubes(self.part, 'pid1', 'pid2', pprint=False) assert [round(x, 2) for x in res] == [0.92, 0.98, 0.95] _ = partial_bcubes(self.part, 'pid1', 'pid2', pprint=True) def test_pairs(self): res = pairs(self.lex, test='cogid', pprint=False) self.assertAlmostEquals(res, (1.0, 1.0, 1.0)) def test_diff(self): res = diff(self.lex, test='cogid', tofile=False, pprint=False) self.assertAlmostEquals(res, ((1.0, 1.0, 1.0), (1.0, 1.0, 1.0))) self.lex.add_entries('cugid', 'cogid', lambda x: x + 1 if x % 2 else x * x) _ = diff(self.lex, gold='cogid', test='cogid', filename='%s' % self.tmp_path('test_acd'), pprint=False) d2 = diff(self.lex, gold='cugid', test='cogid', filename='%s' % self.tmp_path('test_acd'), pprint=False, tofile=False) _ = diff(self.lex, gold='cugid', test='cogid', filename='%s' % self.tmp_path('test_acd'), pprint=False, tofile=True) assert d2[0] != 1 def test_random_cognates(self): random_cognates(self.lex, ref='randomid') assert 'randomid' in self.lex.header def test_extreme_cognates(self): extreme_cognates(self.lex, ref="lumperid", bias='lumper') assert self.lex[1, 'lumperid'] == self.lex[2, 'lumperid'] extreme_cognates(self.lex, ref='splitterid', bias='splitter') assert self.lex[1, 'splitterid'] != self.lex[2, 'splitterid'] assert_raises(ValueError, extreme_cognates, self.lex, bias='')
def setUp(self): WithTempDir.setUp(self) self.lex = LexStat(test_data('KSL.qlc')) self.part = Partial(test_data('partial_cognates.tsv'), segments='segments') self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x) self.part.add_entries('pid2', 'partialids2', lambda x: [int(y) for y in x.split(' ')])
class Tests(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.lex = LexStat(test_data('KSL.qlc')) self.part = Partial(test_data('partial_cognates.tsv'), segments='segments') self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x) self.part.add_entries('pid2', 'partialids2', lambda x: [int(y) for y in x.split(' ')]) def test_bcubes(self): from lingpy.evaluate.acd import bcubes res = bcubes(self.lex, test='cogid', pprint=False) self.assertAlmostEquals(res, (1.0, 1.0, 1.0)) res = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True) def test_partial_bcubes(self): from lingpy.evaluate.acd import partial_bcubes res = partial_bcubes(self.part, 'pid1', 'pid2', pprint=False) assert [round(x, 2) for x in res] == [0.92, 0.98, 0.95] res = partial_bcubes(self.part, 'pid1', 'pid2', pprint=True) def test_pairs(self): from lingpy.evaluate.acd import pairs res = pairs(self.lex, test='cogid', pprint=False) self.assertAlmostEquals(res, (1.0, 1.0, 1.0)) def test_diff(self): from lingpy.evaluate.acd import diff res = diff(self.lex, test='cogid', tofile=False, pprint=False) self.assertAlmostEquals(res, ((1.0, 1.0, 1.0), (1.0, 1.0, 1.0))) self.lex.add_entries('cugid', 'cogid', lambda x: x+1 if x % 2 else x*x) d1 = diff(self.lex, gold='cogid', test='cogid', filename='%s' % self.tmp_path('test_acd'), pprint=False) d2 = diff(self.lex, gold='cugid', test='cogid', filename='%s' % self.tmp_path('test_acd'), pprint=False, tofile=False) d3 = diff(self.lex, gold='cugid', test='cogid', filename='%s' % self.tmp_path('test_acd'), pprint=False, tofile=True) assert d2[0] != 1
def writeToFile(): print("LOAD TEST WORDLIST") pathToAnnotatedWordList = "Data/IELex/output/IELex-2016.tsv" print("LOAD WORDLIST") #pathToAnnotatedWordList = "Data/mattis_new/output/ObUgrian-110-21.tsv.asjp" languages,words,global_ids,cognate_classes = loadAnnotatedWordList(pathToAnnotatedWordList) print(len(set(global_ids))) stoplist = {221 , 646 ,1333 ,1224 , 778 ,1402, 1411, 1232, 1203, 1292} languages,words,global_ids,cognate_classes = getRidOfValidationSet(languages,words,global_ids,cognate_classes,stoplist) print(len(set(global_ids))) with codecs.open("lexstat_wordlist.txt","w",encoding="UTF-8") as f: f.write("CONCEPT\tIPA\tDOCULECT\tCOGID\n") for l,w,gi,cog in zip(languages,words,global_ids,cognate_classes): f.write(str(gi)+"\t"+w+"\t"+l+"\t"+cog+"\n") wl =get_wordlist("lexstat_wordlist.txt",delimiter="\t") print(wl.get_dict(concept="730",entry="IPA")) print("initializing lexstat") lex = LexStat(wl) print("getting scorer") lex.get_scorer() print("clustering") lex.cluster(method="lexstat", threshold=0.6, ref="cognate_class_pred") print("output") lex.output('tsv', filename="lexstat_ielex") from lingpy.evaluate.acd import bcubes, diff bcubes(lex, "cognate_class", "COGID") print(bcubes(lex, "cognate_class", "cognate_class_pred"))
def test_correctness(self): lex = LexStat({ 0: ['ID', 'doculect', 'concept', 'IPA'], 1: ['1', 'deu', 'hand', 'hand'], 2: ['2', 'eng', 'hand', 'hand'], 3: ['3', 'xyz', 'hand', 'xyz']}) lex.get_scorer(**self.get_scorer_kw) lex.cluster(ref='cogid') self.assertEquals(lex.get_entries('cogid'), [[1, 1, 3]]) lex = LexStat({ 0: ['ID', 'concept', 'ipa', 'doculect'], 1: ['5424', 'Abend::N', 'swar', 'FRA'], 2: ['5425', 'Abend::N', 'sware', 'FRA'], 3: ['5426', 'Abend::N', 'sear3', 'RON'], 4: ['5427', 'Abend::N', 'ivniN', 'ENG'], 5: ['5428', 'Abend::N', 'noyt3', 'POR'], 6: ['5429', 'Abend::N', 'tardi5a', 'POR'], 7: ['5430', 'Abend::N', 'afd3n', 'DAN'], }) lex.get_scorer() lex.cluster(method='lexstat', threshold=0.8, ref='cogid') self.assertEquals(lex.get_entries('cogid'), [[1, 2, 3, 4, 5], [0, 0, 3, 3, 0]])
def setUp(self): WithTempDir.setUp(self) self.lex = LexStat(test_data('KSL.qlc')) self.log = get_log() self.get_scorer_kw = dict(runs=10, rands=10, limit=100)
class TestLexStat(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.lex = LexStat(test_data('KSL.qlc')) self.log = get_log() self.get_scorer_kw = dict(runs=10, rands=10, limit=100) def test_init(self): LexStat({0: ['ID', 'doculect', 'concept', 'IPA']}, model='sca') ls = LexStat({0: ['ID', 'doculect', 'concept', 'IPA']}) self.assertIn('lexstat', repr(ls)) LexStat(ls) LexStat({0: ['ID', 'doculect', 'concept', 'tokens']}) self.assertRaises(AssertionError, LexStat, {0: ['ID', 'doculect', 'concept']}) LexStat(test_data('phybo.qlc'), check=True) with patch('lingpy.compare.lexstat.log', self.log): LexStat(test_data('KSL.qlc'), check=True) assert self.log.info.called error_log = self.tmp_path('errors') with patch('lingpy.util.confirm', Mock(return_value=True)): lex = LexStat({ 0: ['ID', 'doculect', 'concept', 'IPA', 'tokens'], 1: ['1', 'deu', 'hand', 'hand', ['']], 2: ['2', 'eng', 'hand', 'hand', ['abc']], 3: ['3', 'xyz', 'hand', 'hund', 'h u n d'], }, check=True, errors='%s' % error_log) assert error_log.exists() self.assertEquals(len(lex._meta['errors']), 2) def test_init2(self): freqs = self.lex.freqs['Hawaiian'] for char, n in {'5.W.C': 19, '5.I.V': 87, '5.Y.V': 75, '5.U.V': 87}.items(): self.assertEquals(freqs[char], n) self.assertEquals(len(self.lex.chars), 187) self.assertEquals(len(self.lex.rchars), 35) self.maxDiff = None for name in 'bscorer rscorer pairs'.split(): obj = jsonload(test_data('KSL.%s.json' % name)) if name != 'pairs': self.assertEquals(getattr(self.lex, name).matrix, obj) else: for key, values in self.lex.pairs.items(): values = set(values) ovalues = set(tuple(v) for v in obj['---'.join(key)]) if 'TRAVIS' not in os.environ and sys.version_info < (3, 5): # For some reason this assertion fails when run on travis-ci with # python 3.3 self.assertEquals(values, ovalues) def test_getitem(self): self.assertIsNone(self.lex['xyz']) def test_get_scorer(self): self.lex.get_scorer(**self.get_scorer_kw) assert hasattr(self.lex, "cscorer") with patch('lingpy.compare.lexstat.log', self.log): self.lex.get_scorer(**self.get_scorer_kw) assert self.log.warn.called del self.lex.cscorer self.lex.get_scorer(**self.get_scorer_kw) self.lex.get_scorer(method='markov', **self.get_scorer_kw) def test_cluster(self): self.lex.get_scorer(**self.get_scorer_kw) self.lex.cluster(method="lexstat", threshold=0.7) self.lex.cluster(method="edit-dist", threshold=0.7) self.lex.cluster(method="turchin", threshold=0.7) self.assertRaises(ValueError, self.lex.cluster, method="fuzzy") with patch('lingpy.basic.parser.input', Mock(return_value='y')): self.lex.cluster(method="sca", guess_threshold=True, gt_mode='nulld') assert 'scaid' in self.lex.header \ and 'lexstatid' in self.lex.header \ and 'editid' in self.lex.header \ and 'turchinid' in self.lex.header def test_align_pairs(self): self.lex.align_pairs('English', 'German', method='sca') def test_get_subset(self): self.lex.get_subset([]) self.assertEquals([v for v in self.lex.subsets.values() if v], []) pairs = jsonload(test_data('KSL.pairs.json')) self.assertEquals( sorted('---'.join(k) for k in self.lex.subsets.keys()), sorted(pairs.keys())) def test_get_distances(self): self.lex.get_scorer(**self.get_scorer_kw) self.lex.get_random_distances() self.lex.get_distances() self.lex.get_distances(method='turchin') self.lex.get_distances(aggregate=False) def test_get_frequencies(self): f = self.lex.get_frequencies('sounds') assert len(f) == self.lex.width f = self.lex.get_frequencies('sounds', aggregated=True) tokens = [] for k in self.lex: for t in self.lex[k, 'tokens']: tokens += [t] assert len(f) == len(set(tokens)) d = self.lex.get_frequencies('diversity', ref='cogid') assert isinstance(d, float) w = self.lex.get_frequencies('wordlength') assert len(w) == self.lex.width w = self.lex.get_frequencies('wordlength', aggregated=True) assert isinstance(w, float) def test_output(self): self.lex.output('csv', filename='%s' % self.tmp_path('test_lexstat')) self.lex.output('scorer', filename='%s' % self.tmp_path('test_lexstat')) def test_correctness(self): lex = LexStat({ 0: ['ID', 'doculect', 'concept', 'IPA'], 1: ['1', 'deu', 'hand', 'hand'], 2: ['2', 'eng', 'hand', 'hand'], 3: ['3', 'xyz', 'hand', 'xyz']}) lex.get_scorer(**self.get_scorer_kw) lex.cluster(ref='cogid') self.assertEquals(lex.get_entries('cogid'), [[1, 1, 3]]) lex = LexStat({ 0: ['ID', 'concept', 'ipa', 'doculect'], 1: ['5424', 'Abend::N', 'swar', 'FRA'], 2: ['5425', 'Abend::N', 'sware', 'FRA'], 3: ['5426', 'Abend::N', 'sear3', 'RON'], 4: ['5427', 'Abend::N', 'ivniN', 'ENG'], 5: ['5428', 'Abend::N', 'noyt3', 'POR'], 6: ['5429', 'Abend::N', 'tardi5a', 'POR'], 7: ['5430', 'Abend::N', 'afd3n', 'DAN'], }) lex.get_scorer() lex.cluster(method='lexstat', threshold=0.8, ref='cogid') self.assertEquals(lex.get_entries('cogid'), [[1, 2, 3, 4, 5], [0, 0, 3, 3, 0]])
def make(*args, **kw): kw.setdefault('errors', str(tmppath / 'errors.log')) return LexStat(*args, **kw)
def setUp(self): self.lex = LexStat(test_data('KSL.qlc'))
class TestLexStat(unittest.TestCase): def setUp(self): self.lex = LexStat(test_data('KSL.qlc')) def test_get_scorer(self): self.lex.get_scorer() assert hasattr(self.lex,"cscorer") == True def test_cluster(self): self.lex.get_scorer() self.lex.cluster(method="lexstat", threshold=0.7) self.lex.cluster(method="edit-dist", threshold=0.7) self.lex.cluster(method="turchin", threshold=0.7) assert ('scaid' in self.lex.header and 'lexstatid' in self.lex.header \ and 'editid' in self.lex.header and 'turchinid' in \ self.lex.header) == True def test_align_pairs(self): self.lex.align_pairs('English', 'German', method='sca')
def _make_one(self, *args, **kw): kw.setdefault('errors', self.tmp_path('errors.log').as_posix()) return LexStat(*args, **kw)
def main(): results = [] """for i in range(20): lex = LexStat("../PIE_scored_{}_og.csv".format(i)) # lex.get_scorer() # lex.cluster(method="lexstat", threshold=0.6, ref="cognates") print(".", end="", flush=True) results.append( ( i, bcubes( lex, "cogid", "newcogid", pprint=False, modify_ref=lambda x: abs(int(x)), ), ) ) : print() print("OG") for r in results: print(r) """ records = [] f_names = [ "SLV", "GER", "ROM", "OUG", "KSL", "BAI", "JAP", "PIE", "IEL", "PAN" ] for name in f_names: print(name) for n in range(1, 7): results = [] for i in [50]: lex = LexStat("../{}_updating_{}_{}.csv".format(name, i, n)) # lex.get_scorer() # lex.cluster(method="lexstat", threshold=0.6, ref="cognates") precision, recall, f_score = bcubes( lex, "cogid", "newcogid", pprint=False, modify_ref=lambda x: abs(int(x)), ) records.append([name, n, precision, recall, f_score]) for r in results: print(r) print() f, axes = plt.subplots(1, 2, figsize=(20, 8)) # markers = {i: "${}$".format(i) for i in range(1, 7)} df = pd.DataFrame.from_records( records, columns=["Partition", "Iteration", "Precision", "Recall", "F-score"]) sns.lineplot(x="Recall", y="Precision", hue="Partition", data=df, marker="o", ax=axes[0]) # plt.subplots_adjust(right=0.7) # plt.legend(bbox_to_anchor=(1.02, 1.02), loc="upper left") for _, i, precision, recall, _ in records: if i == 1 or i == 6: axes[0].annotate(str(i), (recall, precision)) # markers = {i: "${}$".format(i) for i in range(1, 7)} df = pd.DataFrame.from_records( records, columns=["Partition", "Iteration", "Precision", "Recall", "F-score"]) sns.lineplot(x="Iteration", y="F-score", hue="Partition", data=df, ax=axes[1]) # plt.subplots_adjust(right=0.7) axes[0].get_legend().remove() lgd = plt.legend(bbox_to_anchor=(1.02, 1.00), loc="upper left") plt.savefig("bbb.png", bbox_extra_artists=[lgd], bbox_inches="tight")
class TestLexStat(unittest.TestCase): def setUp(self): self.lex = LexStat(test_data('KSL.qlc')) def test_get_scorer(self): self.lex.get_scorer() assert hasattr(self.lex, "cscorer") == True def test_cluster(self): self.lex.get_scorer() self.lex.cluster(method="lexstat", threshold=0.7) self.lex.cluster(method="edit-dist", threshold=0.7) self.lex.cluster(method="turchin", threshold=0.7) assert ('scaid' in self.lex.header and 'lexstatid' in self.lex.header \ and 'editid' in self.lex.header and 'turchinid' in \ self.lex.header) == True def test_align_pairs(self): self.lex.align_pairs('English', 'German', method='sca')