示例#1
0
class Tests(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.lex = LexStat(test_data('KSL.qlc'))
        self.part = Partial(test_data('partial_cognates.tsv'),
                            segments='segments')
        self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x)
        self.part.add_entries('pid2', 'partialids2',
                              lambda x: [int(y) for y in x.split(' ')])

    def test_bcubes(self):
        res = bcubes(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

        _ = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True)

    def test_partial_bcubes(self):
        res = partial_bcubes(self.part, 'pid1', 'pid2', pprint=False)
        assert [round(x, 2) for x in res] == [0.92, 0.98, 0.95]

        _ = partial_bcubes(self.part, 'pid1', 'pid2', pprint=True)

    def test_pairs(self):
        res = pairs(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

    def test_diff(self):
        res = diff(self.lex, test='cogid', tofile=False, pprint=False)
        self.assertAlmostEquals(res, ((1.0, 1.0, 1.0), (1.0, 1.0, 1.0)))
        self.lex.add_entries('cugid', 'cogid',
                             lambda x: x + 1 if x % 2 else x * x)

        _ = diff(self.lex, gold='cogid', test='cogid',
                 filename='%s' % self.tmp_path('test_acd'), pprint=False)
        d2 = diff(self.lex, gold='cugid', test='cogid',
                  filename='%s' % self.tmp_path('test_acd'),
                  pprint=False, tofile=False)
        _ = diff(self.lex, gold='cugid', test='cogid',
                 filename='%s' % self.tmp_path('test_acd'),
                 pprint=False, tofile=True)

        assert d2[0] != 1

    def test_random_cognates(self):
        random_cognates(self.lex, ref='randomid')
        assert 'randomid' in self.lex.header

    def test_extreme_cognates(self):
        extreme_cognates(self.lex, ref="lumperid", bias='lumper')
        assert self.lex[1, 'lumperid'] == self.lex[2, 'lumperid']
        extreme_cognates(self.lex, ref='splitterid', bias='splitter')
        assert self.lex[1, 'splitterid'] != self.lex[2, 'splitterid']
        assert_raises(ValueError, extreme_cognates, self.lex, bias='')
示例#2
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.lex = LexStat(test_data('KSL.qlc'))
     self.part = Partial(test_data('partial_cognates.tsv'),
             segments='segments')
     self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x)
     self.part.add_entries('pid2', 'partialids2', lambda x: [int(y)
         for y in x.split(' ')])
示例#3
0
class Tests(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.lex = LexStat(test_data('KSL.qlc'))
        self.part = Partial(test_data('partial_cognates.tsv'),
                segments='segments')
        self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x)
        self.part.add_entries('pid2', 'partialids2', lambda x: [int(y)
            for y in x.split(' ')])


    def test_bcubes(self):
        from lingpy.evaluate.acd import bcubes

        res = bcubes(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

        res = bcubes(self.lex, 'cogid', 'cogid', pprint=True,
                per_concept=True)


    def test_partial_bcubes(self):
        from lingpy.evaluate.acd import partial_bcubes
        res = partial_bcubes(self.part, 'pid1', 'pid2', pprint=False)
        assert [round(x, 2) for x in res] == [0.92, 0.98, 0.95]

        res = partial_bcubes(self.part, 'pid1', 'pid2', pprint=True)
        
    def test_pairs(self):
        from lingpy.evaluate.acd import pairs

        res = pairs(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

    def test_diff(self):
        from lingpy.evaluate.acd import diff

        res = diff(self.lex, test='cogid', tofile=False, pprint=False)
        self.assertAlmostEquals(res, ((1.0, 1.0, 1.0), (1.0, 1.0, 1.0)))
        self.lex.add_entries('cugid', 'cogid', lambda x: x+1 if x % 2 else x*x)
        d1 = diff(self.lex, gold='cogid', test='cogid', filename='%s' % self.tmp_path('test_acd'), pprint=False)
        d2 = diff(self.lex, gold='cugid', test='cogid', filename='%s' %
                self.tmp_path('test_acd'), pprint=False, tofile=False)
        d3 = diff(self.lex, gold='cugid', test='cogid', filename='%s' %
                self.tmp_path('test_acd'), pprint=False, tofile=True)
        assert d2[0] != 1
示例#4
0
文件: LextStat.py 项目: marlonbetz/BA
def writeToFile():
    print("LOAD TEST WORDLIST")
    pathToAnnotatedWordList = "Data/IELex/output/IELex-2016.tsv"
    
    print("LOAD WORDLIST")
    #pathToAnnotatedWordList = "Data/mattis_new/output/ObUgrian-110-21.tsv.asjp"
    languages,words,global_ids,cognate_classes = loadAnnotatedWordList(pathToAnnotatedWordList)
    print(len(set(global_ids)))
    stoplist = {221 , 646 ,1333 ,1224 , 778 ,1402, 1411, 1232, 1203, 1292}

    languages,words,global_ids,cognate_classes = getRidOfValidationSet(languages,words,global_ids,cognate_classes,stoplist)
    print(len(set(global_ids)))
    with codecs.open("lexstat_wordlist.txt","w",encoding="UTF-8") as f:
        f.write("CONCEPT\tIPA\tDOCULECT\tCOGID\n")
        for l,w,gi,cog in zip(languages,words,global_ids,cognate_classes):
            f.write(str(gi)+"\t"+w+"\t"+l+"\t"+cog+"\n")
    wl =get_wordlist("lexstat_wordlist.txt",delimiter="\t")
    print(wl.get_dict(concept="730",entry="IPA"))
    print("initializing lexstat")
    lex = LexStat(wl)
    print("getting scorer")
    lex.get_scorer()
    print("clustering")
    lex.cluster(method="lexstat", threshold=0.6, ref="cognate_class_pred")
    print("output")
    lex.output('tsv', filename="lexstat_ielex")
    
    from lingpy.evaluate.acd import bcubes, diff
    bcubes(lex, "cognate_class", "COGID")
    print(bcubes(lex, "cognate_class", "cognate_class_pred"))
示例#5
0
    def test_correctness(self):
        lex = LexStat({
            0: ['ID', 'doculect', 'concept', 'IPA'],
            1: ['1', 'deu', 'hand', 'hand'],
            2: ['2', 'eng', 'hand', 'hand'],
            3: ['3', 'xyz', 'hand', 'xyz']})
        lex.get_scorer(**self.get_scorer_kw)
        lex.cluster(ref='cogid')
        self.assertEquals(lex.get_entries('cogid'), [[1, 1, 3]])

        lex = LexStat({
            0: ['ID', 'concept', 'ipa', 'doculect'],
            1: ['5424', 'Abend::N', 'swar', 'FRA'],
            2: ['5425', 'Abend::N', 'sware', 'FRA'],
            3: ['5426', 'Abend::N', 'sear3', 'RON'],
            4: ['5427', 'Abend::N', 'ivniN', 'ENG'],
            5: ['5428', 'Abend::N', 'noyt3', 'POR'],
            6: ['5429', 'Abend::N', 'tardi5a', 'POR'],
            7: ['5430', 'Abend::N', 'afd3n', 'DAN'],
        })
        lex.get_scorer()
        lex.cluster(method='lexstat', threshold=0.8, ref='cogid')
        self.assertEquals(lex.get_entries('cogid'), [[1, 2, 3, 4, 5], [0, 0, 3, 3, 0]])
示例#6
0
 def setUp(self):
     WithTempDir.setUp(self)
     self.lex = LexStat(test_data('KSL.qlc'))
     self.log = get_log()
     self.get_scorer_kw = dict(runs=10, rands=10, limit=100)
示例#7
0
class TestLexStat(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.lex = LexStat(test_data('KSL.qlc'))
        self.log = get_log()
        self.get_scorer_kw = dict(runs=10, rands=10, limit=100)

    def test_init(self):
        LexStat({0: ['ID', 'doculect', 'concept', 'IPA']}, model='sca')
        ls = LexStat({0: ['ID', 'doculect', 'concept', 'IPA']})
        self.assertIn('lexstat', repr(ls))
        LexStat(ls)
        LexStat({0: ['ID', 'doculect', 'concept', 'tokens']})
        self.assertRaises(AssertionError, LexStat, {0: ['ID', 'doculect', 'concept']})
        LexStat(test_data('phybo.qlc'), check=True)
        with patch('lingpy.compare.lexstat.log', self.log):
            LexStat(test_data('KSL.qlc'), check=True)
            assert self.log.info.called
        error_log = self.tmp_path('errors')
        with patch('lingpy.util.confirm', Mock(return_value=True)):
            lex = LexStat({
                0: ['ID', 'doculect', 'concept', 'IPA', 'tokens'],
                1: ['1', 'deu', 'hand', 'hand', ['']],
                2: ['2', 'eng', 'hand', 'hand', ['abc']],
                3: ['3', 'xyz', 'hand', 'hund', 'h u n d'],
            }, check=True, errors='%s' % error_log)
            assert error_log.exists()
            self.assertEquals(len(lex._meta['errors']), 2)

    def test_init2(self):
        freqs = self.lex.freqs['Hawaiian']
        for char, n in {'5.W.C': 19, '5.I.V': 87, '5.Y.V': 75, '5.U.V': 87}.items():
            self.assertEquals(freqs[char], n)
        self.assertEquals(len(self.lex.chars), 187)
        self.assertEquals(len(self.lex.rchars), 35)

        self.maxDiff = None

        for name in 'bscorer rscorer pairs'.split():
            obj = jsonload(test_data('KSL.%s.json' % name))
            if name != 'pairs':
                self.assertEquals(getattr(self.lex, name).matrix, obj)
            else:
                for key, values in self.lex.pairs.items():
                    values = set(values)
                    ovalues = set(tuple(v) for v in obj['---'.join(key)])
                    if 'TRAVIS' not in os.environ and sys.version_info < (3, 5):
                        # For some reason this assertion fails when run on travis-ci with
                        # python 3.3
                        self.assertEquals(values, ovalues)

    def test_getitem(self):
        self.assertIsNone(self.lex['xyz'])

    def test_get_scorer(self):
        self.lex.get_scorer(**self.get_scorer_kw)
        assert hasattr(self.lex, "cscorer")
        with patch('lingpy.compare.lexstat.log', self.log):
            self.lex.get_scorer(**self.get_scorer_kw)
            assert self.log.warn.called
        del self.lex.cscorer
        self.lex.get_scorer(**self.get_scorer_kw)
        self.lex.get_scorer(method='markov', **self.get_scorer_kw)

    def test_cluster(self):
        self.lex.get_scorer(**self.get_scorer_kw)
        self.lex.cluster(method="lexstat", threshold=0.7)
        self.lex.cluster(method="edit-dist", threshold=0.7)
        self.lex.cluster(method="turchin", threshold=0.7)
        self.assertRaises(ValueError, self.lex.cluster, method="fuzzy")
        with patch('lingpy.basic.parser.input', Mock(return_value='y')):
            self.lex.cluster(method="sca", guess_threshold=True, gt_mode='nulld')

        assert 'scaid' in self.lex.header \
            and 'lexstatid' in self.lex.header \
            and 'editid' in self.lex.header \
            and 'turchinid' in self.lex.header

    def test_align_pairs(self):
        self.lex.align_pairs('English', 'German', method='sca')

    def test_get_subset(self):
        self.lex.get_subset([])
        self.assertEquals([v for v in self.lex.subsets.values() if v], [])
        pairs = jsonload(test_data('KSL.pairs.json'))
        self.assertEquals(
            sorted('---'.join(k) for k in self.lex.subsets.keys()),
            sorted(pairs.keys()))

    def test_get_distances(self):
        self.lex.get_scorer(**self.get_scorer_kw)
        self.lex.get_random_distances()
        self.lex.get_distances()
        self.lex.get_distances(method='turchin')
        self.lex.get_distances(aggregate=False)

    def test_get_frequencies(self):
        f = self.lex.get_frequencies('sounds')
        assert len(f) == self.lex.width

        f = self.lex.get_frequencies('sounds', aggregated=True)
        tokens = []
        for k in self.lex:
            for t in self.lex[k, 'tokens']:
                tokens += [t]
        assert len(f) == len(set(tokens))

        d = self.lex.get_frequencies('diversity', ref='cogid')
        assert isinstance(d, float)

        w = self.lex.get_frequencies('wordlength')
        assert len(w) == self.lex.width

        w = self.lex.get_frequencies('wordlength', aggregated=True)
        assert isinstance(w, float)

    def test_output(self):
        self.lex.output('csv', filename='%s' % self.tmp_path('test_lexstat'))
        self.lex.output('scorer', filename='%s' % self.tmp_path('test_lexstat'))

    def test_correctness(self):
        lex = LexStat({
            0: ['ID', 'doculect', 'concept', 'IPA'],
            1: ['1', 'deu', 'hand', 'hand'],
            2: ['2', 'eng', 'hand', 'hand'],
            3: ['3', 'xyz', 'hand', 'xyz']})
        lex.get_scorer(**self.get_scorer_kw)
        lex.cluster(ref='cogid')
        self.assertEquals(lex.get_entries('cogid'), [[1, 1, 3]])

        lex = LexStat({
            0: ['ID', 'concept', 'ipa', 'doculect'],
            1: ['5424', 'Abend::N', 'swar', 'FRA'],
            2: ['5425', 'Abend::N', 'sware', 'FRA'],
            3: ['5426', 'Abend::N', 'sear3', 'RON'],
            4: ['5427', 'Abend::N', 'ivniN', 'ENG'],
            5: ['5428', 'Abend::N', 'noyt3', 'POR'],
            6: ['5429', 'Abend::N', 'tardi5a', 'POR'],
            7: ['5430', 'Abend::N', 'afd3n', 'DAN'],
        })
        lex.get_scorer()
        lex.cluster(method='lexstat', threshold=0.8, ref='cogid')
        self.assertEquals(lex.get_entries('cogid'), [[1, 2, 3, 4, 5], [0, 0, 3, 3, 0]])
示例#8
0
 def make(*args, **kw):
     kw.setdefault('errors', str(tmppath / 'errors.log'))
     return LexStat(*args, **kw)
示例#9
0
 def setUp(self):
     self.lex = LexStat(test_data('KSL.qlc'))
示例#10
0
class TestLexStat(unittest.TestCase):

    def setUp(self):
        self.lex = LexStat(test_data('KSL.qlc'))

    def test_get_scorer(self):
        self.lex.get_scorer()
        assert hasattr(self.lex,"cscorer") == True

    def test_cluster(self):
        self.lex.get_scorer()
        self.lex.cluster(method="lexstat", threshold=0.7)
        self.lex.cluster(method="edit-dist", threshold=0.7)
        self.lex.cluster(method="turchin", threshold=0.7)

        assert ('scaid' in self.lex.header and 'lexstatid' in self.lex.header \
                and 'editid' in self.lex.header and 'turchinid' in \
                self.lex.header) == True
    
    def test_align_pairs(self):
        self.lex.align_pairs('English', 'German', method='sca')
示例#11
0
 def _make_one(self, *args, **kw):
     kw.setdefault('errors', self.tmp_path('errors.log').as_posix())
     return LexStat(*args, **kw)
示例#12
0
class Tests(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.lex = LexStat(test_data('KSL.qlc'))
        self.part = Partial(test_data('partial_cognates.tsv'),
                            segments='segments')
        self.part.add_entries('pid1', 'partial_cognate_sets', lambda x: x)
        self.part.add_entries('pid2', 'partialids2',
                              lambda x: [int(y) for y in x.split(' ')])

    def test_bcubes(self):
        res = bcubes(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

        _ = bcubes(self.lex, 'cogid', 'cogid', pprint=True, per_concept=True)

    def test_partial_bcubes(self):
        res = partial_bcubes(self.part, 'pid1', 'pid2', pprint=False)
        assert [round(x, 2) for x in res] == [0.92, 0.98, 0.95]

        _ = partial_bcubes(self.part, 'pid1', 'pid2', pprint=True)

    def test_pairs(self):
        res = pairs(self.lex, test='cogid', pprint=False)
        self.assertAlmostEquals(res, (1.0, 1.0, 1.0))

    def test_diff(self):
        res = diff(self.lex, test='cogid', tofile=False, pprint=False)
        self.assertAlmostEquals(res, ((1.0, 1.0, 1.0), (1.0, 1.0, 1.0)))
        self.lex.add_entries('cugid', 'cogid', lambda x: x + 1
                             if x % 2 else x * x)

        _ = diff(self.lex,
                 gold='cogid',
                 test='cogid',
                 filename='%s' % self.tmp_path('test_acd'),
                 pprint=False)
        d2 = diff(self.lex,
                  gold='cugid',
                  test='cogid',
                  filename='%s' % self.tmp_path('test_acd'),
                  pprint=False,
                  tofile=False)
        _ = diff(self.lex,
                 gold='cugid',
                 test='cogid',
                 filename='%s' % self.tmp_path('test_acd'),
                 pprint=False,
                 tofile=True)

        assert d2[0] != 1

    def test_random_cognates(self):
        random_cognates(self.lex, ref='randomid')
        assert 'randomid' in self.lex.header

    def test_extreme_cognates(self):
        extreme_cognates(self.lex, ref="lumperid", bias='lumper')
        assert self.lex[1, 'lumperid'] == self.lex[2, 'lumperid']
        extreme_cognates(self.lex, ref='splitterid', bias='splitter')
        assert self.lex[1, 'splitterid'] != self.lex[2, 'splitterid']
        assert_raises(ValueError, extreme_cognates, self.lex, bias='')
示例#13
0
def main():
    results = []
    """for i in range(20):
        lex = LexStat("../PIE_scored_{}_og.csv".format(i))
        # lex.get_scorer()
        # lex.cluster(method="lexstat", threshold=0.6, ref="cognates")
        print(".", end="", flush=True)
        results.append( (
                i,
                bcubes(
                    lex,
                    "cogid",
                    "newcogid",
                    pprint=False,
                    modify_ref=lambda x: abs(int(x)),
                ),
            )
        )
:
    print()
    print("OG")
    for r in results:
        print(r)
"""
    records = []
    f_names = [
        "SLV", "GER", "ROM", "OUG", "KSL", "BAI", "JAP", "PIE", "IEL", "PAN"
    ]
    for name in f_names:
        print(name)
        for n in range(1, 7):
            results = []
            for i in [50]:
                lex = LexStat("../{}_updating_{}_{}.csv".format(name, i, n))
                # lex.get_scorer()
                # lex.cluster(method="lexstat", threshold=0.6, ref="cognates")

                precision, recall, f_score = bcubes(
                    lex,
                    "cogid",
                    "newcogid",
                    pprint=False,
                    modify_ref=lambda x: abs(int(x)),
                )
                records.append([name, n, precision, recall, f_score])

            for r in results:
                print(r)
        print()

    f, axes = plt.subplots(1, 2, figsize=(20, 8))

    # markers = {i: "${}$".format(i) for i in range(1, 7)}
    df = pd.DataFrame.from_records(
        records,
        columns=["Partition", "Iteration", "Precision", "Recall", "F-score"])
    sns.lineplot(x="Recall",
                 y="Precision",
                 hue="Partition",
                 data=df,
                 marker="o",
                 ax=axes[0])
    # plt.subplots_adjust(right=0.7)
    # plt.legend(bbox_to_anchor=(1.02, 1.02), loc="upper left")

    for _, i, precision, recall, _ in records:
        if i == 1 or i == 6:
            axes[0].annotate(str(i), (recall, precision))

    # markers = {i: "${}$".format(i) for i in range(1, 7)}
    df = pd.DataFrame.from_records(
        records,
        columns=["Partition", "Iteration", "Precision", "Recall", "F-score"])
    sns.lineplot(x="Iteration",
                 y="F-score",
                 hue="Partition",
                 data=df,
                 ax=axes[1])
    # plt.subplots_adjust(right=0.7)
    axes[0].get_legend().remove()
    lgd = plt.legend(bbox_to_anchor=(1.02, 1.00), loc="upper left")

    plt.savefig("bbb.png", bbox_extra_artists=[lgd], bbox_inches="tight")
示例#14
0
 def setUp(self):
     self.lex = LexStat(test_data('KSL.qlc'))
示例#15
0
class TestLexStat(unittest.TestCase):
    def setUp(self):
        self.lex = LexStat(test_data('KSL.qlc'))

    def test_get_scorer(self):
        self.lex.get_scorer()
        assert hasattr(self.lex, "cscorer") == True

    def test_cluster(self):
        self.lex.get_scorer()
        self.lex.cluster(method="lexstat", threshold=0.7)
        self.lex.cluster(method="edit-dist", threshold=0.7)
        self.lex.cluster(method="turchin", threshold=0.7)

        assert ('scaid' in self.lex.header and 'lexstatid' in self.lex.header \
                and 'editid' in self.lex.header and 'turchinid' in \
                self.lex.header) == True

    def test_align_pairs(self):
        self.lex.align_pairs('English', 'German', method='sca')