Пример #1
0
def lexStatIM_test(th=.57):
    random.seed(12345)
    scores = []
    lp.rc(schema='ipa')
    for fn in test:
        db = fn.split('.')[0]
        print db
        lex = lp.LexStat('reformattedData/ipa/'+fn,
                         check=False)
        lex.get_scorer(preprocessing=False,run=10000)
        lex.cluster(method='lexstat',threshold=th,
                    external_function=lambda x, y: infomap_clustering(y, x, revert=True),
                    ref="lexstat_infomap")
        taxa = array(lex.cols)
        partition = vstack([array([concatenate(lex.get_dict(col=l,entry=entry).values())
                                   for entry in ['concept','doculect','ipa',
                                                 'cogid',
                                                 'lexstat_infomap']]).T for l in taxa])
        partition = pd.DataFrame(partition,columns=['concept','doculect','counterpart',
                                                    'cogid','lpID'])
        partition.to_csv('lexstatCC_IM/'+db+'_lsCC.csv',encoding='utf-8',index=False)
        concepts = partition.concept.unique()
        scoreList = []
        for c in concepts:
            cPartition = partition[partition.concept==c]
            ari = adjusted_rand_score(cPartition.cogid,cPartition.lpID)
            scoreList.append(ari)
        dbAri = mean(scoreList)
        bc = bcubed(array([':'.join(x)
                           for x in partition[['concept','cogid']].values]),
                    array([':'.join(x)
                           for x in partition[['concept','lpID']].values]))
        scores.append((db,dbAri,bc))
        print scores[-1]
    return scores
Пример #2
0
def _cldf2lexstat(
        dataset,
        segments='segments',
        transcription='value',
        row='parameter_id',
        col='language_id'):
    """Read LexStat object from cldf dataset."""
    D = _cldf2wld(dataset)
    return lingpy.LexStat(D, segments=segments, transcription=transcription, row=row, col=col)
Пример #3
0
def test_partial_alignments_with_lexstat():
    lex = lp.LexStat(test_data('test-partial-alignments.tsv'),
                     segments='tokens')
    alms = lp.Alignments(test_data('test-partial-alignments.tsv'),
                         fuzzy=True,
                         ref='cogids',
                         sonar=True,
                         segments='tokens')
    alms.align(scorer=lex.bscorer)
    assert '-' in alms.msa['cogids'][12]['alignment'][-1]
Пример #4
0
    def get_wordlist(
            self,
            doculect='base',
            profile=False,
            ref='crossid',
            lexstat=True,
            threshold=0.4):
        """
        Return a classical wordlist from the data.
        """
        if profile:
            profile = segments.Tokenizer(profile)
            tokenize = lambda x: profile('^' + x + '$', column='IPA').split()  # noqa: E731
        else:
            tokenize = lingpy.ipa2tokens

        D = {
            0: [
                'doculect',
                'concept',
                'concept_in_source',
                'concept_type',
                'form',
                'tokens',
                'occurrences',
                'word_forms',
                'gloss_forms',
                'phrase_example',
                'gloss_example',
                'references',
            ]
        }
        idx = 1
        for ctype in ['lexicon', 'grammar']:
            concepts = self.get_concepts(ctype=ctype)
            concordance = self._concordances[ctype]
            for concept, entries in concepts.items():
                for form, lid, cis, freq in entries:
                    # retrieve the concordance
                    pidx, sA, sB = concordance[form, concept, cis, lid][0]
                    txt = self[pidx].phrase
                    gls = self[pidx].gloss
                    word, fgls = self[pidx, sA]
                    tokens = tokenize(form)
                    references = ' '.join(
                        ['{0}:{1}:{2}'.format(a, b, c)
                         for a, b, c in concordance[form, concept, cis, lid]])
                    # check tokens
                    try:
                        lingpy.tokens2class(tokens, 'sca')
                        check = True
                    except:  # noqa: E722, # pragma: no cover
                        check = False
                    if concept.strip() and check:
                        D[idx] = [
                            doculect if self.monolingual else lid,
                            concept,
                            cis,
                            ctype,
                            form,
                            tokens,
                            freq,
                            word,
                            fgls,
                            txt,
                            gls,
                            references]
                        idx += 1
                    else:
                        print('[!] Problem with "{0}" / [{1}] [{2}] / {3} {4} {5}'.format(
                            concept,
                            form,
                            tokens,
                            pidx,
                            sA,
                            sB,
                        ))
        wl = lingpy.Wordlist(D)

        if lexstat:
            wl = lingpy.LexStat(D)
            wl.cluster(method='sca', threshold=threshold, ref=ref)
        else:
            wl.add_entries('cog', 'concept,form', lambda x, y: x[y[0]] + '-' + x[y[1]])
            wl.renumber('cog', ref)
        return wl
Пример #5
0
    def setUp(self):

        self.lex = lp.LexStat(os.path.join("data", "kessler.qlc"))
                        action="store_true",
                        default=False,
                        help="The data is in LingPy's format, not CLDF.")
    parser.add_argument("--verbose",
                        action="store_true",
                        default=False,
                        help="Output the forms which do not match.")
    parser.add_argument("--ssv",
                        default=False,
                        action="store_true",
                        help="Output one line, not many")
    args = parser.parse_args()

    if args.lingpy:
        import lingpy
        dataset = lingpy.LexStat(str(args.codings))
        forms = {
            row: {
                e: dataset[row][dataset.header[e]]
                for e in dataset.entries if e in dataset.header
            }
            for row in dataset
        }
        codings = {form: row["partial_ids"] for form, row in forms.items()}
        c_id = "reference"
        c_lect = "doculect"
        c_concept = "concept"
        c_segm = "tokens"
    else:
        dataset = get_dataset(args.codings)
        cognatesets = cognate_sets(dataset)
Пример #7
0
data['ASJP1'] = [toASJP(w) for w in data.TOKENS.values]

new_data = {}  # the data formatted as LexStat wants it
new_data[0] = ['doculect', 'concept', 'ipa', 'tokens', 'index']  # header
key = 1
for i in data.index:
    nl = list(data.ix[i][['DOCULECT', 'CONCEPT', 'FORM']])
    nl.append(data.ix[i]['TOKENS'].split())
    nl.append(i)
    new_data[key] = nl
    key += 1

wl = lp.Wordlist(new_data)

lex = lp.LexStat(wl)
lex.get_scorer(runs=10000, preprocessing=False)


def get_pairs(lang1, lang2, lex):
    """
    Returns all the lang1-lang2 pairs of words with the same Concepticon ID.
    Returns [] of LexStat ID tuples.
    
    Note that LexStat.pairs cannot be used here because the latter flattens
    transcription duplicates.
    """
    pairs = []
    for gloss1, indices1 in lex.get_dict(col=lang1).items():
        for gloss2, indices2 in lex.get_dict(col=lang2).items():
            if gloss1 == gloss2:
                        help="A CLDF dataset with cognate codes")
    parser.add_argument("--gold-lingpy", action="store_true",
                        default=False,
                        help="The ground-truth data is in LingPy's format, not CLDF.")
#     parser.add_argument("--lingpy", action="store_true",
#                         default=False,
#                         help="The data is in LingPy's format, not CLDF.")
    parser.add_argument("--ssv", default=False,
                        action="store_true",
                        help="Output one line, not many")
    args = parser.parse_args()

    if args.codings.suffix == '.tsv':
        # Assume LingPy
        import lingpy
        dataset = lingpy.LexStat(str(args.codings))
        forms = {row:
            {e: dataset[row][dataset.header[e]]
             for e in dataset.entries
             if e in dataset.header}
            for row in dataset}
        codings = {
            str(form): str(row["cogid"])
            for form, row in forms.items()}

        def iterate_concept_and_id():
            for i in dataset:
                yield dataset[i][dataset.header['concept']], str(i)
    else:
        dataset = get_dataset(args.codings)
        cognatesets = cognate_sets(dataset)