def lexStatIM_test(th=.57): random.seed(12345) scores = [] lp.rc(schema='ipa') for fn in test: db = fn.split('.')[0] print db lex = lp.LexStat('reformattedData/ipa/'+fn, check=False) lex.get_scorer(preprocessing=False,run=10000) lex.cluster(method='lexstat',threshold=th, external_function=lambda x, y: infomap_clustering(y, x, revert=True), ref="lexstat_infomap") taxa = array(lex.cols) partition = vstack([array([concatenate(lex.get_dict(col=l,entry=entry).values()) for entry in ['concept','doculect','ipa', 'cogid', 'lexstat_infomap']]).T for l in taxa]) partition = pd.DataFrame(partition,columns=['concept','doculect','counterpart', 'cogid','lpID']) partition.to_csv('lexstatCC_IM/'+db+'_lsCC.csv',encoding='utf-8',index=False) concepts = partition.concept.unique() scoreList = [] for c in concepts: cPartition = partition[partition.concept==c] ari = adjusted_rand_score(cPartition.cogid,cPartition.lpID) scoreList.append(ari) dbAri = mean(scoreList) bc = bcubed(array([':'.join(x) for x in partition[['concept','cogid']].values]), array([':'.join(x) for x in partition[['concept','lpID']].values])) scores.append((db,dbAri,bc)) print scores[-1] return scores
def _cldf2lexstat( dataset, segments='segments', transcription='value', row='parameter_id', col='language_id'): """Read LexStat object from cldf dataset.""" D = _cldf2wld(dataset) return lingpy.LexStat(D, segments=segments, transcription=transcription, row=row, col=col)
def test_partial_alignments_with_lexstat(): lex = lp.LexStat(test_data('test-partial-alignments.tsv'), segments='tokens') alms = lp.Alignments(test_data('test-partial-alignments.tsv'), fuzzy=True, ref='cogids', sonar=True, segments='tokens') alms.align(scorer=lex.bscorer) assert '-' in alms.msa['cogids'][12]['alignment'][-1]
def get_wordlist( self, doculect='base', profile=False, ref='crossid', lexstat=True, threshold=0.4): """ Return a classical wordlist from the data. """ if profile: profile = segments.Tokenizer(profile) tokenize = lambda x: profile('^' + x + '$', column='IPA').split() # noqa: E731 else: tokenize = lingpy.ipa2tokens D = { 0: [ 'doculect', 'concept', 'concept_in_source', 'concept_type', 'form', 'tokens', 'occurrences', 'word_forms', 'gloss_forms', 'phrase_example', 'gloss_example', 'references', ] } idx = 1 for ctype in ['lexicon', 'grammar']: concepts = self.get_concepts(ctype=ctype) concordance = self._concordances[ctype] for concept, entries in concepts.items(): for form, lid, cis, freq in entries: # retrieve the concordance pidx, sA, sB = concordance[form, concept, cis, lid][0] txt = self[pidx].phrase gls = self[pidx].gloss word, fgls = self[pidx, sA] tokens = tokenize(form) references = ' '.join( ['{0}:{1}:{2}'.format(a, b, c) for a, b, c in concordance[form, concept, cis, lid]]) # check tokens try: lingpy.tokens2class(tokens, 'sca') check = True except: # noqa: E722, # pragma: no cover check = False if concept.strip() and check: D[idx] = [ doculect if self.monolingual else lid, concept, cis, ctype, form, tokens, freq, word, fgls, txt, gls, references] idx += 1 else: print('[!] Problem with "{0}" / [{1}] [{2}] / {3} {4} {5}'.format( concept, form, tokens, pidx, sA, sB, )) wl = lingpy.Wordlist(D) if lexstat: wl = lingpy.LexStat(D) wl.cluster(method='sca', threshold=threshold, ref=ref) else: wl.add_entries('cog', 'concept,form', lambda x, y: x[y[0]] + '-' + x[y[1]]) wl.renumber('cog', ref) return wl
def setUp(self): self.lex = lp.LexStat(os.path.join("data", "kessler.qlc"))
action="store_true", default=False, help="The data is in LingPy's format, not CLDF.") parser.add_argument("--verbose", action="store_true", default=False, help="Output the forms which do not match.") parser.add_argument("--ssv", default=False, action="store_true", help="Output one line, not many") args = parser.parse_args() if args.lingpy: import lingpy dataset = lingpy.LexStat(str(args.codings)) forms = { row: { e: dataset[row][dataset.header[e]] for e in dataset.entries if e in dataset.header } for row in dataset } codings = {form: row["partial_ids"] for form, row in forms.items()} c_id = "reference" c_lect = "doculect" c_concept = "concept" c_segm = "tokens" else: dataset = get_dataset(args.codings) cognatesets = cognate_sets(dataset)
data['ASJP1'] = [toASJP(w) for w in data.TOKENS.values] new_data = {} # the data formatted as LexStat wants it new_data[0] = ['doculect', 'concept', 'ipa', 'tokens', 'index'] # header key = 1 for i in data.index: nl = list(data.ix[i][['DOCULECT', 'CONCEPT', 'FORM']]) nl.append(data.ix[i]['TOKENS'].split()) nl.append(i) new_data[key] = nl key += 1 wl = lp.Wordlist(new_data) lex = lp.LexStat(wl) lex.get_scorer(runs=10000, preprocessing=False) def get_pairs(lang1, lang2, lex): """ Returns all the lang1-lang2 pairs of words with the same Concepticon ID. Returns [] of LexStat ID tuples. Note that LexStat.pairs cannot be used here because the latter flattens transcription duplicates. """ pairs = [] for gloss1, indices1 in lex.get_dict(col=lang1).items(): for gloss2, indices2 in lex.get_dict(col=lang2).items(): if gloss1 == gloss2:
help="A CLDF dataset with cognate codes") parser.add_argument("--gold-lingpy", action="store_true", default=False, help="The ground-truth data is in LingPy's format, not CLDF.") # parser.add_argument("--lingpy", action="store_true", # default=False, # help="The data is in LingPy's format, not CLDF.") parser.add_argument("--ssv", default=False, action="store_true", help="Output one line, not many") args = parser.parse_args() if args.codings.suffix == '.tsv': # Assume LingPy import lingpy dataset = lingpy.LexStat(str(args.codings)) forms = {row: {e: dataset[row][dataset.header[e]] for e in dataset.entries if e in dataset.header} for row in dataset} codings = { str(form): str(row["cogid"]) for form, row in forms.items()} def iterate_concept_and_id(): for i in dataset: yield dataset[i][dataset.header['concept']], str(i) else: dataset = get_dataset(args.codings) cognatesets = cognate_sets(dataset)