def run(args): ds = Dataset() try: part = Partial( ds.dir.joinpath('workflow', 'D_Chen_partial.bin.tsv').as_posix()) except: part = Partial(ds.dir.joinpath('workflow', 'D_Chen_subset.tsv').as_posix(), segments='tokens') part.get_partial_scorer(runs=10000) part.output('tsv', filename=ds.dir.joinpath('workflow', 'D_Chen_partial.bin').as_posix(), ignore=[], prettify=False) args.log.info('[i] saved the scorer') finally: part.partial_cluster(method='lexstat', threshold=0.55, ref='cogids', mode='global', gop=-2, cluster_method='infomap') part.output('tsv', filename=ds.dir.joinpath('workflow', 'D_Chen_partial').as_posix(), prettify=False)
def run(args): ds = Dataset(args) wl = Wordlist.from_cldf( str(ds.cldf_specs().dir.joinpath('cldf-metadata.json'))) D = {0: [x for x in wl.columns]} for idx in wl: if wl[idx, 'tokens']: D[idx] = wl[idx] part = Partial(D, check=True) part.get_partial_scorer(runs=10000) part.partial_cluster(method='lexstat', threshold=0.45, ref="cogids", cluster_method='infomap') alms = Alignments(part, ref='cogids', fuzzy=True) alms.align() alms.output('tsv', filename="chin-aligned")
from lingpy import * from lingpy.compare.partial import Partial try: part = Partial('hm-111-17.bin.tsv', segments='segments') except: part = Partial('hm-111-17.tsv', segments='segments') part.get_scorer(runs=10000) part.output('tsv', filename='hm-111-17.bin') # manually correct error in data part.partial_cluster(method='lexstat', cluster_method='infomap', threshold=0.6, ref='cogids') part.add_entries('note', 'cogid', lambda x: '') part.add_entries('morphemes', 'cogid', lambda x: '') part.output('tsv', filename='hm-111-17-t06', ignore='all', prettify=False)
from lingpy import * from lingpy.compare.partial import Partial from lingpy.evaluate.acd import partial_bcubes try: lexx = LexStat('hm-jerry-scored.bin.tsv', segments='segments') lex = Partial('../hm-111-17_16feb.tsv', segments='segments') lex.cscorer = lexx.cscorer except: lex = Partial('../hm-111-17_16feb.tsv', segments='segments') lex.get_scorer(runs=10000) lex.output('tsv', filename='hm-jerry-scored.bin') # we test several thresholds for i in range(2, 8): lex.partial_cluster(method='lexstat', cluster_method='infomap', threshold=i * 0.1, ref='t' + str(i)) a, b, c = partial_bcubes(lex, 'cogids', 't' + str(i), pprint=False) print('{0:2} {1:.2f} {2:.2f} {3:.2f}'.format(i, a, b, c))
class Tests(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.part = Partial(test_data('partial_cognates.tsv'), segments='segments') self.part2 = Partial(test_data('partial_cognates-scored.tsv'), segments='segments') def test__get_slices(self): a = _get_slices(list('ba²te²')) b = _get_slices(list('ba²te²'), split_on_tones=False) assert a[0][1] == 3 assert b[0][1] == 6 def test_get_partial_matrices(self): for method in ['upgma', 'single', 'complete', 'ward', 'mcl']: matrix = list( self.part._get_partial_matrices(cluster_method=method, concept="bird"))[0] assert isinstance(matrix[0][0], (float, int)) if lingpy.algorithm.extra.igraph: for concept, tracer, matrix in self.part._get_partial_matrices( cluster_method='infomap'): assert isinstance(concept, text_type) assert [x[0] for x in tracer] def test_partial_cluster(self): assert_raises(ValueError, self.part.partial_cluster, cluster_method='upgmu') self.part.partial_cluster( method='sca', threshold=0.45, cluster_method='infomap'\ if lingpy.algorithm.extra.igraph else 'upgma', ref='parts1') self.part.partial_cluster(method='sca', threshold=0.45, cluster_method='mcl', ref='parts2') self.part.partial_cluster(method='sca', threshold=0.45, cluster_method='upgma', ref='parts3') self.part2.partial_cluster(method='lexstat', threshold=0.6, cluster_method='single', post_processing=True, imap_mode=False, ref='parts4') # high threshold to trigger post-processing movement self.part.partial_cluster(method='sca', threshold=0.9, cluster_method='single', post_processing=True, imap_mode=False, ref='parts5') assert self.part[9, 'parts3'][0] == self.part[10, 'parts3'][0] assert self.part2[8, 'parts4'][1] == self.part2[10, 'parts4'][1] def test_add_cognate_ids(self): self.part.partial_cluster(method='sca', threshold=0.45, cluster_method='upgma', ref='parts3') self.part.add_cognate_ids('parts3', 'cogs1', idtype='strict') self.part.add_cognate_ids('parts3', 'cogs2', idtype='loose') assert self.part[9, 'cogs1'] == self.part[10, 'cogs1'] assert_raises(ValueError, self.part.add_cognate_ids, 'parts3', 'cogs1', idtype='dummy')
class Tests(WithTempDir): def setUp(self): WithTempDir.setUp(self) self.part = Partial(test_data('partial_cognates.tsv'), segments='segments') self.part2 = Partial(test_data('partial_cognates-scored.tsv'), segments='segments') def test_get_partial_matrices(self): for method in ['upgma', 'single', 'complete', 'ward', 'mcl']: matrix = list(self.part._get_partial_matrices(cluster_method=method, concept="bird"))[0] assert isinstance(matrix[0][0], (float, int)) if lingpy.algorithm.extra.igraph: for concept, tracer, matrix in self.part._get_partial_matrices( cluster_method='infomap'): assert isinstance(concept, text_type) assert [x[0] for x in tracer] def test_partial_cluster(self): assert_raises(ValueError, self.part.partial_cluster, cluster_method='upgmu') self.part.partial_cluster( method='sca', threshold=0.45, cluster_method='infomap'\ if lingpy.algorithm.extra.igraph else 'upgma', ref='parts1') self.part.partial_cluster( method='sca', threshold=0.45, cluster_method='mcl', ref='parts2') self.part.partial_cluster( method='sca', threshold=0.45, cluster_method='upgma', ref='parts3') self.part2.partial_cluster( method='lexstat', threshold=0.6, cluster_method='single', post_processing=True, imap_mode=False, ref='parts4') # high threshold to trigger post-processing movement self.part.partial_cluster( method='sca', threshold=0.9, cluster_method='single', post_processing=True, imap_mode=False, ref='parts5') assert self.part[9, 'parts3'][0] == self.part[10, 'parts3'][0] assert self.part2[8, 'parts4'][1] == self.part2[10, 'parts4'][1] def test_add_cognate_ids(self): self.part.partial_cluster( method='sca', threshold=0.45, cluster_method='upgma', ref='parts3') self.part.add_cognate_ids('parts3', 'cogs1', idtype='strict') self.part.add_cognate_ids('parts3', 'cogs2', idtype='loose') assert self.part[9,'cogs1'] == self.part[10, 'cogs1'] assert_raises(ValueError, self.part.add_cognate_ids, 'parts3', 'cogs1', idtype='dummy')
if not 'loose_cogid' in lex.header: lex.add_cognate_ids('partialids', 'loose_cogid', 'loose') for i in range(1,20): print("Analyzing {0} with t={1}...".format(f, i)) t = 0.05 * i ts = '{0:.2f}'.format(t).replace('0.','') for m in methods: msf = 'f_'+m for cm in cluster_methods: ms = '{0}_{1}_{2}'.format('p', m, cm) msf = '{0}_{1}_{2}'.format('f', m, cm) msp = ms +'_'+ts lex.partial_cluster(method=m, cluster_method=cm, threshold=t, ref=msp) # get loose and strict cognate ids for this method lex.add_cognate_ids(msp, ms+'_strict'+'_'+ts, 'strict') lex.add_cognate_ids(msp, ms+'_loose'+'_'+ts, 'loose') # get the bcubes for mode in ['strict', 'loose']: msm = ms+'_'+mode+'_'+ts p, r, fs = bcubes(lex, mode+'_cogid', msm, pprint=False) pprint_result(f, msm, ts, p, r, fs) ccubes += [[msm, f, t, ts, p, r, fs]] p, r, fs = partial_bcubes(lex, 'partialids', msp, pprint=False) pprint_result(f, msp, ts, p, r, fs) ccubes += [[msp, f, t, ts, p, r, fs]]
from lingpy import * from lingpy.compare.partial import Partial from sys import argv if 'all' in argv: fname='A_Chen_' else: fname='D_Chen_' try: part = Partial(fname+'partial.bin.tsv') except: part = Partial(fname+'subset.tsv', segments='tokens') print('[i] loaded the file') part.get_partial_scorer(runs=10000) part.output('tsv', filename=fname+'partial.bin', ignore=[], prettify=False) print('[i] saved the scorer') finally: part.partial_cluster( method='lexstat', threshold=0.55, ref='cogids', mode='global', gop=-2, cluster_method='infomap' ) part.output('tsv', filename=fname+'partial', prettify=False)
lex.add_cognate_ids('partialids', 'loose_cogid', 'loose') for i in range(1, 20): print("Analyzing {0} with t={1}...".format(f, i)) t = 0.05 * i ts = '{0:.2f}'.format(t).replace('0.', '') for m in methods: msf = 'f_' + m for cm in cluster_methods: ms = '{0}_{1}_{2}'.format('p', m, cm) msf = '{0}_{1}_{2}'.format('f', m, cm) msp = ms + '_' + ts lex.partial_cluster(method=m, cluster_method=cm, threshold=t, ref=msp) # get loose and strict cognate ids for this method lex.add_cognate_ids(msp, ms + '_strict' + '_' + ts, 'strict') lex.add_cognate_ids(msp, ms + '_loose' + '_' + ts, 'loose') # get the bcubes for mode in ['strict', 'loose']: msm = ms + '_' + mode + '_' + ts p, r, fs = bcubes(lex, mode + '_cogid', msm, pprint=False) pprint_result(f, msm, ts, p, r, fs) ccubes += [[msm, f, t, ts, p, r, fs]] p, r, fs = partial_bcubes(lex, 'partialids', msp, pprint=False) pprint_result(f, msp, ts, p, r, fs) ccubes += [[msp, f, t, ts, p, r, fs]]
namespace = (('concept_name', 'concept'), ('language_id', 'doculect'), ('segments', 'tokens'), ('language_glottocode', 'glottolog'), ('concept_concepticon_id', 'concepticon'), ('language_latitude', 'latitude'), ('language_longitude', 'longitude'), ('cognacy', 'cognacy'), ('cogid_cognateset_id', 'cognacy')) wl = Wordlist.from_cldf('../cldf/cldf-metadata.json', columns=columns, namespace=namespace) D = {0: wl.columns} for idx in wl: if wl[idx, 'tokens']: D[idx] = wl[idx] part = Partial(D) part.partial_cluster(method='sca', threshold=0.45, ref='cogids') alms = Alignments(part, ref='cogids') alms.align() alms.add_entries('note', 'form', lambda x: '') #part.add_entries('cog', 'cognacy', lambda x: x) #for idx in wl: # if wl[idx, 'cog'].strip(): # wl[idx, 'cog'] += '-'+wl[idx, 'concept'] # else: # wl[idx, 'cog'] += str(idx) # #wl.renumber('cog') cogids2cogid(alms)