Пример #1
0
class Tests(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.part = Partial(test_data('partial_cognates.tsv'),
                            segments='segments')
        self.part2 = Partial(test_data('partial_cognates-scored.tsv'),
                             segments='segments')

    def test__get_slices(self):
        a = _get_slices(list('ba²te²'))
        b = _get_slices(list('ba²te²'), split_on_tones=False)
        assert a[0][1] == 3
        assert b[0][1] == 6

    def test_get_partial_matrices(self):
        for method in ['upgma', 'single', 'complete', 'ward', 'mcl']:
            matrix = list(self.part._get_partial_matrices(cluster_method=method,
                                                          concept="bird"))[0]
            assert isinstance(matrix[0][0], (float, int))

        if lingpy.algorithm.extra.igraph:
            for concept, tracer, matrix in self.part._get_partial_matrices(
                    cluster_method='infomap'):
                assert isinstance(concept, text_type)
                assert [x[0] for x in tracer]

    def test_partial_cluster(self):
        assert_raises(ValueError, self.part.partial_cluster,
                      cluster_method='upgmu')
        self.part.partial_cluster(method='sca', threshold=0.45,
                                  cluster_method='infomap'
                                  if lingpy.algorithm.extra.igraph else
                                  'upgma', ref='parts1')

        self.part.partial_cluster(method='sca', threshold=0.45,
                                  cluster_method='mcl', ref='parts2')
        self.part.partial_cluster(method='sca', threshold=0.45,
                                  cluster_method='upgma', ref='parts3')

        self.part2.partial_cluster(method='lexstat', threshold=0.6,
                                   cluster_method='single',
                                   post_processing=True, imap_mode=False,
                                   ref='parts4')
        # high threshold to trigger post-processing movement
        self.part.partial_cluster(method='sca', threshold=0.9,
                                  cluster_method='single', post_processing=True,
                                  imap_mode=False, ref='parts5')

        assert self.part[9, 'parts3'][0] == self.part[10, 'parts3'][0]
        assert self.part2[8, 'parts4'][1] == self.part2[10, 'parts4'][1]

    def test_add_cognate_ids(self):
        self.part.partial_cluster(method='sca', threshold=0.45,
                                  cluster_method='upgma', ref='parts3')
        self.part.add_cognate_ids('parts3', 'cogs1', idtype='strict')
        self.part.add_cognate_ids('parts3', 'cogs2', idtype='loose')

        assert self.part[9, 'cogs1'] == self.part[10, 'cogs1']
        assert_raises(ValueError, self.part.add_cognate_ids, 'parts3', 'cogs1',
                      idtype='dummy')
Пример #2
0
class Tests(WithTempDir):
    def setUp(self):
        WithTempDir.setUp(self)
        self.part = Partial(test_data('partial_cognates.tsv'),
                            segments='segments')
        self.part2 = Partial(test_data('partial_cognates-scored.tsv'),
                             segments='segments')

    def test__get_slices(self):
        a = _get_slices(list('ba²te²'))
        b = _get_slices(list('ba²te²'), split_on_tones=False)
        assert a[0][1] == 3
        assert b[0][1] == 6

    def test_get_partial_matrices(self):

        for method in ['upgma', 'single', 'complete', 'ward', 'mcl']:
            matrix = list(
                self.part._get_partial_matrices(cluster_method=method,
                                                concept="bird"))[0]
            assert isinstance(matrix[0][0], (float, int))

        if lingpy.algorithm.extra.igraph:
            for concept, tracer, matrix in self.part._get_partial_matrices(
                    cluster_method='infomap'):
                assert isinstance(concept, text_type)
                assert [x[0] for x in tracer]

    def test_partial_cluster(self):

        assert_raises(ValueError,
                      self.part.partial_cluster,
                      cluster_method='upgmu')
        self.part.partial_cluster(
                method='sca', threshold=0.45, cluster_method='infomap'\
                        if lingpy.algorithm.extra.igraph else 'upgma',
                        ref='parts1')
        self.part.partial_cluster(method='sca',
                                  threshold=0.45,
                                  cluster_method='mcl',
                                  ref='parts2')
        self.part.partial_cluster(method='sca',
                                  threshold=0.45,
                                  cluster_method='upgma',
                                  ref='parts3')

        self.part2.partial_cluster(method='lexstat',
                                   threshold=0.6,
                                   cluster_method='single',
                                   post_processing=True,
                                   imap_mode=False,
                                   ref='parts4')
        # high threshold to trigger post-processing movement
        self.part.partial_cluster(method='sca',
                                  threshold=0.9,
                                  cluster_method='single',
                                  post_processing=True,
                                  imap_mode=False,
                                  ref='parts5')

        assert self.part[9, 'parts3'][0] == self.part[10, 'parts3'][0]
        assert self.part2[8, 'parts4'][1] == self.part2[10, 'parts4'][1]

    def test_add_cognate_ids(self):
        self.part.partial_cluster(method='sca',
                                  threshold=0.45,
                                  cluster_method='upgma',
                                  ref='parts3')
        self.part.add_cognate_ids('parts3', 'cogs1', idtype='strict')
        self.part.add_cognate_ids('parts3', 'cogs2', idtype='loose')
        assert self.part[9, 'cogs1'] == self.part[10, 'cogs1']
        assert_raises(ValueError,
                      self.part.add_cognate_ids,
                      'parts3',
                      'cogs1',
                      idtype='dummy')
Пример #3
0
measures = ['partial', 'strict', 'loose']

for f in infiles:
    try:
        lex = Partial(pcd_path('data', 'BIN_'+f+'.tsv'))
    except IOError:
        lex = Partial(pcd_path('data', f+'.tsv'))
        lex.get_scorer(
                preprocessing=False, 
                runs=10000,
                )
        lex.output('tsv', filename=pcd_path('data', 'BIN_'+f[2:]))

    # create new reference ids for cogantes from partial cognates
    if not 'strict_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'strict_cogid', 'strict')
    if not 'loose_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'loose_cogid', 'loose')

    for i in range(1,20):
        print("Analyzing {0} with t={1}...".format(f, i))
        t = 0.05 * i
        ts = '{0:.2f}'.format(t).replace('0.','')

        for m in methods:
            msf = 'f_'+m
            for cm in cluster_methods:
                ms = '{0}_{1}_{2}'.format('p', m, cm)
                msf = '{0}_{1}_{2}'.format('f', m, cm)
                msp = ms +'_'+ts
Пример #4
0
def prepare(ds):

    # steps:
    # parse characters (numbers, zeros)
    # check for number
    # recreate partial cognate identifiers
    # create strict cognate identifieres
    # code everything as CLDF-like file
    con = Concepticon()
    beida = con.conceptlists['BeijingDaxue-1964-905']
    inv = ds.sounds
    words = Wordlist(ds.raw('chars-corrected-2017-06-18.tsv'))
    partialids, pidc = {}, {}
    pidx = 1
    concepts = {}
    for idx, chars, tks, doculect, glossid in iter_rows(
            words, 'benzi', 'segments', 'doculect', 'beida_id'):
        tokens = tokens2morphemes(tks)
        benzi = parse_chars(chars, doculect, tokens)
        if len(tokens) != len(benzi):
            print(doculect, glossid, benzi, tokens)
        pids = []
        for char in benzi:
            if char == '囗':
                pids += [str(pidx)]
                pidx += 1
            else:
                if char not in partialids:
                    partialids[char] = str(pidx)
                    pidx += 1
                pids += [partialids[char]]
        words[idx, 'cogids'] = ' '.join(pids)
        words[idx, 'benzi'] = ' '.join(benzi)

        # retrieve correct concept
        bidx = 'BeijingDaxue-1964-905-' + glossid
        concept = beida.concepts[bidx]
        concepts[idx] = [
            concept.concepticon_id, concept.attributes['chinese'],
            concept.attributes['pageno'], concept.attributes['pinyin']
        ]
        words[idx, 'concept'] = concept.gloss + ' (' + concept.attributes[
            'pinyin'] + ' ' + concept.attributes['chinese'] + ')'
    for i, entry in enumerate(['concepticon_id', 'chinese', 'page', 'pinyin']):
        words.add_entries(entry, concepts, lambda x: x[i])
    words.add_entries('benzi_in_source', 'hanzi', lambda x: x)
    words.add_entries('source', 'ipa', lambda x: 'BeijingDaxue1964')
    words.add_entries('value', 'ipa', lambda x: x)
    words.add_entries('form', 'ipa', lambda x: x)
    words.add_entries('glottolog', 'doculect',
                      lambda x: ds.languages[x]['glottolog'])
    words.add_entries('iso', 'doculect', lambda x: ds.languages[x]['iso'])

    # determine order of entries
    order = {}
    for d in words.cols:
        entries = words.get_list(col=d, flat=True)
        concept, oid = '', 1
        for idx in sorted(entries):
            new_concept = words[idx, 'concept']
            if new_concept == concept:
                oid += 1
            else:
                concept = new_concept
                oid = 1
            order[idx] = oid
    words.add_entries('order', order, lambda x: str(x))

    words.output('tsv', filename=ds.raw('tmp-2017-06-18'))
    print('first run on words')
    part = Partial(ds.raw('tmp-2017-06-18.tsv'), segments='segments')
    part.add_cognate_ids('cogids', 'cogid')
    part.output('tsv', filename=ds.raw('tmp-2017-06-18'))
    print('created cognate ids')
    alm = Alignments(ds.raw('tmp-2017-06-18.tsv'),
                     segments='segments',
                     ref='cogids',
                     alignment='alignments')
    alm.align()
    alm.output('tsv',
               filename=ds.raw('tmp-2017-06-18-finalized'),
               subset=True,
               cols=[
                   'doculect', 'glottolog', 'iso', 'concept', 'concepticon_id',
                   'chinese', 'pinyin', 'benzi', 'benzi_in_source', 'value',
                   'form', 'segments', 'cogid', 'cogids', 'note', 'source',
                   'beida_id', 'page', 'order', 'alignments'
               ])
    words = Wordlist(ds.raw('tmp-2017-06-18-finalized.tsv'))
    ds.write_wordlist(words)
    with open('cldf/beijingdaxue1964.csv', 'w') as f:
        f.write(','.join([
            'ID', 'Language_name', 'Language_ID', 'Language_iso',
            'Parameter_ID', 'Parameter_name', 'Source', 'Comment',
            'Parameter_Chinese', 'Parameter_Pinyin', 'Value', 'Form',
            'Segments', 'Cognate_Set', 'Cognate_Sets', 'Alignments', 'Order',
            'Beida_ID', 'Page', 'Benzi', 'Benzi_in_source'
        ]) + '\n')
        for idx in words:
            out = [str(idx)]
            for entry in [
                    'doculect', 'glottolog', 'iso', 'concepticon_id',
                    'concept', 'source', 'note', 'chinese', 'pinyin', 'value',
                    'form', 'segments', 'cogid', 'cogids', 'alignments',
                    'order', 'beida_id', 'page', 'benzi', 'benzi_in_source'
            ]:
                value = words[idx, entry]
                if isinstance(value, list):
                    value = ' '.join([str(x) for x in value])
                else:
                    value = str(value)
                if '"' in value:
                    value = value.replace('"', '""')
                if ',' in value:
                    value = '"' + value + '"'
                out += [value]
            f.write(','.join(out) + '\n')
Пример #5
0
from lingpy.compare.partial import Partial
from lingpy.convert.plot import plot_tree
from sys import argv
from clldutils.text import strip_brackets, split_text
from collections import defaultdict
from lingpy import basictypes

if 'all' in argv:
    fname='../output/A_Deepadung_'
else:
    fname='../output/D_Deepadung_'

part = Partial(fname+'crossids.tsv')
part.add_cognate_ids('crossids', 'crossid', idtype='strict')
part.add_entries('cog', 'crossid,concept', lambda x, y: str(x[y[0]])+x[y[1]])
part.renumber('cog')

part.calculate('distance', ref='cogid')
part.calculate('tree', tree_calc='neighbor')

part.output('dst', filename=fname+'distance')
part.output('tre', filename=fname+'tree')

if 'plot' in argv:
    plot_tree(str(part.tree), degree=350, filename=fname+'tree')


Пример #6
0
measures = ['partial', 'strict', 'loose']

for f in infiles:
    try:
        lex = Partial(pcd_path('data', 'BIN_' + f + '.tsv'))
    except IOError:
        lex = Partial(pcd_path('data', f + '.tsv'))
        lex.get_scorer(
            preprocessing=False,
            runs=10000,
        )
        lex.output('tsv', filename=pcd_path('data', 'BIN_' + f[2:]))

    # create new reference ids for cogantes from partial cognates
    if not 'strict_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'strict_cogid', 'strict')
    if not 'loose_cogid' in lex.header:
        lex.add_cognate_ids('partialids', 'loose_cogid', 'loose')

    for i in range(1, 20):
        print("Analyzing {0} with t={1}...".format(f, i))
        t = 0.05 * i
        ts = '{0:.2f}'.format(t).replace('0.', '')

        for m in methods:
            msf = 'f_' + m
            for cm in cluster_methods:
                ms = '{0}_{1}_{2}'.format('p', m, cm)
                msf = '{0}_{1}_{2}'.format('f', m, cm)
                msp = ms + '_' + ts
Пример #7
0
    7336: 31174,
    7133: 31074,
    7131: 31073,
}
matcher.update(explicit)
blacklist = []
for idx in ob2:
    if idx not in matcher.values() and idx not in unmatched:
        blacklist += [idx]

# now that we have all relevant data, we need to compare the cognate sets
# print(max([int(stdb[idx, 'cogid']) for idx in stdb]))

# cogid range should be 7000+
part = Partial(wl)
part.add_cognate_ids('cogids', 'strictid', idtype='strict')

# compute a matcher of cognate ids
burm2stdb = {}
ncid = 8000
for idx in part:
    nidx = matcher.get(idx)
    tid = part[idx, 'strictid']
    if nidx and nidx not in burm2stdb:
        oid = stdb[nidx, 'cogid']
        burm2stdb[tid] = oid
    else:
        if tid in burm2stdb:
            pass
        else:
            burm2stdb[tid] = str(ncid)