Exemplo n.º 1
0
def lexStatIM_test(th=.57):
    random.seed(12345)
    scores = []
    lp.rc(schema='ipa')
    for fn in test:
        db = fn.split('.')[0]
        print db
        lex = lp.LexStat('reformattedData/ipa/'+fn,
                         check=False)
        lex.get_scorer(preprocessing=False,run=10000)
        lex.cluster(method='lexstat',threshold=th,
                    external_function=lambda x, y: infomap_clustering(y, x, revert=True),
                    ref="lexstat_infomap")
        taxa = array(lex.cols)
        partition = vstack([array([concatenate(lex.get_dict(col=l,entry=entry).values())
                                   for entry in ['concept','doculect','ipa',
                                                 'cogid',
                                                 'lexstat_infomap']]).T for l in taxa])
        partition = pd.DataFrame(partition,columns=['concept','doculect','counterpart',
                                                    'cogid','lpID'])
        partition.to_csv('lexstatCC_IM/'+db+'_lsCC.csv',encoding='utf-8',index=False)
        concepts = partition.concept.unique()
        scoreList = []
        for c in concepts:
            cPartition = partition[partition.concept==c]
            ari = adjusted_rand_score(cPartition.cogid,cPartition.lpID)
            scoreList.append(ari)
        dbAri = mean(scoreList)
        bc = bcubed(array([':'.join(x)
                           for x in partition[['concept','cogid']].values]),
                    array([':'.join(x)
                           for x in partition[['concept','lpID']].values]))
        scores.append((db,dbAri,bc))
        print scores[-1]
    return scores
Exemplo n.º 2
0
def main(*args):
    """
    LingPy command line interface.
    """
    args = get_parser().parse_args(args or None)
    lingpy.rc(schema=args.schema)
    lingpy.rc(model=lingpy.settings.rcParams[args.model])
    return _cmd_by_name(args.subcommand)(args)
Exemplo n.º 3
0
def main(*args):
    """
    LingPy command line interface.
    """
    args = get_parser().parse_args(args or None)
    lingpy.rc(schema=args.schema)
    lingpy.rc(model=lingpy.settings.rcParams[args.model])
    return _cmd_by_name(args.subcommand)(args)
Exemplo n.º 4
0
    def create(self, table=None, dbase=None, ignore=None):
        """
        Upload triple-data to sqlite3-db. Thereby, delete the previous table
        if it is still in the database.
        """
        dbase = pathlib.Path(dbase or self.dbase)
        table = table or dbase.stem
        ignore = ignore or []

        # write a log for the blacklist
        with UnicodeWriter(
                dbase.parent.joinpath(
                    lingpy.rc('timestamp') + '-blacklist.log')) as w:
            w.writerow(['ID'] +
                       sorted(self.header, key=lambda x: self.header[x]))
            w.writerows([[str(k)] + [stringval(e) for e in self[k]]
                         for k in self.blacklist])

        with self.cursor(dbase) as cu:
            cu.execute(
                "CREATE TABLE IF NOT EXISTS backup "
                "(file TEXT, id INT, col TEXT, val TEXT, date TEXT, user TEXT)"
            )
            cu.execute(
                "CREATE TABLE IF NOT EXISTS {0} (id INT, col TEXT, val TEXT)".
                format(table))
            cu.execute("DELETE FROM {0}".format(table))

        self.update(table=table, dbase=dbase, ignore=ignore)
Exemplo n.º 5
0
def test_token2class():

    seq = 'tʰ ɔ x ˈth ə r A'.split(' ')

    assert token2class(seq[0], rc('dolgo')) == 'T'
    assert token2class(seq[3], 'dolgo') == 'T'
    assert token2class(seq[-1], 'dolgo') == '0'
Exemplo n.º 6
0
def set_schema(schema):
    """
	Provides context within which the lingpy schema is set to one of (ASJP,
	IPA). The schema is reverted back to IPA afterwards.
	
	This is necessary because other modules expect the schema to be IPA.
	"""
    assert schema.lower() in ('asjp', 'ipa')

    with disable_info_logs():
        rc(schema=schema.lower())

    yield

    with disable_info_logs():
        rc(schema='ipa')
Exemplo n.º 7
0
    def test_token2class(self):
        seq = 'tʰ ɔ x ˈth ə r A'.split(' ')

        assert token2class(seq[0], rc('dolgo')) == 'T'
        assert token2class(seq[3], 'dolgo') == 'T'
        assert token2class(seq[-1], 'dolgo') == '0'
        assert token2class('', 'dolgo') == '0'
Exemplo n.º 8
0
def test_scorer2str():
    """
    Test conversion of scorers to strings.
    """
    
    # get scorer from dolgo (not so many chars, easier to test)
    scorerA = scorer2str(lingpy.rc('dolgo').scorer)

    # get scoring matrix for dolgo
    scorerB = lingpy.util.read_text_file(os.path.join(test_data(),'dolgo.scorer'))

    assert scorerA == scorerB
Exemplo n.º 9
0
def test_scorer2str():
    """
    Test conversion of scorers to strings.
    """

    # get scorer from dolgo (not so many chars, easier to test)
    scorerA = scorer2str(lingpy.rc('dolgo').scorer)

    # get scoring matrix for dolgo
    scorerB = lingpy.util.read_text_file(
        os.path.join(test_data(), 'dolgo.scorer'))

    assert scorerA == scorerB
Exemplo n.º 10
0
def test_syllabify():
    
    seq1 = "t i a o ¹ b u ² d a o"
    seq2 = "jabloko"
    seq3 = "jabəlko"
    seq4 = "j a b u - k o"

    assert_raises(ValueError, syllabify, seq1, output="test")

    assert syllabify(seq1, output="flat").count(rc('morpheme_separator')) == 2
    assert syllabify(seq2, output="breakpoints")[0] == (0,2)
    assert syllabify(seq3, output="nested")[1] == list("bəl")
    assert syllabify(seq4, output="nested")[1] == list("bu-")
Exemplo n.º 11
0
    def test_correctness(self):
        lex = self._make_one({
            0: ['ID', 'doculect', 'concept', 'IPA'],
            1: ['1', 'deu', 'hand', 'hand'],
            2: ['2', 'eng', 'hand', 'hand'],
            3: ['3', 'xyz', 'hand', 'xyz']})
        lex.cluster(ref='cogid', method='sca', threshold=0.5)
        self.assertEquals(lex[1, 'cogid'], lex[2, 'cogid'])

        rc(schema='asjp')
        lex = self._make_one({
            0: ['ID', 'concept', 'ipa', 'doculect'],
            1: ['5424', 'Abend::N', 'swar', 'FRA'],
            2: ['5425', 'Abend::N', 'sware', 'FRA'],
            3: ['5426', 'Abend::N', 'sear3', 'RON'],
            4: ['5427', 'Abend::N', 'ivniN', 'ENG'],
            5: ['5428', 'Abend::N', 'noyt3', 'POR'],
            6: ['5429', 'Abend::N', 'tardi5a', 'POR'],
            7: ['5430', 'Abend::N', 'afd3n', 'DAN'],
        })
        lex.cluster(method='sca', threshold=0.5, ref='cogid')
        self.assertEquals(lex[1, 'cogid'], lex[2, 'cogid'], lex[3, 'cogid'])
        rc(schema='ipa')
Exemplo n.º 12
0
    def test_correctness(self):
        lex = self._make_one({
            0: ['ID', 'doculect', 'concept', 'IPA'],
            1: ['1', 'deu', 'hand', 'hand'],
            2: ['2', 'eng', 'hand', 'hand'],
            3: ['3', 'xyz', 'hand', 'xyz']})
        lex.cluster(ref='cogid', method='sca', threshold=0.5)
        self.assertEqual(lex[1, 'cogid'], lex[2, 'cogid'])

        rc(schema='asjp')
        lex = self._make_one({
            0: ['ID', 'concept', 'ipa', 'doculect'],
            1: ['5424', 'Abend::N', 'swar', 'FRA'],
            2: ['5425', 'Abend::N', 'sware', 'FRA'],
            3: ['5426', 'Abend::N', 'sear3', 'RON'],
            4: ['5427', 'Abend::N', 'ivniN', 'ENG'],
            5: ['5428', 'Abend::N', 'noyt3', 'POR'],
            6: ['5429', 'Abend::N', 'tardi5a', 'POR'],
            7: ['5430', 'Abend::N', 'afd3n', 'DAN'],
        })
        lex.cluster(method='sca', threshold=0.5, ref='cogid')
        self.assertEqual(lex[1, 'cogid'], lex[2, 'cogid'], lex[3, 'cogid'])
        rc(schema='ipa')
Exemplo n.º 13
0
    def test_syllabify(self):
        seq1 = "t i a o ¹ b u ² d a o"
        seq2 = "jabloko"
        seq3 = "jabəlko"
        seq4 = "j a b u - k o"
        seq5 = "ma⁵io"

        assert_raises(ValueError, syllabify, seq1, output="test")

        assert syllabify(seq1,
                         output="flat").count(rc('morpheme_separator')) == 2
        assert syllabify(seq2, output="breakpoints")[0] == (0, 2)
        assert syllabify(seq3, output="nested")[1] == list("bəl")
        assert syllabify(seq4, output="nested")[1] == list("bu-")
        assert ''.join(syllabify(seq5, sep="+")).split('+')[-1] == 'io'
Exemplo n.º 14
0
def test_settings():

    assert lingpy.rc('phybo_suffix') == ' -'
Exemplo n.º 15
0
def test_scorer2str(test_data):
    assert scorer2str(rc('dolgo').scorer) == read_text_file(str(test_data / 'dolgo.scorer'))
Exemplo n.º 16
0
def test_get_score_dict():
    chars = ["1.A.-", "2.B.-"]
    model = rc("sca")
    sd = get_score_dict(chars, model)
    assert sd['A', 'B'] == -22.5
Exemplo n.º 17
0
f = open('pmi-world.txt', 'r')
l = f.readlines()
f.close()
logOdds = array([x.strip().split() for x in l], double)

lodict = dict()
for i in xrange(len(sounds)):
    for j in xrange(len(sounds)):
        lodict[sounds[i], sounds[j]] = logOdds[i, j]

f = open('gapPenalties.txt')
gp1, gp2 = array([x.strip() for x in f.readlines()], double)
f.close()

lp.rc(schema='ipa')


def nwBio(x, y, lodict=lodict, gp1=gp1, gp2=gp2):
    al = pairwise2.align.globalds(x, y, lodict, gp1, gp2)[0]
    return al[2], array(al[:2])


data = pd.read_table('../data/' + dataset + '.tsv',
                     sep='\t',
                     encoding='utf-8',
                     na_filter=False)


def cleanASJP(word):
    word = word.replace('\t', '')
Exemplo n.º 18
0
 def test_scorer2str(self):
     """
     Test conversion of scorers to strings.
     """
     self.assertEqual(scorer2str(lingpy.rc('dolgo').scorer),
                      read_text_file(test_data('dolgo.scorer')))
Exemplo n.º 19
0
from lingpy import rc
from pathlib import Path

from classes import Ipa2Asjp, Alphabet

ipa = Alphabet(Path("../data/alphabets/ipa.csv"))
sca = rc('asjp')
converter = Ipa2Asjp(sca, ["ː"])

romance_ipa_path = Path("../data/romance_ciobanu_ipa.csv")
romance_ipa = romance_ipa_path.open(encoding='utf-16').read()

out_path = Path("../data/romance_ciobanu_asjp.csv")
out_path.touch()
out_file = out_path.open('w', encoding='utf-16')

langs = ["latin", "italian", "spanish", "french", "portuguese", "romanian"]
col_names = ["id", "concept"] + langs

header = "id,concept,latin,italian,spanish,french,portuguese,romanian\n"
out_file.write(header)
print(header)

for line in romance_ipa.split("\n")[1:]:
    s = ""
    if line != "":
        row = line.split(",")
        assert len(row) == len(col_names), "Expected {} fields, found {}"\
            .format(len(col_names), row)
        # create row data dict
        row_data = {
Exemplo n.º 20
0
### ASJP Sound class model:
###     https://en.wikipedia.org/wiki/Automated_Similarity_Judgment_Program
### LingPy: Python library for historical linguists:
###     http://lingpy.org/
###     Module reference: https://lingpy.readthedocs.io/en/latest/reference/lingpy.html
### Pyclts:
###     https://pypi.org/project/pyclts/
###     neat tool if we want to generate transcriptions (fast) and don't want to use lingpy
###     for that. However, lingpy is preferred.
##################################################################################################

import copy
from pathlib import Path
from lingpy import rc

asjp = rc('asjp')
print(asjp)

# columns
DESCRIPTION = 4
LOAN = 6
WORD = 8
CONCEPT1 = 9
CONCEPT2 = 10

working_dir = Path("./word_lists")

word_lists = {}

for lang in ['fr', 'sp', 'it', 'lat']:
    word_lists[lang] = {}
Exemplo n.º 21
0
def test_settings():
    assert lingpy.rc('phybo_suffix') == ' -'
Exemplo n.º 22
0
 def test_scorer2str(self):
     """
     Test conversion of scorers to strings.
     """
     self.assertEqual(scorer2str(rc('dolgo').scorer),
                      read_text_file(test_data('dolgo.scorer')))