def test_basic_corpus_nd(specified_test_corpus):
    calls = [({
        'query': specified_test_corpus.find('mata'),
        'max_distance': 1
    }, 1.0),
             ({
                 'query': specified_test_corpus.find('nata'),
                 'max_distance': 2
             }, 3.0),
             ({
                 'query': specified_test_corpus.find('mata'),
                 'algorithm': 'phono_edit_distance',
                 'max_distance': 3
             }, 1.0)]
    with CanonicalVariantContext(specified_test_corpus, 'transcription',
                                 'type') as c:
        for kwargs, v in calls:
            result = neighborhood_density(c, **kwargs)
            assert (abs(result[0] - v) < 0.0001)
    with CanonicalVariantContext(specified_test_corpus, 'spelling',
                                 'type') as c:
        result = neighborhood_density(
            c, **{
                'query': specified_test_corpus.find('mata'),
                'max_distance': 1
            })
        assert (abs(result[0] - 1.0) < 0.0001)
def test_basic_phonoprob(unspecified_test_corpus):
    expected = {
        'atema': 0.1011173499,
        'enuta': 0.1011173499,
        'ta': 0.4353651056,
        'mata': 0.1881114313,
        'nata': 0.1782478499
    }
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'token') as c:
        for k, v in expected.items():
            res = phonotactic_probability_vitevitch(
                c, unspecified_test_corpus.find(k), 'unigram')
        assert (abs(v - res) < 0.0001)

    expected = {
        'atema': 0.0798992942,
        'enuta': 0.0798992942,
        'ta': 0.1507780332,
        'mata': 0.0626335622,
        'nata': 0.0494821204
    }
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'token') as c:
        for k, v in expected.items():
            res = phonotactic_probability_vitevitch(
                c, unspecified_test_corpus.find(k), 'bigram')
        assert (abs(v - res) < 0.0001)
Exemplo n.º 3
0
    def __init__(self, corpus, parent=None):
        QWidget.__init__(self, parent)

        self.corpus = corpus
        self.type_context = CanonicalVariantContext(self.corpus,
                                                    'transcription', 'type')
        self.token_context = CanonicalVariantContext(self.corpus,
                                                     'transcription', 'token')

        layout = QHBoxLayout()

        layout.setAlignment(Qt.AlignTop)

        self.segments = InventoryBox('Segments', self.corpus.inventory)
        self.segments.setExclusive(True)
        for b in self.segments.btnGroup.buttons():
            b.clicked.connect(self.summarizeSegment)

        layout.addWidget(self.segments)

        self.detailFrame = QFrame()

        layout.addWidget(self.detailFrame)

        self.setLayout(layout)
Exemplo n.º 4
0
def test_relative_minpair(unspecified_test_corpus):
    calls = [
        ({
            'segment': 's',
            'relative_count': True
        }, 0.013888),
        ({
            'segment': 's',
            'relative_count': False
        }, 0.11111),
        ({
            'segment': 'n',
            'relative_count': True
        }, 0.0123457),
        ({
            'segment': 'n',
            'relative_count': False
        }, 0.11111),
        ({
            'segment': 'o',
            'relative_count': True
        }, 0),
        ({
            'segment': 'o',
            'relative_count': False
        }, 0),
    ]

    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'type') as c:
        for kwargs, v in calls:
            assert (abs(relative_minpair_fl(c, **kwargs) - v) < 0.0001)

    calls = [({
        'segment': 's',
        'relative_count': True
    }, 0.01587), ({
        'segment': 's',
        'relative_count': False
    }, 0.11111), ({
        'segment': 'n',
        'relative_count': True
    }, 0), ({
        'segment': 'n',
        'relative_count': False
    }, 0), ({
        'segment': 'o',
        'relative_count': True
    }, 0), ({
        'segment': 'o',
        'relative_count': False
    }, 0)]

    with CanonicalVariantContext(unspecified_test_corpus,
                                 'transcription',
                                 'type',
                                 frequency_threshold=3) as c:
        for kwargs, v in calls:
            assert (abs(relative_minpair_fl(c, **kwargs) - v) < 0.0001)
Exemplo n.º 5
0
def test_minpair(unspecified_test_corpus):

    calls = [({'segment_pairs':[('s','ʃ')],
                    'relative_count':True},0.125),
            ({'segment_pairs':[('s','ʃ')],
                    'relative_count':False},1),
            ({'segment_pairs':[('m','n')],
                    'relative_count':True},0.11111),
            ({'segment_pairs':[('m','n')],
                    'relative_count':False},1),
            ({'segment_pairs':[('e','o')],
                    'relative_count':True},0),
            ({'segment_pairs':[('e','o')],
                    'relative_count':False},0),
            ({'segment_pairs':[('s','ʃ'),
                                    ('m','n'),
                                    ('e','o')],
                    'relative_count':True},0.14286),
            ({'segment_pairs':[('s','ʃ'),
                                    ('m','n'),
                                    ('e','o')],
                    'relative_count':False},2),]

    with CanonicalVariantContext(unspecified_test_corpus,
                                'transcription', 'type') as c:
        for kwargs, v in calls:
            print(kwargs)
            assert(abs(minpair_fl(c, **kwargs)[0]-v) < 0.0001)

    calls = [({'segment_pairs':[('s','ʃ')],
                    'relative_count':True},0.14286),
            ({'segment_pairs':[('s','ʃ')],
                    'relative_count':False},1),
            ({'segment_pairs':[('m','n')],
                    'relative_count':True},0),
            ({'segment_pairs':[('m','n')],
                    'relative_count':False},0),
            ({'segment_pairs':[('e','o')],
                    'relative_count':True},0),
            ({'segment_pairs':[('e','o')],
                    'relative_count':False},0),

            ({'segment_pairs':[('s','ʃ'),
                                    ('m','n'),
                                    ('e','o')],
                    'relative_count':True},0.09091),
            ({'segment_pairs':[('s','ʃ'),
                                    ('m','n'),
                                    ('e','o')],
                    'relative_count':False},1)]

    with CanonicalVariantContext(unspecified_test_corpus,
                                'transcription', 'type', frequency_threshold = 3) as c:
        for kwargs, v in calls:
            print(kwargs)
            assert(abs(minpair_fl(c, **kwargs)[0]-v) < 0.0001)
Exemplo n.º 6
0
def test_freqalt(specified_test_corpus):
    with CanonicalVariantContext(specified_test_corpus, 'transcription', 'type') as c:

        result = calc_freq_of_alt(c,'s','ʃ','khorsi', min_rel = -15, phono_align=True)
        assert(result==(8,3,0.375))

        result = calc_freq_of_alt(c,'s','ʃ','khorsi', min_rel = -6, phono_align=True)
        assert(result==(8,0,0))

        result = calc_freq_of_alt(c,'s','ʃ','khorsi', min_rel = -6, phono_align=False)
        assert(result==(8,2,0.25))

        result = calc_freq_of_alt(c,'s','ʃ','khorsi', min_rel = -15, phono_align=False)
        assert(result==(8,7,0.875))

        result = calc_freq_of_alt(c,'s','ʃ','edit_distance', max_rel = 2, phono_align=True)
        assert(result==(8,2,0.25))

        result = calc_freq_of_alt(c,'s','ʃ','edit_distance', max_rel = 2, phono_align=False)
        assert(result==(8,2,0.25))

        result = calc_freq_of_alt(c,'s','ʃ','phono_edit_distance', max_rel = 6, phono_align=True)
        assert(result==(8,2,0.25))

        result = calc_freq_of_alt(c,'s','ʃ','phono_edit_distance', max_rel = 6, phono_align=False)
        assert(result==(8,2,0.25))

    with CanonicalVariantContext(specified_test_corpus, 'transcription', 'token') as c:

        result = calc_freq_of_alt(c,'s','ʃ','khorsi', min_rel = -15, phono_align=True)
        assert(result==(8,3,0.375))

        result = calc_freq_of_alt(c,'s','ʃ','khorsi', min_rel = -6, phono_align=True)
        assert(result==(8,2,0.25))

        result = calc_freq_of_alt(c,'s','ʃ','khorsi', min_rel = -15, phono_align=False)
        assert(result==(8,7,0.875))

        result = calc_freq_of_alt(c,'s','ʃ','khorsi', min_rel = -6, phono_align=False)
        assert(result==(8,3,0.375))

        result = calc_freq_of_alt(c,'s','ʃ','edit_distance', max_rel = 4, phono_align=True)
        assert(result==(8,3,0.375))

        result = calc_freq_of_alt(c,'s','ʃ','edit_distance', max_rel = 4, phono_align=False)
        assert(result==(8,6,0.75))

        result = calc_freq_of_alt(c,'s','ʃ','phono_edit_distance', max_rel = 20, phono_align=True)
        assert(result==(8,3,0.375))

        result = calc_freq_of_alt(c,'s','ʃ','phono_edit_distance', max_rel = 20, phono_align=False)
        assert(result==(8,6,0.75))
Exemplo n.º 7
0
def test_prod_type(specified_test_corpus):
    seg1 = 's'
    seg2 = 'ʃ'
    expected = {
        "-voc": 0.0,
        "+voc,+high": 0.863120568566631,
        "+voc,-high": 0.9852281360342515,
        "#": 0.0
    }
    env_list = []
    expected_envs = {}
    for k, v in expected.items():
        if k != '#':
            segs = specified_test_corpus.features_to_segments(k)
        else:
            segs = k
        env = EnvironmentFilter(['s', 'ʃ'], None, [segs])
        env_list.append(env)
        expected_envs[env] = v
    expected_envs["AVG"] = 0.9241743523004413
    type_or_token = 'type'
    tier = 'transcription'
    with CanonicalVariantContext(specified_test_corpus, tier,
                                 type_or_token) as c:
        result = calc_prod(c, env_list, all_info=False)
    for k, v in result.items():
        assert (expected_envs[k] - v < 0.001)
Exemplo n.º 8
0
def test_mass_fl(unspecified_test_corpus):
    #This needs to be updated so that it deterministically passes
    calls = [({'algorithm':'minpair',
                    'relative_count':True},

                        ([(('s', 'ʃ'), 0.125), (('m', 'n'), 0.1111111111111111), (('i', 't'), 0.0), (('t', 'u'), 0.0),
                         (('m', 't'), 0.0), (('i', 'u'), 0.0), (('e', 'o'), 0.0), (('n', 'o'), 0.0), (('i', 'ʃ'), 0.0),
                         (('u', 'ɑ'), 0.0), (('m', 'ʃ'), 0.0), (('m', 'ɑ'), 0.0), (('t', 'ʃ'), 0.0), (('e', 'n'), 0.0),
                         (('o', 't'), 0.0), (('e', 'ɑ'), 0.0), (('n', 'u'), 0.0), (('n', 't'), 0.0), (('o', 'ʃ'), 0.0),
                         (('e', 'u'), 0.0), (('s', 't'), 0.0), (('ɑ', 'ʃ'), 0.0), (('n', 's'), 0.0), (('e', 's'), 0.0),
                         (('i', 's'), 0.0), (('m', 'u'), 0.0), (('e', 'i'), 0.0), (('i', 'n'), 0.0), (('i', 'o'), 0.0),
                         (('i', 'm'), 0.0), (('n', 'ɑ'), 0.0), (('t', 'ɑ'), 0.0), (('s', 'ɑ'), 0.0), (('s', 'u'), 0.0),
                         (('i', 'ɑ'), 0.0), (('o', 's'), 0.0), (('e', 'ʃ'), 0.0), (('u', 'ʃ'), 0.0), (('m', 'o'), 0.0),
                         (('e', 'm'), 0.0), (('o', 'u'), 0.0), (('n', 'ʃ'), 0.0), (('e', 't'), 0.0), (('o', 'ɑ'), 0.0),
                         (('m', 's'), 0.0)])),
            ({'algorithm':'minpair',
                    'relative_count':False},

                        ([(('s', 'ʃ'), 1.0), (('m', 'n'), 1.0), (('i', 't'), 0.0), (('t', 'u'), 0.0),
                         (('m', 't'), 0.0), (('i', 'u'), 0.0), (('e', 'o'), 0.0), (('n', 'o'), 0.0), (('i', 'ʃ'), 0.0),
                         (('u', 'ɑ'), 0.0), (('m', 'ʃ'), 0.0), (('m', 'ɑ'), 0.0), (('t', 'ʃ'), 0.0), (('e', 'n'), 0.0),
                         (('o', 't'), 0.0), (('e', 'ɑ'), 0.0), (('n', 'u'), 0.0), (('n', 't'), 0.0), (('o', 'ʃ'), 0.0),
                         (('e', 'u'), 0.0), (('s', 't'), 0.0), (('ɑ', 'ʃ'), 0.0), (('n', 's'), 0.0), (('e', 's'), 0.0),
                         (('i', 's'), 0.0), (('m', 'u'), 0.0), (('e', 'i'), 0.0), (('i', 'n'), 0.0), (('i', 'o'), 0.0),
                         (('i', 'm'), 0.0), (('n', 'ɑ'), 0.0), (('t', 'ɑ'), 0.0), (('s', 'ɑ'), 0.0), (('s', 'u'), 0.0),
                         (('i', 'ɑ'), 0.0), (('o', 's'), 0.0), (('e', 'ʃ'), 0.0), (('u', 'ʃ'), 0.0), (('m', 'o'), 0.0),
                         (('e', 'm'), 0.0), (('o', 'u'), 0.0), (('n', 'ʃ'), 0.0), (('e', 't'), 0.0), (('o', 'ɑ'), 0.0),
                         (('m', 's'), 0.0)]))]


    with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'token') as c:
        for kwargs, v in calls:
            for result,prediction in zip(all_pairwise_fls(c, **kwargs), v):
                assert(abs(result[1]-prediction[1]) < 0.0001)
Exemplo n.º 9
0
def test_identical(specified_test_corpus):
    #Test 1, things that are identical
    with CanonicalVariantContext(specified_test_corpus, 'transcription',
                                 'type') as c:
        seg1_entropy, seg2_entropy, distance, ur, is_spurious = KL(
            c, 's', 's', 'b')
    assert (distance == 0.0)
    assert (seg1_entropy == seg2_entropy)
Exemplo n.º 10
0
def test_deltah(unspecified_test_corpus):
    type_calls = [({'segment_pairs':[('s','ʃ')]},0.02547695),
            ({'segment_pairs':[('m','n')]},0.02547695),
            ({'segment_pairs':[('e','o')]},0),
            ({'segment_pairs':[('s','ʃ'),
                                ('m','n'),
                                ('e','o')]},0.05284),]

    with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'type') as c:
        for kwargs, v in type_calls:
            assert(abs(deltah_fl(c, **kwargs)-v) < 0.0001)

    type_calls = [({'segment_pairs':[('s','ʃ')], 'prevent_normalization':True},0.09953567),
            ({'segment_pairs':[('m','n')], 'prevent_normalization':True},0.09953567),
            ({'segment_pairs':[('e','o')], 'prevent_normalization':True},0),
            ({'segment_pairs':[('s','ʃ'),
                                ('m','n'),
                                ('e','o')], 'prevent_normalization':True},0.206450877),]

    with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'type') as c:
        for kwargs, v in type_calls:
            assert(abs(deltah_fl(c, **kwargs)-v) < 0.0001)


    type_calls = [({'segment_pairs':[('s','ʃ')]},0.035015954),
            ({'segment_pairs':[('m','n')]},0),
            ({'segment_pairs':[('e','o')]},0),
            ({'segment_pairs':[('s','ʃ'),
                                ('m','n'),
                                ('e','o')]},0.035015954)]

    with CanonicalVariantContext(unspecified_test_corpus,
                            'transcription', 'type', frequency_threshold = 3) as c:
        for kwargs, v in type_calls:
            assert(abs(deltah_fl(c, **kwargs)-v) < 0.0001)

    token_calls = [({'segment_pairs':[('s','ʃ')]},0.08305),
            ({'segment_pairs':[('m','n')]},0.002314),
            ({'segment_pairs':[('e','o')]},0.0),
            ({'segment_pairs':[('s','ʃ'),
                                ('m','n'),
                                ('e','o')]},0.0853641),]

    with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'token') as c:
        for kwargs, v in token_calls:
            assert(abs(deltah_fl(c, **kwargs)-v) < 0.0001)
def test_basic_corpus_mutation_minpairs(specified_test_corpus):
    calls = [({'query':Word(**{'transcription': ['s', 'ɑ', 't', 'ɑ']}),
                    },2)]

    with CanonicalVariantContext(specified_test_corpus, 'transcription', 'type') as c:
        for kwargs,v in calls:
            result = find_mutation_minpairs(c, **kwargs)
            assert(result[0] == v)
            assert(sorted(result[1]) == sorted(['n.ɑ.t.ɑ', 'm.ɑ.t.ɑ']))
Exemplo n.º 12
0
def test_allophones(specified_test_corpus):
    #Test 2, things are supposed to be allophones
    with CanonicalVariantContext(specified_test_corpus, 'transcription',
                                 'type') as c:
        seg1_entropy, seg2_entropy, distance, ur, is_spurious = KL(
            c, 's', 'ʃ', 'b')
    assert (abs(distance - 0.15113518339295337) < 0.001)
    assert (abs(seg1_entropy - 0.035000140096702444) < 0.001)
    assert (abs(seg2_entropy - 0.06074393445793598) < 0.001)
Exemplo n.º 13
0
def test_pseudo_allophones(specified_test_corpus):
    #Test 3, things that are allophones by coincidence
    with CanonicalVariantContext(specified_test_corpus, 'transcription',
                                 'type') as c:
        seg1_entropy, seg2_entropy, distance, ur, is_spurious = KL(
            c, 's', 'ɑ', 'b')
    assert (abs(distance - 0.23231302100802534) < 0.001)
    assert (abs(seg1_entropy - 0.03500014009670246) < 0.001)
    assert (abs(seg2_entropy - 0.07314589775440267) < 0.001)
Exemplo n.º 14
0
def test_default(specified_test_corpus):
    #Test 4, things that have no assumed relationship
    with CanonicalVariantContext(specified_test_corpus, 'transcription',
                                 'type') as c:
        seg1_entropy, seg2_entropy, distance, ur, is_spurious = KL(
            c, 's', 'm', 'b')
    assert (abs(distance - 0.14186314747884132) < 0.001)
    assert (abs(seg1_entropy - 0.035000140096702444) < 0.001)
    assert (abs(seg2_entropy - 0.06074393445793598) < 0.001)
Exemplo n.º 15
0
def test_spelling(unspecified_test_corpus):
    expected = [(unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('atema'),0),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('enuta'),4),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mashomisi'),7),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('mata'),3),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('nata'),3),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('sasi'),5),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shashi'),6),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shisata'),6),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('shushoma'),6),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ta'),3),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tatomi'),3),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tishenishu'),9),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('toni'),4),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('tusa'),3),
                (unspecified_test_corpus.find('atema'),unspecified_test_corpus.find('ʃi'),5)]
    expected.sort(key=lambda t:t[1])
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'type') as c:
        calced = string_similarity(c,unspecified_test_corpus.find('atema'),'edit_distance')
    calced.sort(key=lambda t:t[1])
    for i, v in enumerate(expected):
        assert(calced[i] == v)

    expected = [(unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('atema'),5),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('enuta'),5),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mashomisi'),6),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('mata'),3),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('nata'),3),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('sasi'),0),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shashi'),2),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shisata'),5),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('shushoma'),6),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ta'),3),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tatomi'),4),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tishenishu'),8),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('toni'),3),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('tusa'),3),
                (unspecified_test_corpus.find('sasi'),unspecified_test_corpus.find('ʃi'),3)]
    expected.sort(key=lambda t:t[1])
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling', 'type') as c:
        calced = string_similarity(c,unspecified_test_corpus.find('sasi'),'edit_distance')
    calced.sort(key=lambda t:t[1])
    for i, v in enumerate(expected):
        assert(calced[i] == v)
Exemplo n.º 16
0
def test_relative_deltah(unspecified_test_corpus):
    type_calls = [({'segment':'s'},0.00283),
            ({'segment':'n'},0.00283),
            ({'segment':'o'},0),]
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'type') as c:
        for kwargs, v in type_calls:
            assert(abs(relative_deltah_fl(c, **kwargs)[0]-v) < 0.0001)

    type_calls = [({'segment':'s'}, 0.00389),
            ({'segment':'n'},0),
            ({'segment':'o'},0),]
    with CanonicalVariantContext(unspecified_test_corpus,
                        'transcription', 'type', frequency_threshold = 3) as c:
        for kwargs, v in type_calls:
            assert(abs(relative_deltah_fl(c, **kwargs)[0]-v) < 0.0001)


    token_calls = [({'segment':'s'},0.009227777),
            ({'segment':'n'},0.0002571111),
            ({'segment':'o'},0),]
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription', 'token') as c:
        for kwargs, v in token_calls:
            assert(abs(relative_deltah_fl(c, **kwargs)[0]-v) < 0.0001)
Exemplo n.º 17
0
def test_freq_base_transcription_type(unspecified_test_corpus):
    expected = {
        '#': 30,
        'ɑ': 16,
        'e': 3,
        'i': 10,
        'm': 6,
        'n': 4,
        'o': 4,
        's': 5,
        'ʃ': 9,
        't': 11,
        'u': 4,
        'total': 102
    }

    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'type') as c:
        freq_base = c.get_frequency_base()
    assert (freq_base == expected)
Exemplo n.º 18
0
def test_freq_base_transcription_token(unspecified_test_corpus):
    expected = {
        '#': 1158,
        'ɑ': 466,
        'e': 118,
        'i': 429,
        'm': 156,
        'n': 142,
        'o': 171,
        's': 318,
        'ʃ': 540,
        't': 271,
        'u': 265,
        'total': 4034
    }

    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'token') as c:
        freq_base = c.get_frequency_base()
    assert (freq_base == expected)
Exemplo n.º 19
0
def test_freq_base_spelling_type(unspecified_test_corpus):
    expected = {
        '#': 30,
        'a': 16,
        'e': 3,
        'h': 8,
        'i': 10,
        'm': 6,
        'n': 4,
        'o': 4,
        's': 13,
        'ʃ': 1,
        't': 11,
        'u': 4,
        'total': 110
    }

    with CanonicalVariantContext(unspecified_test_corpus, 'spelling',
                                 'type') as c:
        freq_base = c.get_frequency_base()
    assert (freq_base == expected)
Exemplo n.º 20
0
def test_pointwise_mi(unspecified_test_corpus):
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'type') as c:
        calls = [({
            'corpus_context': c,
            'query': ('e', 'm')
        }, 2.7319821866519507),
                 ({
                     'corpus_context': c,
                     'query': ('t', 'n'),
                     'in_word': True
                 }, 0.5849625007211564),
                 ({
                     'corpus_context': c,
                     'query': ('e', 'm'),
                     'halve_edges': True
                 }, 2.7319821866519507)]

        for c, v in calls:
            result = pointwise_mi(**c)
            assert (abs(result - v) < 0.0001)
Exemplo n.º 21
0
def test_freq_base_spelling_token(unspecified_test_corpus):
    expected = {
        '#': 1158,
        'a': 466,
        'e': 118,
        'h': 538,
        'i': 429,
        'm': 156,
        'n': 142,
        'o': 171,
        's': 856,
        'ʃ': 2,
        't': 271,
        'u': 265,
        'total': 4572
    }

    with CanonicalVariantContext(unspecified_test_corpus, 'spelling',
                                 'token') as c:
        freq_base = c.get_frequency_base()
    assert (freq_base == expected)
Exemplo n.º 22
0
def test_relative_deltah(unspecified_test_corpus):
    type_calls = [
        ({
            'segment': 's'
        }, 0.014814),
        ({
            'segment': 'n'
        }, 0.014814),
        ({
            'segment': 'o'
        }, 0),
    ]
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'type') as c:
        for kwargs, v in type_calls:
            assert (abs(relative_deltah_fl(c, **kwargs) - v) < 0.0001)

    type_calls = [
        ({
            'segment': 's'
        }, 0.0185185),
        ({
            'segment': 'n'
        }, 0),
        ({
            'segment': 'o'
        }, 0),
    ]
    with CanonicalVariantContext(unspecified_test_corpus,
                                 'transcription',
                                 'type',
                                 frequency_threshold=3) as c:
        for kwargs, v in type_calls:
            assert (abs(relative_deltah_fl(c, **kwargs) - v) < 0.0001)

    token_calls = [
        ({
            'segment': 's'
        }, 0.0275487),
        ({
            'segment': 'n'
        }, 0.000767),
        ({
            'segment': 'o'
        }, 0),
    ]
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'token') as c:
        for kwargs, v in token_calls:
            assert (abs(relative_deltah_fl(c, **kwargs) - v) < 0.0001)

    token_calls = [({
        'segment': 's'
    }, 0.027837), ({
        'segment': 'n'
    }, 0), ({
        'segment': 'o'
    }, 0)]
    with CanonicalVariantContext(unspecified_test_corpus,
                                 'transcription',
                                 'token',
                                 frequency_threshold=3) as c:
        for kwargs, v in token_calls:
            assert (abs(relative_deltah_fl(c, **kwargs) - v) < 0.0001)
Exemplo n.º 23
0
def test_error(specified_test_corpus):
    #Test 5, things not in the corpus
    with pytest.raises(ValueError):
        with CanonicalVariantContext(specified_test_corpus, 'transcription',
                                     'type') as c:
            KL(c, 's', '!', '')
Exemplo n.º 24
0
def test_mass_relate_spelling_type(unspecified_test_corpus):
    expected = [
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('atema'), 11.0766887),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('enuta'), -14.09489383),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('mashomisi'), -18.35890071),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('mata'), -6.270847817),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('nata'), -8.494720336),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('sasi'), -13.57140897),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shashi'), -18.17657916),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shisata'), -13.51516925),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shushoma'), -16.90806783),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('ta'), -8.717863887),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tatomi'), -13.53912249),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tishenishu'), -28.78151269),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('toni'), -15.17933206),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tusa'), -13.53067344),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('ʃi'), -17.53815687),
    ]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling',
                                 'type') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('atema'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)

    expected = [
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('atema'), -13.57140897),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('enuta'), -15.36316844),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('mashomisi'), -16.92481569),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('mata'), -10.28799462),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('nata'), -10.69345973),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('sasi'), 7.323034009),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shashi'), -8.971692634),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shisata'), -10.26267682),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shushoma'), -20.30229654),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('ta'), -6.088289546),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tatomi'), -15.73786189),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tishenishu'), -25.52902026),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('toni'), -11.13974683),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tusa'), -5.449867265),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('ʃi'), -7.54617756),
    ]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling',
                                 'type') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('sasi'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)
Exemplo n.º 25
0
def test_mass_relate_spelling_token(unspecified_test_corpus):
    expected = [(unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('atema'), 12.9671688),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('enuta'), -16.49795651),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('mashomisi'), -17.65533907),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('mata'), -7.337667817),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('nata'), -9.088485208),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('sasi'), -13.8251823),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('shashi'), -17.52074498),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('shisata'), -12.59737574),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('shushoma'), -14.82488063),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('ta'), -9.8915809),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('tatomi'), -14.6046824),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('tishenishu'), -27.61147254),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('toni'), -16.14809881),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('tusa'), -13.8308605),
                (unspecified_test_corpus.find('atema'),
                 unspecified_test_corpus.find('ʃi'), -22.4838445)]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling',
                                 'token') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('atema'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)

    expected = [(unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('atema'), -13.8251823),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('enuta'), -14.48366705),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('mashomisi'), -16.62778969),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('mata'), -10.46022702),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('nata'), -10.55425597),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('sasi'), 6.832376308),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('shashi'), -7.235843913),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('shisata'), -9.913037922),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('shushoma'), -19.77169406),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('ta'), -5.382988852),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('tatomi'), -16.07045316),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('tishenishu'), -24.92713472),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('toni'), -11.39132061),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('tusa'), -5.172159875),
                (unspecified_test_corpus.find('sasi'),
                 unspecified_test_corpus.find('ʃi'), -10.12650306)]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'spelling',
                                 'token') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('sasi'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)
Exemplo n.º 26
0
def test_mass_relate_transcription_token(unspecified_test_corpus):
    expected = [
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('atema'), 12.10974787),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('enuta'), -15.29756722),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('mashomisi'), -16.05808867),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('mata'), -8.574032654),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('nata'), -6.823215263),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('sasi'), -14.77671518),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shashi'), -13.71767966),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shisata'), -11.34309371),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shushoma'), -11.19329949),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('ta'), -9.205644162),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tatomi'), -13.74726148),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tishenishu'), -23.12247048),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('toni'), -15.1191937),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tusa'), -13.79217439),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('ʃi'), -15.68503325),
    ]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'token') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('atema'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)

    expected = [
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('atema'), -14.77671518),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('enuta'), -15.43519993),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('mashomisi'), -13.96361833),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('mata'), -11.58324408),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('nata'), -11.67727303),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('sasi'), 8.126877557),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shashi'), -9.734809346),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shisata'), -7.840021077),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shushoma'), -15.95332831),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('ta'), -6.848974285),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tatomi'), -16.85050186),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tishenishu'), -20.51761446),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('toni'), -12.51433768),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tusa'), -4.829191506),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('ʃi'), -5.994066536),
    ]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'token') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('sasi'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)
def test_basic_corpus_probs(unspecified_test_corpus):
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'type') as c:
        prob_dict = c.get_phone_probs(1, log_count=False, probability=False)

    expected = {
        (('i', ), 1): 3,
        (('s', ), 2): 3,
        (('ʃ', ), 6): 1,
        (('t', ), 3): 1,
        (('m', ), 3): 1,
        (('s', ), 0): 1,
        (('o', ), 1): 1,
        (('u', ), 1): 2,
        (('u', ), 2): 1,
        (('n', ), 4): 1,
        (('o', ), 3): 3,
        (('ʃ', ), 2): 4,
        (('m', ), 4): 3,
        (('n', ), 0): 1,
        (('t', ), 0): 5,
        (('ʃ', ), 0): 4,
        (('e', ), 3): 1,
        (('ɑ', ), 5): 2,
        (('m', ), 0): 2,
        (('t', ), 1): 1,
        (('u', ), 7): 1,
        (('t', ), 4): 1,
        (('ɑ', ), 1): 7,
        (('i', ), 7): 1,
        (('t', ), 2): 3,
        (('s', ), 6): 1,
        (('ɑ', ), 3): 4,
        (('i', ), 5): 3,
        (('e', ), 0): 1,
        (('i', ), 3): 3,
        (('n', ), 1): 1,
        (('n', ), 2): 1,
        (('ɑ', ), 0): 1,
        (('ɑ', ), 4): 2,
        (('e', ), 2): 1,
        'total': {
            0: 15,
            1: 15,
            2: 13,
            3: 13,
            4: 7,
            5: 5,
            6: 2,
            7: 2
        }
    }
    for k, v in expected.items():
        if k == 'total':
            for k2, v2 in v.items():
                assert (prob_dict[k][k2] == v2)
            continue
        assert (prob_dict[k] == v)

    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'type') as c:
        prob_dict = c.get_phone_probs(1, log_count=False, probability=True)
    for k, v in expected.items():
        if k == 'total':
            continue
        assert (prob_dict[k] == v / expected['total'][k[1]])

    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'token') as c:
        prob_dict = c.get_phone_probs(1, log_count=True, probability=True)
    expected = {
        (('ɑ', ), 0): 0.0587828456,
        (('t', ), 1): 0.0587828456,  #atema
        (('e', ), 2): 0.0668038019,
        (('m', ), 3): 0.0668038019,
        (('ɑ', ), 4): 0.2544134544,
        (('e', ), 0): 0.0587828456,
        (('n', ), 1): 0.0587828456,  #enuta
        (('u', ), 2): 0.0668038019,
        (('t', ), 3): 0.0668038019,
        (('t', ), 0): 0.4333449434,
        (('ɑ', ), 1): 0.4373852679,  #ta
        (('m', ), 0): 0.0564463785,
        (('t', ), 2): 0.0928330493,  #mata
        (('ɑ', ), 3): 0.1657810293,
        (('n', ), 0): 0.0169920531  #nata
    }
    for k, v in expected.items():
        assert (abs(prob_dict[k] - v) < 0.0001)

    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'token') as c:
        prob_dict = c.get_phone_probs(2, log_count=True, probability=True)
    expected = {
        (('ɑ', 't'), 0): 0.0587828456,
        (('t', 'e'), 1): 0.0668038019,  #atema
        (('e', 'm'), 2): 0.0668038019,
        (('m', 'ɑ'), 3): 0.1272067272,
        (('e', 'n'), 0): 0.0587828456,
        (('n', 'u'), 1): 0.0668038019,  #enuta
        (('u', 't'), 2): 0.0668038019,
        (('t', 'ɑ'), 3): 0.1272067272,
        (('t', 'ɑ'), 0): 0.1507780332,  #ta
        (('m', 'ɑ'), 0): 0.0564463785,
        (('ɑ', 't'), 1): 0.0928330493,  #mata
        (('t', 'ɑ'), 2): 0.0386212588,
        (('n', 'ɑ'), 0): 0.0169920531  #nata
    }
    #print(list(prob_dict.keys()))
    for k, v in expected.items():
        assert (abs(prob_dict[k] - v) < 0.0001)
Exemplo n.º 28
0
def test_mass_relate_transcription_type(unspecified_test_corpus):
    expected = [
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('atema'), 10.54988612),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('enuta'), -13.35737022),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('mashomisi'), -16.64202823),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('mata'), -5.95476627),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('nata'), -8.178638789),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('sasi'), -14.85026877),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shashi'), -13.67469544),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shisata'), -12.0090178),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('shushoma'), -12.51154463),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('ta'), -8.296421824),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tatomi'), -13.01231991),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tishenishu'), -23.85818691),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('toni'), -14.54716897),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('tusa'), -13.85402179),
        (unspecified_test_corpus.find('atema'),
         unspecified_test_corpus.find('ʃi'), -14.60340869),
    ]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'type') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('atema'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)

    expected = [
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('atema'), -14.85026877),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('enuta'), -16.64202823),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('mashomisi'), -12.94778139),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('mata'), -11.67221494),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('nata'), -12.07768004),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('sasi'), 8.812614836),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shashi'), -11.93742415),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shisata'), -7.90637444),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('shushoma'), -18.22899329),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('ta'), -7.683230889),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tatomi'), -16.91136117),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tishenishu'), -21.83498509),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('toni'), -12.52396715),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('tusa'), -5.239146233),
        (unspecified_test_corpus.find('sasi'),
         unspecified_test_corpus.find('ʃi'), -6.943894326),
    ]
    expected.sort(key=lambda t: t[2])
    expected.reverse()
    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'type') as c:
        calced = string_similarity(c, unspecified_test_corpus.find('sasi'),
                                   'khorsi')
    for i, v in enumerate(expected):
        assert (abs(calced[i][2] - v[2]) < 0.0001)
Exemplo n.º 29
0
def test_deltah(unspecified_test_corpus):
    type_calls = [
        ({
            'segment_pairs': [('s', 'ʃ')]
        }, 0.13333),
        ({
            'segment_pairs': [('m', 'n')]
        }, 0.13333),
        ({
            'segment_pairs': [('e', 'o')]
        }, 0),
        ({
            'segment_pairs': [('s', 'ʃ'), ('m', 'n'), ('e', 'o')]
        }, 0.26667),
    ]

    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'type') as c:
        for kwargs, v in type_calls:
            assert (abs(deltah_fl(c, **kwargs) - v) < 0.0001)

    type_calls = [({
        'segment_pairs': [('s', 'ʃ')]
    }, 0.16667), ({
        'segment_pairs': [('m', 'n')]
    }, 0), ({
        'segment_pairs': [('e', 'o')]
    }, 0), ({
        'segment_pairs': [('s', 'ʃ'), ('m', 'n'), ('e', 'o')]
    }, 0.16667)]

    with CanonicalVariantContext(unspecified_test_corpus,
                                 'transcription',
                                 'type',
                                 frequency_threshold=3) as c:
        for kwargs, v in type_calls:
            assert (abs(deltah_fl(c, **kwargs) - v) < 0.0001)

    token_calls = [
        ({
            'segment_pairs': [('s', 'ʃ')]
        }, 0.24794),
        ({
            'segment_pairs': [('m', 'n')]
        }, 0.00691),
        ({
            'segment_pairs': [('e', 'o')]
        }, 0),
        ({
            'segment_pairs': [('s', 'ʃ'), ('m', 'n'), ('e', 'o')]
        }, 0.25485),
    ]

    with CanonicalVariantContext(unspecified_test_corpus, 'transcription',
                                 'token') as c:
        for kwargs, v in token_calls:
            assert (abs(deltah_fl(c, **kwargs) - v) < 0.0001)

    token_calls = [
        ({
            'segment_pairs': [('s', 'ʃ')]
        }, 0.25053),
        ({
            'segment_pairs': [('m', 'n')]
        }, 0),
        ({
            'segment_pairs': [('e', 'o')]
        }, 0),
        ({
            'segment_pairs': [('s', 'ʃ'), ('m', 'n'), ('e', 'o')]
        }, 0.25053),
    ]

    with CanonicalVariantContext(unspecified_test_corpus,
                                 'transcription',
                                 'token',
                                 frequency_threshold=3) as c:
        for kwargs, v in token_calls:
            assert (abs(deltah_fl(c, **kwargs) - v) < 0.0001)