Exemplo n.º 1
0
    def test_system_german_2(self):
        test_accs = list()
        no_gen_test_accs = list()
        size = 60
        for seed in range(25):
            fname = f'../data/german/quant/train{size}_{seed}.txt'
            test_path = fname.replace(f'train{size}', 'test')
            pairs, feature_space = load_pairs(fname, 'german')
            tp = ATP(apply_phonology=False, feature_space=feature_space)
            tp.train(pairs)

            # compute accuracies
            train_acc = self._evaluate('german', fname, tp, no_feats=False)
            assert (train_acc == 1.0)
            test_acc = self._evaluate('german', test_path, tp, no_feats=False)
            test_accs.append(test_acc)
            no_gen_test_acc = self._evaluate('german',
                                             test_path,
                                             tp,
                                             no_feats=True)
            no_gen_test_accs.append(no_gen_test_acc)
        if not 0.54 <= np.mean(test_accs) < 0.57:
            print(0.54, np.mean(test_accs), 0.57)
        assert (0.54 < np.mean(test_accs) < 0.57)
        assert (0.54 < np.mean(no_gen_test_accs) < 0.57)
Exemplo n.º 2
0
    def test_system_german_4(self):
        from utils import load_german_CHILDES
        pairs, features = load_german_CHILDES()
        atp = ATP(feature_space=features)
        atp.train(pairs)
        leaf_names = set()
        for leaf in atp.get_leaves():
            leaf_names.add(leaf.name)

        assert ('[e|del|hel]# => inflected = lemma + n' in leaf_names)
        assert ('¬[e|del|hel]#,[n|er|gel|kel|sel]# => inflected = lemma + '
                in leaf_names)
        assert (
            '¬[e|del|hel]#,¬[n|er|gel|kel|sel]#,M,el# => inflected = lemma + '
            in leaf_names)
        assert (
            '¬[e|del|hel]#,¬[n|er|gel|kel|sel]#,M,¬el# => inflected = lemma + e'
            in leaf_names)
        assert (
            '¬[e|del|hel]#,¬[n|er|gel|kel|sel]#,¬M,[g|r|t]# => inflected = lemma + en'
            in leaf_names)
        assert (
            '¬[e|del|hel]#,¬[n|er|gel|kel|sel]#,¬M,¬[g|r|t]#,[d|h|i]# => inflected = lemma + er'
            in leaf_names)
        assert (
            '¬[e|del|hel]#,¬[n|er|gel|kel|sel]#,¬M,¬[g|r|t]#,¬[d|h|i]#,el# => inflected = lemma + n'
            in leaf_names)
        assert (
            '¬[e|del|hel]#,¬[n|er|gel|kel|sel]#,¬M,¬[g|r|t]#,¬[d|h|i]#,¬el#,[l|s]# => inflected = lemma + e'
            in leaf_names)
        assert (
            '¬[e|del|hel]#,¬[n|er|gel|kel|sel]#,¬M,¬[g|r|t]#,¬[d|h|i]#,¬el#,¬[l|s]# => inflected = lemma + s'
            in leaf_names)
Exemplo n.º 3
0
    def test_maximize_productivity_1(self):
        tp = ATP(feature_space={'PRS', 'PST', '1', '2', '3'})
        p1, p2, p3, p4, p5, p6 = ('walk', 'walked', 'PST,3'), \
                                 ('sprint', 'sprinted', 'PST,1'), \
                                 ('run', 'ran', 'PST,1'), \
                                 ('walk', 'walk', 'PRS,2'), \
                                 ('think', 'thinks', 'PRS,3'), \
                                 ('quack', 'quacked', 'PST,2')
        _pairs = [p1, p2, p3, p4, p5, p6]
        y1, y2, y3, y4, y5, y6 = '-ed', '-ed', 'u -> a', '-null', '-s', '-ed'
        _y = [y1, y2, y3, y4, y5, y6]

        feature_space = tp.feature_space
        split_feature, vals_splits, splits_labels = tp.maximize_productivity(
            _pairs, _y, feature_space)
        assert (split_feature.name in {'PRS', 'PST'})
        assert (len(vals_splits) == 2)
        c1 = sorted(vals_splits[split_feature.name]) == sorted([
            p4, p5
        ]) and sorted(vals_splits[f'¬{split_feature.name}']) == sorted(
            [p1, p2, p3, p6])
        c2 = sorted(vals_splits[split_feature.name]) == sorted(
            [p1, p2, p3, p6]) and sorted(
                vals_splits[f'¬{split_feature.name}']) == sorted([p4, p5])
        assert (c1 or c2)
Exemplo n.º 4
0
    def test_system_english_3(self):
        num_children = 4

        ing = [0] * 20
        s = [0] * 20
        d = [0] * 20

        word_to_ipa = load_word_to_ipa()

        for child in range(num_children):
            vocabs = list(
                glob.glob(f'../data/english/growth/child-{child}/*.txt'))
            vocabs = sorted(
                vocabs, key=lambda it: int(it.split('/')[-1].split('.txt')[0]))
            for i, vocab in enumerate(vocabs):
                pairs = list()
                feature_space = set()
                with open(vocab, 'r') as f:
                    for line in f:
                        lemma, inflected, feats = line.strip().split()
                        lemma_ipa = word_to_ipa[lemma]
                        inflected_ipa = word_to_ipa[inflected]
                        feature_space.update(feats.split(';'))
                        pairs.append((lemma_ipa, inflected_ipa,
                                      tuple(feats.split(';'))))
                tp = ATP(apply_phonology=True, feature_space=feature_space)
                tp.train(pairs)

                suffixes = set()
                for leaf in tp.get_leaves():
                    if leaf.switch_statement.productive:
                        suffix = leaf.switch_statement.default_case.name.split(
                            'lemma')[-1].replace(' + ', '')
                        suffixes.add(suffix)
                if 'z' in suffixes and 's' in suffixes:
                    suffixes.discard('z')
                if 't' in suffixes and 'd' in suffixes:
                    suffixes.discard('t')
                for suffix in suffixes:
                    if suffix == 'ɪŋ':
                        ing[i] += 1
                    if suffix in {'d', 't'}:
                        d[i] += 1
                    if suffix in {'s', 'z'}:
                        s[i] += 1

        assert (ing == s ==
                [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
        assert (d == [
            0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
        ])
Exemplo n.º 5
0
    def test_system_german_3(self):
        num_seeds = 100
        train_sizes = [50, 100, 200, 300, 400]

        en = [0] * len(train_sizes)
        n = [0] * len(train_sizes)
        e = [0] * len(train_sizes)
        null = [0] * len(train_sizes)
        er = [0] * len(train_sizes)
        s = [0] * len(train_sizes)

        seed = 0  # only one seed for now
        for i, size in enumerate(train_sizes):
            for seed in range(num_seeds):
                fname = f'../data/german/growth/train{size}_{seed}.txt'
                pairs, feature_space = load_pairs(fname, 'german')
                tp = ATP(apply_phonology=False, feature_space=feature_space)
                tp.train(pairs)

                suffixes = set()
                for leaf in tp.get_leaves():
                    if leaf.switch_statement.productive:
                        suffix = leaf.switch_statement.default_case.name.split(
                            'lemma')[-1].replace(' + ', '')

                        suffixes.add(suffix)
                for suffix in suffixes:
                    if suffix == 'en':
                        en[i] += 1
                    if suffix == 'e':
                        e[i] += 1
                    if suffix == 'n':
                        n[i] += 1
                    if suffix == '':
                        null[i] += 1
                    if suffix == 'er':
                        er[i] += 1
                    if suffix == 's':
                        s[i] += 1

        assert (n == [100, 100, 100, 100, 100])
        assert (en == [0, 0, 100, 100, 100])
        assert (e[1:] == [100, 100, 100, 100])
        assert (25 <= e[0] <= 27)
        assert (null[1:] == [100, 100, 100, 100])
        assert (74 <= null[0] <= 76)
        assert (s[:-2] == [0, 0, 0])
        assert (0 <= s[-2] <= 2)
        assert (14 <= s[-1] <= 16)
Exemplo n.º 6
0
    def test_system_english_1(self):
        test_accs = list()
        size = 1000
        for seed in range(10):
            fname = f'../data/english/quant/unimorph_celex0_train{size}_{seed}.txt'
            test_path = fname.replace(f"{fname.split('_')[-2]}", 'test')
            pairs, feature_space = load_pairs(fname, 'english-quant')
            tp = ATP(apply_phonology=False, feature_space=feature_space)
            tp.train(pairs)

            # compute accuracies
            train_acc = self._evaluate('english', fname, tp, no_feats=False)
            assert (0.999 <= train_acc <= 1.0)
            test_acc = self._evaluate('english', test_path, tp, no_feats=False)
            test_accs.append(test_acc)

        assert (0.9 < np.mean(test_accs) < 0.92)
Exemplo n.º 7
0
    def test_inflect_1(self):
        train_pairs = [('finger', 'fingerz', 'PL'), ('block', 'blocks', 'PL'),
                       ('car', 'carz', 'PL'), ('toddler', 'toddlerz', 'PL'),
                       ('foot', 'feet', 'PL'), ('snack', 'snacks', 'PL'),
                       ('catcher', 'catcherz', 'PL'),
                       ('carrot', 'carrots', 'PL'), ('ask', 'askt', 'PST'),
                       ('run', 'ran', 'PST'), ('walk', 'walkt', 'PST'),
                       ('pay', 'payd', 'PST'), ('link', 'linkt', 'PST'),
                       ('smack', 'smackt', 'PST'), ('smell', 'smelld', 'PST'),
                       ('think', 'thought', 'PST'), ('fight', 'fought', 'PST'),
                       ('lay', 'layd', 'PST'), ('catch', 'caught', 'PST'),
                       ('call', 'calld', 'PST'), ('play', 'playd', 'PST'),
                       ('look', 'lookd', 'PST')]
        tp = ATP(apply_phonology=False, feature_space={'PL', 'PST'})
        tp.train(train_pairs)
        for leaf in tp.get_leaves():
            if leaf.switch_statement.productive:
                best_case = leaf.switch_statement.default_case
            elif leaf.switch_statement.get_closest_to_productive():
                best_case = leaf.switch_statement.get_closest_to_productive()

        c = 0
        t = 0
        for lemma, inflected, feats in train_pairs:
            pred = tp.inflect(lemma, feats)
            if pred == inflected:
                c += 1
            t += 1
        assert (c / t == 1.0)

        test_pairs = [('walker', 'walkerz', 'PL'), ('feeler', 'feelerz', 'PL'),
                      ('plank', 'planks', 'PL'), ('peanut', 'peanuts', 'PL'),
                      ('talk', 'talkt', 'PST'), ('hurl', 'hurld', 'PST'),
                      ('glean', 'gleand', 'PST')]

        c = 0
        t = 0
        for lemma, inflected, feats in test_pairs:
            pred = tp.inflect(lemma, feats)
            if pred == inflected:
                c += 1
            t += 1
        assert (c / t == 1.0)
Exemplo n.º 8
0
 def test_init(self):
     tp = ATP(feature_space={'PST'})