Пример #1
0
 def test_pqa_english_noamb_diled_no_generalization(self):
     input_parses = module_path + '/tests/data/POC-English-NoAmb/MST-fixed-manually/'
     batch_dir = module_path + '/output/test_grammar_learner_' + str(
         UTC())[:10]
     prj_dir = batch_dir + '/noamb_pqa_diled_no_generalization/'
     if check_dir(prj_dir, create=True, verbose='max'):
         outpath = prj_dir
     # cp,rp :: (test) corpus_path and reference_path:
     cp = module_path + '/data/POC-English-NoAmb/poc_english_noamb_corpus.txt'
     rp = input_parses + '/poc-english_noAmb-parses-gold.txt'
     kwargs = {
         'input_parses': input_parses,
         'output_grammar': outpath,
         'left_wall': '',
         'period': False,
         'context': 2,
         'word_space': 'discrete',
         'dim_reduction': 'none',
         'clustering': 'group',
         'grammar_rules': 2,
         'categories_generalization': 'off',
         'rules_generalization': 'off',
         'tmpath': module_path + '/tmp/',
         'linkage_limit': 1000,
         'verbose': 'min'
     }
     re = learn_grammar(**kwargs)
     pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp,
                                           rp, **kwargs)
     self.assertTrue(pa * recall > 0.99,
                     str(pa) + " * " + str(recall) + " > 0.99")
Пример #2
0
 def test_pqa_turtle_ddrkd_no_generalization(self):
     input_parses = module_path + '/tests/data/POC-Turtle/MST-fixed-manually/'
     batch_dir = module_path + '/output/test_grammar_learner_' + str(
         UTC())[:10]
     prj_dir = batch_dir + '/turtle_pqa_ddrkd_no_generalization/'
     if check_dir(prj_dir, create=True, verbose='max'):
         outpath = prj_dir
     # cp,rp :: (test) corpus_path and reference_path:
     cp = module_path + '/tests/data/POC-Turtle/poc-turtle-corpus.txt'
     rp = input_parses + '/poc-turtle-parses-gold.txt'
     kwargs = {
         'input_parses': input_parses,
         'output_grammar': outpath,
         'left_wall': '',
         'period': False,
         'context': 2,
         'word_space': 'vectors',
         'dim_reduction': 'svd',
         'clustering': ('kmeans', 'kmeans++', 18),
         #-'cluster_range' :   (2,50,9)    ,
         'cluster_range': (20, 2, 9),
         'grammar_rules': 2,
         'categories_generalization': 'off',
         'rules_generalization': 'off',
         'tmpath': module_path + '/tmp/',
         'linkage_limit': 1000,
         'verbose': 'min'
     }
     re = learn_grammar(**kwargs)
     pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp,
                                           rp, **kwargs)
     self.assertTrue(pa * recall > 0.99,
                     str(pa) + " * " + str(recall) + " > 0.99")
Пример #3
0
 def test_pqa_turtle_diled_no_generalization(self):
     input_parses = module_path + '/tests/data/POC-Turtle/MST-fixed-manually'
     batch_dir = module_path + '/output/test_grammar_learner_' + str(UTC())[:10]
     prj_dir = batch_dir + '/turtle_pqa_diled_no_generalization/'
     if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir
     # cp,rp :: (test) corpus_path and reference_path:
     cp = module_path + '/tests/data/POC-Turtle/poc-turtle-corpus.txt'
     rp = input_parses + '/poc-turtle-parses-gold.txt'
     kwargs = {
         'input_parses'  :   input_parses,
         'output_grammar':   outpath,
         'left_wall'     :   '' ,
         'period'        :   False        ,
         'context'       :   2           ,
         'word_space'    :   'discrete'  ,
         'dim_reduction' :   'none'      ,
         'clustering'    :   'group'     ,
         'grammar_rules' :   2           ,
         'categories_generalization' :   'off' ,
         'rules_generalization'      :   'off' ,
         'tmpath'        :   module_path + '/tmp/',
         'linkage_limit' :   1000,
         'verbose'       :   'min'
     }
     re = learn_grammar(**kwargs)
     # 81019 changes:    # FIXME: DEL comments
     # a, q, qa = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs)
     # print('parse-ability, parse-quality:', a, q)
     # assert a*q > 0.99
     # self.assertTrue(a*q*Decimal("100") > 0.99, str(a) + " * " + str(q) + " * 100 !> 0.99")
     pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs)
     # pa, f1, precision, recall: <float> 0.0 - 1.0
     self.assertTrue(pa*recall > 0.99, str(pa) + " * " + str(recall) + " > 0.99")
Пример #4
0
 def test_turtle_generalize_rules(self):
     base  = module_path + '/tests/data/POC-Turtle/' + \
         'generalized_rules/dict_6C_2018-10-03_0006.4.0.dict'
     input_parses = module_path + '/tests/data/POC-Turtle/MST-fixed-manually/'
     batch_dir = module_path + '/output/test_grammar_learner_' + str(UTC())[:10]
     prj_dir = batch_dir + '/turtle_lw_&_dot_generalized_rules/'
     if check_dir(prj_dir, create=True, verbose='max'):
         outpath = prj_dir
     kwargs = {
         'input_parses'  :   input_parses,
         'output_grammar':   outpath,
         'left_wall'     :   'LEFT-WALL' ,
         'period'        :   True        ,
         'context'       :   2           ,
         'word_space'    :   'discrete'  ,
         'dim_reduction' :   'none'      ,
         'clustering'    :   'group'     ,
         'grammar_rules' :   2           ,
         'categories_generalization' :   'off'    ,
         'rules_generalization'      :   'jaccard',
         'tmpath'        :   module_path + '/tmp/',
         'verbose'       :   'none'
     }
     response = learn_grammar(**kwargs)
     with open(response['grammar_file'], 'r') as f:
         rules = f.read().splitlines()
     rule_list = [line for line in rules if line[0:1] in ['"', '(']]
     with open(base, 'r') as f: lst = f.read().splitlines()
     base_list = [line for line in lst if line[0:1] in ['"', '(']]
     if len(rule_list) == len(base_list):
         assert rule_list == base_list
     else: assert len(rule_list) == len(base_list)
Пример #5
0
 def test_pqa_english_noamb_ddrkd_no_generalization(self):
     input_parses = module_path + '/tests/data/POC-English-NoAmb/MST-fixed-manually/'
     batch_dir = module_path + '/output/test_grammar_learner_' + str(
         UTC())[:10]
     prj_dir = batch_dir + '/noamb_pqa_ddrkd_no_generalization/'
     if check_dir(prj_dir, create=True, verbose='max'):
         outpath = prj_dir
     # cp,rp :: (test) corpus_path and reference_path:
     cp = module_path + '/data/POC-English-NoAmb/poc_english_noamb_corpus.txt'
     rp = input_parses + '/poc-english_noAmb-parses-gold.txt'
     kwargs = {
         'input_parses': input_parses,
         'output_grammar': outpath,
         'left_wall': '',
         'period': False,
         'context': 2,
         'word_space': 'vectors',
         'dim_reduction': 'svd',
         'clustering': ('kmeans', 'kmeans++', 18),
         'cluster_range': (12, 12, 5),
         'grammar_rules': 2,
         'categories_generalization': 'off',
         'rules_generalization': 'off',
         'tmpath': module_path + '/tmp/',
         'linkage_limit': 1000,
         'verbose': 'min'
     }
     # Sometimes pqa_meter(with test_grammar updated 2018-10-19) returns pa,recall = 0,0
     # FIXME: check with further test_grammar updates and delete.
     x = 0.
     n = 0
     while x < 0.1:
         re = learn_grammar(**kwargs)
         pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath,
                                               cp, rp, **kwargs)
         print(
             f'\nnoAmb dDRKd: pa {round(pa,3)}, f1 {round(f1,3)}, precision {round(precision,3)}, recall {round(recall,3)} \n'
         )
         x = pa * recall
         n += 1
         if n > 24: break
     self.assertTrue(pa * recall > 0.99,
                     str(pa) + " * " + str(recall) + " > 0.99")
Пример #6
0
            1.0,  # level = 0, 1, 0.-0.99..: 0 - max number of clusters
            'categories_generalization':
            'off',  # 'off' / 'cosine' - cosine similarity, 'jaccard'
            'categories_merge':
            0.8,  # merge categories with similarity > this 'merge' criteria
            'categories_aggregation':
            0.2,  # aggregate categories with similarity > this criteria
            'grammar_rules':
            2,  # 1: 'connectors' / 2 - 'disjuncts' / 0 - 'words' (TODO?)
            'rules_generalization':
            'off',  # 'off' / 'cosine' - cosine similarity, 'jaccard'
            'rules_merge':
            0.8,  # merge rules with similarity > this 'merge' criteria
            'rules_aggregation':
            0.2,  # aggregate rules similarity > this criteria
            'tmpath': module_path + '/tmp/',  # legacy, default if not temp_dir
            'verbose':
            'max',  # display intermediate results: 'none', 'min', 'mid', 'max'
            'linkage_limit': 10000,  # Link Grammar parameter for tests'
            'add_disjunct_costs':
            True,  # add disjunct costs when saving grammar rules to LG dictionary file
            'disjunct_cost_function': 'reverse_count'  # 1/disjunct_count
        }
        #print(kwargs)
        response = learn_grammar(**kwargs)
        with open(response['grammar_file'], 'r') as f:
            rules = f.read().splitlines()
        rule_list = [line for line in rules if line[0:1] in ['"', '(']]
    except:
        print("--ERR--", v, "has no links")