def test_pqa_english_noamb_diled_no_generalization(self): input_parses = module_path + '/tests/data/POC-English-NoAmb/MST-fixed-manually/' batch_dir = module_path + '/output/test_grammar_learner_' + str( UTC())[:10] prj_dir = batch_dir + '/noamb_pqa_diled_no_generalization/' if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir # cp,rp :: (test) corpus_path and reference_path: cp = module_path + '/data/POC-English-NoAmb/poc_english_noamb_corpus.txt' rp = input_parses + '/poc-english_noAmb-parses-gold.txt' kwargs = { 'input_parses': input_parses, 'output_grammar': outpath, 'left_wall': '', 'period': False, 'context': 2, 'word_space': 'discrete', 'dim_reduction': 'none', 'clustering': 'group', 'grammar_rules': 2, 'categories_generalization': 'off', 'rules_generalization': 'off', 'tmpath': module_path + '/tmp/', 'linkage_limit': 1000, 'verbose': 'min' } re = learn_grammar(**kwargs) pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs) self.assertTrue(pa * recall > 0.99, str(pa) + " * " + str(recall) + " > 0.99")
def test_pqa_turtle_ddrkd_no_generalization(self): input_parses = module_path + '/tests/data/POC-Turtle/MST-fixed-manually/' batch_dir = module_path + '/output/test_grammar_learner_' + str( UTC())[:10] prj_dir = batch_dir + '/turtle_pqa_ddrkd_no_generalization/' if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir # cp,rp :: (test) corpus_path and reference_path: cp = module_path + '/tests/data/POC-Turtle/poc-turtle-corpus.txt' rp = input_parses + '/poc-turtle-parses-gold.txt' kwargs = { 'input_parses': input_parses, 'output_grammar': outpath, 'left_wall': '', 'period': False, 'context': 2, 'word_space': 'vectors', 'dim_reduction': 'svd', 'clustering': ('kmeans', 'kmeans++', 18), #-'cluster_range' : (2,50,9) , 'cluster_range': (20, 2, 9), 'grammar_rules': 2, 'categories_generalization': 'off', 'rules_generalization': 'off', 'tmpath': module_path + '/tmp/', 'linkage_limit': 1000, 'verbose': 'min' } re = learn_grammar(**kwargs) pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs) self.assertTrue(pa * recall > 0.99, str(pa) + " * " + str(recall) + " > 0.99")
def test_pqa_turtle_diled_no_generalization(self): input_parses = module_path + '/tests/data/POC-Turtle/MST-fixed-manually' batch_dir = module_path + '/output/test_grammar_learner_' + str(UTC())[:10] prj_dir = batch_dir + '/turtle_pqa_diled_no_generalization/' if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir # cp,rp :: (test) corpus_path and reference_path: cp = module_path + '/tests/data/POC-Turtle/poc-turtle-corpus.txt' rp = input_parses + '/poc-turtle-parses-gold.txt' kwargs = { 'input_parses' : input_parses, 'output_grammar': outpath, 'left_wall' : '' , 'period' : False , 'context' : 2 , 'word_space' : 'discrete' , 'dim_reduction' : 'none' , 'clustering' : 'group' , 'grammar_rules' : 2 , 'categories_generalization' : 'off' , 'rules_generalization' : 'off' , 'tmpath' : module_path + '/tmp/', 'linkage_limit' : 1000, 'verbose' : 'min' } re = learn_grammar(**kwargs) # 81019 changes: # FIXME: DEL comments # a, q, qa = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs) # print('parse-ability, parse-quality:', a, q) # assert a*q > 0.99 # self.assertTrue(a*q*Decimal("100") > 0.99, str(a) + " * " + str(q) + " * 100 !> 0.99") pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs) # pa, f1, precision, recall: <float> 0.0 - 1.0 self.assertTrue(pa*recall > 0.99, str(pa) + " * " + str(recall) + " > 0.99")
def test_turtle_generalize_rules(self): base = module_path + '/tests/data/POC-Turtle/' + \ 'generalized_rules/dict_6C_2018-10-03_0006.4.0.dict' input_parses = module_path + '/tests/data/POC-Turtle/MST-fixed-manually/' batch_dir = module_path + '/output/test_grammar_learner_' + str(UTC())[:10] prj_dir = batch_dir + '/turtle_lw_&_dot_generalized_rules/' if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir kwargs = { 'input_parses' : input_parses, 'output_grammar': outpath, 'left_wall' : 'LEFT-WALL' , 'period' : True , 'context' : 2 , 'word_space' : 'discrete' , 'dim_reduction' : 'none' , 'clustering' : 'group' , 'grammar_rules' : 2 , 'categories_generalization' : 'off' , 'rules_generalization' : 'jaccard', 'tmpath' : module_path + '/tmp/', 'verbose' : 'none' } response = learn_grammar(**kwargs) with open(response['grammar_file'], 'r') as f: rules = f.read().splitlines() rule_list = [line for line in rules if line[0:1] in ['"', '(']] with open(base, 'r') as f: lst = f.read().splitlines() base_list = [line for line in lst if line[0:1] in ['"', '(']] if len(rule_list) == len(base_list): assert rule_list == base_list else: assert len(rule_list) == len(base_list)
def test_pqa_english_noamb_ddrkd_no_generalization(self): input_parses = module_path + '/tests/data/POC-English-NoAmb/MST-fixed-manually/' batch_dir = module_path + '/output/test_grammar_learner_' + str( UTC())[:10] prj_dir = batch_dir + '/noamb_pqa_ddrkd_no_generalization/' if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir # cp,rp :: (test) corpus_path and reference_path: cp = module_path + '/data/POC-English-NoAmb/poc_english_noamb_corpus.txt' rp = input_parses + '/poc-english_noAmb-parses-gold.txt' kwargs = { 'input_parses': input_parses, 'output_grammar': outpath, 'left_wall': '', 'period': False, 'context': 2, 'word_space': 'vectors', 'dim_reduction': 'svd', 'clustering': ('kmeans', 'kmeans++', 18), 'cluster_range': (12, 12, 5), 'grammar_rules': 2, 'categories_generalization': 'off', 'rules_generalization': 'off', 'tmpath': module_path + '/tmp/', 'linkage_limit': 1000, 'verbose': 'min' } # Sometimes pqa_meter(with test_grammar updated 2018-10-19) returns pa,recall = 0,0 # FIXME: check with further test_grammar updates and delete. x = 0. n = 0 while x < 0.1: re = learn_grammar(**kwargs) pa, f1, precision, recall = pqa_meter(re['grammar_file'], outpath, cp, rp, **kwargs) print( f'\nnoAmb dDRKd: pa {round(pa,3)}, f1 {round(f1,3)}, precision {round(precision,3)}, recall {round(recall,3)} \n' ) x = pa * recall n += 1 if n > 24: break self.assertTrue(pa * recall > 0.99, str(pa) + " * " + str(recall) + " > 0.99")
1.0, # level = 0, 1, 0.-0.99..: 0 - max number of clusters 'categories_generalization': 'off', # 'off' / 'cosine' - cosine similarity, 'jaccard' 'categories_merge': 0.8, # merge categories with similarity > this 'merge' criteria 'categories_aggregation': 0.2, # aggregate categories with similarity > this criteria 'grammar_rules': 2, # 1: 'connectors' / 2 - 'disjuncts' / 0 - 'words' (TODO?) 'rules_generalization': 'off', # 'off' / 'cosine' - cosine similarity, 'jaccard' 'rules_merge': 0.8, # merge rules with similarity > this 'merge' criteria 'rules_aggregation': 0.2, # aggregate rules similarity > this criteria 'tmpath': module_path + '/tmp/', # legacy, default if not temp_dir 'verbose': 'max', # display intermediate results: 'none', 'min', 'mid', 'max' 'linkage_limit': 10000, # Link Grammar parameter for tests' 'add_disjunct_costs': True, # add disjunct costs when saving grammar rules to LG dictionary file 'disjunct_cost_function': 'reverse_count' # 1/disjunct_count } #print(kwargs) response = learn_grammar(**kwargs) with open(response['grammar_file'], 'r') as f: rules = f.read().splitlines() rule_list = [line for line in rules if line[0:1] in ['"', '(']] except: print("--ERR--", v, "has no links")