def test_turtle_generalize_both(self): base = module_path + '/tests/data/POC-Turtle/' + \ 'generalized_categories_and_rules/dict_6C_2018-07-06_0005.4.0.dict' #'generalized_categories_and_rules/poc-turtle_6C_2018-06-08_0004.4.0.dict' input_parses = module_path + '/tests/data/POC-Turtle/MST_fixed_manually/' batch_dir = module_path + '/output/Test_Grammar_Learner_' + str( UTC())[:10] + '/' prj_dir = batch_dir + 'generalized_categories_and_rules/' if check_dir(prj_dir, create=True, verbose='max'): outpath = prj_dir kwargs = { 'left_wall': 'LEFT-WALL', 'period': True, 'context': 2, 'word_space': 'discrete', 'dim_reduction': 'none', 'clustering': 'group', 'grammar_rules': 2, 'categories_generalization': 'jaccard', 'rules_generalization': 'jaccard', 'tmpath': module_path + '/tmp/', 'verbose': 'none' } response = learn_grammar(input_parses, outpath, outpath, **kwargs) with open(response['grammar_file'], 'r') as f: rules = f.read().splitlines() rule_list = [line for line in rules if line[0:1] in ['"', '(']] with open(base, 'r') as f: lst = f.read().splitlines() base_list = [line for line in lst if line[0:1] in ['"', '(']] if len(rule_list) == len(base_list): assert rule_list == base_list else: assert len(rule_list) == len(base_list)
def learn_lexical_entries(input_dir, cat_path, dict_path, verbose='none', \ parse_mode='given'): from IPython.display import display from src.utl.turtle import html_table from src.utl.read_files import check_dir, check_dir_files, check_corpus from src.utl.write_files import list2file from src.link_grammar.turtle import \ files2disjuncts, lexical_entries, entries2clusters, entries2categories, \ disjuncts2clusters, entries2rules, save_link_grammar if check_dir(input_dir, create=False, verbose=verbose): files = check_dir_files(input_dir, verbose=verbose) if len(files) > 0: if verbose == 'max': print(files) for i, file in enumerate(files): if check_corpus(file, verbose=verbose): if verbose == 'max': print('File #' + str(i), file, 'checked') else: print('File #' + str(i), file, 'check failed') else: print('Input directory', input_dir, 'is empty') else: print('No input directory', input_dir) log = {'project': 'Grammar Learner -- Lexical entries'} # use OR DEL? disjuncts = files2disjuncts(files, 'LEFT-WALL', True, verbose) #TODO: parse_mode? entries = lexical_entries(disjuncts) category_list = entries2categories(entries) if verbose == 'max': display( html_table( [['Parent', 'Category', 'Quality', 'Words', 'Relevance']] + category_list)) if cat_path[-1] != '/': cat_path += '/' cat_file = cat_path + 'categories.txt' categories = list2file(category_list, cat_file) if verbose == 'max': for line in categories.splitlines()[:3]: print(line) print('<...>\nTotal', len(categories.splitlines()), 'lines, saved to', cat_file) lg_rule_list = entries2rules(disjuncts2clusters(entries2clusters(entries))) if verbose == 'max': display( html_table([['Cluster', 'Germs', 'L', 'R', 'Disjuncts']] + lg_rule_list)) lg_rules_str = save_link_grammar(lg_rule_list, dict_path) if verbose == 'max': for line in lg_rules_str.splitlines(): print(line) #-return categories, lg_rules_dict #TODO: return paths to categories and dict? s = lg_rules_str.splitlines()[-1] lg_file = s[s.find(': ') + 2:] response = {'categories_file': cat_file, 'grammar_file': lg_file} return response
def params(corpus, dataset, module_path, out_dir, **kwargs): from src.utl.read_files import check_dir input_parses = module_path + '/data/' + corpus + '/' + dataset if check_dir(input_parses, create=False, verbose='min'): batch_dir = out_dir + '/' + corpus spaces = ['connectors', 'disjuncts'] if kwargs['word_space'] == 'vectors': wtf = 'DRK' else: wtf = 'ILE' if kwargs['left_wall'] in ['', 'none']: left_wall = 'no-LEFT-WALL' else: left_wall = 'LEFT-WALL' if kwargs['period']: period = 'period' else: period = 'no-period' generalization = ['no_generalization', 'generalized_categories', \ 'generalized_rules', 'generalized_categories_and_rules'] gen = 0 if 'categories_generalization' in kwargs: if kwargs['categories_generalization'] not in ['', 'off']: gen += 1 if 'rules_generalization' in kwargs: if kwargs['rules_generalization'] not in ['', 'off']: gen += 2 prj_dir = batch_dir + '/' + dataset + '/' + \ spaces[kwargs['context']-1] + '-'+wtf+'-' + spaces[kwargs['grammar_rules']-1] \ + '/' + left_wall + '_' + period + '/' + generalization[gen] #-print('params - kwargs[rules_generalization]:', kwargs['rules_generalization']) #-print('params - kwargs[categories_generalization]:', kwargs['categories_generalization']) #-print('params - generalization['+str(gen)+'] =', generalization[gen]) #-print('params - prj_dir:', prj_dir) if check_dir(prj_dir, create=True, verbose='none'): output_categories = prj_dir # no file name ⇒ auto file name output_grammar = prj_dir # no file name ⇒ auto file name return input_parses, output_categories, output_grammar else: return input_parses, out_dir, out_dir else: raise FileNotFoundError('File not found', input_parses)
def test_turtle_diled(self): corpus = 'POC-Turtle' dataset = 'MST_fixed_manually' input_parses = module_path + '/tests/data/POC-Turtle/MST_fixed_manually/' base = module_path + '/tests/data/POC-Turtle/' + \ '/2018-04-25/turtle_dILEd_LW+dot+_2018-04-25_0008.4.0.dict' batch_dir = module_path + '/output/Test_Grammar_Learner_' + str( UTC())[:10] + '/' prj_dir = batch_dir + 'Turtle_dILEd_LW_and_period/' if check_dir(prj_dir, create=True, verbose='max'): output_categories = prj_dir output_grammar = prj_dir kwargs = { 'left_wall': 'LEFT-WALL', 'period': True, 'context': 2, 'word_space': 'discrete', 'dim_reduction': 'none', 'clustering': 'group', 'grammar_rules': 2, 'categories_generalization': 'off', 'rules_generalization': 'off', 'tmpath': module_path + '/tmp/', 'verbose': 'min' } response = learn_grammar(input_parses, output_categories, output_grammar, **kwargs) with open(response['grammar_file'], 'r') as f: rules = f.read().splitlines() rule_list = [line for line in rules if line[0:1] in ['"', '(']] with open(base, 'r') as f: lst = f.read().splitlines() base_list = [line for line in lst if line[0:1] in ['"', '(']] if len(rule_list) == len(base_list): if kwargs['verbose'] == 'debug': print('\nTest results vs baseline:') for i, rule in enumerate(base_list): print(rule_list[i]) print(rule) assert rule_list == base_list else: assert len(rule_list) == len(base_list)
def learn_grammar(input_parses, output_categories, output_grammar, **kwargs): # input_parses - dir with .txt files # output_categories - path/file.ext / dir ⇒ auto file name # output_grammar - path/file.ext / dir ⇒ auto file name def kwa(v, k): return kwargs[k] if k in kwargs else v tmpath = kwa('', 'tmpath') parse_mode = kwa('given', 'parse_mode') left_wall = kwa('', 'left_wall') period = kwa(False, 'period') context = kwa(1, 'context') window = kwa('mst', 'window') weighting = kwa('ppmi', 'weighting') #? distance = kwa(??, 'distance') group = kwa(True, 'group') word_space = kwa('vectors', 'word_space') dim_max = kwa(100, 'dim_max') sv_min = kwa(0.1, 'sv_min') dim_reduction = kwa('svm', 'dim_reduction') clustering = kwa('kmeans', 'clustering') #-cluster_range = kwa((2,48,1), 'cluster_range') #-cluster_criteria = kwa('silhouette', 'cluster_criteria') #-cluster_level = kwa(0.9, 'cluster_level') cats_gen = kwa('off', 'categories_generalization') #-cats_merge = kwa(0.8, 'categories_merge') #-cats_aggr = kwa(0.2, 'categories_aggregation') grammar_rules = kwa(1, 'grammar_rules') rules_gen = kwa('off', 'rules_generalization') # 'off', 'cosine', 'jaccard' #-rules_merge = kwa(0.8, 'rules_merge'), # merge rules with similarity > this 'merge' criteria #-rules_aggr = kwa(0.3, 'rules_aggregation'), # aggregate rules with similarity > this criteria verbose = kwa('none', 'verbose') #80509 kwargs tests ~OK #-print('poc04 learn_grammar kwargs:') #-for k,v in kwargs.items(): print(('- '+k+': ')[:20], v) #-response = print_kwargs(**kwargs) #-return response #80509 TODO: renamed parameters ⇒ update code kwargs['input_parses'] = input_parses kwargs['output_categories'] = output_categories kwargs['output_grammar'] = output_grammar #TODO: if parameter != file: auto file name input_dir = input_parses #cat_path = output_categories #-dict_path = output_grammar import os #, collections import pandas as pd from shutil import copy2 as copy from src.utl.utl import UTC from src.utl.read_files import check_dir, check_mst_files from src.space.poc04 import files2links #+from src.link_grammar.poc04 import category_learner from src.clustering.poc04 import clusters2dict #+from src.link_grammar.poc04 import grammar_learner #-from src.link_grammar.poc import save_link_grammar from src.utl.write_files import list2file, save_link_grammar from src.utl.widgets import html_table, plot2d from collections import OrderedDict log = OrderedDict({'datime': str(UTC()), 'learn_grammar': '80511'}) #log.update({'datime': str(UTC()), 'learn_grammar': '80510'}) files, re01 = check_mst_files(input_parses, verbose) log.update(re01) #for file in files: copy(file, output_categories) #TODO: output_categories file ⇒ dir if os.path.isdir(output_categories): parse_dir = output_categories + '/parses/' else: parse_dir = os.path.dirname(output_categories) + '/parses/' if check_dir(parse_dir, True, verbose): for file in files: copy(file, os.path.dirname(parse_dir)) else: raise FileNotFoundError('File not found', input_parses) # group = True #? always? False option for context = 0 (words)? kwargs['input_files'] = files links, re02 = files2links(**kwargs) log.update(re02) if verbose == 'debug': print('\nfiles2links returns links', type(links), ':\n') with pd.option_context('display.max_rows', 6): print(links, '\n') print('learn_grammar: word_space:', word_space, '/ clustering:', clustering) category_list, re03 = category_learner(links, **kwargs) log.update(re03) word_clusters = clusters2dict(category_list) # Save 1st cats_file = to control 2-step generalization #FIXME:DEL cats_file = output_categories if '.' not in cats_file: #80508 auto file name if cats_file[-1] != '/': cats_file += '/' cats_file += (str(len(set([x[0] for x in category_list]))) + '_categories.txt') #TODO: comment saving cats_file and run tests 80523 #+categories = list2file(category_list, cats_file) log.update({'categories_file': cats_file}) #...TODO... hierarchical categories 80523 snooze #...display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \ #... + category_list)) if grammar_rules != context: #-links, res4 = files2links(files, parse_mode, grammar_rules, group, left_wall, period, verbose) context = kwargs['context'] kwargs['context'] = kwargs['grammar_rules'] links, re04 = files2links(**kwargs) kwargs['context'] = context rule_list, re05 = grammar_learner(word_clusters, links, **kwargs) log.update(re05) #...display(html_table([['Rules','','','','','']] + rule_list)) if 'rules_generalization' in kwargs: if kwargs['rules_generalization'] not in ['', 'off']: #-word_clusters, re06 = generalize_rules(rule_list, **kwargs) cats_list, re06 = generalise_rules(rule_list, **kwargs) #TODO: = generalise_rules(rule_list, **kwargs) log.update(re06) if len(set([x[0] for x in cats_list])) < len( set([x[0] for x in category_list])): category_list = cats_list # Save 2nd cats_file - overwrite in case of equal cats_file = output_categories if '.' not in cats_file: #80508 auto file name if cats_file[-1] != '/': cats_file += '/' cats_file += (str(len(set([x[0] for x in category_list]))) + '_categories.txt') #TODO: comment saving cats_file and run tests 80523 #+categories = list2file(category_list, cats_file) log.update({'categories_file': cats_file}) word_clusters = clusters2dict(category_list) rule_list, re07 = grammar_learner(word_clusters, links, **kwargs) #...display(html_table([['Rules','','','','','']] + rule_list)) log.update(re07) if verbose == 'debug': print('\nrules_generalisation ⇒ category_list:', category_list) if verbose not in ['min', 'none']: display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \ + category_list)) # Save cat_tree.txt file from src.utl.write_files import save_category_tree tree_file = cats_file[:cats_file.rindex('_')] + '_cat_tree.txt' re08 = save_category_tree(category_list, tree_file, verbose) #FIXME: verbose? log.update(re08) # Save Link Grammar .dict re09 = save_link_grammar(rule_list, output_grammar) log.update(re09) return log
def category_learner(links, **kwargs): #80619 POC.0.5 # links - DataFrame ['word', 'link', 'count'] def kwa(v, k): return kwargs[k] if k in kwargs else v #-links = kwargs['links'] # links - check? cats_file = kwa('/output', 'output_categories') # to define tmpath #-dict_path = kwa('/output', 'output_grammar') # not used here tmpath = kwa('', 'tmpath') parse_mode = kwa('given', 'parse_mode') left_wall = kwa('', 'left_wall') period = kwa(False, 'period') context = kwa(1, 'context') window = kwa('mst', 'window') weighting = kwa('ppmi', 'weighting') #? distance = kwa(??, 'distance') group = kwa(True, 'group') word_space = kwa('vectors', 'word_space') dim_max = kwa(100, 'dim_max') sv_min = kwa(0.1, 'sv_min') dim_reduction = kwa('svm', 'dim_reduction') clustering = kwa('kmeans', 'clustering') cluster_range = kwa((2, 48, 1), 'cluster_range') cluster_criteria = kwa('silhouette', 'cluster_criteria') cluster_level = kwa(0.9, 'cluster_level') generalization = kwa('off', 'categories_generalization') merge = kwa(0.8, 'categories_merge') aggregate = kwa(0.2, 'categories_aggregation') grammar_rules = kwa(1, 'grammar_rules') verbose = kwa('none', 'verbose') from src.utl.utl import UTC, round1, round2 #, round3, round4, round5 from src.space.hyperwords import vector_space_dim, pmisvd from src.clustering.kmeans import cluster_words_kmeans from src.clustering.poc05 import number_of_clusters, clusters2list from src.utl.widgets import html_table, plot2d from src.utl.read_files import check_dir #, check_mst_files from src.utl.write_files import list2file, save_link_grammar #-from src.grammar_learner.poc05 import group_links, \ #- aggregate_cosine, aggregate_jaccard, aggregate_word_categories from collections import OrderedDict log = OrderedDict() log.update({'category_learner': '80619'}) if tmpath == '' or tmpath == 'auto': # temporary files path if '.' not in cats_file: tmpath = cats_file else: tmpath = cats_file[:cats_file.rindex('/')] if tmpath[-1] != '/': tmpath += '/' tmpath += 'tmp/' print('tmpath:', tmpath) if check_dir(tmpath, True, verbose): log.update({'tmpath': tmpath}) #TODO:ERROR if verbose == 'debug': print('category_learner: word_space:', word_space, '/ clustering:', clustering) #-if word_space == 'vectors': #80619 Category-Tree-2018-06-19.ipynb if context == 1 or word_space[0] in ['v', 'e'] or clustering == 'kmeans': #word_space options: v,e: 'vectors'='embeddings', d,w: 'discrete'='word_vectors' print('DRK: context =', str(context) + ', word_space: ' + word_space + ', clustering:', clustering) #-dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose) #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt dim = vector_space_dim(links, tmpath, tmpath, dim_max, sv_min, verbose) log.update({'vector_space_dim': dim}) if verbose in ['mid', 'max', 'debug']: print('Optimal vector space dimensionality:', dim) #-vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim) vdf, sv, re01 = pmisvd(links, tmpath, tmpath, dim) log.update(re01) #-if clustering == 'kmeans': if verbose in ['max', 'debug']: print(UTC(), ':: category_learner ⇒ number_of_clusters') n_clusters = number_of_clusters(vdf, cluster_range, clustering, \ criteria=cluster_criteria, level=cluster_level, verbose=verbose) log.update({'n_clusters': n_clusters}) if verbose in ['max', 'debug']: print(UTC(), ':: category_learner ⇒ cluster_words_kmeans:', n_clusters, 'clusters') clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters) log.update({'silhouette': silhouette, 'inertia': inertia}) #-elif clustering[:5] in ['group','ident']: else: if verbose in ['max', 'debug']: print(UTC(),':: category_learner ⇒ iLE group_links: context =', \ str(context)+', word_space: '+str(word_space)+', clustering:', clustering) #TODO: from src.clustering.grouping import group_links clusters = group_links(links, verbose) log.update({'n_clusters': len(clusters)}) if verbose not in ['min', 'none']: print('Total', len(clusters), \ 'clusters of identical lexical entries', type(clusters)) # Convert clusters DataFrame ⇒ cats {} #80619 0.5 #TODO?: if clusters == pd.dataframe: if verbose in ['max', 'debug']: print(UTC(), ':: category_learner: convert clusters ⇒ cats {}') cats = {} #80609 dict instead of DataFrame cats['cluster'] = ['C0'] + clusters['cluster'].tolist() cats['parent'] = [0 for x in cats['cluster']] cats['words'] = [[]] + [set(x) for x in clusters['cluster_words'].tolist()] if 'disjuncts' in clusters: cats['disjuncts'] = [[]] + clusters['disjuncts'].tolist() djset = set() [[djset.add(y) for y in x] for x in cats['disjuncts']] djlist = sorted(djset) cats['djs'] = [set([djlist.index(x) for x in y if x in djlist]) \ for y in cats['disjuncts']] if 'counts' in clusters: cats['counts'] = [0] + clusters['counts'].tolist() if word_space == 'vectors' or clustering == 'kmeans': cats['quality'] = [0 for x in cats['words']] cats['similarities'] = [[0 for y in x] for x in cats['words']] else: cats['quality'] = [1 for x in cats['words']] cats['quality'][0] = 0 cats['similarities'] = [[1 for y in x] for x in cats['words']] cats['similarities'][0] = [0] cats['children'] = [0 for x in cats['words']] return cats, log