def test_turtle_generalize_both(self):
     base  = module_path + '/tests/data/POC-Turtle/' + \
         'generalized_categories_and_rules/dict_6C_2018-07-06_0005.4.0.dict'
     #'generalized_categories_and_rules/poc-turtle_6C_2018-06-08_0004.4.0.dict'
     input_parses = module_path + '/tests/data/POC-Turtle/MST_fixed_manually/'
     batch_dir = module_path + '/output/Test_Grammar_Learner_' + str(
         UTC())[:10] + '/'
     prj_dir = batch_dir + 'generalized_categories_and_rules/'
     if check_dir(prj_dir, create=True, verbose='max'):
         outpath = prj_dir
     kwargs = {
         'left_wall': 'LEFT-WALL',
         'period': True,
         'context': 2,
         'word_space': 'discrete',
         'dim_reduction': 'none',
         'clustering': 'group',
         'grammar_rules': 2,
         'categories_generalization': 'jaccard',
         'rules_generalization': 'jaccard',
         'tmpath': module_path + '/tmp/',
         'verbose': 'none'
     }
     response = learn_grammar(input_parses, outpath, outpath, **kwargs)
     with open(response['grammar_file'], 'r') as f:
         rules = f.read().splitlines()
     rule_list = [line for line in rules if line[0:1] in ['"', '(']]
     with open(base, 'r') as f:
         lst = f.read().splitlines()
     base_list = [line for line in lst if line[0:1] in ['"', '(']]
     if len(rule_list) == len(base_list):
         assert rule_list == base_list
     else:
         assert len(rule_list) == len(base_list)
Пример #2
0
def learn_lexical_entries(input_dir, cat_path, dict_path, verbose='none', \
        parse_mode='given'):
    from IPython.display import display
    from src.utl.turtle import html_table
    from src.utl.read_files import check_dir, check_dir_files, check_corpus
    from src.utl.write_files import list2file
    from src.link_grammar.turtle import \
        files2disjuncts, lexical_entries, entries2clusters, entries2categories, \
        disjuncts2clusters, entries2rules, save_link_grammar

    if check_dir(input_dir, create=False, verbose=verbose):
        files = check_dir_files(input_dir, verbose=verbose)
        if len(files) > 0:
            if verbose == 'max': print(files)
            for i, file in enumerate(files):
                if check_corpus(file, verbose=verbose):
                    if verbose == 'max':
                        print('File #' + str(i), file, 'checked')
                else:
                    print('File #' + str(i), file, 'check failed')
        else:
            print('Input directory', input_dir, 'is empty')
    else:
        print('No input directory', input_dir)

    log = {'project': 'Grammar Learner -- Lexical entries'}  # use OR DEL?

    disjuncts = files2disjuncts(files, 'LEFT-WALL', True, verbose)
    #TODO: parse_mode?
    entries = lexical_entries(disjuncts)
    category_list = entries2categories(entries)
    if verbose == 'max':
        display(
            html_table(
                [['Parent', 'Category', 'Quality', 'Words', 'Relevance']] +
                category_list))
    if cat_path[-1] != '/': cat_path += '/'
    cat_file = cat_path + 'categories.txt'
    categories = list2file(category_list, cat_file)
    if verbose == 'max':
        for line in categories.splitlines()[:3]:
            print(line)
        print('<...>\nTotal', len(categories.splitlines()), 'lines, saved to',
              cat_file)

    lg_rule_list = entries2rules(disjuncts2clusters(entries2clusters(entries)))
    if verbose == 'max':
        display(
            html_table([['Cluster', 'Germs', 'L', 'R', 'Disjuncts']] +
                       lg_rule_list))
    lg_rules_str = save_link_grammar(lg_rule_list, dict_path)
    if verbose == 'max':
        for line in lg_rules_str.splitlines():
            print(line)
    #-return categories, lg_rules_dict
    #TODO: return paths to categories and dict?
    s = lg_rules_str.splitlines()[-1]
    lg_file = s[s.find(': ') + 2:]
    response = {'categories_file': cat_file, 'grammar_file': lg_file}
    return response
Пример #3
0
def params(corpus, dataset, module_path, out_dir, **kwargs):
    from src.utl.read_files import check_dir
    input_parses = module_path + '/data/' + corpus + '/' + dataset
    if check_dir(input_parses, create=False, verbose='min'):
        batch_dir = out_dir + '/' + corpus
        spaces = ['connectors', 'disjuncts']
        if kwargs['word_space'] == 'vectors': wtf = 'DRK'
        else: wtf = 'ILE'
        if kwargs['left_wall'] in ['', 'none']:
            left_wall = 'no-LEFT-WALL'
        else:
            left_wall = 'LEFT-WALL'
        if kwargs['period']:
            period = 'period'
        else:
            period = 'no-period'
        generalization = ['no_generalization', 'generalized_categories', \
                          'generalized_rules', 'generalized_categories_and_rules']
        gen = 0
        if 'categories_generalization' in kwargs:
            if kwargs['categories_generalization'] not in ['', 'off']: gen += 1
        if 'rules_generalization' in kwargs:
            if kwargs['rules_generalization'] not in ['', 'off']: gen += 2
        prj_dir = batch_dir + '/' + dataset  + '/' + \
            spaces[kwargs['context']-1] + '-'+wtf+'-' + spaces[kwargs['grammar_rules']-1] \
            + '/' + left_wall + '_' + period + '/' + generalization[gen]

        #-print('params - kwargs[rules_generalization]:', kwargs['rules_generalization'])
        #-print('params - kwargs[categories_generalization]:', kwargs['categories_generalization'])
        #-print('params - generalization['+str(gen)+'] =', generalization[gen])
        #-print('params - prj_dir:', prj_dir)

        if check_dir(prj_dir, create=True, verbose='none'):
            output_categories = prj_dir  # no file name ⇒ auto file name
            output_grammar = prj_dir  # no file name ⇒ auto file name
            return input_parses, output_categories, output_grammar
        else:
            return input_parses, out_dir, out_dir
    else:
        raise FileNotFoundError('File not found', input_parses)
 def test_turtle_diled(self):
     corpus = 'POC-Turtle'
     dataset = 'MST_fixed_manually'
     input_parses = module_path + '/tests/data/POC-Turtle/MST_fixed_manually/'
     base  = module_path + '/tests/data/POC-Turtle/' + \
         '/2018-04-25/turtle_dILEd_LW+dot+_2018-04-25_0008.4.0.dict'
     batch_dir = module_path + '/output/Test_Grammar_Learner_' + str(
         UTC())[:10] + '/'
     prj_dir = batch_dir + 'Turtle_dILEd_LW_and_period/'
     if check_dir(prj_dir, create=True, verbose='max'):
         output_categories = prj_dir
         output_grammar = prj_dir
     kwargs = {
         'left_wall': 'LEFT-WALL',
         'period': True,
         'context': 2,
         'word_space': 'discrete',
         'dim_reduction': 'none',
         'clustering': 'group',
         'grammar_rules': 2,
         'categories_generalization': 'off',
         'rules_generalization': 'off',
         'tmpath': module_path + '/tmp/',
         'verbose': 'min'
     }
     response = learn_grammar(input_parses, output_categories,
                              output_grammar, **kwargs)
     with open(response['grammar_file'], 'r') as f:
         rules = f.read().splitlines()
     rule_list = [line for line in rules if line[0:1] in ['"', '(']]
     with open(base, 'r') as f:
         lst = f.read().splitlines()
     base_list = [line for line in lst if line[0:1] in ['"', '(']]
     if len(rule_list) == len(base_list):
         if kwargs['verbose'] == 'debug':
             print('\nTest results vs baseline:')
             for i, rule in enumerate(base_list):
                 print(rule_list[i])
                 print(rule)
         assert rule_list == base_list
     else:
         assert len(rule_list) == len(base_list)
Пример #5
0
def learn_grammar(input_parses, output_categories, output_grammar, **kwargs):
    # input_parses - dir with .txt files
    # output_categories - path/file.ext / dir ⇒ auto file name
    # output_grammar    - path/file.ext / dir ⇒ auto file name
    def kwa(v, k):
        return kwargs[k] if k in kwargs else v

    tmpath = kwa('', 'tmpath')
    parse_mode = kwa('given', 'parse_mode')
    left_wall = kwa('', 'left_wall')
    period = kwa(False, 'period')
    context = kwa(1, 'context')
    window = kwa('mst', 'window')
    weighting = kwa('ppmi', 'weighting')
    #? distance       = kwa(??,   'distance')
    group = kwa(True, 'group')
    word_space = kwa('vectors', 'word_space')
    dim_max = kwa(100, 'dim_max')
    sv_min = kwa(0.1, 'sv_min')
    dim_reduction = kwa('svm', 'dim_reduction')
    clustering = kwa('kmeans', 'clustering')
    #-cluster_range   = kwa((2,48,1), 'cluster_range')
    #-cluster_criteria = kwa('silhouette', 'cluster_criteria')
    #-cluster_level   = kwa(0.9,      'cluster_level')
    cats_gen = kwa('off', 'categories_generalization')
    #-cats_merge      = kwa(0.8,      'categories_merge')
    #-cats_aggr       = kwa(0.2,      'categories_aggregation')
    grammar_rules = kwa(1, 'grammar_rules')
    rules_gen = kwa('off',
                    'rules_generalization')  # 'off', 'cosine', 'jaccard'
    #-rules_merge     = kwa(0.8,      'rules_merge'),   # merge rules with similarity > this 'merge' criteria
    #-rules_aggr      = kwa(0.3,      'rules_aggregation'),   # aggregate rules with similarity > this criteria
    verbose = kwa('none', 'verbose')

    #80509 kwargs tests ~OK
    #-print('poc04 learn_grammar kwargs:')
    #-for k,v in kwargs.items(): print(('- '+k+':                ')[:20], v)
    #-response = print_kwargs(**kwargs)
    #-return response
    #80509 TODO: renamed parameters ⇒ update code
    kwargs['input_parses'] = input_parses
    kwargs['output_categories'] = output_categories
    kwargs['output_grammar'] = output_grammar
    #TODO: if parameter != file: auto file name
    input_dir = input_parses
    #cat_path = output_categories
    #-dict_path = output_grammar

    import os  #, collections
    import pandas as pd
    from shutil import copy2 as copy
    from src.utl.utl import UTC
    from src.utl.read_files import check_dir, check_mst_files
    from src.space.poc04 import files2links
    #+from src.link_grammar.poc04 import category_learner
    from src.clustering.poc04 import clusters2dict
    #+from src.link_grammar.poc04 import grammar_learner
    #-from src.link_grammar.poc import save_link_grammar
    from src.utl.write_files import list2file, save_link_grammar
    from src.utl.widgets import html_table, plot2d

    from collections import OrderedDict
    log = OrderedDict({'datime': str(UTC()), 'learn_grammar': '80511'})
    #log.update({'datime': str(UTC()), 'learn_grammar': '80510'})
    files, re01 = check_mst_files(input_parses, verbose)
    log.update(re01)
    #for file in files: copy(file, output_categories)
    #TODO: output_categories file ⇒ dir
    if os.path.isdir(output_categories):
        parse_dir = output_categories + '/parses/'
    else:
        parse_dir = os.path.dirname(output_categories) + '/parses/'
    if check_dir(parse_dir, True, verbose):
        for file in files:
            copy(file, os.path.dirname(parse_dir))
    else:
        raise FileNotFoundError('File not found', input_parses)
    # group = True    #? always? False option for context = 0 (words)?
    kwargs['input_files'] = files
    links, re02 = files2links(**kwargs)
    log.update(re02)
    if verbose == 'debug':
        print('\nfiles2links returns links', type(links), ':\n')
        with pd.option_context('display.max_rows', 6):
            print(links, '\n')
        print('learn_grammar: word_space:', word_space, '/ clustering:',
              clustering)

    category_list, re03 = category_learner(links, **kwargs)
    log.update(re03)
    word_clusters = clusters2dict(category_list)
    # Save 1st cats_file = to control 2-step generalization #FIXME:DEL
    cats_file = output_categories
    if '.' not in cats_file:  #80508 auto file name
        if cats_file[-1] != '/': cats_file += '/'
        cats_file += (str(len(set([x[0] for x in category_list]))) +
                      '_categories.txt')
    #TODO: comment saving cats_file and run tests 80523
    #+categories = list2file(category_list, cats_file)
    log.update({'categories_file': cats_file})
    #...TODO... hierarchical categories  80523 snooze
    #...display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \
    #...        + category_list))

    if grammar_rules != context:
        #-links, res4 = files2links(files, parse_mode, grammar_rules, group, left_wall, period, verbose)
        context = kwargs['context']
        kwargs['context'] = kwargs['grammar_rules']
        links, re04 = files2links(**kwargs)
        kwargs['context'] = context

    rule_list, re05 = grammar_learner(word_clusters, links, **kwargs)
    log.update(re05)
    #...display(html_table([['Rules','','','','','']] + rule_list))

    if 'rules_generalization' in kwargs:
        if kwargs['rules_generalization'] not in ['', 'off']:
            #-word_clusters, re06 = generalize_rules(rule_list, **kwargs)
            cats_list, re06 = generalise_rules(rule_list, **kwargs)
            #TODO: = generalise_rules(rule_list, **kwargs)
            log.update(re06)
            if len(set([x[0] for x in cats_list])) < len(
                    set([x[0] for x in category_list])):
                category_list = cats_list
                # Save 2nd cats_file - overwrite in case of equal
                cats_file = output_categories
                if '.' not in cats_file:  #80508 auto file name
                    if cats_file[-1] != '/': cats_file += '/'
                    cats_file += (str(len(set([x[0]
                                               for x in category_list]))) +
                                  '_categories.txt')
                #TODO: comment saving cats_file and run tests 80523
                #+categories = list2file(category_list, cats_file)
                log.update({'categories_file': cats_file})
                word_clusters = clusters2dict(category_list)
                rule_list, re07 = grammar_learner(word_clusters, links,
                                                  **kwargs)
                #...display(html_table([['Rules','','','','','']] + rule_list))
                log.update(re07)
                if verbose == 'debug':
                    print('\nrules_generalisation ⇒ category_list:',
                          category_list)
    if verbose not in ['min', 'none']:
        display(html_table([['Code','Parent','Id','Quality','Words','Relevance']] \
            + category_list))

    # Save cat_tree.txt file
    from src.utl.write_files import save_category_tree
    tree_file = cats_file[:cats_file.rindex('_')] + '_cat_tree.txt'
    re08 = save_category_tree(category_list, tree_file,
                              verbose)  #FIXME: verbose?
    log.update(re08)
    # Save Link Grammar .dict
    re09 = save_link_grammar(rule_list, output_grammar)
    log.update(re09)

    return log
Пример #6
0
def category_learner(links, **kwargs):  #80619 POC.0.5
    # links - DataFrame ['word', 'link', 'count']
    def kwa(v, k):
        return kwargs[k] if k in kwargs else v

    #-links = kwargs['links']   # links - check?
    cats_file = kwa('/output', 'output_categories')  # to define tmpath
    #-dict_path       = kwa('/output', 'output_grammar')   # not used here
    tmpath = kwa('', 'tmpath')
    parse_mode = kwa('given', 'parse_mode')
    left_wall = kwa('', 'left_wall')
    period = kwa(False, 'period')
    context = kwa(1, 'context')
    window = kwa('mst', 'window')
    weighting = kwa('ppmi', 'weighting')
    #? distance       = kwa(??,   'distance')
    group = kwa(True, 'group')
    word_space = kwa('vectors', 'word_space')
    dim_max = kwa(100, 'dim_max')
    sv_min = kwa(0.1, 'sv_min')
    dim_reduction = kwa('svm', 'dim_reduction')
    clustering = kwa('kmeans', 'clustering')
    cluster_range = kwa((2, 48, 1), 'cluster_range')
    cluster_criteria = kwa('silhouette', 'cluster_criteria')
    cluster_level = kwa(0.9, 'cluster_level')
    generalization = kwa('off', 'categories_generalization')
    merge = kwa(0.8, 'categories_merge')
    aggregate = kwa(0.2, 'categories_aggregation')
    grammar_rules = kwa(1, 'grammar_rules')
    verbose = kwa('none', 'verbose')

    from src.utl.utl import UTC, round1, round2  #, round3, round4, round5
    from src.space.hyperwords import vector_space_dim, pmisvd
    from src.clustering.kmeans import cluster_words_kmeans
    from src.clustering.poc05 import number_of_clusters, clusters2list
    from src.utl.widgets import html_table, plot2d
    from src.utl.read_files import check_dir  #, check_mst_files
    from src.utl.write_files import list2file, save_link_grammar
    #-from src.grammar_learner.poc05 import group_links, \
    #-    aggregate_cosine, aggregate_jaccard, aggregate_word_categories

    from collections import OrderedDict
    log = OrderedDict()
    log.update({'category_learner': '80619'})

    if tmpath == '' or tmpath == 'auto':  # temporary files path
        if '.' not in cats_file: tmpath = cats_file
        else: tmpath = cats_file[:cats_file.rindex('/')]
        if tmpath[-1] != '/': tmpath += '/'
        tmpath += 'tmp/'
        print('tmpath:', tmpath)
    if check_dir(tmpath, True, verbose):
        log.update({'tmpath': tmpath})
    #TODO:ERROR

    if verbose == 'debug':
        print('category_learner: word_space:', word_space, '/ clustering:',
              clustering)

    #-if word_space == 'vectors':    #80619 Category-Tree-2018-06-19.ipynb
    if context == 1 or word_space[0] in ['v', 'e'] or clustering == 'kmeans':
        #word_space options: v,e: 'vectors'='embeddings', d,w: 'discrete'='word_vectors'
        print('DRK: context =',
              str(context) + ', word_space: ' + word_space + ', clustering:',
              clustering)
        #-dim = vector_space_dim(links, dict_path, tmpath, dim_max, sv_min, verbose)
        #-80420 dict_path ⇒ tmpath :: dir to save vectors.txt
        dim = vector_space_dim(links, tmpath, tmpath, dim_max, sv_min, verbose)
        log.update({'vector_space_dim': dim})
        if verbose in ['mid', 'max', 'debug']:
            print('Optimal vector space dimensionality:', dim)
        #-vdf, sv, res3 = pmisvd(links, dict_path, tmpath, dim)
        vdf, sv, re01 = pmisvd(links, tmpath, tmpath, dim)
        log.update(re01)
        #-if clustering == 'kmeans':
        if verbose in ['max', 'debug']:
            print(UTC(), ':: category_learner ⇒ number_of_clusters')
        n_clusters = number_of_clusters(vdf, cluster_range, clustering,  \
            criteria=cluster_criteria, level=cluster_level, verbose=verbose)
        log.update({'n_clusters': n_clusters})
        if verbose in ['max', 'debug']:
            print(UTC(), ':: category_learner ⇒ cluster_words_kmeans:',
                  n_clusters, 'clusters')
        clusters, silhouette, inertia = cluster_words_kmeans(vdf, n_clusters)
        log.update({'silhouette': silhouette, 'inertia': inertia})
    #-elif clustering[:5] in ['group','ident']:
    else:
        if verbose in ['max', 'debug']:
            print(UTC(),':: category_learner ⇒ iLE group_links: context =', \
                str(context)+', word_space: '+str(word_space)+', clustering:', clustering)
        #TODO: from src.clustering.grouping import group_links
        clusters = group_links(links, verbose)
        log.update({'n_clusters': len(clusters)})
        if verbose not in ['min', 'none']:
            print('Total', len(clusters), \
                'clusters of identical lexical entries', type(clusters))

    # Convert clusters DataFrame ⇒ cats {}   #80619 0.5
    #TODO?: if clusters == pd.dataframe:
    if verbose in ['max', 'debug']:
        print(UTC(), ':: category_learner: convert clusters ⇒ cats {}')
    cats = {}  #80609 dict instead of DataFrame
    cats['cluster'] = ['C0'] + clusters['cluster'].tolist()
    cats['parent'] = [0 for x in cats['cluster']]
    cats['words'] = [[]] + [set(x) for x in clusters['cluster_words'].tolist()]
    if 'disjuncts' in clusters:
        cats['disjuncts'] = [[]] + clusters['disjuncts'].tolist()
        djset = set()
        [[djset.add(y) for y in x] for x in cats['disjuncts']]
        djlist = sorted(djset)
        cats['djs'] = [set([djlist.index(x) for x in y if x in djlist]) \
                       for y in cats['disjuncts']]
    if 'counts' in clusters:
        cats['counts'] = [0] + clusters['counts'].tolist()
    if word_space == 'vectors' or clustering == 'kmeans':
        cats['quality'] = [0 for x in cats['words']]
        cats['similarities'] = [[0 for y in x] for x in cats['words']]
    else:
        cats['quality'] = [1 for x in cats['words']]
        cats['quality'][0] = 0
        cats['similarities'] = [[1 for y in x] for x in cats['words']]
        cats['similarities'][0] = [0]
    cats['children'] = [0 for x in cats['words']]

    return cats, log