Exemplo n.º 1
0
def split_train_test_highest(fname, codeprefixlen, mincodeoccurences, filtered_by):
    #prepare generators
    rec_generator = lambda:gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen)
    prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))
    
    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' +str(elements_count))
    
    #split into training and testing samples
    PRINTER('splitting into training and testing...')
    train_inds, test_inds = randomly_divide(elements_count, int(elements_count / 10))
    train_generator = lambda:gen_record_fromshifts(prefix_code_generator, train_inds)
    test_generator = lambda:gen_record_fromshifts(prefix_code_generator, test_inds)
    PRINTER('splitted.')
    
    elements_count = len(list(prefix_code_generator()))
    return train_generator, test_generator, elements_count, labels, elements_count
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences, filtered_by):
    #prepare generators
    rec_generator = lambda:gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen)
    prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))
    
    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' +str(elements_count))
    
    return labels, labelsset, prefix_code_generator, elements_count
Exemplo n.º 3
0
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences,
                                filtered_by):
    #prepare generators
    rec_generator = lambda: gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda: gen_record_prefixed(
        rec_generator, codeprefixlen)
    prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))

    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda: gen_record_filteredbylabels(
        prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' + str(elements_count))

    return labels, labelsset, prefix_code_generator, elements_count
    print "mincodeoccurences", mincodeoccurences
    print "k:", k
    print "smoothingparam:", smoothingparam
    print "distancetrainingsteps:", distancetrainingsteps
    print "filtered_by:", filtered_by
    print "save_hierarchical_path:", save_hierarchical_path
    print "save_train_generator_path:", save_train_generator_path
    print "save_lenlabels_path:", save_lenlabels_path
    

    log_level = logging.INFO
    logging.basicConfig(level=log_level)
    
    #prepare generators
    rec_generator = lambda: gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen)
    prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator)
    
    #generate labels
    print "generating labels..."
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    labelsset = set(labels)
    print "labels generated."
    print labels
    
    #gen filtered records:
    prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
    print "counting elements..."
    elements_count = len(list(prefix_code_generator()))
    print "number of elements:", elements_count
    
Exemplo n.º 5
0
     sys.exit(1)
 try:
     filtered_by = sys.argv[7:]
 except:
     print '7th argument: list of the fields to exist in considered records.'
     sys.exit(1)
 
 #prepare generators
 rec_generator_first = lambda: gen_record(fname, filtered_by)
 #choosing shuffling_cnt elements in random:
 PRINTER("shuffling in random")
 import random
 chosen_records = random.sample(list(rec_generator_first()), shuffling_cnt)
 rec_generator = lambda: chosen_records
 
 prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen)
 prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator)
 
 #generate labels
 PRINTER("generating labels...")
 labels_counts = get_labels_counts(prefix_code_generator, mincodeoccurences)
 #PRINTER("labels generated."
 #PRINTER(sorted(labels_counts, key = lambda x: x[1], reverse = True)
 biggest_labels = map(lambda x: x[0], sorted(labels_counts, key = lambda x: x[1], 
                                             reverse = True))[:biggest_labels_cnt]
 labelsset = set(biggest_labels)
 PRINTER(biggest_labels)
 
 #gen filtered records:
 prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
 PRINTER("counting elements...")