Exemplo n.º 1
0
def split_train_test_highest(fname, codeprefixlen, mincodeoccurences, filtered_by):
    #prepare generators
    rec_generator = lambda:gen_record(fname, filtered_by)
    prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen)
    prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator)
    #generate labels
    PRINTER('generating labels...')
    labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
    PRINTER('labels generated:')
    PRINTER(str(labels))
    
    #gen filtered records:
    labelsset = set(labels)
    prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
    PRINTER('counting elements...')
    elements_count = len(list(prefix_code_generator()))
    PRINTER('number of elements' +str(elements_count))
    
    #split into training and testing samples
    PRINTER('splitting into training and testing...')
    train_inds, test_inds = randomly_divide(elements_count, int(elements_count / 10))
    train_generator = lambda:gen_record_fromshifts(prefix_code_generator, train_inds)
    test_generator = lambda:gen_record_fromshifts(prefix_code_generator, test_inds)
    PRINTER('splitted.')
    
    elements_count = len(list(prefix_code_generator()))
    return train_generator, test_generator, elements_count, labels, elements_count
def gen_train_test_kfold(labelsset, prefix_code_generator, elements_count, kfold):
    #split into training and testing samples
    buckets = [[] for _ in xrange(kfold)]
    for ind in xrange(elements_count):
        buckets[ind%kfold].append(ind)
    
    #print "buckets:", buckets
    
    for test_bucket_ind in xrange(kfold):
        test_inds = buckets[test_bucket_ind]
        train_inds = sorted(reduce(lambda a, b: a+b, buckets[:test_bucket_ind]+buckets[test_bucket_ind+1:]))
        
        #print "test_inds:", test_inds, len(test_inds)
        #print "train_inds:", train_inds, len(train_inds)
        
        train_generator = list(gen_record_fromshifts(prefix_code_generator, train_inds))
        test_generator = list(gen_record_fromshifts(prefix_code_generator, test_inds))
        
        yield train_generator, test_generator
Exemplo n.º 3
0
def gen_train_test_kfold(labelsset, prefix_code_generator, elements_count,
                         kfold):
    #split into training and testing samples
    buckets = [[] for _ in xrange(kfold)]
    for ind in xrange(elements_count):
        buckets[ind % kfold].append(ind)

    #print "buckets:", buckets

    for test_bucket_ind in xrange(kfold):
        test_inds = buckets[test_bucket_ind]
        train_inds = sorted(
            reduce(lambda a, b: a + b,
                   buckets[:test_bucket_ind] + buckets[test_bucket_ind + 1:]))

        #print "test_inds:", test_inds, len(test_inds)
        #print "train_inds:", train_inds, len(train_inds)

        train_generator = list(
            gen_record_fromshifts(prefix_code_generator, train_inds))
        test_generator = list(
            gen_record_fromshifts(prefix_code_generator, test_inds))

        yield train_generator, test_generator
 print "generating labels..."
 labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences)
 labelsset = set(labels)
 print "labels generated."
 print labels
 
 #gen filtered records:
 prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset)
 print "counting elements..."
 elements_count = len(list(prefix_code_generator()))
 print "number of elements:", elements_count
 
 #split into training and testing samples
 print "splitting into training and testing..."
 train_inds, test_inds = randomly_divide(elements_count, int(elements_count/10))
 train_generator = lambda: gen_record_fromshifts(prefix_code_generator, train_inds)
 test_generator = lambda: gen_record_fromshifts(prefix_code_generator, test_inds)
 print "splitted."
 
 #train mlknn:
 print "training distance..."
 zbldistance = jaccard_distance.JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps)
 
 print "training hierarchical mlknn..."
 mlknn_callable = lambda train_gen: mlknn.MlKnn(train_gen, zbldistance, find_closest_points.find_closest_points, 
                      k, smoothingparam)
 
 
 label_mappings = (lambda x: x[:2], lambda x: x[:3], lambda x: x)
 record_mappings = (lambda x: gen_1record_prefixed(x, 2), lambda x: gen_1record_prefixed(x, 3), lambda x: x)
 hierarhical_mlknn = ml_hierarchical.MlHierarchical(train_generator, mlknn_callable, label_mappings, record_mappings)
Exemplo n.º 5
0
    labelsset = set(labels)
    print "labels generated."
    print labels

    #gen filtered records:
    prefix_code_generator = lambda: gen_record_filteredbylabels(
        prefixed_rec_generator, labelsset)
    print "counting elements..."
    elements_count = len(list(prefix_code_generator()))
    print "number of elements:", elements_count

    #split into training and testing samples
    print "splitting into training and testing..."
    train_inds, test_inds = randomly_divide(elements_count,
                                            int(elements_count / 10))
    train_generator = lambda: gen_record_fromshifts(prefix_code_generator,
                                                    train_inds)
    test_generator = lambda: gen_record_fromshifts(prefix_code_generator,
                                                   test_inds)
    print "splitted."

    #train mlknn:
    print "training distance..."
    zbldistance = jaccard_distance.JaccardDistance(
        train_generator, elements_count - int(elements_count / 10),
        distancetrainingsteps)

    print "training hierarchical mlknn..."
    mlknn_callable = lambda train_gen: mlknn.MlKnn(
        train_gen, zbldistance, find_closest_points.find_closest_points, k,
        smoothingparam)
Exemplo n.º 6
0
     distancetype = sys.argv[8]
 except:
     print '8th argument expected: type of distance. Available: jac, g0, g1, g2'
     sys.exit(1)
     
 
 PRINTER('Loading training list...')
 from tools.pickle_tools import read_pickle
 all_train_generator_list = read_pickle(load_train_generator_path)
 
 PRINTER('Dividing the train_generator_list into training set and validation set...')
 from tools.randomly_divide import randomly_divide
 from data_io.zbl_record_generators import gen_record_fromshifts
 elements_count = len(all_train_generator_list)
 train_inds, validate_inds = randomly_divide(elements_count, int(elements_count / 5))
 train_generator_list = list(gen_record_fromshifts(lambda: all_train_generator_list, train_inds))
 validate_generator_list = list(gen_record_fromshifts(lambda: all_train_generator_list, validate_inds))
 
 PRINTER('Loading labels path and elements count...')
 lenlabels = len(read_pickle(load_labels_path)) 
 elements_count = read_pickle(load_elements_count_path) 
 
 PRINTER("training distance...")
 train_generator = lambda: train_generator_list
 if distancetype=='jac':
     from mlknn.jaccard_distance import JaccardDistance
     zbldistance = JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps)
 else:
     from mlknn.txt_cosine_distance import TxtCosineDistance 
     zbldistance = TxtCosineDistance(distancetype)