def split_train_test_highest(fname, codeprefixlen, mincodeoccurences, filtered_by): #prepare generators rec_generator = lambda:gen_record(fname, filtered_by) prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator) #generate labels PRINTER('generating labels...') labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) PRINTER('labels generated:') PRINTER(str(labels)) #gen filtered records: labelsset = set(labels) prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset) PRINTER('counting elements...') elements_count = len(list(prefix_code_generator())) PRINTER('number of elements' +str(elements_count)) #split into training and testing samples PRINTER('splitting into training and testing...') train_inds, test_inds = randomly_divide(elements_count, int(elements_count / 10)) train_generator = lambda:gen_record_fromshifts(prefix_code_generator, train_inds) test_generator = lambda:gen_record_fromshifts(prefix_code_generator, test_inds) PRINTER('splitted.') elements_count = len(list(prefix_code_generator())) return train_generator, test_generator, elements_count, labels, elements_count
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences, filtered_by): #prepare generators rec_generator = lambda:gen_record(fname, filtered_by) prefixed_rec_generator = lambda:gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda:gen_lmc(prefixed_rec_generator) #generate labels PRINTER('generating labels...') labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) PRINTER('labels generated:') PRINTER(str(labels)) #gen filtered records: labelsset = set(labels) prefix_code_generator = lambda:gen_record_filteredbylabels(prefixed_rec_generator, labelsset) PRINTER('counting elements...') elements_count = len(list(prefix_code_generator())) PRINTER('number of elements' +str(elements_count)) return labels, labelsset, prefix_code_generator, elements_count
def load_labels_codegen_elemcnt(fname, codeprefixlen, mincodeoccurences, filtered_by): #prepare generators rec_generator = lambda: gen_record(fname, filtered_by) prefixed_rec_generator = lambda: gen_record_prefixed( rec_generator, codeprefixlen) prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator) #generate labels PRINTER('generating labels...') labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) PRINTER('labels generated:') PRINTER(str(labels)) #gen filtered records: labelsset = set(labels) prefix_code_generator = lambda: gen_record_filteredbylabels( prefixed_rec_generator, labelsset) PRINTER('counting elements...') elements_count = len(list(prefix_code_generator())) PRINTER('number of elements' + str(elements_count)) return labels, labelsset, prefix_code_generator, elements_count
logging.basicConfig(level=log_level) #prepare generators rec_generator = lambda: gen_record(fname, filtered_by) prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator) #generate labels print "generating labels..." labels = get_labels_min_occurence(prefix_code_generator, mincodeoccurences) labelsset = set(labels) print "labels generated." print labels #gen filtered records: prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset) print "counting elements..." elements_count = len(list(prefix_code_generator())) print "number of elements:", elements_count #split into training and testing samples print "splitting into training and testing..." train_inds, test_inds = randomly_divide(elements_count, int(elements_count/10)) train_generator = lambda: gen_record_fromshifts(prefix_code_generator, train_inds) test_generator = lambda: gen_record_fromshifts(prefix_code_generator, test_inds) print "splitted." #train mlknn: print "training distance..." zbldistance = jaccard_distance.JaccardDistance(train_generator, elements_count-int(elements_count/10), distancetrainingsteps)
#choosing shuffling_cnt elements in random: PRINTER("shuffling in random") import random chosen_records = random.sample(list(rec_generator_first()), shuffling_cnt) rec_generator = lambda: chosen_records prefixed_rec_generator = lambda: gen_record_prefixed(rec_generator, codeprefixlen) prefix_code_generator = lambda: gen_lmc(prefixed_rec_generator) #generate labels PRINTER("generating labels...") labels_counts = get_labels_counts(prefix_code_generator, mincodeoccurences) #PRINTER("labels generated." #PRINTER(sorted(labels_counts, key = lambda x: x[1], reverse = True) biggest_labels = map(lambda x: x[0], sorted(labels_counts, key = lambda x: x[1], reverse = True))[:biggest_labels_cnt] labelsset = set(biggest_labels) PRINTER(biggest_labels) #gen filtered records: prefix_code_generator = lambda: gen_record_filteredbylabels(prefixed_rec_generator, labelsset) PRINTER("counting elements...") elements_count = len(list(prefix_code_generator())) PRINTER("number of elements:"+str(elements_count)) codes_generator = lambda: gen_lmc(prefix_code_generator) PRINTER("labels per document statistics:"+str(get_labelperdocuments_counts(codes_generator))) PRINTER("saving...") write_zbl_records(open(savefname, 'w'), prefix_code_generator())