def setup(self, argv): self.set_parameters(argv[1:]) print self.set1, self.set2 print self.__dict__.items() self.data = make_d.read_data(open(argv[0])) self.data = make_d.assign_classes(self.data) self.data = make_d.prepare_data(self.data) print self.data.keys(), [len(v) for v in self.data.values()] pass
def main(argv): fn = argv[0] data = make_d.prepare_data(make_d.read_data(open(fn))) N = 10000 xdata = [] classes = [] for x in data: xdata.extend([y for y in data[x]]) classes.extend([x for y in data[x]]) mic = compute_mic_pos_class(xdata, classes) print mic cmp_mic = mic rnd_mic = [0.0 for x in mic] for i in xrange(N): data = make_d.prepare_data(make_d.read_data(open(fn)), randomize='class-shuffle') xdata = [] classes = [] for x in data: xdata.extend([y for y in data[x]]) classes.extend([x for y in data[x]]) mic = compute_mic_pos_class(xdata, classes) rnd_mic = map(sum, zip(rnd_mic, mic)) rnd_mic = [x/N for x in rnd_mic] print cmp_mic print rnd_mic fo = open('mic.csv', 'w') fo.write('MIC_computed,MIC_random\n') for xy in zip(cmp_mic, rnd_mic): print xy fo.write('%f,%f\n' % xy) fo.close() return None
def main(argv): global C_RANGE global GAMMA_RANGE global SET1 global SET2 i = 0 param_grid = {} results = [] sum_acc = 0 init(argv[1:]) print SET1, SET2 # return None fn = argv[0] dataset = make_d.read_data(open(fn)) dataset = make_d.assign_classes(dataset) data = make_d.prepare_data(dataset) print data.keys(), [len(v) for v in data.values()] param = svm.svm_parameter('-b 1') if KERNEL_TYPE == 'LINEAR': param.kernel_type = svm.LINEAR GAMMA_RANGE = 1, 0, -2 else: param.kernel_type = svm.RBF cvfunc = leave_one_out n_cv = None use_sets = not SET1 is None and not SET2 is None outfile = os.path.basename(fn) outfile = outfile.replace('.fasta', '') outfile = outfile.replace('.fas', '') if use_sets: outfile = ''.join(map(str, map(int, SET1))) + 'vs' + ''.join(map(str, map(int, SET2))) log_name = '%s-%s-%i-%s.csv' % (TIMESTAMP, KERNEL_TYPE, int(RANDOMIZE_DATA), outfile) logfile = open(log_name, 'w') while i < N_RUNS: sys.stdout.write('%i ' % i) sys.stdout.flush() if use_sets: """ TODO: If set sizes are reasonably large, do not use the complete smallest set. """ set1 = dict([item for item in data.items() if item[0] in SET1]) set2 = dict([item for item in data.items() if item[0] in SET2]) set1 = make_d.make_set(set1, training_fraction=1.0) set2 = make_d.make_set(set2, training_fraction=1.0) new_sets = {1.0: set1[1], -1.0: set2[1]} sets = make_d.make_set(new_sets, training_fraction=0.75) else: sets = make_d.make_set(data, training_fraction=0.75) train_y, train_x, test_y, test_x = sets if RANDOMIZE_DATA: random.shuffle(train_y) random.shuffle(test_y) pass print [len(x) for x in sets]
def main(argv): global C_RANGE global GAMMA_RANGE global SET1 global SET2 i = 0 param_grid = {} results = [] sum_acc = 0 init(argv[1:]) print SET1, SET2 fn = argv[0] dataset = make_d.read_data(open(fn)) dataset = make_d.assign_classes(dataset) data = make_d.prepare_data(dataset) print data.keys(), [len(v) for v in data.values()] param = svm.svm_parameter('-b 1') if KERNEL_TYPE == 'LINEAR': param.kernel_type = svm.LINEAR GAMMA_RANGE = 1, 0, -2 else: param.kernel_type = svm.RBF cvfunc = svmfun.leave_one_out n_cv = None limit_sets = not SET1 is None and not SET2 is None outfile = os.path.basename(fn) outfile = outfile.replace('.fasta', '') outfile = outfile.replace('.fas', '') if limit_sets: outfile = ''.join(map(str, map(int, SET1))) + 'vs' outfile += ''.join(map(str, map(int, SET2))) log_name = '%s-%s-%i-%s.csv' % (TIMESTAMP, KERNEL_TYPE, int(RANDOMIZE_DATA), outfile) logfile = open(log_name, 'w') while i < N_RUNS: sys.stdout.write('%i ' % i) sys.stdout.flush() if limit_sets: new_sets = make_d.merge_multiclasses(data, SET1, SET2) sets = make_d.make_set(new_sets, training_fraction=0.75) else: sets = make_d.make_set(data, training_fraction=0.75) train_y, train_x, test_y, test_x = sets if RANDOMIZE_DATA: random.shuffle(train_y) random.shuffle(test_y) pass print [len(x) for x in sets]
def main(argv): global C_RANGE global GAMMA_RANGE global SET1 global SET2 i = 0 param_grid = {} results = [] sum_acc = 0 init(argv[2:]) print SET1, SET2 param = svm.svm_parameter("-b 1") if KERNEL_TYPE == "LINEAR": param.kernel_type = svm.LINEAR GAMMA_RANGE = 1, 0, -2 else: param.kernel_type = svm.RBF cvfunc = leave_one_out n_cv = None use_sets = not SET1 is None and not SET2 is None fn = argv[0] dataset = make_d.read_data(open(fn)) data = make_d.assign_classes(dataset) data = [(d[0], d[1][1:]) for d in data] data = make_d.prepare_data(data) """ Next line is just for testing. """ data = {1.0: data[1.0], 0.0: data[0.0]} print data.keys(), [len(v) for v in data.values()] testdata = make_d.read_data(open(argv[1])) testset = make_d.assign_classes(testdata) testset = [(d[0], d[1][1:]) for d in testset] testset = make_d.prepare_data(testset) precursor = {} for k, v in testdata.items(): v = v[1:] precursor[v] = precursor.get(v, []) + [int(k.split("_")[-1])] print precursor outfile = os.path.basename(fn) outfile = outfile.replace(".fasta", "") outfile = outfile.replace(".fas", "") if use_sets: outfile = "".join(map(str, map(int, SET1))) + "vs" + "".join(map(str, map(int, SET2))) log_name = "%s-%s-%i-%s.csv" % (TIMESTAMP, KERNEL_TYPE, int(RANDOMIZE_DATA), outfile) logfile = open(log_name, "w") """ Prepare test set (precursor fragments). """ testset[-1.0] = copy.deepcopy(testset[0.0]) del testset[0.0] testset = make_d.make_set(testset, balanced_set=False, training_fraction=1.0) """ 'Training' and 'Test' sets flipped """ test_y, test_x = testset[:2] encoded_x = [make_d.encode(x, make_d.encode_dic) for x in test_x] # logfile.write(',%s\n' % ','.join(map(str, map(int, test_y)))) """ Train and predict """ row = [0.0 for x in test_x] while i < N_RUNS: sys.stdout.write("%i " % i) sys.stdout.flush() set1 = dict([item for item in data.items() if item[0] == 1.0]) set2 = dict([item for item in data.items() if item[0] == 0.0]) set1 = make_d.make_set(set1, training_fraction=1.0) set2 = make_d.make_set(set2, training_fraction=1.0) new_sets = {1.0: set1[1], -1.0: set2[1]} sets = make_d.make_set(new_sets, training_fraction=1.0) train_y, train_x, dummy_y, dummy_x = sets print [len(x) for x in sets]
def main(argv): global C_RANGE global GAMMA_RANGE init(argv[2:]) fn = argv[0] dataset = make_d.read_data(open(fn)) items = dataset.items() keys = [float(x[0].split('_')[0][3:]) for x in items] dataset = zip(keys, [v[1] for v in items]) data = make_d.prepare_data(dataset) print data.keys(), [len(v) for v in data.values()] param = svm.svm_parameter('-b 1') if KERNEL_TYPE == 'LINEAR': param.kernel_type = svm.LINEAR GAMMA_RANGE = 1, 0, -2 else: param.kernel_type = svm.RBF fn_test = argv[1] testdata = make_d.read_data(open(fn_test)) testitems = testdata.items() testkeys = [float(x[0].split('_')[0][3:]) for x in testitems] testdataset = zip(testkeys, [v[1] for v in testitems]) testdata = make_d.prepare_data(testdataset) cvfunc = leave_one_out n_cv = None outfile = os.path.basename(fn) outfile = outfile.replace('.fasta', '') outfile = outfile.replace('.fas', '') log_name = '%s-%s-%i-%s.csv' % (TIMESTAMP, KERNEL_TYPE, int(RANDOMIZE_DATA), outfile) logfile = open(log_name, 'w') i = 0 param_grid = {} results = [] sum_acc = 0 sets = make_d.make_set(data, balanced_set=False, training_fraction=1.0) train_y, train_x, test_y, test_x = sets train_x = [make_d.encode(x, make_d.encode_dic) for x in train_x] testsets = make_d.make_set(testdata, balanced_set=False, training_fraction=0.0) dummy0, dummy1, test_y, test_x = testsets test_x = [make_d.encode(x, make_d.encode_dic) for x in test_x] param_grid = {} param_grid = grid_search(train_y, train_x, param, param_grid, leave_one_out, n_cv, C_RANGE, GAMMA_RANGE) ranking = [] for k, v in param_grid.items(): recognized = [v_i[0][0] == v_i[3] for v_i in v] recog_rate = sum(map(int, recognized))/float(len(recognized)) ranking.append((recog_rate, k)) ranking.sort() param.C, param.gamma = map(lambda x: 2**x, ranking[-1][1]) problem = svm.svm_problem(train_y, train_x) model = svmutil.svm_train(problem, param, '-q') result = svmutil.svm_predict(test_y, test_x, model, '-b 1') print result """ cur_result = zip(result[0], test_y) cur_acc = compute_accuracy(cur_result) results.extend(cur_result) total_acc = compute_accuracy(results) sum_acc += cur_acc mean_acc = sum_acc/(i+1) # print cur_acc, mean_acc, total_acc logfile.write('%f,%f,%f\n' % (cur_acc, mean_acc, total_acc)) print 'ACC', compute_accuracy(results) """ logfile.close() return None
def main(argv): do_write = True fn = argv[0] dataset = make_d.read_data(open(fn)) negset = make_d.read_data(open(argv[1])) items = dataset.items() keys = [float(x[0].split('_')[0][3]) for x in items] dataset = zip(keys, [v[1] for v in items]) negitems = negset.items() negkeys = [float(x[0].split('_')[0][3]) for x in negitems] negset = zip(negkeys, [v[1] for v in negitems]) set1 = [x for x in dataset if x[0] == 1.0] print 'Set1:', len(set1), len(set(set1)) set2 = [x for x in dataset if x[0] == 2.0] print 'Set2:', len(set2), len(set(set2)) set4 = [x for x in dataset if x[0] == 4.0] print 'Set4:', len(set4), len(set(set4)) set0 = [x for x in negset] print 'Set0:', len(set0) set1_2 = [(1.0, x[1]) for x in set1] + [(-1.0, x[1]) for x in set2] print 'Set1_2:', len(set1_2) if do_write: write_set(set1_2, open('1vs2.fas', 'w')) set1_4 = [(1.0, x[1]) for x in set1] + [(-1.0, x[1]) for x in set4] print 'Set1_4:', len(set1_4) if do_write: write_set(set1_4, open('1vs4.fas', 'w')) set2_4 = [(1.0, x[1]) for x in set2] + [(-1.0, x[1]) for x in set4] print 'Set2_4:', len(set2_4) if do_write: write_set(set2_4, open('2vs4.fas', 'w')) set12_4 = [(1.0, x[1]) for x in set1 + set2] + [(-1.0, x[1]) for x in set4] print 'Set12_4:', len(set12_4) if do_write: write_set(set12_4, open('12vs4.fas', 'w')) set14_2 = [(1.0, x[1]) for x in set1 + set4] + [(-1.0, x[1]) for x in set2] print 'Set14_2:', len(set14_2) if do_write: write_set(set14_2, open('14vs2.fas', 'w')) set24_1 = [(1.0, x[1]) for x in set2 + set4] + [(-1.0, x[1]) for x in set1] print 'Set24_1:', len(set24_1) if do_write: write_set(set24_1, open('24vs1.fas', 'w')) set_all = set1 + set2 + set4 + set0 print 'Set_all:', len(set_all) if do_write: write_set(set_all, open('1vs2vs4vs0.fas', 'w')) set124_0 = [(1.0, x[1]) for x in set1 + set2 + set4] + [(-1.0, x[1]) for x in set0] print 'Set124_0:', len(set124_0) if do_write: write_set(set124_0, open('124vs0.fas', 'w')) set1_0 = [(1.0, x[1]) for x in set1] + [(-1.0, x[1]) for x in set0] print 'Set1_0:', len(set1_0) if do_write: write_set(set1_0, open('1vs0.fas', 'w')) set2_0 = [(1.0, x[1]) for x in set2] + [(-1.0, x[1]) for x in set0] print 'Set2_0:', len(set2_0) if do_write: write_set(set2_0, open('2vs0.fas', 'w')) set4_0 = [(1.0, x[1]) for x in set4] + [(-1.0, x[1]) for x in set0] print 'Set4_0:', len(set4_0) if do_write: write_set(set4_0, open('4vs0.fas', 'w')) set12_0 = [(1.0, x[1]) for x in set1 + set2] + [(-1.0, x[1]) for x in set0] print 'Set12_0:', len(set12_0) if do_write: write_set(set12_0, open('12vs0.fas', 'w')) set14_0 = [(1.0, x[1]) for x in set1 + set4] + [(-1.0, x[1]) for x in set0] print 'Set14_0:', len(set14_0) if do_write: write_set(set14_0, open('14vs0.fas', 'w')) set24_0 = [(1.0, x[1]) for x in set2 + set4] + [(-1.0, x[1]) for x in set0] print 'Set24_0:', len(set24_0) if do_write: write_set(set24_0, open('24vs0.fas', 'w')) return None