def run(cross=True, verbose=False, xml="../xml/RTE2_dev.xml", pre_processec_xml="../xml/RTE2_dev.preprocessed.xml"): learning_data="learningdata.tab" # the data features. extracted from an earlier run of features. filename = "results_part3.txt" clean_file(filename) if cross: features.run(xml, pre_processec_xml) # extracts the features data = orange.ExampleTable(learning_data) l = orange.BayesLearner(data) if cross: if verbose: print "result: ", validation(data) for item in data: if item.getclass() != l(item): print '\033[1;41m' print item, l(item), print '\033[1;m' print else: print item, l(item) else: print "result: ", validation(data) else: file = open(filename, "a") file.write("ranked: no\n") if file: for item in data: s = str(item['id']) +" "+ str(l(item)) file.write(s+"\n") else: print "Error opening file" file.close() print "finished writing to results_part3" #run(True, False, "../xml/blind-test-data.xml") # runs the learning #run(False) # runs the writing to results file.
if cfg.quick == True: g.run(cfg.name, quick=True) else: g.run(cfg.name) if cfg.roc == True: import roccurves as rc if cfg.quick == True: rc.run(cfg.name + str(cfg.maxdepth), quick=True) else: rc.run(cfg.name + str(cfg.maxdepth)) if cfg.features == True: import features as f if cfg.quick == True: f.run(cfg.name + str(cfg.maxdepth), quick=True) else: f.run(cfg.name + str(cfg.maxdepth)) if cfg.checksignal == True: import checksignal as cs if cfg.quick == True: cs.run(cfg.name + str(cfg.maxdepth), quick=True) else: cs.run(cfg.name + str(cfg.maxdepth)) if cfg.crossvalidation == True: import crossvalidation as cv if cfg.quick == True: cv.run(cfg.name + str(cfg.maxdepth), quick=True) else:
def single_run(data, split_rate, threshold=0, first_N=300, window_size=50, with_pca=False, n_components=20): ds = data['ds'] us = data['us'] random.shuffle(ds) random.shuffle(us) data['ds'] = ds data['us'] = us total_set = [] total_label = [] flag_f1 = False flag_f2 = False for i in range(1000): try: if len(data['ds'][i][1]) > threshold: if len(data['ds'][i][3]) > threshold: total_set.append([ features.run("all", data['ds'][i][1], window_size, first_N), features.run("all", data['ds'][i][3], window_size, first_N) ]) total_label.append(1) except: flag_f1 = True try: if len(data['us'][i][1]) > threshold: if len(data['us'][i][3]) > threshold: total_set.append([ features.run("all", data['us'][i][1], window_size, first_N), features.run("all", data['us'][i][3], window_size, first_N) ]) total_label.append(0) except: flag_f2 = True if flag_f1 and flag_f2: break #train and test split train = total_set[:int(split_rate * len(total_set))] train_label = total_label[:int(split_rate * len(total_set))] test = total_set[int(split_rate * len(total_set)):] test_label = total_label[int(split_rate * len(total_set)):] # flattens data automaticcaly for all data point train = np.reshape( np.asarray(train), (len(train), len(train[0]) * len(train[0][0]) * len(train[0][0][0]))) test = np.reshape( np.asarray(test), (len(test), len(test[0]) * len(test[0][0]) * len(test[0][0][0]))) if with_pca: pca = PCA(n_components=n_components, svd_solver="arpack") pca.fit(train) train = pca.transform(train) test = pca.transform(test) print("SVM training started...") svm = SVC(kernel='linear') svm.fit(np.asarray(train), np.asarray(train_label)) print("SVM prediction started...") predictions = svm.predict(np.asarray(test)) accuracy = accuracy_score(test_label, predictions) print(accuracy) print("Train split size: " + str(len(train))) print("Test split size: " + str(len(test)))
def k_fold(data, K, threshold=0, first_N=300, window_size=50, with_pca=False, n_components=20): test_split_rate = 1 / K accuracies = [] ds = data['ds'] us = data['us'] random.shuffle(ds) random.shuffle(us) data['ds'] = ds data['us'] = us total_set = [] total_label = [] flag_f1 = False flag_f2 = False for i in range( 1000 ): #tries to add all data from all classes, if all flags are true, that means all data collected try: if len(data['ds'][i][1]) > threshold: if len(data['ds'][i][3]) > threshold: total_set.append([ features.run("all", data['ds'][i][1], window_size, first_N), features.run("all", data['ds'][i][3], window_size, first_N) ]) total_label.append(1) except: flag_f1 = True try: if len(data['us'][i][1]) > threshold: if len(data['us'][i][3]) > threshold: total_set.append([ features.run("all", data['us'][i][1], window_size, first_N), features.run("all", data['us'][i][3], window_size, first_N) ]) total_label.append(0) except: flag_f2 = True if flag_f1 and flag_f2: break for i in range(K): if i + 1 == K: #when i is the last index, test split is not taking from middle, no need for 2 concatenation in train train = total_set[:int(i * test_split_rate * len(total_set))] train_label = total_label[:int(i * test_split_rate * len(total_set))] test = total_set[int(i * test_split_rate * len(total_set)):] test_label = total_label[int(i * test_split_rate * len(total_set)):] else: #means that test split is taken from middle or from end of the list which creates no problem train = total_set[:int(i * test_split_rate * len(total_set))] + \ total_set[int((i+1) * test_split_rate * len(total_set)):] train_label = total_label[:int(i * test_split_rate * len(total_set))] + \ total_label[int((i+1) * test_split_rate * len(total_set)):] test = total_set[int(i * test_split_rate * len(total_set)):int((i + 1) * test_split_rate * len(total_set))] test_label = total_label[int(i * test_split_rate * len(total_set)):int((i + 1) * test_split_rate * len(total_set))] #flattens data automaticcaly for all data point train = np.reshape(np.asarray(train), (len(train), len(train[0]) * len(train[0][0]) * len(train[0][0][0]))) test = np.reshape( np.asarray(test), (len(test), len(test[0]) * len(test[0][0]) * len(test[0][0][0]))) if with_pca: pca = PCA(n_components=n_components, svd_solver="arpack") pca.fit(train) train = pca.transform(train) test = pca.transform(test) print("SVM training started...") svm = SVC(kernel='linear') svm.fit(np.asarray(train), np.asarray(train_label)) print("SVM prediction started...") predictions = svm.predict(np.asarray(test)) accuracy = accuracy_score(test_label, predictions) accuracies.append(accuracy) print("Accuracies: ", end='') print(accuracies) print("Average accuracy: ", end='') print(sum(accuracies) / len(accuracies)) print("Train split size: " + str(len(train))) print("Test split size: " + str(len(test))) return accuracies
if cfg.quick == True: g.run(cfg.name, quick = True) else: g.run(cfg.name) if cfg.roc == True: import roccurves as rc if cfg.quick == True: rc.run(cfg.name + str(cfg.maxdepth), quick = True) else: rc.run(cfg.name + str(cfg.maxdepth)) if cfg.features == True: import features as f if cfg.quick == True: f.run(cfg.name + str(cfg.maxdepth), quick = True) else: f.run(cfg.name + str(cfg.maxdepth)) if cfg.checksignal == True: import checksignal as cs if cfg.quick == True: cs.run(cfg.name + str(cfg.maxdepth), quick = True) else: cs.run(cfg.name + str(cfg.maxdepth)) if cfg.crossvalidation == True: import crossvalidation as cv if cfg.quick == True: cv.run(cfg.name + str(cfg.maxdepth), quick = True) else: