def run_knn_classifier(knn_ctor, k: int, m=0, v=-1, normlize_data=False, features_subset=None) -> KNNResults: train_dataset_path = "train.csv" test_dataset_path = "test.csv" train_samples, train_labels, _ = dl.extract_dataset(train_dataset_path) test_samples, test_labels, _ = dl.extract_dataset(test_dataset_path) if normlize_data: minmaxNormalization(train_samples, test_samples) # Data Normalization knn_classifier = None if m == 0: if v == -1: knn_classifier = knn_ctor(k) else: knn_classifier = knn_ctor(k, 0, v) else: if v == -1: knn_classifier = knn_ctor(k, m) else: knn_classifier = knn_ctor(k, m, v) knn_classifier.fit(train_samples, train_labels, features=features_subset) knn_predict = knn_classifier.predict(test_samples, features=features_subset) _accuracy = metrics.accuracy_score(test_labels, knn_predict) _c_m = metrics.confusion_matrix(test_labels, knn_predict) _error_w = Error_w(_c_m) return KNNResults(accuracy=_accuracy, confusion_matrix=_c_m, error_w=_error_w)
def main(): # training parameter result_path = 'results/housingLiR_1.mse' model_name = 'housing_shiftAndScale' # normalization = Preprocess.zero_mean_unit_var normalization = Preprocess.shift_and_scale # cols_not_norm = (0,7,12) cols_not_norm = [] # laod and preprocess training data training_data = loader.load_dataset('data/housing_train.txt') testing_data = loader.load_dataset('data/housing_test.txt') Preprocess.normalize_features_all(normalization, training_data[0], testing_data[0], cols_not_norm) # start training model = rm.LinearRegression() model.build(training_data[0], training_data[1]) training_mse = model.test(training_data[0], training_data[1], util.mse) testing_mse = model.test(testing_data[0], testing_data[1], util.mse) print 'Error for training data is:' print training_mse print 'Error for testing data is:' print testing_mse result = {} result['TrainingMSE'] = str(training_mse) result['TestingMSE'] = str(testing_mse) result['Theta'] = str(model.theta) # log the training result to file util.write_result_to_file(result_path, model_name, result)
def get_data(): file_paths = DataLoader.get_all_files('Data') X_list, y_list = [], [] for file_path in file_paths: data_frame = pd.read_csv(file_path) abstract_text, abstract_labels = DataLoader.extract_abstract_and_labels(data_frame) mesh_terms, title = DataLoader.extract_mesh_and_title(data_frame) X = [] y = [] for i in range(abstract_text.shape[0]): abstract_str = abstract_text[i] mesh_str = mesh_terms[i] title_str = title[i] label = abstract_labels[i] text = "".join([abstract_str, " ", mesh_str, " ", title_str]) X.append(text) y.append(label) X_list.append(X) y_list.append(y) return X_list, y_list
def get_term_structure_df(current_date:str, start_date:str, end_date:str, root:str, num_contracts:int): prices = {} syms, relative_map = dl.get_recent_symbols(current_date, root, num_contracts) data = {} for sym in syms: data[sym] = dl.read_data(sym, 'D', None, None) tds = pd.date_range(start_date, end_date) skipped_days = set({}) for td in tds: td = str(td.date()) price_day = {} for sym in syms: relative_sym = sym.split('.')[1][:-4] + '!' + str(relative_map[sym]) try: price_day[relative_sym] = (data[sym].loc[td][sym + '.close']) except Exception as inst: #print(inst) if inst not in skipped_days: skipped_days.add(inst) else: continue if price_day: prices[td] = price_day print(skipped_days) _df = pd.DataFrame(prices) df = _df.transpose() return df
def main(): target = 'v2' # training parameter k = 10 # fold layer_thresh = 2 T = 50 threshes_path = 'data/spambase.threshes' # laod and preprocess training data training_data = loader.load_dataset('data/spambase.data') # load thresholds threshes = loader.load_pickle_file(threshes_path) # start training k_folds = Preprocess.prepare_k_folds(training_data, k) tr_data, te_data = Preprocess.get_i_fold(k_folds, 0) f_cur = [x[0] for x in tr_data[0]] t = dt.DecisionTree() if target == 'v1': for i in range(100): h_y = t.compute_entropy(tr_data[1]) thresh = threshes[0][30] ig = t.compute_ig(f_cur, tr_data[1], thresh, h_y) else: h_y = t.compute_entropy_v2(tr_data[1]) thresh = threshes[0][0] ig = t.compute_ig_v2(f_cur, tr_data[1], thresh, h_y)
def topNPerClass(self, datasetGenerator, numImages, nList=[1, 3, 5], savePath="./topN.csv"): batchGenerator = DataLoader.oneHotWrapper( DataLoader.batchLoader(datasetGenerator, batchSize=16)) numBatches = int(ceil(numImages / 16.0)) successCount = {} totalCount = {} percentCount = {} for i in range(numBatches): imgMat, labMat = next(batchGenerator) predMat = self.predict(imgMat, True) for j in range(16): truth = np.argmax(labMat[j]) for n in nList: thisNSuccess = successCount.get(n, {}) topN = np.argpartition(-predMat[j], n)[:n] if truth in topN: thisNSuccess[truth] = thisNSuccess.get(truth, 0) + 1 successCount[n] = thisNSuccess totalCount[truth] = totalCount.get(truth, 0) + 1 #find the percentages for key in totalCount: for n in nList: thisNPercent = percentCount.get(n, {}) thisNPercent[key] = float((successCount.get(n, {})).get( key, 0.0)) / totalCount.get(key, 0.0) percentCount[n] = thisNPercent df = pd.DataFrame.from_dict(percentCount) df.to_csv(savePath) return df
def __getDataName(numInputErrors): """ Function asks the user for Data Set on whihc learning has to be done :return: Returns the DataLoader Object of specific Data set """ while True: if numInputErrors > MAX_INPUT_ERROR_ALLOWED: raise TooManyInputException() try: dataChoice = input(Constants.inputDataChoice) except SyntaxError: continue if dataChoice == 1: data = DataLoader.MnistDataLoader() break elif dataChoice == 2: data = DataLoader.MnistRotated() break elif dataChoice == 3: data = DataLoader.MnistBackground() break elif dataChoice == 4: data = DataLoader.MnistRandomBackground() break elif dataChoice == 5: data = DataLoader.Cifar10DataLoader() break else: numInputErrors += 1 print Constants.inputDataChoiceError return data
def main(): kernel = c.COSINE # training parameter result_path = 'results/PB2_spam.acc' model_name = 'digits_' + kernel tr_data_path = 'data\\digits\\tr_f_l_10.pickle' te_data_path = 'data\\digits\\te_f_l_10.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(tr_data_path) te_data = loader.load_pickle_file(te_data_path) # transpose label tr_data[1] = np.transpose(tr_data[1])[0] te_data[1] = np.transpose(te_data[1])[0] Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0]) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0]) # start training st = time.time() # start training print('{:.2f} Start training.'.format(time.time() - st)) for r in (0.15, 0.1): clf = kNN.kNN(kernel=kernel, dataset=c.DS_DIGITS) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0], r=r) te_pred = clf.predict(te_data[0], r=r) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print('{} Final results with kernel {} and r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, r, tr_acc, te_acc))
def test(self, datasetGenerator, numImages): batchGenerator = DataLoader.oneHotWrapper( DataLoader.batchLoader(datasetGenerator, batchSize=1)) results = self.model.evaluate_generator(batchGenerator, numImages) print("testing complete") for i in range(len(results)): print("\t{}: {}".format(self.model.metrics_names[i], results[i]))
def construct_dataloader_disk(): # Construct DataLoader opt_data_train = { #'data_h5': 'miniplaces_128_train.h5', 'data_root': '../../data/images/', # MODIFY PATH ACCORDINGLY 'data_list': '../../data/train.txt', # MODIFY PATH ACCORDINGLY 'load_size': load_size, 'fine_size': fine_size, 'data_mean': data_mean, 'randomize': True } opt_data_val = { #'data_h5': 'miniplaces_128_val.h5', 'data_root': '../../data/images/', # MODIFY PATH ACCORDINGLY 'data_list': '../../data/val.txt', # MODIFY PATH ACCORDINGLY 'load_size': load_size, 'fine_size': fine_size, 'data_mean': data_mean, 'randomize': False } loader_train = DataLoader.DataLoaderDisk(**opt_data_train) loader_val = DataLoader.DataLoaderDisk(**opt_data_val) return (loader_train, loader_val)
def __init__(self, class_num, batch_size, iters, learning_rate, param): self.ClassNum = class_num self.BatchSize = batch_size self.Iters = iters self.LearningRate = learning_rate self.target_loss_param = param[0] self.domain_loss_param = param[1] self.adver_loss_param = param[2] Data = DataLoader("office31", source="Amazon", target="Webcam") self.SourceData, self.SourceLabel = Data.LoadSource() self.TargetData, self.TestData, self.TestLabel = Data.LoadTarget() ####################################################################################### self.source_image = tf.placeholder(tf.float32, shape=[self.BatchSize, 227, 227, 3], name="source_image") self.source_label = tf.placeholder( tf.float32, shape=[self.BatchSize, self.ClassNum], name="source_label") self.target_image = tf.placeholder(tf.float32, shape=[self.BatchSize, 227, 227, 3], name="target_image") self.Training_flag = tf.placeholder(tf.bool, shape=None, name="Training_flag") self.KeepProb = tf.placeholder(tf.float32, name='keep_prob')
def run(): n = 100000 df_android = DataLoader.load_data(r'Data\df_Ready_Data2.csv') print('finish read data.') # df_android = DataPreProcessor.drop_columns(df_android, ['user_isp_new']) df_android = df_android.dropna(how='any') col_label_encode = ['user_state', 'user_isp', 'app_cat', 'app_domain'] df_android = DataEncoder.label_encoder(df_android, col_label_encode) col_one_hot_encode = [ 'device_maker', 'geo_location', 'day_of_week', 'part_of_day' ] # df_android = DataEncoder.encode_one_hot(df_android, list(df_android.columns[1:])) df_android = DataEncoder.encode_one_hot(df_android, col_one_hot_encode) print('finish encode data.') x_train, x_test, y_train, y_test = DataLoader.split_data( df_android, 'click') print('finish split to train and test') x_train, y_train = DataLoader.under_sampling_majority( x_train, y_train, 'click') print('finish sample data') print('start training classifiers...') for classifier_name, classifier in eval_classifiers.items(): trained_model = ModelProcessor.train_model(classifier_name, classifier, x_train, y_train) trained_models[classifier_name] = trained_model print('Finish train classifiers.') print('start evaluating classifiers...') for model_name, model in trained_models.items(): predictions = ModelProcessor.predict_samples(model_name, model, x_test) score = Evaluator.evaluate_performance_metric('auc', predictions, y_test) print(f'metric auc- score for [{model_name}]: [{score}]')
def main(): st = time.time() # training parameter result_path = 'results/PB2_A_spam_polluted_NB_Gaussian.acc' model_name = 'spam_' train_data_path = 'data/spam_polluted/train/data.pickle' test_data_path = 'data/spam_polluted/test/data.pickle' tr_data = loader.load_pickle_file(train_data_path) te_data = loader.load_pickle_file(test_data_path) print('{:.2f} Data loaded!'.format(time.time() - st)) # start training print('{:.2f} Building model...'.format(time.time() - st)) model = m.NBGaussian() model.build(tr_data[0], tr_data[1]) print('{:.2f} Predicting...'.format(time.time() - st)) tr_pred = model.predict(tr_data[0]) te_pred = model.predict(te_data[0]) print('{:.2f} Calculating results...'.format(time.time() - st)) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print('{} Final results. Train acc: {}, Test acc: {}'.format(time.time() - st, tr_acc, te_acc)) result = {} result['TrainingAcc'] = tr_acc result['TestingAcc'] = te_acc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
def hillClimb(domain, costf): tmp = DataLoader.Result("") tmp.red = random.sample(RedBalls, 6) tmp.blue = random.sample(BlueBalls, 1)[0] while 1: neighbors = [] for j in range(len(domain)): if tmp.blue > domain[j].blue: n = DataLoader.Result("") n.blue = tmp.blue - 1 for i in range(len(tmp.red)): n.red[i] = tmp.red[i] - 1 neighbors.append(n) if tmp.blue < domain[j].blue: n = DataLoader.Result("") n.blue = tmp.blue + 1 for i in range(len(tmp.red)): n.red[i] = tmp.red[i] + 1 neighbors.append(n) current = costf(tmp, domain) best = current for j in range(len(neighbors)): cost = costf(neighbors[j], domain) if cost > best: best = cost tmp = neighbors[j] if best == current: break print best print tmp return tmp
def __set_stopwords(self): data_loader = DataLoader('./vietnamese-stopwords.txt') content = data_loader.read_file() stop_words_set = set(line.strip() for line in content.split("\n")) stop_words_set = set(stop_word.strip().replace(" ", "_") for stop_word in stop_words_set) self.stop_words = stop_words_set
def run_pipeline_with_pretrained_doc2vec(): all_documents = DataLoader.load_all_documents() adjacency_matrix_references_all_documents = DataLoader.load_adjacency_matrix_all_documents( ) # Load model model = DataLoader.load_model() # Runs HDBSCAN, returns a list of labels (a label for each documents. -1 == outlier) labels = Clustering.run_hdbscan(model=model, min_cluster_size=4, min_samples=4) # Extracts the documents which have been clustered such that we have no outliers # Mask denotes the ones to include and exclude. Labels of the clustered documents and the clustered documents mask, labels_subset, clustered_documents = Clustering.extract_clustered_documents( all_documents, labels) # Creates the adjacency matrix for references between clusters cluster_references_adjacency = Clustering.create_adjacency_matrix_for_clusters( mask=mask, labels=labels_subset, adjacency_references_all_documents= adjacency_matrix_references_all_documents) # k-nearest undirected adjacency cluster_references_adjacency = Clustering.make_adjacency_matrix_undirected( cluster_references_adjacency, k=3) DataLoader.save_data(cluster_references_adjacency, clustered_documents, labels_subset) # Creates the graph and sets up the interactive webpage showing the graph visualization.doc_to_vec_visualize(documents=clustered_documents, adj_matrix=cluster_references_adjacency, labels=labels_subset)
def loadData(dir,t,o,l): # Loading Twitter training features global tw_train_all global tw_train_all_recent global tw_train_base global tw_train_base_recent tw_train_all = dl.load_tw_train_all(dir,t,o,l) tw_train_all_recent = dl.load_tw_train_all_recent(dir,t,o,l) tw_train_base = dl.load_tw_train_base(dir,t,o,l) tw_train_base_recent = dl.load_tw_train_base_recent(dir,t,o,l) # Loading Youtube training features global yt_train_all global yt_train_all_recent global yt_train_base global yt_train_base_recent yt_train_all = dl.load_yt_train_all(dir,t,o,l) yt_train_all_recent = dl.load_yt_train_all_recent(dir,t,o,l) yt_train_base = dl.load_yt_train_base(dir,t,o,l) yt_train_base_recent = dl.load_yt_train_base_recent(dir,t,o,l) # Loading Labeling data global popular_train global popular_train_recent global viral_train global viral_train_recent labeled = dl.load_labeling(dir,t,o,l) popular_train = np.array(labeled[0]) viral_train = np.array(labeled[1]) # Create viral and popular list global vap_train vap_train=[] for i in range(popular_train.size): if ((popular_train[i]==1) and (viral_train[i]==1)): vap_train.append(1) else: vap_train.append(0) vap_train = np.array(vap_train) labeled_recent = dl.load_labeling_recent(dir,t,o,l) popular_train_recent = np.array(labeled_recent[0]) viral_train_recent = np.array(labeled_recent[1]) # Create viral and popular list global vap_train_recent vap_train_recent = [] for i in range(popular_train_recent.size): if ((popular_train_recent[i]==1) and (viral_train_recent[i]==1)): vap_train_recent.append(1) else: vap_train_recent.append(0) vap_train_recent = np.array(vap_train_recent)
def build_model(training_data, config): ''' Build model from the config and training data ''' m_type = config[c.CLSFR_TYPE] if m_type == c.DT_WITH_IG: # for decision tree # load thresholds threshs = loader.load_arrays(config[c.THRESHS]) tree = Tree.Tree() tree.build(utils.split_on_ig, training_data[0], training_data[1], threshs, config[c.TERM_CON], int(config[c.TERM_THRESH])) return tree elif m_type == c.REGRESSION_TREE: # for regression tree # load thresholds threshs = loader.load_arrays(config[c.THRESHS]) tree = Tree.Tree() tree.build(utils.split_on_mse, training_data[0], training_data[1], threshs, config[c.TERM_CON], float(config[c.TERM_THRESH])) return tree elif m_type == c.REGRESSION: # for linear regression reg_model = rmodel.Regression() reg_model.build(training_data[0], training_data[1]) return reg_model
def test_update(self): print("This program created by Phuong Pham to test an update dataset ") record = DataLoader.readRecord( self.data, 1) # test the first record from the dataset DataLoader.updateRecord( record, "Value", 100) # update new value equal 100 for field 'Value' self.assertEqual(record['Value'], 10) # test for field 'Value'
def __init__(self, windowSize, paramIndex, threshold): self.windowSize = windowSize # 1 = 5 min self.maxEpoch = 0 self.paramIndex = paramIndex self.learningRate = 0 self.threshold = threshold self.normalData = DataLoader.NormalDataLoader(self.paramIndex, 'train') self.unstableData = DataLoader.UnstableDataLoader(self.paramIndex, 'test')
def test(): # laod and preprocess training data # tr_data = loader.load_pickle_file(tr_data_path) te_data= loader.load_pickle_file(te_data_path) model = loader.load_pickle_file(model_path) # te_pred_dict = loader.load_pickle_file(te_pred_dict_path) test_pred_dict = {} for i in range(9): test_pred_dict[i] = {} for j in range(i + 1, 10): if i == j: continue # get training data for this class clf = model[i][j] te_pred = clf.predict(te_data[0]) test_pred_dict[i][j] = te_pred te_n = len(te_data[1]) te_pred = np.zeros((1, te_n))[0] for i in range(te_n): votes = np.zeros((10,), dtype=np.int) for j in range(9): for k in range(j): votes[j] += 1 if test_pred_dict[k][j][i] == -1 else 0 for kk in test_pred_dict[j]: votes[j] += 1 if test_pred_dict[j][kk][i] == 1 else 0 count = np.bincount(votes) if count[-1] == 1: te_pred[i] = votes.argmax() else: te_pred[i] = votes.argmax() tie_ind = [votes.argmax()] cc = 0 for ind_v, v in enumerate(votes): if v == votes.max(): if cc == 1: tie_ind.append(ind_v) break else: cc += 1 te_pred[i] = tie_ind[0] if test_pred_dict[tie_ind[0]][tie_ind[1]][i] == 1 else tie_ind[1] print('{} Tie! {} wins.'.format(count[-1], te_pred[i])) acc = 0 acc_n = 0 for ind_l, l in enumerate(te_data[1]): acc += 1 if l == te_pred[ind_l] else 0 acc /= te_n # acc = (te_data[1] == te_pred).sum() / te_n print('Acc: {}'.format(acc))
def __init__(self, Sfilename, model='Net1_FFN_v7', verbose=False): self.index = 0 self.settingfuncs = [self.setting1, self.setting2, self.setting3, self.setting4, self.setting5, self.setting6, self.setting7, self.setting8] if isinstance(model, str): self.modelname = model self.model = FFN(model) self.model.Load(verbose=verbose) elif isinstance(model, FFN): self.model = model self.modelname = self.model.name mask1, pmask1 = self.model.apply_mask(Sfilename) rgb, self.TitleStr = Vis.FalseColour(Sfilename, False) scn = DL.scene_loader(Sfilename) scn.load(['bayes_in', 'probability_cloud_single_in']) bmask = DL.upscale_repeat(scn['bayes_in'].values).astype('int') bmask = 1 - ((bmask & 2) / 2) bpmask = DL.upscale_repeat(scn['probability_cloud_single_in'].values) self.im1 = plt.imshow(rgb) plt.title('False colour image\n' + self.TitleStr) self.im2 = plt.imshow(mask1, cmap='Blues') self.im2.set_visible(False) bmask = DL.extract_mask(Sfilename, 'bayes_in', 2) self.im3 = plt.imshow(bmask, cmap='Reds') self.im3.set_visible(False) mask1 = mask1.astype('bool') temp = np.copy(rgb) temp[~mask1, :] = 254 / 255, 253 / 255, 185 / 255 self.im4 = plt.imshow(temp) self.im4.set_visible(False) rgb[mask1, :] = 74 / 255, 117 / 255, 50 / 255 self.im5 = plt.imshow(rgb) self.im5.set_visible(False) self.im6 = plt.imshow(1 - pmask1, cmap='Oranges') self.im6.set_visible(False) self.im7 = plt.imshow(1 - bpmask, cmap='Reds') self.im7.set_visible(False) maskdiff = bmask - mask1 self.im8 = plt.imshow(maskdiff, cmap='bwr') self.im8.set_visible(False) self.cbset = False self.cb = None
def get_cached_by_extent(self, fn, extent, buffer): geom = DataLoader.extent_to_transformed_geom(extent, "epsg:2794") geom = shapely.geometry.shape(geom) new_fn = None for i, boundary_shape in enumerate(highres_boundary_shapes): if boundary_shape.contains(geom): new_fn = highres_fns[i] break if new_fn is None: print("No intersections") new_fn = fn new_fn = new_fn.replace("esri-naip/", "full-usa-output/1_3_2019/")[:-4] + "_prob.tif" f = rasterio.open(new_fn, "r") geom = DataLoader.extent_to_transformed_geom(extent, f.crs["init"]) pad_rad = 15 # TODO: this might need to be changed for much larger inputs buffed_geom = shapely.geometry.shape(geom).buffer(pad_rad) minx, miny, maxx, maxy = buffed_geom.bounds geom = shapely.geometry.mapping(shapely.geometry.box(minx, miny, maxx, maxy, ccw=True)) out_image, out_transform = rasterio.mask.mask(f, [geom], crop=True, nodata=-1) src_crs = f.crs.copy() f.close() dst_crs = {"init": "EPSG:%s" % (extent["spatialReference"]["latestWkid"])} dst_transform, width, height = rasterio.warp.calculate_default_transform( src_crs, dst_crs, width=out_image.shape[2], height=out_image.shape[1], left=buffed_geom.bounds[0], bottom=buffed_geom.bounds[1], right=buffed_geom.bounds[2], top=buffed_geom.bounds[3], resolution=1 ) dst_image = np.zeros((out_image.shape[0], height, width), np.uint8) rasterio.warp.reproject( source=out_image, destination=dst_image, src_transform=out_transform, src_crs=src_crs, dst_transform=dst_transform, dst_crs=dst_crs, resampling=rasterio.warp.Resampling.nearest ) # Calculate the correct padding w = extent["xmax"] - extent["xmin"] padding = int(np.round((dst_image.shape[1] - w) / 2)) dst_image = np.rollaxis(dst_image, 0, 3) dst_image = dst_image[padding:-padding, padding:-padding, :] return dst_image / 255.0, "highres_prob_predictions_quantized_compressed_5_11_2018"
def __init__(self, windowSize, maxEpoch, paramIndex, learningRate, threshold): self.windowSize = windowSize self.maxEpoch = maxEpoch self.paramIndex = paramIndex self.learningRate = learningRate self.threshold = threshold self.embeddingDim = 128 self.normalData = DataLoader.NormalDataLoader(self.paramIndex, 'train') self.unstableData = DataLoader.UnstableDataLoader(self.paramIndex, 'test') self.wantToShuffle = False self.statistics = {}
def main(): parser = GooeyParser(prog="example_progress_bar_1") parser.add_argument('FolderChooser', help="name of the file to process", widget='DirChooser') parser.add_argument('FolderDist', help="name of the file to process", widget='DirChooser') args = parser.parse_args(sys.argv[1:]) # Loading directory data dl.start(args.FolderChooser, args.FolderDist)
def setUp(self): if not hasattr(self, 'students'): self.students = DataLoader.load_students_from_file( 'resources/students.json') self.classes = DataLoader.load_classes_from_file( 'resources/classes.json') self.students2 = DataLoader.load_students_from_file( 'resources/students2.json') self.classes2 = DataLoader.load_classes_from_file( 'resources/classes2.json') self.students3 = DataLoader.load_students_from_file( 'resources/students3.json')
def get_cs(data_path, cs_path): # dp compute cheat sheet cs = None if os.path.isfile(cs_path): cs = loader.load_pickle_file(cs_path) print('CS loaded.') else: print('Start compute cs.') data = loader.load_pickle_file(data_path) cs = dp_compute_cs(data[0]) loader.save(cs_path, cs) print('CS saved.') return cs
def start(port=5000): ql.initialize_static_questions(TREE) app.run(port=port) request_endpoint = '{0}/me/messenger_profile'.format(bot.graph_url) response = requests.post( request_endpoint, params=bot.auth_args, data=json.dumps({"get_started": {"payload": "first"}}), headers={'Content-Type': "application/json"} ) result = response.json() Bot.send_raw(response)
def LoadData(): Data = DL.GetData(transform=False, includeDraw=False) TData = [] bits = 7 for entry in Data: reshaped = TL.FENtoBits(entry[0], bits, True) reshaped = np.reshape(reshaped, (8, 8, bits)) TData.append([reshaped, entry[1]]) TData = DL.StartifiedData(TData) return TData
def pred_patch(): ''' Method called for POST `/predPatch`''' bottle.response.content_type = 'application/json' data = Dict(bottle.request.json) data["remote_address"] = bottle.request.client_ip SESSION_HANDLER.get_session(bottle.request.session.id).add_entry( data) # record this interaction # Inputs extent = data.extent dataset = data.dataset name_list = [item.name for item in dataset.class_list] # Load the input data sources for the given tile if dataset.metadata.id not in DATASETS: raise ValueError( "Dataset doesn't seem to be valid, do the datasets in js/tile_layers.js correspond to those in TileLayers.py" ) loaded_query = DATASETS[ dataset.metadata.id]["data_loader"].get_data_from_extent(extent) SESSION_HANDLER.get_session( bottle.request.session.id).current_transform = ( loaded_query["src_crs"], loaded_query["src_transform"]) # Run a model on the input data model = SESSION_HANDLER.get_session(bottle.request.session.id).model output = model.run(loaded_query["src_img"], extent, False) loaded_query["src_img"] = None # save memory assert len( output.shape ) == 3, "The model function should return an image shaped as (height, width, num_classes)" assert ( output.shape[2] < output.shape[0] and output.shape[2] < output.shape[1] ), "The model function should return an image shaped as (height, width, num_classes)" # assume that num channels is less than img dimensions # Warp output to EPSG:3857 output, output_bounds = DL.warp_data_to_3857(output, loaded_query["src_crs"], loaded_query["src_transform"], loaded_query["src_bounds"]) # ------------------------------------------------------ # Step 5 # Convert images to base64 and return # ------------------------------------------------------ img_soft = np.round(utils.class_prediction_to_img(output)).astype(np.uint8) data["output_soft"] = DL.encode_rgb(img_soft) bottle.response.status = 200 return json.dumps(data)
def random_select_data(tr_save_path, sel_tr_save_path, percent): all_tr = loader.load_pickle_file(tr_save_path) tr_l_ind_dict = {} selected_tr_data = [[], []] for i in range(10): tr_l_ind_dict[i] = [l_ind for l_ind, l in enumerate(all_tr[1]) if l == i] for i in range(10): i_n = len(tr_l_ind_dict[i]) pick_n = int(percent * i_n) cur_pick_ind = np.random.choice(tr_l_ind_dict[i], pick_n, replace=False).tolist() selected_tr_data[0].extend([x for x_ind, x in enumerate(all_tr[0]) if x_ind in cur_pick_ind]) selected_tr_data[1].extend([y for y_ind, y in enumerate(all_tr[1]) if y_ind in cur_pick_ind]) loader.save(sel_tr_save_path, selected_tr_data)
def load_dataset(dataset, architecture, batch_size, device, path): if dataset == "imdb": if architecture == "cnn": data = DataLoader.IMDB_CNN_CUSTOM(batch_size, device, path) elif architecture == "lstm": data = DataLoader.IMDB_LSTM(batch_size, device, path) elif dataset == "agnews": data = DataLoader.AGNEWS(batch_size, device, path) else: raise ValueError(dataset + "is not supported") return data
def evaluate_data(self): for ii, batch in enumerate(self.batch_data_dict): print("[Batch %s]: %d / %d" % (batch, ii + 1, self.total_batches)) data_dict = self.batch_data_dict[batch] for job_id, job in enumerate(data_dict): self.agents = {} path, format = data_dict[job]['path'], data_dict[job][ 'data_format'] if "log" in path.split("/")[-1]: continue data_loader = DataLoader(path, format) data_scale = format_dict[format]['data_type_scale'] capture_freq = format_dict[format]['capture_freq'] prev_time = 0 curr_time = 0 time_frame_done = False normalized_time_step = 0 curr_agents = {} for _ in tqdm(range(data_loader.total_time_steps + 1)): if data_loader.done: for evaluation_metric in self.evaluation_metrics[ batch][job_id]: evaluation_metric.evaluate(self.agents, curr_agents, prev_time) break data = data_loader.step_data() time_step, agent_id, x, y = self.parse_into_state( data, format) curr_time = time_step if ((curr_time != prev_time and _ != 0) or _ == data_loader.total_time_steps): if normalized_time_step > ignored_time_steps: for evaluation_metric in self.evaluation_metrics[ batch][job_id]: evaluation_metric.evaluate( self.agents, curr_agents, prev_time) curr_agents = {} normalized_time_step += 1 if agent_id not in self.agents: self.agents[agent_id] = Agent(agent_id, [x, y], curr_time, capture_freq) else: self.agents[agent_id].update_state([x, y], curr_time) curr_agents[agent_id] = self.agents[agent_id] prev_time = curr_time self.write_stats(job_id, job, batch) self.write_batch_stats(batch) self.df_dict = {}
def main(): is_sklearn = False # kernel = c.COSINE # kernel = c.GAUSSIAN kernel = c.POLY # training parameter result_path = 'results/PB2_spam.acc' model_name = 'digits_' + kernel model_path = 'data/PB1_B_digits_sk_Gaussian_1.model' # tr_data_path = 'data\\digits\\tr_f_l.pickle' # te_data_path = 'data\\digits\\te_f_l.pickle' tr_data_path = 'data\\digits\\tr_f_l_10.pickle' te_data_path = 'data\\digits\\te_f_l_10.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(tr_data_path) te_data = loader.load_pickle_file(te_data_path) # transpose label tr_data[1] = np.transpose(tr_data[1])[0] te_data[1] = np.transpose(te_data[1])[0] Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, tr_data[0]) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, te_data[0]) # start training models = [] st = time.time() # start training print('{:.2f} Start training.'.format(time.time() - st)) for k in (1, 3, 7): if not is_sklearn: clf = kNN.kNN(kernel=kernel) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0], k=k) te_pred = clf.predict(te_data[0], k=k) else: clf = KNeighborsClassifier(n_neighbors=k, metric=cosine_distances) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0]) te_pred = clf.predict(te_data[0]) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] models.append(clf) print('{} Final results with kernel {} and k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, k, tr_acc, te_acc))
def __init__(self, run=True): if run==True: self.loader = DataLoader() while True: t = datetime.datetime.now().strftime("%H:%M:%S") if not ((t>"09:30:00" and t<"11:30:00") or (t>"13:00:00" and t<"15:00:00")): print "Not Trading Time, Resting ......" else: self.loader.load() m = self.monitor(self.loader.option_rt, self.loader.index_future_plus_dvd(self.loader.index_future_rt), self.loader.etf_index_rt) self.display(m, "Option Complex Monitor") time.sleep(10)
def update_price_value(exchanges, symbol, freq): if symbol is None: raise PreventUpdate start_dt = None end_dt = None df = DataLoader.read_data(symbol, freq, start_dt, end_dt) df_summary = dashlib.generate_summary_table(df, symbol) data = df_summary.to_dict('records') columns = [{'name': k.capitalize(), 'id': k} for k in data[0].keys()] fig = go.Figure(data=[ go.Candlestick(x=df.index, open=df['open'], high=df['high'], low=df['low'], close=df['close'], increasing_line_color='red', decreasing_line_color='green') ]) fig.update_layout(title=symbol + ' ' + 'klines', xaxis_rangeslider_visible=True, height=600) return fig, columns, data
def app_main(): try: logger = lgr.Logger() print("--------application starting--------------") logger.info("--------application starting--------------") print("--------loading data----------------------") logger.info("--------loading data----------------------") dl = dataLoader.DataLoader() train_X, test_X = train_test_split(dl.DataFrame.copy(), test_size=0.3, random_state=42) # label = train_X["median_house_value"].copy() # housing = train_X.drop("median_house_value", axis=1) housing = test_X.drop("median_house_value", axis=1) #label_test = test_X["median_house_value"].copy() numeric_cols = list(housing.columns.values) category_cols = ["ocean_proximity"] numeric_cols.remove("ocean_proximity") dp = preproc.DataPreProcess(numeric_cols, category_cols) print("--------processing data-------------------") dataProcessed = dp.getProcessedData(housing) print(dataProcessed.shape) configMagt = cmfmagt.ConfigManager() engine = mlEngine.ModelEngine() print("------------loading model------------------") bestModel = engine.loadML(configMagt.config["APPSETTING"]["ml_path"]) result = bestModel.predict(dataProcessed) print(result.shape) except Exception as e: print("Error : ", str(e))
def getinputsCNN(Sreference, indices): row = (indices / 3000).astype(int) col = (indices % 3000).astype(int) if type(Sreference) == str: scn = DL.scene_loader(Sreference) else: scn = Sreference scn.load(['S1_an']) S1 = np.nan_to_num(scn['S1_an'].values) data = [] for i in range(len(row)): coords = get_coords(row[i], col[i], 50, True) star = [] for arm in coords: if len(arm) > 0: arm = np.array(arm) arm_row = arm[:, 0] arm_col = arm[:, 1] arm_data = S1[arm_row, arm_col] star.append(arm_data) else: star.append([]) data.append(star) return data
def get_distance(directory) : '''returns the distance in m''' file = directory + '/info.dat' info = dl.load(file)[0] distancestr = info['Distance'] distance = 0.01*float(distancestr.replace('cm','')) return distance
def predict(): """ An example of how to load a trained model and use it to predict labels. """ # load the saved model classifier = pickle.load(open("best_model.p", "rb")) # compile a predictor function predict_model = theano.function( inputs=[classifier.input], outputs=classifier.y_pred) # We can test it on some examples from test test dataset = 'mnist_train.csv' datasets = DataLoader.load_kaggle_mnist(dataset) test_set_x, test_set_y = datasets[2] print(type(test_set_x)) print(type(test_set_y)) test_set_x = test_set_x.get_value() test_set_y = test_set_y.eval() predicted_values = predict_model(test_set_x[20:30]) print("Sample Neural Prediction") print ("Predicted values for the first 20 examples in test set:") print(predicted_values) print ("The actual values are") print(test_set_y[20:30])
def update_volume_value(symbol): if symbol is None: raise PreventUpdate start_dt = None end_dt = None df = DataLoader.read_data(symbol, 'D', start_dt, end_dt) volume_fig_dict = dict({ 'data': [{ 'x': df.index, 'y': df['volume'], 'type': 'bar', 'name': symbol }], 'layout': { 'title': symbol + ' volume' } }) oi_fig_dict = dict({ 'data': [{ 'x': df.index, 'y': df['close_oi'] - df['open_oi'], 'type': 'bar', 'name': symbol }], 'layout': { 'title': symbol + ' net_oi' } }) volume_fig = go.Figure(volume_fig_dict) oi_fig_fig = go.Figure(oi_fig_dict) return volume_fig, oi_fig_fig
def get_balanced_dataset(in_memory=False, TMP_WHOLE_UNBALANCED=False): from ActiveLearning.LargeDatasetHandler_AL import LargeDatasetHandler_AL import Settings # init structures import mock args = mock.Mock() args.name = "test" settings = Settings.Settings(args) WholeDataset = LargeDatasetHandler_AL(settings) # load paths of our favourite dataset! import DataLoader, DataPreprocesser, Debugger import DatasetInstance_OurAerial dataLoader = DataLoader.DataLoader(settings) debugger = Debugger.Debugger(settings) #h5_file = settings.large_file_folder + "datasets/OurAerial_preloadedImgs_subBAL3.0_1.0_sel2144_res256x256.h5" h5_file = settings.large_file_folder + "datasets/OurAerial_preloadedImgs_subBAL3.0_1.0_sel2144_res256x256_SMALLER.h5" datasetInstance = DatasetInstance_OurAerial.DatasetInstance_OurAerial( settings, dataLoader, "256_cleanManual") if not TMP_WHOLE_UNBALANCED: # ! this one automatically balances the data + deletes misfits in the resolution data, paths = datasetInstance.load_dataset() lefts_paths, rights_paths, labels_paths = paths print("Paths: L,R,Y ", len(lefts_paths), len(rights_paths), len(labels_paths)) else: # ! this one loads them all (CHECK: would some be deleted?) paths = datasetInstance.load_dataset_ONLY_PATHS_UPDATE_FROM_THE_OTHER_ONE_IF_NEEDED( ) lefts_paths, rights_paths, labels_paths = paths print("Paths: L,R,Y ", len(lefts_paths), len(rights_paths), len(labels_paths)) WholeDataset.initialize_from_just_paths(paths) if in_memory: assert not TMP_WHOLE_UNBALANCED #WholeDataset.keep_it_all_in_memory() WholeDataset.keep_it_all_in_memory(h5_file) npy_path = settings.large_file_folder + "datasets/OurAerial_preloadedImgs_BALCLASS.npy" I_WANT_TO_RECOMPUTE_THE_LABELS = False if I_WANT_TO_RECOMPUTE_THE_LABELS: assert False # don't want to mistakenly recompute these ... WholeDataset.compute_per_tile_class_in_batches() WholeDataset.save_per_tile_class(npy_path) WholeDataset.load_per_tile_class(npy_path) WholeDataset.report() return WholeDataset
def compute_feature_mean(features, save_path): n, d = np.shape(features) means = [] for i in range(d): cur_f = features[:, i] means.append(np.nanmean(cur_f)) # cur_mean = 0 # for f in features: # if not np.isnan(f[i]): # cur_mean += f[i] # means.append(cur_mean / n) means = np.array(means) loader.save(save_path, means) return means
def train(self, X, y, model, batch_generator, n_epochs=50, optim_algo='adam', criterion='categorical_crossentropy', save_model=True, verbose=2, plot=True, batch_size=64,): if optim_algo == 'adam': optim_algo = Adam() elif optim_algo == 'sgd': optim_algo = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) elif optim_algo == 'adagrad': optim_algo = Adagrad() self.model.compile(optimizer=optim_algo, loss=criterion) loss_train_history = [] loss_val_history = [] batch_history = {'f1': [], 'recall': [], 'precision': []} for epoch in range(1, n_epochs + 1): batch_f1_history = [] batch_precision_history = [] batch_recall_history = [] for X, y in batch_generator.next_batch(): history = self.model.fit(X, y, nb_epoch=1, batch_size=batch_size, validation_split=0.2, verbose=0) val_loss, loss = history.history['val_loss'][0], history.history['loss'][0] loss_train_history.append(loss) loss_val_history.append(val_loss) truth = self.model.validation_data[3] truth = dl.onehot2list(truth) batch_prediction = self.predict_classes(self.model.validation_data[0:3]) batch_f1 = metrics.f1_score(truth, batch_prediction) batch_recall = metrics.recall_score(truth, batch_prediction) batch_precision = metrics.precision_score(truth, batch_prediction) batch_f1_history.append(batch_f1) batch_recall_history.append(batch_recall) batch_precision_history.append(batch_precision) batch_history['f1'].append(batch_f1_history) batch_history['recall'].append(batch_recall_history) batch_history['precision'].append(batch_precision_history) print('Epoch: {} | Train loss: {} | Valid loss: {}'.format(epoch, loss, val_loss)) print("Epoch Metrics | F1: {} | Recall {} | Precision: {}".format(np.mean(batch_history['f1'][epoch - 1]), np.mean(batch_history['recall'][epoch - 1]), np.mean(batch_history['precision'][epoch - 1]))) a_max = np.argmax(batch_history['f1'][epoch - 1]) print("Best F1 at Epoch {} Minibatch {}: {}\n".format(epoch, a_max, batch_history['f1'][epoch-1][a_max])) if save_model: self.model.save_weights(self.model_name + '.h5', overwrite=True)
def main(): st = time.time() # training parameter result_path = 'results/PB4_spam_polluted_missing_NB_Bern.acc' model_name = 'spam_' mean_path = 'data/spam_polluted_missing/train/f_mean.pickle' train_data_path = 'data/spam_polluted_missing/train/data.pickle' test_data_path = 'data/spam_polluted_missing/test/data.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(train_data_path) te_data = loader.load_pickle_file(test_data_path) print('{:.2f} Data loaded!'.format(time.time() - st)) # load means means = loader.load_pickle_file(mean_path) print('{:.2f} Means loaded!'.format(time.time() - st)) # start training roc = [] auc = 0.0 tr_n, f_d = np.shape(tr_data[0]) te_n, = np.shape(te_data[1]) te_auc = 2. round = 0 model = m.NBBernoulli(means) model.build(tr_data[0], tr_data[1]) training_acc = model.test(tr_data[0], tr_data[1], util.acc) # training_cms.append(training_test_res[1]) testing_acc = model.test(te_data[0], te_data[1], util.acc) # testing_cms.append(testing_test_res[1]) print('Final results. Train acc: {}, Test acc: {}'.format(training_acc, testing_acc)) result = {} result['TrainingAcc'] = training_acc result['TestingAcc'] = testing_acc # log the training result to file util.write_result_to_file(result_path, model_name, result, True)
def ecoc_test(): svms = loader.load_pickle_file(model_path) te_data= loader.load_pickle_file(te_data_path) pred = [] for f in te_data[0]: min_hamming_dist = 1. match_label = 0 code = [] for s in svms: c_pred = s.predict([f])[0] code.append(1 if c_pred == 1 else 0) # replace -1 with 0 for ind, c in enumerate(ecoc): cur_hd = hamming(c, code) if cur_hd < min_hamming_dist: min_hamming_dist = cur_hd match_label = ind pred.append(match_label) return (pred == te_data[1]).sum() / len(te_data[1])
def abstract_features(data_path, cs_path, rects_path, res_path): # get cs cs = get_cs(data_path, cs_path) rects = loader.load_pickle_file(rects_path) # 2 features for each rectangle features = [] for i, ccs in enumerate(cs): f = [] for rect in rects: f.extend(compute_feature_with_cs(rect, ccs)) features.append(f) print('{} rects finished.'.format(i)) # combine with labels label = loader.load_pickle_file(data_path)[1] f_l = [np.array(features), label] loader.save(res_path, f_l) return f_l
def create_filename(directory, num=1) : """ creates basic name for a file containing condition and distance information. Use num to assign numbers if more than one file per distance will be created """ file = directory + '/info.dat' info = dl.load(file)[0] distancestr = info['Distance'] conditionstr= info['Condition'] speciesName = 'Pholidoptera_littoralis_' return speciesName+conditionstr+'_'+distancestr+'_'+str(num)
def get_ecoc(ecoc_path, num_ecoc, class_num): if path.isfile(ecoc_path): print('Loading the ecoc...') best_ecoc = loader.load_pickle_file(ecoc_path) else: print('Creating the ecoc...') best_ecoc = [0, [], []] # distance, ecoc for training, ecoc for predicting for i in range(100): n = int(math.pow(2, num_ecoc)) codes = choice(n, class_num) ecoc_func_codes = [] for i in range(num_ecoc): ecoc_func_codes.append([]) c_ecoc = [] for c in codes: bin_s = '{0:0' + str(num_ecoc) + '10b}'.format(c) bin_s = [int(ss) for ss in bin_s] c_ecoc.append(bin_s) for i in range(num_ecoc): ecoc_func_codes[i].append(bin_s[i]) c_hamming_dist = 0 has_same_code = False for j in range(len(c_ecoc)): for k in range(len(c_ecoc)): if j != k: c_hd = hamming(c_ecoc[j], c_ecoc[k]) if c_hd == 0: has_same_code = True c_hamming_dist += c_hd if has_same_code: continue if c_hamming_dist > best_ecoc[0]: best_ecoc[0] = c_hamming_dist best_ecoc[1] = ecoc_func_codes best_ecoc[2] = c_ecoc # serialize the best ecoc loader.save(ecoc_path, best_ecoc) return best_ecoc
def random_select_rectangle(h, w, n, pl, ph, save_path=None): ''' :param h: height of the image in pixel :param w: width of the image in pixel :param n: number of rectangle :param pl: min pixels of each rectangle :param ph: max pixels of each rectangle :return: ''' sel_rects = [] for i in range(n): a = -1 while a < pl or a > ph: p1 = (random.randint(0, h - 1), random.randint(0, w - 1)) p2 = (random.randint(0, h - 1), random.randint(0, w - 1)) a = rect_area(p1, p2) sel_rects.append(((min(p1[0], p2[0]), min(p1[1], p2[1])), (max(p1[0], p2[0]), max(p1[1], p2[1])))) if save_path is not None: loader.save(save_path, sel_rects) return sel_rects
def predict_main(classifier_pickle): data = DataLoader.load_kaggle_mnist("mnist_train.csv", neural=False) X = numpy.array(data[2][0]) X = X/255.0*2 - 1 Y = numpy.array(data[2][1]) predictor = MLutil.Predictor(classifier_pickle, 'SVM') predicted_values = predictor.make_prediction(X) predAnalysis = MLutil.PredictionAccuracies(predicted_values, Y) print(predAnalysis.get_misclass_rate()) print(predAnalysis.get_indicies_misclassifications()) pickle.dump(predAnalysis.get_indicies_misclassifications(), open("svm_indicies.p", "wb")) return predAnalysis.get_indicies_misclassifications()
def ecoc(): # training parameter c = 0.001 tol = 0.01 epsilon = 0.001 # kernel = 'rbf' kernel = 'linear' # laod and preprocess training data print('Loading data...') tr_data = loader.load_pickle_file(tr_data_path) te_data= loader.load_pickle_file(te_data_path) # randomly generate ECOC of 50 functions num_ecoc = 10 class_num = 10 best_ecoc = util.get_ecoc(ecoc_path, num_ecoc, class_num) # train 10 svm print('Begin training...') svms = [] # list of svm classifiers function_tr_err = [] sst = time.time() for ind, c_ecoc in enumerate(best_ecoc[1]): st = time.time() # prepare label c_label = [-1 if c_ecoc[l] == 0 else 1 for l in tr_data[1]] clf = svm.SVM(C=c, tol=tol, epsilon=epsilon, kernel=kernel) clf.fit(tr_data[0], c_label) tr_pred = clf.predict(tr_data) tr_acc = (c_label == tr_pred).sum() / tr_data[0].shape[0] print('{} Function {} done. Final results. Train acc: {}'.format(time.time() - st, ind, tr_acc)) svms.append(clf) print('{} Training finished.'.format(time.time() - sst)) loader.save(model_path, svms)
def main(): # training parameter result_path = 'results/PB1_B_digits.acc' model_name = 'digits_' threshes_path = 'data/spambase.threshes' tr_data_path = 'data\\digits\\tr_f_l_10r.pickle' te_data_path = 'data\\digits\\te_f_l_10r.pickle' # laod and preprocess training data tr_data = loader.load_pickle_file(tr_data_path) te_data = loader.load_pickle_file(te_data_path) # transpose label tr_data[1] = np.transpose(tr_data[1])[0] te_data[1] = np.transpose(te_data[1])[0] # start training # kernel = 'poly' kernel = 'linear' tol = 0.01 c = 0.01 st = time.time() # start training print('{} Start training. Kernel: {}'.format(time.time() - st, kernel)) # clf = svm.SVC(kernel='poly') clf = svm.SVC(C=c, kernel=kernel, tol=tol) # clf = svm.NuSVC(kernel=kernel) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0]) te_pred = clf.predict(te_data[0]) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print('{} Final results. Train acc: {}, Test acc: {}'.format(time.time() - st, tr_acc, te_acc))
def main(): try: opts, args = getopt.getopt(sys.argv[1:], '', ['max_words_abstract=', 'max_words_title=', 'max_words_mesh=', 'path=', 'w2v_path=', 'w2v_length=', '=filter_small_data']) except getopt.GetoptError as error: print(error) sys.exit(2) max_words = {'text': 270, 'mesh': 50, 'title': 17} path = 'Data/' w2v_path = '/Users/ericrincon/PycharmProjects/Deep-PICO/wikipedia-pubmed-and-PMC-w2v.bin' word_vector_size = 200 filter_small_data = False for opt, arg in opts: if opt == '--max_words_abstract': max_words['text'] = int(arg) elif opt == '--max_words_title': max_words['mesh'] = int(arg) elif opt == '--max_words_mesh': max_words['mesh'] = int(arg) elif opt == '--path': path = arg elif opt == '--w2v_path': w2v_path = arg elif opt == '--filter_small_data': if int(arg) == 1: filter_small_data = True elif int(arg): filter_small_data = False print('Loading word2vec...') w2v = Word2Vec.load_word2vec_format(w2v_path, binary=True) print('Loaded word2vec...') X_list, y_list, data_names = DataLoader.get_data_separately(max_words, word_vector_size, w2v, use_abstract_cnn=True, preprocess_text=False, filter_small_data=filter_small_data) for X, y, name in zip(X_list, y_list, data_names): X_abstract, X_title, X_mesh = X f = h5py.File("DataProcessed/" + name + ".hdf5", "w") f.create_dataset('X_abstract', data=X_abstract, shape=X_abstract.shape) f.create_dataset('X_title', data=X_title, shape=X_title.shape) f.create_dataset('X_mesh', data=X_mesh, shape=X_mesh.shape) f.create_dataset('y', data=y, shape=y.shape)
def predict_main(classifier_pickle): print("This functions is being called") datasets = DataLoader.load_kaggle_mnist("mnist_train.csv") test_set_x, test_set_y = datasets[2] test_set_x = test_set_x.get_value() test_set_y = test_set_y.eval() predictor = MLutil.Predictor(classifier_pickle, 'DNN') predicted_values = predictor.make_prediction(test_set_x) predAnalysis = MLutil.PredictionAccuracies(predicted_values, test_set_y) print(predAnalysis.get_misclass_rate()) print(predAnalysis.get_indicies_misclassifications()) pickle.dump(predAnalysis.get_indicies_misclassifications(), open("neural_indicies.p", "wb")) return predAnalysis.get_indicies_misclassifications()
def svm_main(dataset, pickle_model): data = DataLoader.load_kaggle_mnist(dataset, neural=False) classifier = SVM() start = time.time() print("Fitting the svm") X = numpy.array(data[0][0]) X = X/255.0*2 - 1 print(X) Y = numpy.array(data[0][1]) print(len(X)) print(len(Y)) del data classifier.fit_multi(X, Y) fin = time.time() - start print("Awesome, the SVM has been fit, only took {0} seconds".format(fin)) pickle.dump(classifier, open(pickle_model, "wb"))
def train(self, X, y, n_epochs, optim_algo='adam', criterion='categorical_crossentropy', save_model=True, verbose=2, plot=True, batch_size=64, fold_idxs=None): if optim_algo == 'adam': optim_algo = Adam() elif optim_algo == 'sgd': optim_algo = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True) elif optim_algo == 'adagrad': optim_algo = Adagrad() self.model.compile(optimizer=optim_algo, loss=criterion) if fold_idxs is not None: batch_generator = StanderedBG(X, y, batch_size=batch_size, fold_indices=fold_idxs) else: batch_generator = StanderedBG(X, y, batch_size=batch_size) loss_train_history = [] loss_val_history = [] batch_history = {'accuracy': []} for epoch in range(1, n_epochs + 1): batch_accuracy_history = [] for X, y in batch_generator.next_batch(): history = self.model.fit(X, y, nb_epoch=1, batch_size=batch_size, validation_split=0.2, verbose=0) val_loss, loss = history.history['val_loss'][0], history.history['loss'][0] loss_train_history.append(loss) loss_val_history.append(val_loss) truth = self.model.validation_data[1] truth = dl.onehot2list(truth) batch_prediction = self.predict_classes(self.model.validation_data[0]) accuracy = metrics.accuracy_score(truth, batch_prediction) batch_accuracy_history.append(accuracy) batch_history['accuracy'].append(batch_accuracy_history) print('Epoch: {} | Train loss: {} | Valid loss: {}'.format(epoch, loss, val_loss)) print("Epoch Metrics | Accuracy: {}".format(np.mean(batch_history['accuracy'][epoch-1]))) if save_model: self.model.save_weights(self.model_name + '.h5', overwrite=True)
def main(): # training parameter is_sklearn = True k = 10 # fold result_path = 'results/PB2_spam.acc' model_name = 'spam_' + str(k) + 'fold' data_path = 'data/spam/data.pickle' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} # util.replace_zero_label_with_neg_one(training_data) # Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0]) # training_data[0] = preprocessing.scale(training_data[0]) # start training training_errs = [] testing_errs = [] print('Preparing k fold data.') k_folds = Preprocess.prepare_k_folds(training_data, k) for i in (0,): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) # start training print('{:.2f} Start training.'.format(time.time() - st)) kernel = c.EUCLIDEAN # kernel = c.GAUSSIAN f_select = True best_features_num = 5 clf = kNN.kNN(kernel=kernel) clf.fit(tr_data[0], tr_data[1], f_select=f_select, best_f=best_features_num) print("Best features: {}".format(clf.best_f_indices)) for kk in (1, 2, 3, 7): tr_pred = clf.predict(tr_data[0], k=kk) te_pred = clf.predict(te_data[0], k=kk) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print('{} Final results with kernel {}, k={}. Train acc: {}, Test acc: {}'.format(time.time() - st, kernel, kk, tr_acc, te_acc))
def main(): # training parameter k = 8 # fold result_path = 'results/PB2_spam.acc' model_name = 'spam_' + str(k) + 'fold' data_path = 'data/spam/data.pickle' # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} # util.replace_zero_label_with_neg_one(training_data) Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0]) # Preprocess.normalize_features_all(Preprocess.shifiat_and_scale, training_data[0]) # start training training_accs = [] testing_accs = [] print('Preparing k fold data.') k_folds = Preprocess.prepare_k_folds(training_data, k) kernel = c.EUCLIDEAN sst = time.time() for i in (1,): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) # start training print('{:.2f} Start training.'.format(time.time() - st)) for r in (2.5, 2.7): clf = kNN.kNN(kernel=kernel) # clf.fit(training_data[0], training_data[1]) clf.fit(tr_data[0], tr_data[1]) # tr_pred = clf.predict(training_data[0], r=r) tr_pred = clf.predict(tr_data[0], r=r) te_pred = clf.predict(te_data[0], r=r) # tr_acc = (training_data[1] == tr_pred).sum() / training_data[0].shape[0] tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] testing_accs.append(te_acc) print('{} {}-fold results with kernel {}, r={}. Train acc: {}, Test acc: {}'.format(time.time() - st, i, kernel, r, tr_acc, te_acc))
def main(): # training parameter k = 10 # fold result_path = "results/PB1_A_spam.acc" model_name = "spam_" + str(k) + "fold" threshes_path = "data/spambase.threshes" data_path = "data/spam/data.pickle" # kernel = 'poly' kernel = "linear" # kernel = 'rbf' verbose = False tol = 0.01 c = 0.1 # laod and preprocess training data training_data = loader.load_pickle_file(data_path) # TODO convert labels from {0, 1} to {-1, 1} util.replace_zero_label_with_neg_one(training_data) # normalize Preprocess.normalize_features_all(Preprocess.zero_mean_unit_var, training_data[0]) print("Preparing k fold data.") k_folds = Preprocess.prepare_k_folds(training_data, k) for i in range(1): st = time.time() tr_data, te_data = Preprocess.get_i_fold(k_folds, i) # start training print("{:3f} Start training. Kernel: {}".format(time.time() - st, kernel)) clf = svm.SVC(C=c, kernel=kernel, tol=tol, verbose=verbose) # clf = svm.NuSVC(kernel=kernel, tol=tol, verbose=verbose) clf.fit(tr_data[0], tr_data[1]) tr_pred = clf.predict(tr_data[0]) te_pred = clf.predict(te_data[0]) tr_acc = (tr_data[1] == tr_pred).sum() / tr_data[0].shape[0] te_acc = (te_data[1] == te_pred).sum() / te_data[0].shape[0] print("{:3f} Final results. Train acc: {}, Test acc: {}".format(time.time() - st, tr_acc, te_acc))