def Train(self, mini_batches, epoch, best_f_score, options): print 'Start time', time.ctime() start = time.time() errs, loss, iters, sen_num = [], 0, 0, 0 dev_path = options.conll_dev part_size = len(mini_batches) / 5 part = 0 best_part = 0 for b, mini_batch in enumerate(mini_batches): e = self.buildGraph(mini_batch, True) errs += e sum_errs = esum(errs) / len(errs) loss += sum_errs.scalar_value() sum_errs.backward() self.trainer.update() renew_cg() self.x_le.init_row(self.NO_LEMMA, [0] * self.d_l) renew_cg() print 'loss:', loss / ( b + 1), 'time:', time.time() - start, 'progress', round( 100 * float(b + 1) / len(mini_batches), 2), '%' loss, start = 0, time.time() errs, sen_num = [], 0 iters += 1 if (b + 1) % part_size == 0: part += 1 if dev_path != '': start = time.time() write_conll( os.path.join(options.outdir, options.model) + str(epoch + 1) + "_" + str(part) + '.txt', self.Predict(dev_path)) os.system('perl src/utils/eval.pl -g ' + dev_path + ' -s ' + os.path.join(options.outdir, options.model) + str(epoch + 1) + "_" + str(part) + '.txt' + ' > ' + os.path.join(options.outdir, options.model) + str(epoch + 1) + "_" + str(part) + '.eval') print 'Finished predicting dev on part ' + str( part) + '; time:', time.time() - start labeled_f, unlabeled_f = get_scores( os.path.join(options.outdir, options.model) + str(epoch + 1) + "_" + str(part) + '.eval') print 'epoch: ' + str(epoch) + ' part: ' + str( part) + '-- labeled F1: ' + str( labeled_f) + ' Unlabaled F: ' + str(unlabeled_f) if float(labeled_f) > best_f_score: self.Save(os.path.join(options.outdir, options.model)) best_f_score = float(labeled_f) best_part = part print 'best part on this epoch: ' + str(best_part) return best_f_score
def run_game(self, num_players=2, score_limit=15): # run a game between num_players players assert(num_players > 0) players = dict([(str(r+1),0) for r in range(num_players)]) player_index = 0 print "Starting a game between", num_players, "players" print "Enter moves as a row and column separated by a space, i.e.: R C" raw_input("Ready?") # should add a check to see if there are no moves left while all([score<score_limit for score in players.values()]): p = players.keys()[player_index] player_index = (player_index+1)%num_players good_choice = False print self.board while not good_choice: choice = raw_input("Player "+p+", place a piece: ").strip() try: r, c = map(int, choice.split(" ")) except Exception: print "Please format as two integers: R C" continue if 0 <= r < len(self.board) and 0 <= c < len(self.board[0]): if self.board[r][c] == self.board.default: self.board[r][c] = p squares = get_squares(self.board, (p, Point(r,c))) score = get_scores(squares) players[p] += score.get(p, 0) print "Player",p,"score:",players[p] good_choice = True else: print "That square is occupied." else: print "Choice out of range." print "Final scores:" pprint.pprint(players)
def test(model, test_loader, topk): model.eval() test_steps = (len(test_loader.dataset) // test_loader.batch_size) + 1 scores = [] with torch.no_grad(): with trange(test_steps) as t: for i, data in zip(t, test_loader): t.set_description('test') users = data[:, 0] items = data[:, 1] labels = data[:, 2].float() if use_cuda: users, items, labels = users.cuda(), items.cuda( ), labels.cuda() preds = model(users, items) items_cpu = items.cpu().numpy() preds_cpu = preds.squeeze(1).detach().cpu().numpy() litems = np.split(items_cpu, test_loader.batch_size // 100) lpreds = np.split(preds_cpu, test_loader.batch_size // 100) scores += [ get_scores(it, pr, topk) for it, pr in zip(litems, lpreds) ] hits = [s[0] for s in scores] ndcgs = [s[1] for s in scores] return (np.array(hits).mean(), np.array(ndcgs).mean())
def check_true(question, verbose=True, attribution=False): keys = [] with open('key.txt', 'r') as f: keys = f.readlines() query = { 'key': keys[0].rstrip(), #generate API key first 'cx': keys[1].rstrip(), #create custom search using Google Custom Search API 'q': question } response = requests.get(api_url, params=query, headers={'Content-Type': 'application/json'}) if response.status_code != 200: # This means something went wrong. print('DID NOT WORK!') else: # pdb.set_trace() resp_json = response.json() # print ('HELLLOOOOOOOO') # print (resp_json.url) urls = [] snippets = [] for item in resp_json['items']: sentence = sanitizer2(item['snippet']) snippets.append(sentence) urls.append(item['link']) #print ('\'' + item['snippet'].rstrip('...').lstrip('...') + '\'') # print (item['formattedUrl']) # print ('#############') if verbose: print('\n'.join(snippets)) res_scores = get_scores(snippets, question, verbose=verbose, attribution=attribution, urls=urls) if verbose: print('RES SCORES') print(res_scores) top_score = res_scores[0][1] if top_score >= threshold_high: res = 1 if verbose: print("TRUE") elif top_score < threshold_low: res = 0 if verbose: print("FALSE") else: res = 0.5 if verbose: print("WE ARE UNSURE") if not attribution: return res else: return res, res_scores return res
def evaluate_on_holdout(model) -> float: data = pd.read_csv('./data/holdout.csv') x_cols = [x for x in data.columns if (x not in ['ID_code', 'target'])] X = data[x_cols] y = data['target'] y_pred = model.predict(X) y_true = np.array(y) score = utils.get_scores(y_true, y_pred)["f1"] return score
def predict(self, dim_reduced_vecs, outlier_labels, scores, contamination, **kwargs): print(f"Outlier detection using pyod's {self.pyod_model}") od = self.pyod_model(**kwargs) od.fit(dim_reduced_vecs) out_pred = od.labels_ out_pred[out_pred == 1] = -1 out_pred[out_pred == 0] = 1 scores = get_scores(scores, outlier_labels, out_pred) scores.update(**kwargs) out_f1 = scores["out_f1"] print(f"{kwargs}\nOut_f1: {out_f1}\n\n") return scores, out_pred
def predict(self, dim_reduced_vecs, outlier_labels, scores, contamination, **kwargs): od = self.dem_red_outlier_model(**kwargs) if self.as_numpy: dim_reduced_vecs = np.array(dim_reduced_vecs) preds = od.fit_transform(dim_reduced_vecs) preds = preds.astype(float) preds = self.reject_outliers(preds, iq_range=1.0 - contamination) preds = [-1 if x else 1 for x in preds] scores = get_scores(scores, outlier_labels, preds) scores.update(**kwargs) out_f1 = scores["out_f1"] print(f"{kwargs}\nOut_f1: {out_f1}\n\n") return scores, preds
def move(self, r, c, player_name): # check we should make move if not self.is_game_started(): return False, "Move not made - game not yet started." if player_name not in self.players: return False, "Move not made - player not part of this game." # attempt to make a move if not (0 <= r < len(self.board) and 0 <= c < len(self.board[0])): return False, "Move not made - out of range." if self.board[r][c] != self.board.default: return False, "Move not made - that space is occupied." # otherwise, make move self.board[r][c] = player_name # and update scores squares = get_squares(self.board, (player_name, Point(r, c))) scores = get_scores(squares) self.players[player_name] += scores.get(player_name, 0) return True, "Move successful."
def move(self, r, c, player_name): # check we should make move if not self.is_game_started(): return False, "Move not made - game not yet started." if player_name not in self.players: return False, "Move not made - player not part of this game." # attempt to make a move if not (0 <= r < len(self.board) and 0 <= c < len(self.board[0])): return False, "Move not made - out of range." if self.board[r][c] != self.board.default: return False, "Move not made - that space is occupied." # otherwise, make move self.board[r][c] = player_name # and update scores squares = get_squares(self.board, (player_name, Point(r,c))) scores = get_scores(squares) self.players[player_name] += scores.get(player_name, 0) return True, "Move successful."
def run_vcl(hidden_size, no_epochs, data_gen, coreset_method, coreset_size=0, batch_size=None, single_head=True): in_dim, out_dim = data_gen.get_dims() x_coresets, y_coresets = [], [] x_testsets, y_testsets = [], [] all_acc = np.array([]) for task_id in range(data_gen.max_iter): x_train, y_train, x_test, y_test = data_gen.next_task() x_testsets.append(x_test) y_testsets.append(y_test) # Set the readout head to train head = 0 if single_head else task_id bsize = x_train.shape[0] if (batch_size is None) else batch_size # Train network with maximum likelihood to initialize first model if task_id == 0: ml_model = Vanilla_NN(in_dim, hidden_size, out_dim, x_train.shape[0]) ml_model.train(x_train, y_train, task_id, no_epochs, bsize) mf_weights = ml_model.get_weights() mf_variances = None ml_model.close_session() # Train on non-coreset data mf_model = MFVI_NN(in_dim, hidden_size, out_dim, x_train.shape[0], prev_means=mf_weights, prev_log_variances=mf_variances) mf_model.train(x_train, y_train, head, no_epochs, bsize) mf_weights, mf_variances = mf_model.get_weights() # Select coreset if needed if coreset_size > 0: if type(coreset_method) is str and coreset_method == "uncertainty_based": x_coresets, y_coresets, x_train, y_train = uncertainty_based( mf_model, task_id, x_coresets, y_coresets, x_train, y_train, coreset_size) else: x_coresets, y_coresets, x_train, y_train = coreset_method(x_coresets, y_coresets, x_train, y_train, coreset_size) # Incorporate coreset data and make prediction acc = utils.get_scores(mf_model, x_testsets, y_testsets, x_coresets, y_coresets, hidden_size, no_epochs, single_head, batch_size) all_acc = utils.concatenate_results(acc, all_acc) mf_model.close_session() return all_acc
def evaluate(model, test_loader, use_cuda, topk): model.eval() scores = [] with torch.no_grad(): for data in test_loader: users = data[:, 0] items = data[:, 1] labels = data[:, 2].float() if use_cuda: users, items, labels = users.cuda(), items.cuda(), labels.cuda( ) preds = model(users, items) items_cpu = items.cpu().numpy() preds_cpu = preds.squeeze(1).detach().cpu().numpy() litems = np.split(items_cpu, test_loader.batch_size // 100) lpreds = np.split(preds_cpu, test_loader.batch_size // 100) scores += [ get_scores(it, pr, topk) for it, pr in zip(litems, lpreds) ] hits = [s[0] for s in scores] ndcgs = [s[1] for s in scores] return (np.array(hits).mean(), np.array(ndcgs).mean())
def predict(self, dim_reduced_vecs, outlier_labels, scores, contamination, min_cluster_size, allow_noise): print("Clustering ...") clusterer = HDBSCAN(min_cluster_size=min_cluster_size, prediction_data=True, metric="euclidean").fit(dim_reduced_vecs) print("Get prediction data ...") clusterer.generate_prediction_data() try: cluster_pred = clusterer.labels_ if allow_noise else np.argmax( all_points_membership_vectors(clusterer)[:, 1:], axis=1) except IndexError: print( "Got IndexError and will not enforce cluster membership (allow noise) ..." ) print(all_points_membership_vectors(clusterer)) cluster_pred = clusterer.labels_ # scoring print("Get scores ...") # GLOSH threshold = pd.Series(clusterer.outlier_scores_).quantile(0.9) outlier_pred = np.where(clusterer.outlier_scores_ > threshold, -1, 1) scores["cluster_n"] = len(np.unique(clusterer.labels_)) scores["homogeneity"] = homogeneity_score(outlier_labels, cluster_pred) scores["completeness"] = completeness_score(outlier_labels, cluster_pred) scores["v_measure"] = v_measure_score(outlier_labels, cluster_pred) scores = get_scores(scores, outlier_labels, outlier_pred) print( f"Homogeneity - {homogeneity_score(outlier_labels, cluster_pred)*100:.1f} \ cluster_n - {len(np.unique(clusterer.labels_))}") return scores, clusterer.outlier_scores_
def run_vcl(hidden_size, no_epochs, data_gen, coreset_method, coreset_size=0, batch_size=None, single_head=True): in_dim, out_dim = data_gen.get_dims() x_coresets, y_coresets = [], [] x_testsets, y_testsets = [], [] all_acc = np.array([]) for task_id in range(data_gen.max_iter): x_train, y_train, x_test, y_test = data_gen.next_task() x_testsets.append(x_test) y_testsets.append(y_test) head = 0 if single_head else task_id bsize = x_train.shape[0] if (batch_size is None) else batch_size if task_id == 0: ml_model = VCL(in_dim, hidden_size, output_size=10). ml_model.train(x_train, y_train, bsize, no_epochs, task_id) torch.save(ml_model.state_dict(), 'my_model.pth') else: ml_model = VCL(in_dim, hidden_size, output_size=10) ml_model.load_state_dict(torch.load('my_model.pth')) ml_model.bfc3 = vcl_model.BayesLinear(hidden_size, 10) ml_model.train(x_train, y_train, bsize, no_epochs, task_id) torch.save(ml_model.state_dict(), 'my_model.pth') if coreset_size > 0: x_coresets, y_coresets, x_train, y_train = coreset_method(x_coresets, y_coresets, x_train, y_train, coreset_size) acc = utils.get_scores(ml_model, x_testsets, y_testsets, x_coresets, y_coresets, hidden_size, no_epochs, single_head, batch_size) all_acc = utils.concatenate_results(acc, all_acc) return all_acc
def upload_model(): ''' Receives a base64 encoded joblib file to be saved as a new model. INPUT: - input: { model_name : string, model: base64 encoded joblib file } OUTPUT: - output: message status (successful or not) ''' input_asDict = request.get_json() logger.info("Received [Upload Model Request] (0/3) ... ") # Sees if model name is available try: # Check if Model Name and Model Type are present if ("model_name" in input_asDict.keys() and input_asDict["model_name"]): model_name = input_asDict["model_name"] else: model_name = default_model_name filepath = "./models/" + model_name + ".joblib.dat" if file_exists(filepath): message = "Model with given name already exists. Choose another name." logger.error(message) return custom_response_http(message, 400) except: message = "Internal Server Error" logger.info(message) return custom_response_http(message, 500) logger.info("[Upload Model Request] Model Name Set: [" + model_name + "] (1/3) ... ") # Decodes Model try: if ("model" in input_asDict.keys()): model_as64 = input_asDict["model"] logger.info(str(model_as64)[:10]) decoded_model = base64.b64decode(model_as64) with open(filepath, 'wb') as fh: fh.write(decoded_model) with open(filepath, 'rb') as model_file: # Load model model = joblib.load(model_file) else: message = "Model does not exist in request." logger.error(message) return custom_response_http(message, 400) except: message = "Internal Server Error" logger.error(message) return custom_response_http(message, 500) logger.info("[Upload Model Request] Model Decoded (2/3) ... ") # Validate Model try: # Validate with holdout Set data = pd.read_csv('./data/holdout.csv') x_cols = [x for x in data.columns if (x not in ['ID_code', 'target'])] X = data[x_cols] y = data['target'] y_pred = model.predict(X) y_true = np.array(y) score = get_scores(y_true, y_pred)["f1"] except: message = "Could not validate model ." # remove(filepath) logger.error(message) return custom_response_http(message, 500) logger.info("[Upload Model Request] Model Validated (3/3) .") message = "Model Uploaded! Score: " + str(score) logger.info(str(custom_response_http(message, 200))) return custom_response_http(message, 200)
mongo.close() mongo_ids = [post.pop('_id', None) for post in posts] # exclude mongo generated ids posts = d_to_df(posts) posts['created_time'] = pd.to_datetime(posts['created_time'], format="%Y-%m-%dT%H:%M:%S+0000") posts.set_index('created_time', inplace=True) # Calculating post title and message sentiment posts['article_title'].fillna('', inplace=True) posts['article_title_sentiment'] = posts.article_title.apply( paragraph_sentiment) posts['message_sentiment'] = posts.message.apply(paragraph_sentiment) # Calculating sentiment bing_scores = get_scores(docs['message'], bing) afinn_scores = get_scores(docs['message'], afinn) syuzhet_scores = get_scores(docs['message'], syuzhet) nrc_scores = get_scores(docs['message'], nrc) # used version 2 of the nrc lexicon vader_scores = docs.message.apply(paragraph_sentiment) all_methods = pd.DataFrame( { 'bing': bing_scores, 'afinn': afinn_scores, 'syuzhet': syuzhet_scores, 'nrc': nrc_scores }, index=docs.index).div(docs.n_sents, axis='index') all_methods = all_methods.apply(lambda x: map(normalize, x)) all_methods['vader'] = vader_scores
def main(): standard_split = [ ([0, 1, 2, 11], [3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15])] pairwise_split = list(permutations([[x] for x in range(0, 16)], 2)) # %% param_combinations = product_dict(**dict( seed=[42, 43, 44], test_size=[0.2], labeled_data=[0.1, 0.3, 0.5, 0.8, 1.0], fixed_cont=[0.05, 0.1], n_oe=[0], use_nn=[True], pair=standard_split )) # how many samples per class are used for all tests n_class = 3000 # split the outlier, inlier tuple pairs and print all parameters for run for d in param_combinations: d["inliers"], d["outliers"] = d["pair"] d.pop('pair', None) #data_path = "/home/philipp/projects/dad4td/data/processed/20_news_imdb_vec.pkl" data_path = "/home/philipp/projects/dad4td/data/raw/QS-OCR-Large/rvl_cdip.pkl" oe_path = "/home/philipp/projects/dad4td/data/processed/oe_data.pkl" res_path = next_path( "/home/philipp/projects/dad4td/reports/semisupervised/semisup_rvl_pw_%04d.tsv") doc2vec_model = Doc2VecModel("apnews", "apnews", 1.0, 100, 1, "/home/philipp/projects/dad4td/models/apnews_dbow/doc2vec.bin") # load data and get the doc2vec vectors for all of the data used df_full = pd.read_pickle(data_path) # sample only a portion of the data df_full = df_full.groupby('target', group_keys=False).apply( lambda df: df.sample(n=n_class, random_state=42)) # %% df_full["vecs"] = doc2vec_model.vectorize(df_full["text"]) df_full["vecs"] = df_full["vecs"].apply(tuple) # %% result_df = pd.DataFrame() for i, params in enumerate(param_combinations): print( f"\n\n---------------------\n\nRun {i+1} out of {len(param_combinations)}\n\n{params}") df, df_test = prepare_data(df_full, **params) # UMAP Train docvecs, umap_model = umap_reduce( df["vecs"].to_list(), df["label"], None, **params) # Ivis docvecs, ivis_model = ivis_reduce( docvecs, df["label"], None, **params) # remove OE data, so it's not scored as well df["decision_scores"] = docvecs df = df.where(df.scorable == 1).dropna() # find outliers in 1D scores preds, iqr_model = score_out_preds(df["decision_scores"], None, contamination=df.outlier_label.value_counts(normalize=True)[-1]) # score the predictions for outliers scores = get_scores(dict(), df["outlier_label"], preds) # %% # write the scores to df and save scores.update(params) scores["data"] = "train" result_df = result_df.append(scores, ignore_index=True) result_df.to_csv(res_path, sep="\t") print(f"\nTraining scores:\n{pd.DataFrame([scores], index=[0])}") # %% # test UMAP and ivis docvecs_test, _ = umap_reduce( df_test["vecs"].to_list(), None, umap_model, **params) docvecs_test, _ = ivis_reduce(docvecs_test, None, ivis_model, **params) # remove OE data, so it's not scored as well df_test["decision_scores"] = docvecs_test df_test = df_test.where(df_test.scorable == 1).dropna() # find outliers in 1D scores preds = iqr_model.transform( df_test["decision_scores"], thresh_factor=1) # score the predictions for outliers scores = get_scores(dict(), df_test["outlier_label"], preds) # write the scores to df and save scores.update(params) scores["data"] = "test" result_df = result_df.append(scores, ignore_index=True) result_df.to_csv(res_path, sep="\t") print(f"\nTest scores:\n{pd.DataFrame([scores], index=[0])}")
def run_vcl(hidden_size, num_epochs, data_generator, coreset_method, coreset_size=0, batch_size=None, single_head=True): """It runs the variational continual learning algorithm presented in "Variational Continual Learning" (2018) by Cuong V. Nguyen et al. :param hidden_size: :param num_epochs: :param data_generator: :param coreset_method: :param coreset_size: :param batch_size: :param single_head: :return: """ in_dim, out_dim = data_generator.get_dims() # TODO: what is difference between coresets and testsets? Maybe coresets are training sets? x_coresets, y_coresets = [], [] x_testsets, y_testsets = [], [] all_acc = np.array([]) # max_iter corresponds to the number of tasks (?) for task_id in range(data_generator.max_iter): x_train, y_train, x_test, y_test = data_generator.next_task() x_testsets.append(x_test) y_testsets.append(y_test) # Set the readout head to train head = 0 if single_head else task_id bsize = x_train.shape[0] if (batch_size is None) else batch_size # Train network with maximum likelihood to initialize first model if task_id == 0: ml_model = VanillaNN(in_dim, hidden_size, out_dim, x_train.shape[0]) ml_model.train(x_train, y_train, task_id, num_epochs, bsize) mf_weights = ml_model.get_weights() mf_variances = None ml_model.close_session() # Select coreset if needed if coreset_size > 0: x_coresets, y_coresets, x_train, y_train = coreset_method( x_coresets, y_coresets, x_train, y_train, coreset_size) # Train on non-coreset data mf_model = MeanFieldVINN(in_dim, hidden_size, out_dim, x_train.shape[0], prev_means=mf_weights, prev_log_variances=mf_variances) mf_model.train(x_train, y_train, head, num_epochs, bsize) mf_weights, mf_variances = mf_model.get_weights() # Incorporate coreset data and make prediction acc = utils.get_scores(mf_model, x_testsets, y_testsets, x_coresets, y_coresets, hidden_size, num_epochs, single_head, batch_size) all_acc = utils.concatenate_results(acc, all_acc) mf_model.close_session() return all_acc
def train_test(result_df, df_t, df_r, df_test): # data x_target = np.array(df_t.vecs.to_list()) x_ref = np.array(df_r.vecs.to_list()) y_ref = np.array(df_r.target.to_list()) y_ref = to_categorical(y_ref) test_vecs = np.array(df_test.vecs.to_list()) n_sup = 10000 n_per_targ = 1000 df_r_temp = df_r.groupby('target', group_keys=False).apply( lambda df: df.sample(n=min(df.shape[0], n_per_targ), random_state=42)) x_tr = np.array(df_t.head(n_sup).append(df_r_temp).vecs.to_list()) y_tr = np.array(df_t.head(n_sup).append(df_r_temp).label.to_list()) #y_tr = to_categorical(y_tr) #print(f"{df.where(df.label == 0).dropna().target.value_counts()}") #print(f"x_target: {x_target.shape}\nx_ref: {x_ref.shape}\ny_ref: {y_ref.shape}\n") res_path = "/home/philipp/projects/dad4td/reports/one_class/all.tsv" classes = df_r.target.unique().shape[0] print(f"classes: {classes}") batchsize = 128 epoch_num = 15 epoch_report = 5 feature_out = 64 pred_mode = "nn" # get the loss for compactness original_loss = create_loss(classes, batchsize) # model creation model = create_model(loss="binary_crossentropy", n_in=x_target[0].shape[0]) model_t = Model(inputs=model.input, outputs=model.output) model_r = Network(inputs=model_t.input, outputs=model_t.output, name="shared_layer") prediction = Dense(classes, activation='softmax')(model_t.output) model_r = Model(inputs=model_r.input, outputs=prediction) #latent_t = Dense(2, activation='relu')(model_t.output) #model_t = Model(inputs=model_t.input,outputs=latent_t) prediction_t = Dense(feature_out, activation='softmax')(model_t.output) model_t = Model(inputs=model_t.input, outputs=prediction_t) #optimizer = SGD(lr=5e-5, decay=0.00005) optimizer = Adam(learning_rate=5e-5) model_r.compile(optimizer=optimizer, loss="categorical_crossentropy") model_t.compile(optimizer=optimizer, loss=original_loss) model_t.summary() model_r.summary() ref_samples = np.arange(x_ref.shape[0]) loss, loss_c = [], [] epochs = [] best_acc = 0 print("training...") for epochnumber in range(epoch_num): x_r, y_r, lc, ld = [], [], [], [] np.random.shuffle(x_target) np.random.shuffle(ref_samples) for i in range(len(x_ref)): x_r.append(x_ref[ref_samples[i]]) y_r.append(y_ref[ref_samples[i]]) x_r = np.array(x_r) y_r = np.array(y_r) for i in range(int(len(x_target) / batchsize)): batch_target = x_target[i * batchsize:i * batchsize + batchsize] batch_ref = x_r[i * batchsize:i * batchsize + batchsize] batch_y = y_r[i * batchsize:i * batchsize + batchsize] # target data lc.append( model_t.train_on_batch(batch_target, np.zeros((batchsize, feature_out)))) # reference data ld.append(model_r.train_on_batch(batch_ref, batch_y)) loss.append(np.mean(ld)) loss_c.append(np.mean(lc)) epochs.append(epochnumber) if epochnumber % epoch_report == 0 or epochnumber == epoch_num - 1: print( f"-----\n\nepoch : {epochnumber+1} ,Descriptive loss : {loss[-1]}, Compact loss : {loss_c[-1]}" ) model_t.save_weights( '/home/philipp/projects/dad4td/models/one_class/model_t_smd_{}.h5' .format(epochnumber)) model_r.save_weights( '/home/philipp/projects/dad4td/models/one_class/model_r_smd_{}.h5' .format(epochnumber)) #test_b = model_t.predict(test_vecs) #od = OCSVM() # od.fit(test_b) #decision_scores = od.labels_ # decision_scores = decision_scores.astype(float) labels = df_test["label"].astype(int).values # threshold = 0.5 # scores = get_scores(dict(),labels, np.where(decision_scores > threshold, 0, 1), outlabel=0) if pred_mode == "svm": x_tr_pred = model_t.predict(x_tr) clf = SVC() clf.fit(x_tr_pred, y_tr) preds = model_t.predict(test_vecs) preds = clf.predict(preds) elif pred_mode == "nn": y_tr = y_tr.astype(int) print(y_tr) x_tr_pred = model_t.predict(x_tr) clf = create_sup_model(n_in=feature_out) clf.summary() clf.fit(x_tr_pred, y=y_tr, epochs=15, batch_size=64, verbose=True) decision_scores = model_t.predict(test_vecs) decision_scores = clf.predict(decision_scores) preds = decision_scores.astype(float) _ = plt.hist(preds, bins=10) plt.show() else: raise Exception(f"{pred_mode} must be one of svm, nn, osvm") scores = get_scores(dict(), labels, preds, outlabel=0) print(f"\n\nTest scores:\n{pd.DataFrame([scores], index=[0])}") if scores["accuracy"] > best_acc: best_acc = scores["accuracy"] print(f"best_acc updated to: {best_acc}") normalize = "true" print(f"{confusion_matrix(labels, preds, normalize=normalize)}") result_df = result_df.append(dict(cclass=list(df_test.target.unique()), accuracy=best_acc), ignore_index=True) result_df.to_csv(res_path, sep="\t") return result_df
if __name__ == "__main__": parser = OptionParser() parser.add_option("-i", dest="in_dir", type="string", default="data/gifs/test", help="Load images from this directory") parser.add_option("-o", dest="out_file", type="string", default="test.csv", help="Save images to this file") (options, args) = parser.parse_args() face_cascade = cv2.CascadeClassifier('haar_data/haarcascade_frontalface_default.xml') scores = get_scores('data/json/', 'multi_hot') scores = vectorize_scores(scores) f = open(options.out_file, 'w') f.write('emotion,pixels\n') gifs = os.listdir(options.gifs_dir) for gif in gifs: frames = os.listdir(options.gifs_dir + '/' + gif) score = scores[gif] # Sample 10 equally-spaced frames if we have >10 frames lf = len(frames) if lf > 10:
def run_vcl(hidden_size, no_epochs, data_gen, coreset_method, coreset_size=0, batch_size=None, single_head=True, sd=0, lr=0.001): print("seed ", sd) in_dim, out_dim = data_gen.get_dims() print('in dim , out ', in_dim, out_dim) x_coresets, y_coresets = [], [] x_testsets, y_testsets = [], [] path_folder_result = create_path_file_result(lr, sd) all_acc = np.array([]) print("max iter ", data_gen.max_iter) for task_id in range(data_gen.max_iter): x_train, y_train, x_test, y_test = data_gen.next_task() x_testsets.append(x_test) y_testsets.append(y_test) # Set the readout head to train head = 0 if single_head else task_id bsize = x_train.shape[0] if (batch_size is None) else batch_size # Train network with maximum likelihood to initialize first model if task_id == 0: ml_model = Vanilla_NN(in_dim, hidden_size, out_dim, x_train.shape[0]) ml_model.train(x_train, y_train, task_id, no_epochs, bsize) mf_weights = ml_model.get_weights() mf_variances = None ml_model.close_session() # Select coreset if needed if coreset_size > 0: x_coresets, y_coresets, x_train, y_train = coreset_method( x_coresets, y_coresets, x_train, y_train, coreset_size) # Train on non-coreset data s_time = time.time() print("batch size ", bsize) mf_model = MFVI_NN(in_dim, hidden_size, out_dim, x_train.shape[0], prev_means=mf_weights, prev_log_variances=mf_variances, learning_rate=lr) mf_model.train(x_train, y_train, head, no_epochs, bsize) e_time = time.time() print("time train ", e_time - s_time) mf_weights, mf_variances = mf_model.get_weights() # Incorporate coreset data and make prediction acc = utils.get_scores(mf_model, x_testsets, y_testsets, x_coresets, y_coresets, hidden_size, no_epochs, single_head, batch_size) all_acc = utils.concatenate_results(acc, all_acc) print(all_acc) write_data_to_file( all_acc, path_folder_result + "/result_vcl_split_seed" + str(sd) + ".csv") mf_model.close_session() return all_acc
##### # test UMAP and neural net docvecs_test, _ = umap_reduce(df_test["vecs"].to_list(), None, umap_model, **params) docvecs_test, _ = neuralnet(docvecs_test, None, nnet, n_out=1, loss="binary_crossentropy", **params) # get prediction scores threshold = 0.5 scores = get_scores(dict(), df_test["label"].astype(int).values, np.where(docvecs_test > threshold, 1, 0), outlabel=0) #### # write scores #### # write the scores to df and save scores.update(params) scores["n_class"] = n_class scores["data"] = "test" scores["threshold"] = threshold scores["doc2vec_model"] = doc2vec_model.model_name result_df = result_df.append(scores, ignore_index=True) result_df.to_csv(res_path, sep="\t") print(f"\nTest scores:\n{pd.DataFrame([scores], index=[0])}")
targets = captions[:, 1:] # removing the start token targets = pack_padded_sequence(targets, [len(tar) - 1 for tar in targets], batch_first=True)[0] packed_preds = pack_padded_sequence(preds, [len(pred) - 1 for pred in preds], batch_first=True)[0] att_regularization = args.alpha_c * ((1 - alphas.sum(1))**2).mean() loss = cross_entropy_loss(packed_preds, targets) loss += att_regularization loss.backward() optimizer.step() total_caption_length = calculate_caption_lengths(word_dict, captions) losses.update(loss.item(), total_caption_length) if batch_idx % args.log_interval == 0: print('Train Batch: [{0}/{1}]\t' 'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format( batch_idx, len(train_loader), loss=losses)) if debug: break # x = get_scores(model,train_loader) y = get_scores(model,val_loader,word_dict,idx_dict,device,debug) z = get_scores(model,test_loader,word_dict,idx_dict,device,debug) torch.save(model.state_dict(),Path(args.result_dir)/f"{epoch}.pth") print(f"epoch = {epoch} Val : {y} Test : {z}")
ground_truth_targets_val = np.load(join(input_path, 'targets_validation.npy')) ground_truth_targets_tst = np.load(join(input_path, 'targets_test.npy')) # Get model variables. models_env = os.environ['models_exp'] seed_ensemble_env = os.environ['seed_ensemble_exp'] models = np.array(models_env.split(':')).astype(int) seed_ensemble = np.array(seed_ensemble_env.split(':')).astype(int) # Load the results of the ensemble in the validation set. predictions_mean_val, _ = load_results(models, seed_ensemble, model_path, data='val') # Find the best epoch in the validation set. ensemble_mean_val = get_ensemble_mean(predictions_mean_val) r2_val = get_scores(ensemble_mean_val, ground_truth_targets_val, data='val') best_epoch = early_stopping(r2_val) # Load the results of the ensemble in the test set and compute mean and variance. predictions_mean_tst, predictions_variance_tst = load_results(models, seed_ensemble, model_path, data='tst', n_epoch=best_epoch) ensemble_mean_tst = get_ensemble_mean(predictions_mean_tst) ensemble_variance_tst = get_ensemble_variance(predictions_mean_tst, predictions_variance_tst) r2_tst, rmse_tst, mae_tst, xfold2_tst = get_scores(ensemble_mean_tst, ground_truth_targets_tst, data='tst') # Save table containing performance metrics in the test set. with open(join(output_path, 'prediction_performance.txt'), 'w') as f_out: f_out.write('data\tr2\trmse\tmae\t%-within-2-fold\n') f_out.write('test set\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\n'.format(
def run_vcl(hidden_size, no_epochs, data_gen, coreset_method, coreset_size=0, batch_size=None, single_head=True): in_dim, out_dim = data_gen.get_dims() x_coresets, y_coresets = [], [] x_testsets, y_testsets = [], [] all_acc = np.array([]) for task_id in list(range(data_gen.max_iter)): x_train, y_train, x_test, y_test = data_gen.next_task() x_testsets.append(x_test) y_testsets.append(y_test) # Set the readout head to train head = 0 if single_head else task_id bsize = x_train.shape[0] if (batch_size is None) else batch_size # Train network with maximum likelihood to initialize first model if task_id == 0: mf_variances = None mf_weights = None # Select coreset if needed if coreset_size > 0: x_coresets, y_coresets, x_train, y_train = coreset_method( x_coresets, y_coresets, x_train, y_train, coreset_size) # Train on non-coreset data mf_model = CVI_NN(in_dim, hidden_size, out_dim, x_train.shape[0], prev_means=mf_weights, prev_log_variances=mf_variances) no_epochs = 0 if task_id == 1 else 10 mf_model.train(x_train, y_train, head, no_epochs, bsize) mf_weights, mf_variances = mf_model.create_weights() prev_mf_weights, prev_mf_variances = mf_weights, mf_variances # sess = mf_model.sess # with sess.as_default(): # if not (mf_weights and mf_variances): # print(sess.run(mf_weights)) # print(sess.run(mf_variances)) # mf_weights = sess.run(mf_weights) # mf_variances = sess.run(mf_variances) #import pdb; pdb.set_trace() # Incorporate coreset data and make prediction acc = utils.get_scores(mf_model, x_testsets, y_testsets, x_coresets, y_coresets, hidden_size, no_epochs, single_head, batch_size) all_acc = utils.concatenate_results(acc, all_acc) print(acc) mf_model.close_session() return all_acc
def run_vcl(hidden_size, no_epochs, data_gen, coreset_method, coreset_size=0, batch_size=None, single_head=True, train_info = None): in_dim, out_dim = data_gen.get_dims() x_coresets, y_coresets = [], [] x_testsets, y_testsets = [], [] all_acc = np.array([]) all_acc_for_save = np.zeros((data_gen.max_iter, data_gen.max_iter), dtype=np.float32) for task_id in range(data_gen.max_iter): x_train, y_train, x_test, y_test = data_gen.next_task() x_testsets.append(x_test) y_testsets.append(y_test) # Set the readout head to train head = 0 if single_head else task_id bsize = x_train.shape[0] if (batch_size is None) else batch_size # Train network with maximum likelihood to initialize first model if task_id == 0: print('Vanilla NN train for task 0!') ml_model = Vanilla_NN(in_dim, hidden_size, out_dim, x_train.shape[0]) ml_model.train(x_train, y_train, task_id, no_epochs, bsize) mf_weights = ml_model.get_weights() mf_variances = None ml_model.close_session() # Select coreset if needed if coreset_size > 0: x_coresets, y_coresets, x_train, y_train = coreset_method(x_coresets, y_coresets, x_train, y_train, coreset_size) print('Current task : {}'.format(task_id)) # Train on non-coreset data mf_model = MFVI_NN(in_dim, hidden_size, out_dim, x_train.shape[0], prev_means=mf_weights, prev_log_variances=mf_variances) mf_model.train(x_train, y_train, head, no_epochs, bsize) mf_weights, mf_variances = mf_model.get_weights() # Incorporate coreset data and make prediction acc = utils.get_scores(mf_model, x_testsets, y_testsets, x_coresets, y_coresets, hidden_size, no_epochs, single_head, batch_size) all_acc = utils.concatenate_results(acc, all_acc) for u in range(task_id + 1): print('>>> Test on task {:2d} : acc={:5.1f}% <<<'.format(u, 100 * acc[u])) all_acc_for_save[task_id, u] = acc[u] # Save log_name = '{}_{}_{}_{}epochs_batch{}_{}_{}coreset_{}'.format(train_info['date'], train_info['experiment'], train_info['tasknum'], no_epochs, train_info['batch'], train_info['coreset_method'], coreset_size, train_info['trial']) if single_head: log_name += '_single' save_path = './results/' + log_name + '.txt' print('Save at ' + save_path) np.savetxt(save_path, all_acc_for_save, '%.4f') mf_model.close_session() return all_acc
def get_scores(): return utils.get_scores(mongo.db)