def test(netFile, dataSet): trees = tr.loadTrees(dataSet) assert netFile is not None, "Must give model to test" with open(netFile, 'r') as fid: opts = pickle.load(fid) _ = pickle.load(fid) rnn = nnet.RNN(opts.wvecDim, opts.outputDim, opts.numWords, opts.minibatch) rnn.initParams() rnn.fromFile(fid) print "Testing..." cost, correct, total = rnn.costAndGrad(trees, test=True) print "Cost %f, Correct %d/%d, Acc %f" % (cost, correct, total, correct / float(total))
def test(model_dir, dataSet): trees = tr.loadTrees(dataSet) total_df = pd.DataFrame() assert model_dir is not None, "Must give model to test" with open(model_dir + "/checkpoint.bin", 'r') as fid: opts = pickle.load(fid) _ = pickle.load(fid) rnn = nnet.RNN(opts.wvecDim, opts.outputDim, opts.numWords, opts.optimizer_settings['minibatch']) rnn.initParams() rnn.fromFile(fid) log.info("Testing...") cost, correct, total, df = rnn.costAndGrad(trees, test = True) total_df = total_df.append(df, ignore_index = True) total_df.to_csv(model_dir + "/test_preds.csv", header = True, index = False) test_performance = pd.DataFrame() row = {"Cost": cost, "Correct": correct, "Total": total, "Accuracy": correct / float(total)} test_performance = test_performance.append(row, ignore_index = True) test_performance.to_csv(model_dir + "/test_performance.csv", header = True, index = False) log.info("Cost %f, Correct %d/%d, Acc %f" % (cost, correct, total, correct / float(total)))
def perform_evaluation(iterations=1000): df_unknown = pd.DataFrame() df_preds = pd.DataFrame() RESULTS_DIR = './results/new_results/results_account3/pos_tags/2020_07_23---15_44_766248' with open(RESULTS_DIR + "/checkpoint.bin", 'r') as fid: opts = pickle.load(fid) _ = pickle.load(fid) rnn = nnet.RNN(opts.wvecDim, opts.outputDim, opts.numWords, opts.optimizer_settings['minibatch']) rnn.initParams() rnn.fromFile(fid) rnn.L, rnn.V, rnn.W, rnn.b, rnn.Ws, rnn.bs = rnn.stack texts = [ "If the content is not available in the detected system language, English is selected by default." ] for text in texts: vocab_index, tokenized_sentence_original, df_unknown = load_vocab_index( text, df_unknown) word_embeddings = np.zeros((opts.wvecDim, len(vocab_index))) for i, index in enumerate(vocab_index): word_embeddings[:, i] = rnn.L[:, index] probabilities = np.zeros((2, word_embeddings.shape[1])) for index in range(word_embeddings.shape[1]): # Softmax word_embedding = word_embeddings[:, index] probs = np.dot(rnn.Ws, word_embedding) + rnn.bs probs -= np.max(probs) probs = np.exp(probs) probs = probs / np.sum(probs) probabilities[:, index] = [probs[np.argmax(probs)], np.argmax(probs)] start = np.concatenate([word_embeddings, probabilities]) token_index = np.arange(0, len(tokenized_sentence_original)) token_index = token_index[:, None].T start = np.concatenate([start, token_index]) # add two dimension two store the indexes of the childs cache = np.zeros((start.shape[0] + 2, start.shape[1])) # init the total_certainteis dimenison working_tensor_empty = np.repeat(start[:, :, np.newaxis], start.shape[1] - 1, axis=2) num_runs = iterations tokenized_sentences_array = [] certainties_runs = [] total_certainties = np.zeros_like(working_tensor_empty) total_certainties = np.repeat(total_certainties[:, :, :, np.newaxis], num_runs, axis=3) for h in range(0, num_runs): certainty = 0.0 tokenized_sentence = list(tokenized_sentence_original) # binary tree has maximum height of n-1 (start.shape[1]-1) working_tensor = np.repeat(start[:, :, np.newaxis], start.shape[1] - 1, axis=2) # i is the counter for the height of the tree, going through the third dimension of the tensor for i in range(0, start.shape[1] - 1): # j is the counter for the predictions done in one step cache = np.zeros_like(cache) for j in range(0, start.shape[1] - 1): k = j + 1 # affine with j and j+1 if np.isnan(working_tensor[0, j, i]): continue while np.isnan( working_tensor[0, k, i]) and k < start.shape[1] - 1: k = k + 1 if not np.isnan(working_tensor[0, k, i]): break lr = np.hstack( [working_tensor[:-3, j, i], working_tensor[:-3, k, i]]) cache[:-5, k] = np.dot(rnn.W, lr) + rnn.b cache[:-5, k] += np.tensordot(rnn.V, np.outer(lr, lr), axes=([1, 2], [0, 1])) # Tanh cache[:-5, k] = np.tanh(cache[:-5, k]) # Compute softmax probs = np.dot(rnn.Ws, cache[:-5, k]) + rnn.bs probs -= np.max(probs) probs = np.exp(probs) probs = probs / np.sum(probs) # add probs and label to the vector cache[-5:-3, k] = [probs[np.argmax(probs)], np.argmax(probs)] cache[-3, k], tokenized_sentence = create_new_token( working_tensor[-1, j, i].astype(int), working_tensor[-1, k, i].astype(int), tokenized_sentence_in_method=tokenized_sentence) # we store here the indices from the working tensor, to know which two cache items are merged cache[-2, k] = j cache[-1, k] = k # select one of the best five predictions best_three_preds = cache[-5, :].argsort()[-5:] # look at the best five preds and filter if they are below filterd_best_preds = [] for index in best_three_preds: prediction = cache[-5, index] if index == cache.shape[1] - 1 and np.count_nonzero( cache[-5, :]) > 1: continue if prediction > 0.01: filterd_best_preds.append(index) if not len(filterd_best_preds) == 0: # use random random_index = np.random.choice(len(filterd_best_preds), 1, replace=False) best_cache_index = filterd_best_preds[random_index[0]] best_cache = cache[:, best_cache_index] certainty += best_cache[-5] # we push the best merge to the next tensor slice second_child_of_cache = best_cache[-1].astype(int) # We need to propagate/repeat the merged values to every higher dimension best_cache_all_dims = np.repeat(best_cache[:-2, np.newaxis], working_tensor.shape[2] - i - 1, axis=1) working_tensor[:, second_child_of_cache, i + 1:] = best_cache_all_dims # since we always put the merged result in the second place, set the first place in the upcoming slices to NaN # We need to set the value to nan which is the first child node, if there are NaNs in betweeen this does not work # We can find the first child by extending the cache with two dimenesions and setting there the index of the childs first_child_of_cache = best_cache[-2].astype(int) working_tensor[:, first_child_of_cache, i + 1:] = np.nan tokenized_sentences_array.append(tokenized_sentence) total_certainties[:, :, :, h] = working_tensor certainties_runs.append(certainty) best_run = np.argmax(certainties_runs) best_working_tensor = total_certainties[:, :, :, best_run] tensor_slice = best_working_tensor[-2:, :, :] test = np.reshape(tensor_slice, (2, -1)) predictions_dict = [] for col in test.T: label = col[0] if not np.isnan(col[1]): token123 = tokenized_sentences_array[best_run][col[1].astype( int)] if not [token123, label_names[label.astype(int)] ] in predictions_dict: predictions_dict.append( [token123, label_names[label.astype(int)], text]) predictions_dict.sort(key=lambda x: len(x[0]), reverse=False) df = pd.DataFrame.from_records(predictions_dict, columns=['token', 'label', 'sentence']) df_preds = df_preds.append(df) df_preds.to_csv("predictions.csv", header=True, index=False) df_unknown.to_csv("unknown_words.csv", header=True, index=False)
def run(args=None): usage = "usage : %prog [options]" parser = optparse.OptionParser(usage = usage) parser.add_option("--test", action = "store_true", dest = "test", default = False) # Paramsfile includes hyperparameters for training parser.add_option('--params_file', dest = "params_file", default = './params/exp_params.json', help = "Path to the file containing the training settings") parser.add_option('--data_dir', dest = "data_dir", default = './trees', help = "Directory containing the trees") # Directory containing the model to test parser.add_option("--model_directory", dest = "test_dir", type = "string") parser.add_option("--data", dest = "data", type = "string", default = "train") (opts, args) = parser.parse_args(args) results_dir = "./results" if opts.test: pass else: results_dir_current_job = os.path.join(results_dir, utils.now_as_str_f()) while os.path.isdir(results_dir_current_job): # generate a new timestamp if the current one already exists results_dir_current_job = os.path.join(results_dir, utils.now_as_str_f()) os.makedirs(results_dir_current_job) # Load training settings (e.g. hyperparameters) params = utils.Params(opts.params_file) if opts.test: pass else: # Copy the settings file into the results directory copyfile(opts.params_file, os.path.join(results_dir_current_job, os.path.basename(opts.params_file))) # Get the logger if opts.test: log_path = os.path.join(opts.test_dir, 'testing.log') else: log_path = os.path.join(results_dir_current_job, 'training.log') log_level = params.log_level if hasattr(params, 'log_level') else logging.DEBUG log = utils.get_logger(log_path, log_level) if opts.test: log.info("Testing directory: " + opts.test_dir) log.info("Dataset used for testing: " + opts.data) else: log.info("Results directory: " + results_dir_current_job) log.info("Minibatch: " + str(params.optimizer_settings['minibatch'])) log.info("Optimizer: " + params.optimizer) log.info("Epsilon: " + str(params.optimizer_settings['epsilon'])) log.info("Alpha: " + str(params.optimizer_settings['alpha'])) log.info("Number of samples used: " + str(params.sample_size)) # Testing if opts.test: test(opts.test_dir, opts.data) return log.info("Loading data...") # load training data trees = tr.loadTrees(sample_size = params.sample_size) params.numWords = len(tr.loadWordMap()) overall_performance = pd.DataFrame() rnn = nnet.RNN(params.wvecDim, params.outputDim, params.numWords, params.optimizer_settings['minibatch']) rnn.initParams() sgd = optimizer.SGD(rnn, alpha = params.optimizer_settings['alpha'], minibatch = params.optimizer_settings['minibatch'], optimizer = params.optimizer, epsilon = params.optimizer_settings['epsilon']) best_val_cost = float('inf') best_epoch = 0 for e in range(params.num_epochs): start = time.time() log.info("Running epoch %d" % e) df, updated_model, train_cost, train_acc = sgd.run(trees) end = time.time() log.info("Time per epoch : %f" % (end - start)) log.info("Training accuracy : %f" % train_acc) # VALIDATION val_df, val_cost, val_acc = validate(updated_model, results_dir_current_job) if val_cost < best_val_cost: # best validation cost we have seen so far log.info("Validation score improved, saving model") best_val_cost = val_cost best_epoch = e best_epoch_row = {"epoch": e, "train_cost": train_cost, "val_cost": val_cost, "train_acc": train_acc, "val_acc": val_acc} with open(results_dir_current_job + "/checkpoint.bin", 'w') as fid: pickle.dump(params, fid) pickle.dump(sgd.costt, fid) rnn.toFile(fid) val_df.to_csv(results_dir_current_job + "/validation_preds_epoch_ " + str(e) + ".csv", header = True, index = False) df.to_csv(results_dir_current_job + "/training_preds_epoch_" + str(e) + ".csv", header = True, index = False) row = {"epoch": e, "train_cost": train_cost, "val_cost": val_cost, "train_acc": train_acc, "val_acc": val_acc} overall_performance = overall_performance.append(row, ignore_index = True) # break if no val loss improvement in the last epochs if (e - best_epoch) >= params.num_epochs_early_stop: log.tinfo("No improvement in the last {num_epochs_early_stop} epochs, stop training.".format(num_epochs_early_stop=params.num_epochs_early_stop)) break overall_performance = overall_performance.append(best_epoch_row, ignore_index = True) overall_performance.to_csv(results_dir_current_job + "/train_val_costs.csv", header = True, index = False) log.info("Experiment end")
def run(args=None): usage = "usage : %prog [options]" parser = optparse.OptionParser(usage=usage) parser.add_option("--test", action="store_true", dest="test", default=False) # Optimizer parser.add_option("--minibatch", dest="minibatch", type="int", default=30) parser.add_option("--optimizer", dest="optimizer", type="string", default="adagrad") parser.add_option("--epochs", dest="epochs", type="int", default=50) parser.add_option("--step", dest="step", type="float", default=1e-2) parser.add_option("--outputDim", dest="outputDim", type="int", default=5) parser.add_option("--wvecDim", dest="wvecDim", type="int", default=30) parser.add_option("--outFile", dest="outFile", type="string", default="models/test.bin") parser.add_option("--inFile", dest="inFile", type="string", default="models/test.bin") parser.add_option("--data", dest="data", type="string", default="train") (opts, args) = parser.parse_args(args) # Testing if opts.test: test(opts.inFile, opts.data) return print "Loading data..." # load training data trees = tr.loadTrees() opts.numWords = len(tr.loadWordMap()) rnn = nnet.RNN(opts.wvecDim, opts.outputDim, opts.numWords, opts.minibatch) rnn.initParams() sgd = optimizer.SGD(rnn, alpha=opts.step, minibatch=opts.minibatch, optimizer=opts.optimizer) for e in range(opts.epochs): start = time.time() print "Running epoch %d" % e sgd.run(trees) end = time.time() print "Time per epoch : %f" % (end - start) with open(opts.outFile, 'w') as fid: pickle.dump(opts, fid) pickle.dump(sgd.costt, fid) rnn.toFile(fid)