def run(experiment_name, model_name, load_previous, learning_rate, batch_size, samples_per_epoch, epochs, print_after_batches, save_after_steps, learning_rate_decay, L1_reg, L2_reg, n_factors_emb, n_hidden, using_dropout, dropout_rate, loss_weights): print 'Meta parameters: ' print('experiment_name: ', experiment_name) print('learning_rate: ', learning_rate) print('batch_size: ', batch_size) print('samples_per_epoch: ', samples_per_epoch) print('epochs: ', epochs) print('n_factors_emb: ', n_factors_emb) print('n_hidden: ', n_hidden) print('using_dropout: ', using_dropout) print('dropout_rate: ', dropout_rate) print('loss_weights: ', loss_weights) print '' start_time = time.clock() experiment_name_prefix = "%s_" % experiment_name tmp_exp_name = experiment_name_prefix + "temp" final_exp_name = experiment_name_prefix + "final" e27_exp_name = experiment_name_prefix + "e27" with open(DATA_PATH + "description", 'rb') as data_file: description = cPickle.load(data_file) print description.keys() word_vocabulary = description['word_vocabulary'] role_vocabulary = description['role_vocabulary'] unk_word_id = description['NN_unk_word_id'] unk_role_id = description['unk_role_id'] missing_word_id = description['NN_missing_word_id'] print(unk_word_id, unk_role_id, missing_word_id) print '... building the model' rng = np.random word_vocabulary['<NULL>'] = missing_word_id word_vocabulary['<UNKNOWN>'] = unk_word_id role_vocabulary['<UNKNOWN>'] = unk_role_id n_word_vocab = len(word_vocabulary) n_role_vocab = len(role_vocabulary) adagrad = Adagrad(lr=learning_rate, epsilon=1e-08, decay=0.0) if re.search('NNRF', model_name): model = NNRF(n_word_vocab, n_role_vocab, n_factors_emb, 512, n_hidden, word_vocabulary, role_vocabulary, unk_word_id, unk_role_id, missing_word_id, using_dropout, dropout_rate, optimizer=adagrad, loss='sparse_categorical_crossentropy', metrics=['accuracy']) else: model = eval(model_name)(n_word_vocab, n_role_vocab, n_factors_emb, n_hidden, word_vocabulary, role_vocabulary, unk_word_id, unk_role_id, missing_word_id, using_dropout, dropout_rate, optimizer=adagrad, loss='sparse_categorical_crossentropy', metrics=['accuracy'], loss_weights=loss_weights) # else: # sys.exit('No such model!!!') model.summary() print model.model.metrics_names epoch = 0 max_output_length = 0 validation_cost_history = [] best_validation_cost = np.inf best_epoch = 0 valid_sample_size = config.OCT_VALID_SIZE test_sample_size = config.OCT_TEST_SIZE train_steps = samples_per_epoch / batch_size valid_steps = valid_sample_size / batch_size test_steps = test_sample_size / batch_size # # DEBUG # valid_steps = 10 # test_steps = 10 save_after_steps = save_after_steps + 1 training_verbose = 2 max_q_size = 10 workers = 2 pickle_safe = True def thematic_fit_evaluation(model_name, experiment_name, model, print_result): result = dict() # Pado, Mcrae A0/A1/A2 tempdict = dict() tempdict['pado'], _, _ = eval_pado_mcrae(model_name, experiment_name, 'pado', model, print_result) tempdict['mcrae'], _, _ = eval_pado_mcrae(model_name, experiment_name, 'mcrae', model, print_result) # tempdict['pado_fixed'] = eval_pado_mcrae(model_name, experiment_name, 'pado_fixed', model=model, print_result=False) # tempdict['mcrad_fixed'] = eval_pado_mcrae(model_name, experiment_name, 'mcrad_fixed', model=model, print_result=False) for k, v in tempdict.items(): for sk, sv in v.items(): result[k + '-' + sk] = sv r, _, _, _, _ = eval_MNR_LOC(model_name, experiment_name, 'AM-MNR', model, print_result, skip_header=True) result['mcrae-MNR'] = round(r, 4) r2, _, _, _, _ = eval_MNR_LOC(model_name, experiment_name, 'AM-LOC', model, print_result) result['mcrae-LOC'] = round(r2, 4) rho_obj, _, _, rho_fil, _, _, rho_gre, _, _ = eval_greenberg_all( model_name, experiment_name, model, print_result) result['GObject'] = round(rho_obj, 4) result['GFiller'] = round(rho_fil, 4) result['greenberg'] = round(rho_gre, 4) correct, _, acc = eval_bicknell_switch(model_name, experiment_name, 'bicknell', model, print_result, switch_test=False) result['bicknell'] = (acc, correct) correlation = eval_GS(model_name, experiment_name, 'GS2013data.txt', model, print_result) result['GS'] = round(correlation, 4) return result class CallbackContainer(Callback): """Callback that records events into a `History` object. """ def on_train_begin(self, logs=None): self.epoch = [] self.history = {} self.best_validation_cost = -1 self.best_epoch = -1 def on_epoch_begin(self, epoch, logs): self.epoch_start = time.clock() def on_batch_end(self, batch, logs): batch_n = batch + 1 epoch_n = len(self.epoch) if batch_n % print_after_batches == 0: elapsed_time = time.clock() - self.epoch_start output = "batch %d; %d samples; %.1f sps; " % ( batch_n, batch_n * batch_size, batch_n * batch_size / (elapsed_time + 1e-32)) print output if batch_n % save_after_steps == 0: model.save(MODEL_PATH, tmp_exp_name, model_name, learning_rate, self.history, self.best_validation_cost, self.best_epoch, epoch_n) print "Temp model saved! " def on_epoch_end(self, epoch, logs=None): epoch_n = epoch + 1 logs = logs or {} self.epoch.append(epoch_n) print 'Validating...' valid_result = model.model.evaluate_generator(generator=generator( DATA_PATH + "NN_dev", model_name, unk_word_id, unk_role_id, missing_word_id, role_vocabulary, random=False, batch_size=batch_size), steps=test_steps, max_q_size=1, workers=1, pickle_safe=False) print('validate_result', valid_result) for i, m in enumerate(model.model.metrics_names): logs['valid_' + m] = valid_result[i] # print model.model.get_layer("softmax_word_output").get_weights()[1] result = thematic_fit_evaluation(model_name, experiment_name, model, False) for k, v in result.items(): logs[k] = v # print model.model.get_layer("softmax_word_output").get_weights()[1] for k, v in logs.items(): self.history.setdefault(k, []).append(v) if epoch_n > 1 and self.history['valid_loss'][-1] < self.history[ 'valid_loss'][-2]: print "Best model saved! " self.best_validation_cost = np.min( np.array(self.history['valid_loss'])) self.best_epoch = np.argmin( np.array(self.history['valid_loss'])) + 1 model.save(MODEL_PATH, final_exp_name, model_name, learning_rate, self.history, self.best_validation_cost, self.best_epoch, epoch_n) print('best_validation_cost, best_epoch, epoch_n', self.best_validation_cost, self.best_epoch, epoch_n) for k, v in self.history.items(): print k, v if epoch_n == 27: model.save(MODEL_PATH, experiment_name, model_name, learning_rate, self.history, self.best_validation_cost, self.best_epoch, epoch_n) print "Current model saved! " model.save(MODEL_PATH, experiment_name, model_name, learning_rate, self.history, self.best_validation_cost, self.best_epoch, epoch_n) callback_container = CallbackContainer() # saves the backup model weights after each epoch if the validation loss decreased # backup_checkpointer = ModelCheckpoint(filepath='backup_' + experiment_name + '.hdf5', verbose=1, save_best_only=True) stopper = EarlyStopping(monitor='valid_loss', min_delta=1e-3, patience=5, verbose=1) naNChecker = TerminateOnNaN() reduce_lr = ReduceLROnPlateau(monitor='valid_loss', factor=0.1, patience=3, min_lr=0.001) print 'Training...' train_start = time.clock() model.model.fit_generator( generator=generator(DATA_PATH + "NN_train", model_name, unk_word_id, unk_role_id, missing_word_id, role_vocabulary, random=True, rng=rng, batch_size=batch_size), steps_per_epoch=train_steps, epochs=epochs, verbose=training_verbose, workers=workers, max_q_size=max_q_size, pickle_safe=pickle_safe, callbacks=[callback_container, stopper, naNChecker, reduce_lr]) print callback_container.epoch for k, v in callback_container.history.items(): print k, v train_end = time.clock() print 'train and validate time: %f, sps: %f' % (train_end - train_start, train_steps * batch_size / (train_end - train_start)) print 'Testing...' test_start = time.clock() description_best = model_builder.load_description(MODEL_PATH, experiment_name) model.load(MODEL_PATH, experiment_name, description_best) test_result = model.model.evaluate_generator(generator=generator( DATA_PATH + "NN_test", model_name, unk_word_id, unk_role_id, missing_word_id, role_vocabulary, random=False, batch_size=batch_size), steps=test_steps, max_q_size=1, workers=1, pickle_safe=False) print('test_result', test_result) test_end = time.clock() print 'test time: %f, sps: %f' % (test_end - test_start, test_steps * batch_size / (test_end - test_start)) end_time = time.clock() print "Total running time %.2fh" % ((end_time - start_time) / 3600.) print 'Optimization complete. Best validation cost of %f obtained at epoch %i' % ( callback_container.best_validation_cost, callback_container.best_epoch)
def evaluate(model_name, experiment_name, batch_size): MODEL_NAME = experiment_name repr_file = os.path.join(MODEL_PATH, 'confusionM_' + MODEL_NAME) description = model_builder.load_description(MODEL_PATH, MODEL_NAME) net = model_builder.build_model(model_name, description) net.load(MODEL_PATH, MODEL_NAME, description) n_roles = len(net.role_vocabulary) print(net.role_vocabulary) #Added () to print (team1-change) print("unk_word_id", net.unk_word_id) print("missing_word_id", net.missing_word_id) net.model.summary() print(net.model.metrics_names) #Added () to print (team1-change) reverse_role_vocabulary = utils.get_reverse_map(net.role_vocabulary) test_sample_size = config.OCT_TEST_SIZE test_steps = test_sample_size / batch_size # # DEBUG # test_steps = 10 print('Testing...') #Added () to print (team1-change) test_start = time.process_time() #Changed from time.clock() (team1-change) # Always use generator in Keras if re.search('NAME_WHICH_YOU_NEED_OLD_BATCHER', experiment_name): test_gen = get_minibatch(DATA_PATH + "NN_test", net.unk_word_id, net.unk_role_id, net.missing_word_id, n_roles, random=False, batch_size=batch_size) else: test_gen = generator(DATA_PATH + "NN_test", model_name, net.unk_word_id, net.unk_role_id, net.missing_word_id, n_roles, random=False, batch_size=batch_size) # Test the model test_result = net.model.evaluate_generator(generator=test_gen, steps=test_steps, max_q_size=1, workers=1, pickle_safe=False) print('test_result', test_result) # Compute confusion matrix metrics_names = net.model.metrics_names result_dict = {(x, 0) for x in metrics_names} batch_n = 0 confusionM = np.zeros((n_roles, n_roles), dtype='int32') ppl_role_list = dict() ppl_role = dict() result_list = [] for ([i_w, i_r, t_w, t_r], _) in generator(DATA_PATH + "NN_test", model_name, net.unk_word_id, net.unk_role_id, net.missing_word_id, n_roles, random=False, batch_size=batch_size): result_role = net.predict_role(i_w, i_r, t_w, t_r, batch_size) result_word_likelihood = net.predict(i_w, i_r, t_w, t_r, batch_size)[0] neg_log_likelihoods = -np.log(result_word_likelihood) for i, row in enumerate(neg_log_likelihoods, start=0): target_word = t_w[i][0] target_role = t_r[i][0] neg_log_likelihood = row[target_word] ppl_role_list.setdefault(target_role, []).append(neg_log_likelihood) for i, true_r in enumerate(t_r, start=0): confusionM[true_r, result_role[i]] += 1 if true_r == result_role[i]: result_list.append(1) batch_n += 1 print(batch_n) #Added () to print (team1-change) if batch_n >= test_steps: break for k, v in ppl_role_list.items(): neg_log_likelihood_role = np.mean(np.array(v)) ppl_role[k] = np.exp(neg_log_likelihood_role) print("Confusion Matrix: ") #Added () to print (team1-change) print(" A0, A1, LOC, TMP, MNR, V, <UNKNOWN>" ) #Added () to print (team1-change) print(confusionM) #Added () to print (team1-change) np.savetxt('confusionM_' + experiment_name + '.csv', confusionM, delimiter=',') np.savetxt('result_list_' + experiment_name + '.csv', result_list, delimiter=',') stats(net, confusionM) print("Loss(neg_log_likelihood) by role: " ) #Added () to print (team1-change) for r in ppl_role.keys(): print(reverse_role_vocabulary[r], np.log(ppl_role[r])) print("PPL by role: ") #Added () to print (team1-change) for r in ppl_role.keys(): print(reverse_role_vocabulary[r], ppl_role[r]) with open(repr_file, 'w') as f_out: f_out.write('[') for i in range(n_roles): f_out.write('[') for j in range(n_roles): f_out.write(str(confusionM[i][j]) + ", ") f_out.write('] \n') f_out.write(']') test_end = time.process_time() #Changed from time.clock() (team1-change) print('test time: %f, sps: %f' % (test_end - test_start, test_steps * batch_size / (test_end - test_start))) #Added () to print (team1-change)
def eval_GS(model_name, experiment_name, eval_file_name, model=None, print_result=True, verb_baseline=False): MODEL_NAME = experiment_name eval_file = os.path.join(EVAL_PATH, eval_file_name) result_file = os.path.join(MODEL_PATH, MODEL_NAME + '_' + eval_file_name) if model: net = model else: description = model_builder.load_description(MODEL_PATH, MODEL_NAME) net = model_builder.build_model(model_name, description) net.load(MODEL_PATH, MODEL_NAME, description) sent_layer = 'context_embedding' sent_model = Model(inputs=net.model.input, outputs=net.model.get_layer(sent_layer).output) # if print_result: # sent_model.summary() n_input_length = len(net.role_vocabulary) - 1 print net.role_vocabulary scores = [] similarities = [] original_sim_f = [] similarities_f = [] lo_similarities = [] hi_similarities = [] records = [] print("Embedding: " + experiment_name) print("=" * 60) print("\n") print("sentence1\tsentence2\taverage_score\tembedding_cosine") print("-" * 60) with open(eval_file, 'r') as f, \ open(result_file, 'w') as f_out: first = True for line in f: # skip header if first: first = False continue s = line.split() sentence = " ".join(s[1:5]) score = float(s[5]) hilo = s[6].upper() # verb subject object landmark # A1 - object; A0 - subject V1, A0, A1, V2 = sentence.split() V1 = wnl.lemmatize(V1, wn.VERB) A0 = wnl.lemmatize(A0, wn.NOUN) A1 = wnl.lemmatize(A1, wn.NOUN) V2 = wnl.lemmatize(V2, wn.VERB) V1_i = net.word_vocabulary.get(V1, net.unk_word_id) A0_i = net.word_vocabulary.get(A0, net.unk_word_id) A1_i = net.word_vocabulary.get(A1, net.unk_word_id) V2_i = net.word_vocabulary.get(V2, net.unk_word_id) # if np.array([V1_i, A0_i, A1_i, V2_i]).any() == net.unk_word_id: # print 'OOV: ', A0, A1, V1, V2 V_ri = net.role_vocabulary['V'] A0_ri = net.role_vocabulary['A0'] A1_ri = net.role_vocabulary['A1'] sent1_x = dict((r, net.missing_word_id) for r in (net.role_vocabulary.values())) sent2_x = dict((r, net.missing_word_id) for r in (net.role_vocabulary.values())) sent1_x.pop(n_input_length) sent2_x.pop(n_input_length) sent1_x[V_ri] = V1_i sent2_x[V_ri] = V2_i if not verb_baseline: sent1_x[A0_ri] = A0_i sent1_x[A1_ri] = A1_i sent2_x[A0_ri] = A0_i sent2_x[A1_ri] = A1_i zeroA = np.array([0]) s1_w = np.array(sent1_x.values()).reshape((1, n_input_length)) s1_r = np.array(sent1_x.keys()).reshape((1, n_input_length)) s2_w = np.array(sent2_x.values()).reshape((1, n_input_length)) s2_r = np.array(sent2_x.keys()).reshape((1, n_input_length)) if re.search('NNRF', model_name): sent1_emb = sent_model.predict([s1_w, s1_r, zeroA]) sent2_emb = sent_model.predict([s2_w, s2_r, zeroA]) else: sent1_emb = sent_model.predict([s1_w, s1_r, zeroA, zeroA]) sent2_emb = sent_model.predict([s2_w, s2_r, zeroA, zeroA]) # Baseline #sent1_emb = V1_i #sent2_emb = V2_i # Compositional # sent1_emb = V1_i + A0_i + A1_i # sent2_emb = V2_i + A0_i + A1_i #sent1_emb = V1_i * A0_i * A1_i #sent2_emb = V2_i * A0_i * A1_i similarity = -(cosine(sent1_emb, sent2_emb) - 1.0 ) # convert distance to similarity if hilo == "HIGH": hi_similarities.append(similarity) elif hilo == "LOW": lo_similarities.append(similarity) else: raise Exception("Unknown hilo value %s" % hilo) if (V1, A0, A1, V2) not in records: records.append((V1, A0, A1, V2)) # print "\"%s %s %s\"\t\"%s %s %s\"\t%.2f\t%.2f \n" % (A0, V1, A1, A0, V2, A1, score, similarity) scores.append(score) similarities.append(similarity) f_out.write("\"%s %s %s\"\t\"%s %s %s\"\t %.2f \t %.2f \n" % (A0, V1, A1, A0, V2, A1, score, similarity)) print("-" * 60) correlation, pvalue = spearmanr(scores, similarities) if print_result: print("Total number of samples: %d" % len(scores) ) #Added paranthesis to the print statements (team1-change) print("Spearman correlation: %.4f; 2-tailed p-value: %.10f" % (correlation, pvalue) ) #Added paranthesis to the print statements (team1-change) print("High: %.2f; Low: %.2f" % (np.mean(hi_similarities), np.mean(lo_similarities)) ) #Added paranthesis to the print statements (team1-change) # import pylab # pylab.scatter(scores, similarities) # pylab.show() return correlation
def eval_bicknell_switch(model_name, experiment_name, evaluation, model=None, print_result=True, switch_test=False): MODEL_NAME = experiment_name if model: net = model else: description = model_builder.load_description(MODEL_PATH, MODEL_NAME) net = model_builder.build_model(model_name, description) net.load(MODEL_PATH, MODEL_NAME, description) bias = net.set_0_bias() if print_result: print net.role_vocabulary eval_data_file = os.path.join(RF_EVAL_PATH, evaluation + '.txt') result_file = os.path.join(MODEL_PATH, MODEL_NAME + '_' + evaluation + '.txt') probs = [] baseline = [] oov_count = 0 if print_result: print eval_data_file print "=" * 60 dataset = numpy.genfromtxt(eval_data_file, dtype=str, delimiter='\t', usecols=[0, 1, 2, 3, 4]) samples = [] i = 0 while True: d = dataset[i] d2 = dataset[i + 1] A0 = d[0][:-2] V = d[1][:-2] assert d2[0][:-2] == A0 assert d2[1][:-2] == V if d[3] == 'yes': assert d2[3] == 'no' A1_correct = d[2][:-2] A1_incorrect = d2[2][:-2] b_correct = d[4] b_incorrect = d2[4] else: assert d[3] == 'no' A1_correct = d2[2][:-2] A1_incorrect = d[2][:-2] b_correct = d2[4] b_incorrect = d[4] if A1_correct not in net.word_vocabulary or A1_incorrect not in net.word_vocabulary: if A1_correct not in net.word_vocabulary and print_result: print "%s MISSING FROM VOCABULARY. SKIPPING..." % A1_correct if A1_incorrect not in net.word_vocabulary and print_result: print "%s MISSING FROM VOCABULARY. SKIPPING..." % A1_incorrect else: roles = net.role_vocabulary.values() del roles[net.unk_role_id] input_roles_words = dict((r, net.missing_word_id) for r in (roles)) input_roles_words[ net.role_vocabulary["A0"]] = utils.input_word_index( net.word_vocabulary, A0, net.unk_word_id, warn_unk=True) input_roles_words[ net.role_vocabulary["V"]] = utils.input_word_index( net.word_vocabulary, V, net.unk_word_id, warn_unk=True) sample = ( numpy.asarray( [input_roles_words.values(), input_roles_words.values()], dtype=numpy.int64), # x_w_i numpy.asarray( [input_roles_words.keys(), input_roles_words.keys()], dtype=numpy.int64), # x_r_i numpy.asarray([ net.word_vocabulary[A1_correct], net.word_vocabulary[A1_incorrect] ], dtype=numpy.int64 ), # y_i (1st is correct and 2nd is incorrect numpy.asarray( [net.role_vocabulary["A1"], net.role_vocabulary["A1"]], dtype=numpy.int64), # y_r_i [b_correct, b_incorrect], # bicknell scores "\"" + A0 + " " + V + "\"", # context [A1_correct, A1_incorrect]) samples.append(sample) i += 2 if i > len(dataset) - 2: break num_samples = len(samples) num_correct = 0 num_total = 0 if print_result: print "context", "correct", "incorrect", "P(correct)", "P(incorrect)", "bicnell_correct", "bicnell_incorrect" result_list = [] for x_w_i, x_r_i, y_w_i, y_r_i, bicknell, context, a1 in samples: p = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i) p_correct = p[0] p_incorrect = p[1] if print_result: print context, a1[0], a1[1], p_correct, p_incorrect, bicknell[ 0], bicknell[1] if p_correct > p_incorrect: result_list.append(1) else: result_list.append(0) num_correct += p_correct > p_incorrect num_total += 1 assert num_total == num_samples accuracy = float(num_correct) / float(num_samples) if print_result: print "Number of lines %d" % num_samples print "Baseline Lenci11 is 43/64=0.671875" print "Final score of theano model is %d/%d=%.6f" % ( num_correct, num_samples, accuracy) print result_list if switch_test and print_result: print "\nSwitch A0/A1 TEST" input_words = [] input_roles = [] for i in range(1): roles = net.role_vocabulary.values() print net.unk_role_id roles.remove(net.unk_role_id) input_role_word_pairs = dict( (r, net.missing_word_id) for r in roles) input_role_word_pairs[ net.role_vocabulary["V"]] = utils.input_word_index( net.word_vocabulary, "buy", net.unk_word_id, warn_unk=True) input_words.append(input_role_word_pairs.values()) input_roles.append(input_role_word_pairs.keys()) man = utils.input_word_index(net.word_vocabulary, "man", net.unk_word_id, warn_unk=True) car = utils.input_word_index(net.word_vocabulary, "car", net.unk_word_id, warn_unk=True) a1 = net.role_vocabulary["A1"] a0 = net.role_vocabulary["A0"] a0_test = ( numpy.asarray(input_words, dtype=numpy.int64), numpy.asarray(input_roles, dtype=numpy.int64), numpy.asarray([man, car], dtype=numpy.int64), numpy.asarray([a0], dtype=numpy.int64), ) x_w_i, x_r_i, y_w_i, y_r_i = a0_test p0 = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i) print p0 a1_test = ( numpy.asarray(input_words, dtype=numpy.int64), numpy.asarray(input_roles, dtype=numpy.int64), numpy.asarray([man, car], dtype=numpy.int64), numpy.asarray([a1], dtype=numpy.int64), ) x_w_i, x_r_i, y_w_i, y_r_i = a1_test p1 = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i) print p1 print "man buy", p0[0] print "buy man", p1[0] print "car buy", p0[1] print "buy car", p1[1] net.set_bias(bias) return num_correct, num_samples, accuracy
def evaluate(model_name, experiment_name, test_name, batch_size, VR_SP_SRL=True, bootstrapping=False, majority_baseline=False): MODEL_NAME = experiment_name # repr_file = os.path.join(MODEL_PATH, 'confusionM_' + MODEL_NAME) description = model_builder.load_description(MODEL_PATH, MODEL_NAME) net = model_builder.build_model(model_name, description) net.load(MODEL_PATH, MODEL_NAME, description) n_roles = len(net.role_vocabulary) reverse_word_vocabulary = utils.get_reverse_map(net.word_vocabulary) reverse_role_vocabulary = utils.get_reverse_map(net.role_vocabulary) # net.set_0_bias() print net.role_vocabulary print("unk_word_id", net.unk_word_id) print("missing_word_id", net.missing_word_id) net.model.summary() # print net.model.metrics_names test_sample_size = 0 with open(EVAL_PATH + test_name, 'r') as lines: for l in lines: test_sample_size += 1 print(test_sample_size) test_steps = test_sample_size / float(batch_size) # test_steps = test_sample_size # # DEBUG # test_steps = 10 print 'Testing ' + test_name + ' ...' print 'VR_SP_SRL: ' + str(VR_SP_SRL) test_start = time.clock() # if re.search('NNRF_1e8', experiment_name) or re.search('MTRF_dev', experiment_name): # test_gen = get_minibatch(DATA_PATH + "NN_test", net.unk_word_id, net.unk_role_id, net.missing_word_id, # n_roles, random=False, batch_size=batch_size) # else: # test_gen = generator(DATA_PATH + "NN_test", model_name, net.unk_word_id, net.unk_role_id, net.missing_word_id, # n_roles, random=False, batch_size=batch_size) # # Test the model # test_result = net.model.evaluate_generator( # generator = test_gen, # steps = test_steps, # max_q_size = 1, # workers = 1, # pickle_safe = False # ) # print ('test_result', test_result) # Compute confusion matrix metrics_names = net.model.metrics_names result_dict = {(x, 0) for x in metrics_names} batch_n = 0 confusionM = np.zeros((n_roles, n_roles), dtype='int32') ppl_role_list = dict() result_list = [] output_list = [] for ([i_w, i_r, t_w, t_r], _) in data_gen(EVAL_PATH + test_name, model_name, net, batch_size, VR_SP_SRL=VR_SP_SRL): # zeros = np.zeros(t_r.shape) result_role = net.predict_role(i_w, i_r, t_w, t_r, batch_size) # word_emb, avg_emb, event_emb = net.avg_emb.predict([i_w, i_r, t_w, t_r], batch_size) # print word_emb.shape, avg_emb.shape, event_emb.shape # assert np.multiply(word_emb[0][0], avg_emb[0])[0] == event_emb[0][0][0] # assert np.multiply(word_emb[0][0], avg_emb[0])[1] == event_emb[0][0][1] # test role prediction of MTRF_dev, result: role prediction is useless # print i_r # print t_r.reshape(-1) # print result_role # result_word_likelihood = net.predict(i_w, i_r, t_w, t_r, batch_size)[0] # neg_log_likelihoods = -np.log(result_word_likelihood) # for i, row in enumerate(neg_log_likelihoods, start=0): # target_word = t_w[i][0] # target_role = t_r[i][0] # neg_log_likelihood = row[target_word] # ppl_role_list.setdefault(target_role, []).append(neg_log_likelihood) # print i_w, i_r, t_w, t_r for i, true_r in enumerate(t_r, start=0): # if reverse_role_vocabulary.get(t_r[0][0], '<unknown>') == 'AM-LOC': # print ("input words", [reverse_word_vocabulary.get(w, '<unknown>') for w in i_w[0]]) # print ("input roles", [reverse_role_vocabulary.get(r, '<unknown>') for r in i_r[0]]) # print ("target word", [reverse_word_vocabulary.get(w, '<unknown>') for w in t_w[0]]) # print ("target role", [reverse_role_vocabulary.get(r, '<unknown>') for r in t_r[0]]) # print ("predicted role", [reverse_role_vocabulary.get(result_role[i], '<unknown>') for r in t_r[0]]) # print '' confusionM[true_r, result_role[i]] += 1 if true_r == result_role[i]: result_list.append(1) output_list.append((true_r, result_role[i])) batch_n += 1 if batch_n % 100 == 0: print(batch_n) if batch_n >= test_steps: break # ppl_role = dict() # for k, v in ppl_role_list.items(): # neg_log_likelihood_role = np.mean(np.array(v)) # ppl_role[k] = np.exp(neg_log_likelihood_role) # obtain ZeroR baseline print confusionM majority = 1 if majority_baseline == True: for i in range(7): confusionM[i][majority] = confusionM[i][:].sum() confusionM[i][majority - 1] = 0 confusionM[i][majority + 1:] = 0 print confusionM dir_P, dir_R, dir_F1, precision, recall, F1 = stats(net, confusionM) print "Dir: %.2f \t %.2f \t %.2f" % (dir_P, dir_R, dir_F1) # np.savetxt('confusionM_' + experiment_name + '.' + test_name.strip('.dat') + '.csv', confusionM, delimiter = ',') # np.savetxt('output_' + experiment_name + '.' + test_name.strip('.dat') + '.csv', output_list, delimiter = ',') # with open(repr_file, 'w') as f_out: # f_out.write('[') # for i in range(n_roles): # f_out.write('[') # for j in range(n_roles): # f_out.write(str(confusionM[i][j]) + ", ") # f_out.write('] \n') # f_out.write(']') # print "Loss(neg_log_likelihood) by role: " # for r in ppl_role.keys(): # print (reverse_role_vocabulary[r], np.log(ppl_role[r])) print("Result by role: ") for r in range(len(precision)): print('%s: \t %.2f \t %.2f \t %.2f' % (reverse_role_vocabulary[r], precision[r], recall[r], F1[r])) test_end = time.clock() print 'test time: %f, sps: %f' % (test_end - test_start, test_steps * batch_size / (test_end - test_start)) if bootstrapping: P_mean, P_std, R_mean, R_std, F1_mean, F1_std = bootstrap( experiment_name, test_name, net, n_roles, output_list=output_list) return P_mean, P_std, R_mean, R_std, F1_mean, F1_std
def query(model_name, experiment_name, inputs, target): MODEL_NAME = experiment_name description = model_builder.load_description(MODEL_PATH, MODEL_NAME) net = model_builder.build_model(model_name, description) net.load(MODEL_PATH, MODEL_NAME, description) # net.model.summary() # print net.model.get_layer(name="embedding_2").get_weights()[0] print net.role_vocabulary print("unk_word_id", net.unk_word_id) print("missing_word_id", net.missing_word_id) # net.set_0_bias() net.model.summary() propbank_map = { "subj" : "A0", "obj" : "A1", "ARG0" : "A0", "ARG1" : "A1", "ARG2" : "A2", } # tr_map = { # "A0": numpy.asarray([[net.role_vocabulary["A0"]]], dtype=numpy.int64), # "A1": numpy.asarray([[net.role_vocabulary["A1"]]], dtype=numpy.int64), # "A2": numpy.asarray([[net.role_vocabulary["<UNKNOWN>"]]], dtype=numpy.int64) # } # net.word_vocabulary["<NOTHING>"] = net.missing_word_id # net.role_vocabulary["<UNKNOWN>"] = net.unk_role_id reverse_vocabulary = utils.get_reverse_map(net.word_vocabulary) reverse_role_vocabulary = utils.get_reverse_map(net.role_vocabulary) print reverse_role_vocabulary raw_words = dict((reverse_role_vocabulary[r], reverse_vocabulary[net.missing_word_id]) for r in net.role_vocabulary.values()) # print raw_words raw_words.update(inputs) # print raw_words # print len(raw_words) assert len(raw_words) == len(net.role_vocabulary) # print repr(raw_words) # n = int(sys.argv[3]) t_r = [net.role_vocabulary.get(r, net.unk_role_id) for r in target.keys()] t_w = [net.word_vocabulary.get(w, net.unk_word_id) for w in target.values()] input_roles_words = {} for r, w in raw_words.items(): input_roles_words[net.role_vocabulary[r]] = utils.input_word_index(net.word_vocabulary, w, net.unk_word_id, warn_unk=True) print input_roles_words, t_r input_roles_words.pop(t_r[0]) # default_roles_words = dict((r, net.missing_word_id) for r in (net.role_vocabulary.values())) # default_roles_words.update(input_roles_words) # input_roles_words = default_roles_words x_w_i = numpy.asarray([input_roles_words.values()], dtype=numpy.int64) x_r_i = numpy.asarray([input_roles_words.keys()], dtype=numpy.int64) y_w_i = numpy.asarray(t_w, dtype=numpy.int64) y_r_i = numpy.asarray(t_r, dtype=numpy.int64) topN=20 predicted_word_indices = net.top_words(x_w_i, x_r_i, y_w_i, y_r_i, topN) # print predicted_word_indices # print len(predicted_word_indices) print(x_w_i, x_r_i, y_w_i, y_r_i) p_w = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i, batch_size=1, verbose=0)[0] print ('p_t_w: ', p_w) resultlist = predicted_word_indices # print resultlist for i, t_w_i in enumerate(resultlist): t_w = net.word_vocabulary.get(t_w_i, net.unk_word_id) y_w_i = numpy.asarray([t_w_i], dtype=numpy.int64) p = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i, batch_size=1, verbose=0)[0] n = numpy.round(p / 0.005) fb = numpy.floor(n) hb = n % 2 print u"{:<5} {:7.6f} {:<20} ".format(i+1, float(p), reverse_vocabulary[int(t_w_i)]) + u"\u2588" * int(fb) + u"\u258C" * int(hb)
def eval_pado_mcrae(model_name, experiment_name, evaluation, model=None, print_result=True): MODEL_NAME = experiment_name if model: net = model else: description = model_builder.load_description(MODEL_PATH, MODEL_NAME) net = model_builder.build_model(model_name, description) net.load(MODEL_PATH, MODEL_NAME, description) bias = net.set_0_bias() # net.model.summary() # print net.model.get_layer(name="embedding_2").get_weights()[0] # If no <UNKNOWN> role in the role vocabulary, add it. if net.role_vocabulary.get("<UNKNOWN>", -1) == -1: net.role_vocabulary["<UNKNOWN>"] = len(net.role_vocabulary) - 1 if print_result: print net.role_vocabulary print("unk_word_id", net.unk_word_id) print("missing_word_id", net.missing_word_id) propbank_map = { "subj" : "A0", "obj" : "A1", "ARG0" : "A0", "ARG1" : "A1", } tr_map = { "A0": numpy.asarray([net.role_vocabulary["A0"]], dtype=numpy.int64), "A1": numpy.asarray([net.role_vocabulary["A1"]], dtype=numpy.int64), "<UNKNOWN>": numpy.asarray([net.role_vocabulary["<UNKNOWN>"]], dtype=numpy.int64) } if "A2" not in net.role_vocabulary.keys(): propbank_map["ARG2"] = "<UNKNOWN>" tr_map["A2"] = numpy.asarray([net.role_vocabulary["<UNKNOWN>"]], dtype=numpy.int64) else: propbank_map["ARG2"] = "A2" tr_map["A2"] = numpy.asarray([net.role_vocabulary["A2"]], dtype=numpy.int64) fixed = False if evaluation == "pado": eval_data_file = os.path.join(EVAL_PATH, 'pado_plausibility_pb.txt') elif evaluation == 'mcrae': eval_data_file = os.path.join(EVAL_PATH, 'mcrae_agent_patient_more.txt') else: fixed = True if evaluation == 'pado_fixed': eval_data_file = os.path.join(RV_EVAL_PATH, 'Pado-AsadFixes.txt') elif evaluation == 'mcrae_fixed': eval_data_file = os.path.join(RV_EVAL_PATH, 'McRaeNN-fixed.txt') else: eval_data_file = os.path.join(COMP_EVAL_PATH, 'compare-pado.txt') result_file = os.path.join(MODEL_PATH, MODEL_NAME + '_' + evaluation + '.txt') r_i = net.role_vocabulary["V"] probs = {} baseline = {} oov_count = {} blist=[] plist = [] if print_result: print eval_data_file print "="* 60 with open(eval_data_file, 'r') as f, \ open(result_file, 'w') as f_out: for i, line in enumerate(f): line = line.strip() if line == "": continue w, tw, tr = line.split()[:3] # input word, target word, role w = w[:-2] if fixed else w tw = tw[:-2] if fixed else tw w = wnl.lemmatize(w, wn.VERB) tw = wnl.lemmatize(tw, wn.NOUN) w_i = net.word_vocabulary.get(w, net.unk_word_id) tw_i = net.word_vocabulary.get(tw, net.unk_word_id) tr_i = net.role_vocabulary.get(propbank_map[tr], net.unk_role_id) if tw_i == net.unk_word_id: print w, tr, tw oov_count[tr] = oov_count.get(tr, 0) + 1 f_out.write(line + "\tnan\n") continue b = float(line.split()[3]) baseline.setdefault(tr, []).append(b) blist.append(b) sample = dict((r, net.missing_word_id) for r in (net.role_vocabulary.values() + [net.unk_role_id])) sample[r_i] = w_i sample.pop(net.role_vocabulary[propbank_map[tr]], None) x_w_i = numpy.asarray([sample.values()], dtype=numpy.int64) x_r_i = numpy.asarray([sample.keys()], dtype=numpy.int64) y_w_i = numpy.asarray([tw_i]) y_r_i = tr_map[propbank_map[tr]] s = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i) # pr = net.p_roles(x_w_i, x_r_i, y_w_i, y_r_i) plist.append(s) probs.setdefault(tr, []).append(s) f_out.write(line + "\t%s\n" % s) result = dict() for r, b in baseline.iteritems(): p = probs[r] rho, p_value = spearmanr(b, p) rating = len(p) oov = oov_count.get(r, 0) result[r] = round(rho, 4) if print_result: print "=" * 60 print "ROLE: %s" % r print "-" * 60 print "Spearman correlation: %f; 2-tailed p-value: %f" % (rho, p_value) print "Num ratings: %d (%d out of vocabulary)" % (rating, oov) rho, p_value = spearmanr(blist, plist) result['all'] = round(rho, 4) if print_result: print "Spearman correlation of %s: %f; 2-tailed p-value: %f" % (evaluation, rho, p_value) net.set_bias(bias) return result, plist, blist
def eval_MNR_LOC(model_name, experiment_name, evaluation, model=None, print_result=True, skip_header=False): MODEL_NAME = experiment_name if model: net = model else: description = model_builder.load_description(MODEL_PATH, MODEL_NAME) net = model_builder.build_model(model_name, description) net.load(MODEL_PATH, MODEL_NAME, description) bias = net.set_0_bias() if print_result: print net.role_vocabulary tr_map = { "ARG2": "A2", "ARG3": "A3", "ARGM-MNR": "AM-MNR", "ARGM-LOC": "AM-LOC", } if evaluation == "AM-MNR": eval_data_file = os.path.join(RV_EVAL_PATH, 'McRaeInstr-fixed' + '.txt') remove_suffix = False elif evaluation == 'AM-LOC': eval_data_file = os.path.join(RV_EVAL_PATH, 'McRaeLoc-fixed' + '.txt') remove_suffix = True else: sys.exit('No such evaluation!!!') result_file = os.path.join(MODEL_PATH, MODEL_NAME + '_' + evaluation + '.txt') probs = [] baseline = [] oov_count = 0 r_i = net.role_vocabulary["V"] if print_result: print eval_data_file, evaluation print "=" * 60 with open(eval_data_file, 'r') as f, \ open(result_file, 'w') as f_out: for i, line in enumerate(f): if i == 0 and skip_header: # print line.strip() + "\tP(instrument|verb)" continue #skip header line = line.strip() w, tw, temp1, temp2 = line.split( )[:4] # input word, target word, other stuff w = w[:-2] if remove_suffix else w tw = tw[:-2] if remove_suffix else tw w = wnl.lemmatize(w.lower(), wn.VERB) tw = wnl.lemmatize(tw.lower(), wn.NOUN) w_i = net.word_vocabulary.get(w, net.unk_word_id) tw_i = net.word_vocabulary.get(tw, net.unk_word_id) if evaluation == "AM-MNR": r = temp2 else: r = temp1 # tr_i = net.role_vocabulary.get(evaluation, net.unk_role_id) tr_i = net.role_vocabulary.get(tr_map[r], net.unk_role_id) y_r_i = numpy.asarray([tr_i], dtype=numpy.int64) if tw_i == net.unk_word_id: oov_count += 1 print w, tw f_out.write(line + "\tnan\n") continue b = float(line.split()[-1 if remove_suffix else -2]) baseline.append(b) input_roles_words = dict( (r, net.missing_word_id) for r in (net.role_vocabulary.values() + [net.unk_role_id])) input_roles_words[r_i] = w_i input_roles_words.pop(tr_i, None) x_w_i = numpy.asarray([input_roles_words.values()], dtype=numpy.int64) x_r_i = numpy.asarray([input_roles_words.keys()], dtype=numpy.int64) y_w_i = numpy.asarray([tw_i], dtype=numpy.int64) p = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i) # pr = net.p_roles(x_w_i, x_r_i, y_w_i, y_r_i) probs.append(p) f_out.write(line + "\t%s\n" % p) rho, p_value = spearmanr(baseline, probs) rating = len(probs) if print_result: print "Spearman correlation: %f; 2-tailed p-value: %f" % (rho, p_value) print "Num ratings: %d (%d out of vocabulary)" % (rating, oov_count) net.set_bias(bias) return rho, p_value, oov_count, probs, baseline
def pd_themfit(model_name, experiment_name, df, predict_role='V', input_roles='all_available_args', function='filler_prob', n=5, debug=False): """ Adds a column to a pandas df with a role filler probability. For each row in the pandas df, calculates the probability that a particular role filler will fill a particular role, given a set of input roles and fillers (from that row). Keyword arguments: model_name -- The name of the model experiment_name -- The name of the model plus the name of the experiment, separated by '_' df -- The pandas dataframe. Must include columns for all propbank labels in predict_role and input_roles predict_role -- the target role (in propbank labels) for which the filler will be predicted (default: 'V') input_roles -- the set of roles (in propbank labels) that should be used as inputs (default: 'all_args') """ possible_roles = set( ['A0', 'A1', 'AM-LOC', 'AM-TMP', 'AM-MNR', '<UNKNOWN>', 'V']) try: assert predict_role in df.columns assert predict_role in possible_roles if input_roles != 'all_available_args': for r in input_roles: assert r in df.columns assert r in possible_roles except: print("NOT ALL ROLES ARE AVAILABLE AS DF COLUMNS") MODEL_NAME = experiment_name description = model_builder.load_description(MODEL_PATH, MODEL_NAME) net = model_builder.build_model(model_name, description) net.load(MODEL_PATH, MODEL_NAME, description) bias = net.set_0_bias() # net.model.summary() # print net.model.get_layer(name="embedding_2").get_weights()[0] # If no <UNKNOWN> role in the role vocabulary, add it. if net.role_vocabulary.get("<UNKNOWN>", -1) == -1: net.role_vocabulary["<UNKNOWN>"] = len(net.role_vocabulary) - 1 print("Role vocabulary", net.role_vocabulary) print("unk_word_id", net.unk_word_id) print("missing_word_id", net.missing_word_id) reverse_vocabulary = utils.get_reverse_map(net.word_vocabulary) reverse_role_vocabulary = utils.get_reverse_map(net.role_vocabulary) print("Reverse role vocabulary", reverse_role_vocabulary) raw_words = dict( (reverse_role_vocabulary[r], reverse_vocabulary[net.missing_word_id]) for r in net.role_vocabulary.values()) if input_roles == 'all_available_args': possible_roles.remove(predict_role) input_roles = possible_roles.intersection(set(df.columns)) all_roles = input_roles all_roles.add(predict_role) df = df.apply( lambda x: process_row(predict_role=predict_role, role_fillers={i: x[i] for i in all_roles}, model=net, raw_word_list=raw_words, function=function, n=n, debug=debug), axis=1) return df
def eval_greenberg(model_name, experiment_name, evaluation, model=None, print_result=True): MODEL_NAME = experiment_name if model: net = model else: description = model_builder.load_description(MODEL_PATH, MODEL_NAME) net = model_builder.build_model(model_name, description) net.load(MODEL_PATH, MODEL_NAME, description) bias = net.set_0_bias() if print_result: print(net.role_vocabulary) # Updated to python3 syntax (team1-change) eval_data_file = os.path.join(RV_EVAL_PATH, evaluation + '.txt') result_file = os.path.join(MODEL_PATH, MODEL_NAME + '_' + evaluation + '.txt') probs = [] baseline = [] oov_count = 0 tr = "A1" r_i = net.role_vocabulary["V"] tr_i = net.role_vocabulary["A1"] y_r_i = numpy.asarray([tr_i], dtype=numpy.int64) if print_result: print(eval_data_file) # Updated to python3 syntax (team1-change) print("=" * 60) # Updated to python3 syntax (team1-change) with open(eval_data_file, 'r') as f, \ open(result_file, 'w') as f_out: for line in f: line = line.strip().lower() w, tw = line.split()[:2] # input word, target word, other stuff w = w[:-2].strip() tw = tw[:-2].strip() w = wnl.lemmatize(w, wn.VERB) tw = wnl.lemmatize(tw, wn.NOUN) # a hack to fix some words # tw = word_fix.get(tw, tw) w_i = net.word_vocabulary.get(w, net.unk_word_id) tw_i = net.word_vocabulary.get(tw, net.unk_word_id) if tw_i == net.unk_word_id: oov_count += 1 print(w, tr, tw) # Updated to python3 syntax (team1-change) f_out.write(line + "\tnan\n") continue b = float(line.split()[-1]) sample = dict( (r, net.missing_word_id) for r in (net.role_vocabulary.values() + [net.unk_role_id])) sample[r_i] = w_i sample.pop(tr_i, None) x_w_i = numpy.asarray([sample.values()], dtype=numpy.int64) x_r_i = numpy.asarray([sample.keys()], dtype=numpy.int64) y_w_i = numpy.asarray([tw_i], dtype=numpy.int64) p = net.p_words(x_w_i, x_r_i, y_w_i, y_r_i) # pr = net.p_roles(x_w_i, x_r_i, y_w_i, y_r_i) if tw_i == net.unk_word_id: print('OOV: %s' % w, b, p) baseline.append(b) probs.append(p) f_out.write(line + "\t%s\n" % p) rho, p_value = spearmanr(baseline, probs) if print_result: print( f"Spearman correlation of {evaluation}: {rho}; 2-tailed p-value: {p_value}" ) # Updated to python3 syntax and f-string (team1-change) print(f"Num ratings: {len(probs)} ({oov_count} out of vocabulary)" ) # Updated to python3 syntax and f-string (team1-change) net.set_bias(bias) return rho, p_value, oov_count, probs, baseline