def polish_pickle(feature_params, test_accuracy): filename = tr.make_pickle_filename('trained_models', feature_params, test_accuracy) data = tr.load_data(filename) data['feature_params'] = feature_params tr.save_data(data, filename) # Check feature_params were saved correctly. data = tr.load_data(filename) print('feature_params:', data['feature_params'].str())
def best_feature(temporal_rel): """Look at the accuracies for all features in isolation.""" features = [ "pos", "stem", "aspect", "tense", "distance", "similarity", "polarity", "modality" ] accuracies = [] recall = [] precision = [] for feature in features: X, y = load_data(True, temporal_rel, features=[feature]) X_train, X_test, y_train, y_test = split(X, y) rf = RandomForestClassifier(n_jobs=2, n_estimators=100) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) accuracies.append({feature: f1_score(y_test, y_pred)}) recall.append({feature: recall_score(y_test, y_pred)}) precision.append({feature: precision_score(y_test, y_pred)}) print "Done with feature" # Add all features X, y = load_data(True, temporal_rel) X_train, X_test, y_train, y_test = split(X, y) rf = RandomForestClassifier(n_jobs=2, n_estimators=100) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) accuracies.append({"all": f1_score(y_test, y_pred)}) recall.append({"all": recall_score(y_test, y_pred)}) precision.append({"all": precision_score(y_test, y_pred)}) features.append("all") data = [x.values()[0] for x in accuracies] if temporal_rel == None: plot("best_feature_weighted.jpg", "feature", "f1_score", data, features) else: plot("best_feature_" + str(temporal_rel) + ".jpg", "feature", "f1_score", data, features) print recall print precision
def get_distance_data(data, temporal_rel): """Extracts the distance feature into the following data structure which will be returned: [{distance : classified_right?}, ...]""" X, y, distance = load_data(True, temporal_rel, distance=True) X_train, X_test, y_train, y_test, distance_train, distance_test = split( X, y, distance) rf = RandomForestClassifier(n_jobs=2, n_estimators=100) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) # Make array with elements like this {distance : [TruePositive?, TrueNegative?, FalsePositive?, FalseNegative?]} for i in range(len(X_test)): if y_test[i] == y_pred[i] and y_test[i] == 1: # True positive data.append({distance_test[i]: [True, False, False, False]}) elif y_test[i] == y_pred[i] and y_test[i] == 0: # True negative data.append({distance_test[i]: [False, True, False, False]}) elif y_test[i] != y_pred[i] and y_pred[i] == 1: # False positive data.append({distance_test[i]: [False, False, True, False]}) elif y_test[i] != y_pred[i] and y_pred[i] == 0: # False negative data.append({distance_test[i]: [False, False, False, True]})
def main(): """Argument parser for making G2P predictions""" parser = argparse.ArgumentParser() parser.add_argument('pron_path', default='./data/prondict_ice.txt', nargs='?') parser.add_argument( 'words', default=['adolfsdóttir', 'skynsemi', 'uppvaxtarskilyrði'], nargs='?') parser.add_argument('exp_name', default='g2p_ice', nargs='?') parser.add_argument('emb_dim', default=500, nargs='?') parser.add_argument('hidden_dim', default=500, nargs='?') parser.add_argument('cuda', default=True, nargs='?') parser.add_argument('seed', default=1337, nargs='?') parser.add_argument('result_dir', default='./results', nargs='?') parser.add_argument('data_splits', default=(0.9, 0.05, 0.05), nargs='?') args = parser.parse_args() exp_dir = os.path.join(args.result_dir, args.exp_name) ckp_path = os.path.join(exp_dir, 'mdl.ckpt') full_ds, _ = load_data(args.pron_path, args.data_splits, **vars(args)) model = load_model(full_ds.num_graphemes, full_ds.num_phonemes, ckp_path, **vars(args)) for word in args.words: print(word) predict(model, word, full_ds)
def get_data(dataset, data_dir, batch_size, test_batch_size): ''' get data for imagenet ''' nThread = 1 pin = True # for cuda device traindir = os.path.join(data_dir, 'train') valdir = os.path.join(data_dir, 'validation') print('train_dir is ', traindir) dataset, dataset_test, train_sampler, test_sampler = load_data( traindir, valdir, False, get_world_size() > 1) train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, num_workers=nThread, pin_memory=True) val_loader = torch.utils.data.DataLoader(dataset_test, batch_size=test_batch_size, sampler=test_sampler, num_workers=nThread, pin_memory=True) criterion = torch.nn.CrossEntropyLoss() return train_loader, val_loader, criterion
def eval_mode_batch(output_tags, confidences, cities): tagged_data, identifier = load_data(output_tags) num_tags = len(int2tags) - 1 assert len(tagged_data) == len(confidences) for i in range(len(tagged_data)): sentence = tagged_data[i][0] tags = tagged_data[i][1] tag_confs = confidences[i] ident = identifier[i] gold_ents = ident.split(',')[:num_tags] #Throw away title output_pred_line, entity_confidences, entity_cnts = predict_mode(sentence, tags, tag_confs, cities) predictions = output_pred_line.split(" ### ") # Evaluate the predictions. evaluateArticle(predictions, gold_ents) print "------------\nEvaluation Stats: (Precision, Recall, F1):" for tag in GOLD: prec = CORRECT[tag]/PRED[tag] rec = CORRECT[tag]/GOLD[tag] f1 = (2*prec*rec)/(prec+rec) print tag, prec, rec, f1, "########", CORRECT[tag], PRED[tag], GOLD[tag]
def main(_): _, dev, test, samples, _, _, data = load_data(FLAGS.pop_prep_dir, FLAGS.pop_dataset, FLAGS.pop_prep_name, FLAGS.pop_debug) sorted_items = sort_items_by_popularity(samples) print_most_popular_items(data, sorted_items) pop_ranking = np.ones((1, len(sorted_items))) for i, (item_idx, _) in enumerate(sorted_items): pop_ranking[0, item_idx] = len(sorted_items) - i random_items = 100 for title, split in zip(["DEV", "TEST"], [dev, test]): rank_all, rank_random = random_eval(pop_ranking, split, samples, num_rand=random_items) metric_all, metric_random = rank_to_metric_dict( rank_all), rank_to_metric_dict(rank_random) print(f"Result for {title.upper()}") print(f"Random items {random_items}: " + " ".join((f"{k}: {v:.2f}" for k, v in metric_random.items()))) print("All items: " + " ".join((f"{k}: {v:.2f}" for k, v in metric_all.items())))
def check_model(): # Creating the session session, img_length, img_height, y_pred_cls, x, weights1, weights2, conv1, conv2 = load_graph( True, "../model/two_d_cnn_proj.ckpt") # object names object_names = object_names_func() list_of_objects = list(range(29)) list_of_objects.remove(22) # plotting the weights plot_conv_weights(session.run(weights1), 0, '../model/weights1.png') plot_conv_weights(session.run(weights2), 0, '../model/weights2.png') # Select some images after reading in the data train_input_encode, train_out_encode, test_input_encode, test_out_encode = load_data( "../data") image = train_input_encode[1000] test_object = object_names[list_of_objects[np.argmax( train_out_encode[1000])]] feed_dict = {x: [image]} # Calculate and retrieve the output values of the layer1 values = session.run(conv1, feed_dict=feed_dict) plot_conv_layer(values, '../model/{}_1.png'.format(test_object)) # Calculate and retrieve the output values of the layer2 values = session.run(conv2, feed_dict=feed_dict) plot_conv_layer(values, '../model/{}_2.png'.format(test_object)) print("Object = {}".format(test_object))
def eval_mode_batch(output_tags, confidences, cities): tagged_data, identifier = load_data(output_tags) num_tags = len(int2tags) - 1 assert len(tagged_data) == len(confidences) for i in range(len(tagged_data)): sentence = tagged_data[i][0] tags = tagged_data[i][1] tag_confs = confidences[i] ident = identifier[i] gold_ents = ident.split(',')[:num_tags] #Throw away title output_pred_line, entity_confidences, entity_cnts = predict_mode( sentence, tags, tag_confs, cities) predictions = output_pred_line.split(" ### ") # Evaluate the predictions. evaluateArticle(predictions, gold_ents) print "------------\nEvaluation Stats: (Precision, Recall, F1):" for tag in GOLD: prec = CORRECT[tag] / PRED[tag] rec = CORRECT[tag] / GOLD[tag] f1 = (2 * prec * rec) / (prec + rec) print tag, prec, rec, f1, "########", CORRECT[tag], PRED[tag], GOLD[ tag]
def run(data_name, num_train, num_test, Phi, depth, widths, lc_w_range, shift_w_range, optim_name, optim_args, num_epochs, batch_size, chkpt_freq): id = get_info() identifier_id = '%s%s' % (identifier, id) train_data, test_data = load_data(data_name, num_train, num_test) train_ll, test_ll = load_log_ll(data_name, num_train, num_test) print('Computing ground truth manually because tagged log likelihood is wrong') from phi_listing import ClaytonPhi gt_phi = ClaytonPhi(torch.tensor(5.)) cop = Copula(gt_phi) train_ll = -torch.log(cop(train_data, mode='pdf')) test_ll = -torch.log(cop(test_data, mode='pdf')) print('train_ll', torch.mean(train_ll)) print('test_ll', torch.mean(test_ll)) print('Train ideal ll:', torch.mean(train_ll)) print('Test ideal ll:', torch.mean(test_ll)) phi = Phi(depth, widths, lc_w_range, shift_w_range) net = Copula(phi) expt(train_data, test_data, net, optim_name, optim_args, identifier_id, num_epochs, batch_size, chkpt_freq)
def dummy(): config = train_config data_train = load_data(config, 'train') config['input_dim'] = data_train.input_[0].shape[-1] config['output_dim'] = data_train.target[0].shape[-1] data_train.reshuffle() rnn_model_class, placeholders = get_model_and_placeholders(config) rnn_model = RNNModel(config, placeholders, mode='training') # loop through all training batches for i, batch in enumerate(data_train.all_batches()): # get the feed dict for the current batch feed_dict = rnn_model.get_feed_dict(batch) for sequence_mask in batch.mask: if np.sum(sequence_mask) < 35: print('found it {0}'.format(np.sum(sequence_mask))) input_padded, target_padded = batch.get_padded_data() mse = mean_squared_error(input_padded[0], target_padded[0]) mse2 = mean_squared_error(input_padded[1], target_padded[1])
def different_number_of_trees(temporal_rel, start=5, end=800, steps=20, rerunning=5): """How does the accuracy change for different amounts of trees. Plots to different_number_of_trees.jpg""" X, y = load_data(True, temporal_rel) X_train, X_test, y_train, y_test = split(X, y) # Since accuracies for small amounts of trees differ a lot, we take the average over many tries many_accuracies = [] many_recall = [] many_precision = [] for x in range(rerunning): accuracies = [] recall = [] precision = [] for i in range(start, end, steps): rf = RandomForestClassifier(n_jobs=2, n_estimators=i) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) accuracies.append(f1_score(y_test, y_pred)) recall.append(recall_score(y_test, y_pred)) precision.append(precision_score(y_test, y_pred)) many_accuracies.append(accuracies) many_recall.append(recall) many_precision.append(precision) final_accuracies = [] final_recall = [] final_precision = [] # Calculate the mean for i in range(len(many_accuracies[0])): mean = [] mean_recall = [] mean_precision = [] for j in range(len(many_accuracies)): mean.append(many_accuracies[j][i]) mean_recall.append(many_recall[j][i]) mean_precision.append(many_precision[j][i]) final_accuracies.append(np.mean(mean)) final_recall.append(np.mean(mean_recall)) final_precision.append(np.mean(mean_precision)) # xticks xticks = range(start, end, steps) if temporal_rel == None: plot("different_number_of_trees_weighted.jpg", "number_of_trees", "f1_score", final_accuracies, xticks) else: plot("different_number_of_trees_" + str(temporal_rel) + ".jpg", "number_of_trees", "f1_score", final_accuracies, xticks) print final_recall print final_precision
def main(repo_path): test_csv_path = repo_path / "data/prepared/test.csv" test_data, labels = load_data(test_csv_path) model = load(repo_path / "model/model.joblib") predictions = model.predict(test_data) accuracy = accuracy_score(labels, predictions) metrics = {"accuracy": accuracy} accuracy_path = repo_path / "metrics/accuracy.json" accuracy_path.write_text(json.dumps(metrics))
def learning_rate(temporal_rel, k=20, new=False): """Splits the dataset into k pieces and builds a series out of those k pieces. For every partial sum the accuracy will be calculated to obtain the learning rate. Plots the data to learning_rate.jpg""" X, y = load_data(new, temporal_rel) X_train, X_test, y_train, y_test = split(X, y) # Splitting the training set up into k pieces len_piece = len(X_train) / k X_pieces = [] y_pieces = [] data_count = [] recall = [] precision = [] for i in range(k): data_count.append((i + 1) * len_piece) offset = i * len_piece X_piece = X[offset:][:len_piece] y_piece = y[offset:][:len_piece] X_pieces.append(X_piece) y_pieces.append(y_piece) # Building series (from 0 to k) for those pieces X_series = [] y_series = [] for i in range(k): X_sum = X_pieces[0] y_sum = y_pieces[0] for j in range(i): X_sum = np.concatenate((X_sum, X_pieces[j])) y_sum = np.concatenate((y_sum, y_pieces[j])) X_series.append(X_sum) y_series.append(y_sum) # Calculate the accuracy for each partial sum rf = RandomForestClassifier(n_jobs=2, n_estimators=1000) accuracies = [] for partial_X, partial_y in zip(X_series, y_series): rf.fit(partial_X, partial_y) y_pred = rf.predict(X_test) accuracies.append(f1_score(y_test, y_pred)) recall.append(recall_score(y_test, y_pred)) precision.append(precision_score(y_test, y_pred)) if temporal_rel == None: plot("learning_rate_weighted.jpg", "data_count", "f1_score", accuracies, data_count) else: plot("learning_rate_" + str(temporal_rel) + ".jpg", "data_count", "f1_score", accuracies, data_count) print recall print precision
def main(): args = get_args() weight_path = args.weight_path if not os.path.exists(RESPATH): os.makedirs(RESPATH) viddata, auddata = train.load_data(DATAPATH) net_out = auddata.shape[1] viddata, auddata_norm, auddata_means, auddata_stds = standardize_data(viddata, auddata) print(net_out)
def union_vs_intersected_relations(): """Looking at the difference in accuracy when all relations (union) are used vs. all events are used which the annotators have in common (intersected).""" X_union, y_union = load_data(new=True, annotations="union") X_intersected, y_intersected = load_data(new=True, annotations="intersected") X_u_train, X_u_test, y_u_train, y_u_test = split(X_union, y_union) X_i_train, X_i_test, y_i_train, y_i_test = split(X_intersected, y_intersected) rf_u = RandomForestClassifier(n_jobs=2, n_estimators=100) rf_u.fit(X_u_train, y_u_train) rf_i = RandomForestClassifier(n_jobs=2, n_estimators=100) rf_i.fit(X_i_train, y_i_train) y_u_pred = rf_u.predict(X_u_test) y_i_pred = rf_i.predict(X_i_test) print "Union: " + str(f1_score(y_u_test, y_u_pred)) print "Intersected: " + str(f1_score(y_i_test, y_i_test))
def best_feature(temporal_rel): """Look at the accuracies for all features in isolation.""" features = ["pos", "stem", "aspect", "tense", "distance", "similarity", "polarity", "modality"] accuracies = [] recall = [] precision = [] for feature in features: X, y = load_data(True, temporal_rel, features=[feature]) X_train, X_test, y_train, y_test = split(X, y) rf = RandomForestClassifier(n_jobs=2, n_estimators=100) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) accuracies.append({feature : f1_score(y_test, y_pred)}) recall.append({feature : recall_score(y_test, y_pred)}) precision.append({feature : precision_score(y_test, y_pred)}) print "Done with feature" # Add all features X, y = load_data(True, temporal_rel) X_train, X_test, y_train, y_test = split(X, y) rf = RandomForestClassifier(n_jobs=2, n_estimators=100) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) accuracies.append({"all": f1_score(y_test, y_pred)}) recall.append({"all": recall_score(y_test, y_pred)}) precision.append({"all": precision_score(y_test, y_pred)}) features.append("all") data = [x.values()[0] for x in accuracies] if temporal_rel == None: plot("best_feature_weighted.jpg", "feature", "f1_score", data, features) else: plot("best_feature_"+str(temporal_rel)+".jpg", "feature", "f1_score", data, features) print recall print precision
def mean_std(train_dir, val_dir): train_loader, val_loader = load_data(train_dir, val_dir) mean_train, std_train = process(train_loader) mean_val, std_val = process(val_loader) print('mean_train: ', mean_train) print('std_train: ', std_train) print('mean_val: ', mean_val) print('std_val: ', std_val)
def learning_rate(temporal_rel, k=20, new=False): """Splits the dataset into k pieces and builds a series out of those k pieces. For every partial sum the accuracy will be calculated to obtain the learning rate. Plots the data to learning_rate.jpg""" X, y = load_data(new, temporal_rel) X_train, X_test, y_train, y_test = split(X, y) # Splitting the training set up into k pieces len_piece = len(X_train)/k X_pieces = [] y_pieces = [] data_count = [] recall = [] precision = [] for i in range(k): data_count.append((i+1)*len_piece) offset = i*len_piece X_piece = X[offset:][:len_piece] y_piece = y[offset:][:len_piece] X_pieces.append(X_piece) y_pieces.append(y_piece) # Building series (from 0 to k) for those pieces X_series = [] y_series = [] for i in range(k): X_sum = X_pieces[0] y_sum = y_pieces[0] for j in range(i): X_sum = np.concatenate((X_sum, X_pieces[j])) y_sum = np.concatenate((y_sum, y_pieces[j])) X_series.append(X_sum) y_series.append(y_sum) # Calculate the accuracy for each partial sum rf = RandomForestClassifier(n_jobs=2, n_estimators=1000) accuracies = [] for partial_X, partial_y in zip(X_series, y_series): rf.fit(partial_X, partial_y) y_pred = rf.predict(X_test) accuracies.append(f1_score(y_test, y_pred)) recall.append(recall_score(y_test, y_pred)) precision.append(precision_score(y_test, y_pred)) if temporal_rel == None: plot("learning_rate_weighted.jpg", "data_count", "f1_score", accuracies, data_count) else: plot("learning_rate_"+str(temporal_rel)+".jpg", "data_count", "f1_score", accuracies, data_count) print recall print precision
def different_number_of_trees(temporal_rel, start=5, end=800, steps=20, rerunning=5): """How does the accuracy change for different amounts of trees. Plots to different_number_of_trees.jpg""" X, y = load_data(True, temporal_rel) X_train, X_test, y_train, y_test = split(X, y) # Since accuracies for small amounts of trees differ a lot, we take the average over many tries many_accuracies = [] many_recall = [] many_precision = [] for x in range(rerunning): accuracies = [] recall = [] precision = [] for i in range(start, end, steps): rf = RandomForestClassifier(n_jobs=2, n_estimators=i) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) accuracies.append(f1_score(y_test, y_pred)) recall.append(recall_score(y_test, y_pred)) precision.append(precision_score(y_test, y_pred)) many_accuracies.append(accuracies) many_recall.append(recall) many_precision.append(precision) final_accuracies = [] final_recall = [] final_precision = [] # Calculate the mean for i in range(len(many_accuracies[0])): mean = [] mean_recall = [] mean_precision = [] for j in range(len(many_accuracies)): mean.append(many_accuracies[j][i]) mean_recall.append(many_recall[j][i]) mean_precision.append(many_precision[j][i]) final_accuracies.append(np.mean(mean)) final_recall.append(np.mean(mean_recall)) final_precision.append(np.mean(mean_precision)) # xticks xticks = range(start, end, steps) if temporal_rel == None: plot("different_number_of_trees_weighted.jpg", "number_of_trees", "f1_score", final_accuracies, xticks) else: plot("different_number_of_trees_"+str(temporal_rel)+".jpg", "number_of_trees", "f1_score", final_accuracies, xticks) print final_recall print final_precision
def main(trained_model, test_file, viterbi, output_tags="output.tag", output_predictions="output.pred"): test_data, identifier = load_data(testing_file) evaluate = True # extract features if not "crf" in trained_model: if not isinstance(trained_model, list): with open(trained_model, 'rb') as frb: clf, previous_n, next_n, word_vocab, other_features = pickle.load( frb) else: clf, previous_n, next_n, word_vocab, other_features = trained_model tic = time.clock() with open(output_tags, 'w') as fw: confidences = [] for i in range(len(test_data) + len(identifier)): if i % 2 == 1: if "crf" in trained_model: y, tmp_conf = crf.predict(test_data[i / 2][0], trained_model) fw.write(" ".join([ test_data[i / 2][0][j] + "_" + y[j] for j in range(len(test_data[i / 2][0])) ])) else: y, tmp_conf = predict_tags_n(viterbi, previous_n, next_n, clf, test_data[i / 2][0], word_vocab, other_features) fw.write(" ".join([ test_data[i / 2][0][j] + "_" + int2tags[int(y[j])] for j in range(len(test_data[i / 2][0])) ])) assert (len(y) == len(tmp_conf)) confidences.append(tmp_conf) fw.write("\n") else: fw.write(identifier[i / 2]) fw.write("\n") print(time.clock() - tic) if evaluate: eval_mode_batch(output_tags, confidences, helper.cities) else: predict_mode_batch(output_tags, output_predictions, helper.cityies)
def predict_mode_batch(output_tags, output_predictions, cities): tagged_data, identifier = load_data(output_tags) f = open(output_predictions,'w') for i in range(len(tagged_data)+len(identifier)): if i%2 == 1: f.write(predict_mode(tagged_data[i/2][0], tagged_data[i/2][1], cities)) f.write("\n") else: f.write(identifier[i/2]) f.write("\n") return
def main(): args = get_args() weight_path = args.weight_path if not os.path.exists(RESPATH): os.makedirs(RESPATH) (Xtr, Ytr), (Xte, Yte) = train.load_data(DATAPATH) net_out = Ytr.shape[1] Xtr, Ytr_norm, Xte, Yte_norm, Y_means, Y_stds = train.standardize_data( Xtr, Ytr, Xte, Yte) model = train.build_model(net_out) model.compile(loss='mse', optimizer='adam') model.load_weights(weight_path) Ytr_pred, Yte_pred = train.predict(model, Xtr, Xte, Y_means, Y_stds) train.savedata(Ytr, Ytr_pred, Yte, Yte_pred, respath=RESPATH)
def predict_mode_batch(output_tags, output_predictions, cities): tagged_data, identifier = load_data(output_tags) with open(output_predictions, 'w') as f: for i in range(len(tagged_data) + len(identifier)): if i % 2 == 1: f.write( predict_mode(tagged_data[i / 2][0], tagged_data[i / 2][1], cities)) f.write('\n') else: f.write(identifier[i / 2]) f.write('\n') return
def test_apply_model(self): """Tests if we can apply a model to a small test dataset.""" checkpoint_path = os.path.join(os.path.dirname(__file__), 'va0.1', 't0f50.ckpt') file_pattern = os.path.join(os.path.dirname(__file__), 'testdata', 'Con100_η0.1N300L5_100') predictions = train.apply_model(checkpoint_path=checkpoint_path, file_pattern=file_pattern, time_index=0) data = train.load_data(file_pattern, 0) targets = data[0].targets # correlation_value = np.corrcoef(predictions[0], targets)[0, 1] print('predictions:',predictions) print('targets:',targets)
def main(): args = get_args() weight_path = args.weight_path if not os.path.exists(RESPATH): os.makedirs(RESPATH) viddata, auddata = train.load_data(DATAPATH) net_out = auddata.shape[1] viddata, auddata_norm, auddata_means, auddata_stds = standardize_data( viddata, auddata) model = train.build_model(net_out) model.compile(loss='mse', optimizer='adam') model.load_weights(weight_path) aud_pred = train.predict(model, viddata, auddata_means, auddata_stds) np.save(join(RESPATH, 'aud_pred.npy'), aud_pred)
def run(data_name, num_train, num_test, Phi, depth, widths, lc_w_range, shift_w_range, optim_name, optim_args, num_epochs, batch_size, chkpt_freq): id = get_info() identifier_id = '%s%s' % (identifier, id) train_data, test_data = load_data(data_name, num_train, num_test) train_ll, test_ll = load_log_ll(data_name, num_train, num_test) print('Train ideal ll:', torch.mean(train_ll)) print('Test ideal ll:', torch.mean(test_ll)) phi = Phi(depth, widths, lc_w_range, shift_w_range) net = Copula(phi) expt(train_data, test_data, net, optim_name, optim_args, identifier_id, num_epochs, batch_size, chkpt_freq)
def main(): data, words = load_data() model = models.load_model('model.couplets.h5') c = 13 x = np.zeros((1, 2 * c), dtype='uint32') for i in range(2 * c): s = max(i - c, 0) probas = model.predict(x[:, s:s + c], verbose=0) probas = probas.astype('float64') probas /= probas.sum() probas = np.random.multinomial(1, probas[0], 1) char = np.argmax(probas) x[0, i] = char char = words[char] print(char)
def eval_mode_batch(output_tags, confidences, cities): tagged_data, identifier = load_data(output_tags) num_tags = len(int2tags) - 1 correct = [0]* num_tags guessed = [0] * num_tags gold_correct = [0] * num_tags assert len(tagged_data) == len(confidences) for i in range(len(tagged_data)): sentence = tagged_data[i][0] tags = tagged_data[i][1] tag_confs = confidences[i] ident = identifier[i] gold_ents = ident.split(',')[:num_tags] #Throw away title output_pred_line, entity_confidences, entity_cnts = predict_mode(sentence, tags, tag_confs, cities) predictions = output_pred_line.split(" ### ") if not len(gold_ents) == len(predictions): print 'ident', ident print 'gold_ents', gold_ents raw_input() continue for index in range(len(gold_ents)): match = evaluatePrediction(predictions[index], gold_ents[index]) debugCity = False if index == 3 and debugCity: print 'predictions[index]', predictions[index] print 'gold_ents[index]', gold_ents[index] print 'match', match raw_input() if match == 'skip': continue else: gold_correct[index] += 1 if match == "no_predict": continue if match == 1: correct[index] += 1 guessed[index] += 1 helper.printScores(correct, guessed, gold_correct, False)
def yield_chars(): text, char_indices, indices_char = load_data() model = build_model(indices_char) load_latest_model(model) start_index = random.randint(0, len(text) - MAXLEN - 1) gen = sample_from_model(text, start_index, char_indices, model, indices_char) next_letter_upper = False for c in gen: if next_letter_upper and c.upper() != c: yield c.upper() next_letter_upper = False else: yield c if c == '.': next_letter_upper = True
def eval_mode_batch(output_tags, confidences, cities): tagged_data, identifier = load_data(output_tags) num_tags = len(int2tags) - 1 correct = [0] * num_tags guessed = [0] * num_tags gold_correct = [0] * num_tags assert len(tagged_data) == len(confidences) for i in range(len(tagged_data)): sentence = tagged_data[i][0] tags = tagged_data[i][1] tag_confs = confidences[i] ident = identifier[i] gold_ents = ident.split(',')[:num_tags] #Throw away title output_pred_line, entity_confidences, entity_cnts = predict_mode( sentence, tags, tag_confs, cities) predictions = output_pred_line.split(" ### ") if not len(gold_ents) == len(predictions): print 'ident', ident print 'gold_ents', gold_ents raw_input() continue for index in range(len(gold_ents)): match = evaluatePrediction(predictions[index], gold_ents[index]) debugCity = False if index == 3 and debugCity: print 'predictions[index]', predictions[index] print 'gold_ents[index]', gold_ents[index] print 'match', match raw_input() if match == 'skip': continue else: gold_correct[index] += 1 if match == "no_predict": continue if match == 1: correct[index] += 1 guessed[index] += 1 helper.printScores(correct, guessed, gold_correct, False)
def evaluate_model(load_model): args = {} args['max_pos_embed'] = 512 args['max_num_sentences'] = 32 args['max_summary_length'] = 96 args[ 'model_data_dir'] = "/home/alta/summary/pm574/summariser0/lib/model_data/" val_batch_size = 200 # okay for K80 & RTX 2080 ti if 'X_SGE_CUDA_DEVICE' in os.environ: # to run on CUED stack machine print('running on the stack...') cuda_device = os.environ['X_SGE_CUDA_DEVICE'] print('X_SGE_CUDA_DEVICE is set to {}'.format(cuda_device)) os.environ['CUDA_VISIBLE_DEVICES'] = cuda_device else: # pdb.set_trace() print('running locally...') os.environ[ "CUDA_VISIBLE_DEVICES"] = '0' # choose the device (GPU) here device = 'cuda' val_data = load_data(args, 'val') val_summary = load_summary(args, 'val') abs_sum = AbstractiveSummariser(args, device=device) abs_sum.load_state_dict(torch.load(load_model)) abs_sum.eval() # switch to evaluation mode print("evaluate model: {}".format(load_model)) vocab_size = abs_sum.decoder.linear_decoder.out_features with torch.no_grad(): avg_val_loss = evaluate2(abs_sum, val_data, val_summary, val_batch_size, vocab_size, args, device) print( "============================================================================================" ) print("MODEL = {}".format(load_model)) print("VLOSS = {}".format(avg_val_loss)) print( "============================================================================================" )
def predict(dataset, model_file): """ Loads a model file and predicts on the test set """ classifier = cPickle.load(open(model_file)) predict_model = theano.function(inputs=[classifier.input], outputs=classifier.y_pred) datasets = load_data(dataset) test_set_x, test_set_y = datasets[2] test_set_x = test_set_x.get_value() predicted_values = predict_model(test_set_x) print("Predicted values for the first 10 examples in the test test:") print(predicted_values[:10]) test_model = theano.function(inputs=[classifier.input], outputs=classifier.errors(test_set_y)) test_error = test_model(test_set_x) print("Test error is %f %%" % (test_error * 100))
def load_initial_train(setlist, version, fraction, save=False, hys_dir=HYS_DIR, train_set_dir=TRAIN_SET_DIR, grid_dir=GRID_DIR): if VERBOSE: print('Loading initial training data') (grids, results) = train.load_data(setlist, hys_dir, version, fraction, grid_dir=grid_dir) n_grids = grids.shape[0] indexed_results = np.zeros((n_grids, 2)) indexed_results[:, 0] = np.reshape(np.arange(n_grids), (n_grids, )) indexed_results[:, 1] = results if save: out_path = os.path.join(train_set_dir, 'train_grids00.csv') np.savetxt(out_path, grids, fmt='%i', delimiter=',') out_path = os.path.join(train_set_dir, 'train_results00.csv') np.savetxt(out_path, indexed_results, delimiter=',') return (grids, indexed_results)
def main(trained_model,testing_file,viterbi,output_tags="output.tag", output_predictions="output.pred"): test_data, identifier = load_data(testing_file) evaluate = True ## extract features if not "crf" in trained_model: if not isinstance(trained_model, list): clf, previous_n, next_n, word_vocab,other_features = pickle.load( open( trained_model, "rb" ) ) else: clf, previous_n, next_n, word_vocab,other_features = trained_model tic = time.clock() f = open(output_tags,'w') confidences = [] for i in range(len(test_data)+len(identifier)): if i%2 == 1: if "crf" in trained_model: y, tmp_conf = crf.predict(test_data[i/2][0], trained_model) f.write(" ".join([test_data[i/2][0][j]+"_"+y[j] for j in range(len(test_data[i/2][0]))])) else: y, tmp_conf = predict_tags_n(viterbi, previous_n,next_n, clf, test_data[i/2][0], word_vocab,other_features) f.write(" ".join([test_data[i/2][0][j]+"_"+int2tags[int(y[j])] for j in range(len(test_data[i/2][0]))])) assert(len(y) == len(tmp_conf)) confidences.append(tmp_conf) f.write("\n") else: f.write(identifier[i/2]) f.write("\n") #print time.clock()-tic f.close() if evaluate: eval_mode_batch(output_tags, confidences, helper.cities) else: predict_mode_batch(output_tags, output_predictions, helper.cities) return
def get_distance_data(data, temporal_rel): """Extracts the distance feature into the following data structure which will be returned: [{distance : classified_right?}, ...]""" X, y, distance = load_data(True, temporal_rel, distance=True) X_train, X_test, y_train, y_test, distance_train, distance_test = split(X, y, distance) rf = RandomForestClassifier(n_jobs=2, n_estimators=100) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) # Make array with elements like this {distance : [TruePositive?, TrueNegative?, FalsePositive?, FalseNegative?]} for i in range(len(X_test)): if y_test[i] == y_pred[i] and y_test[i] == 1: # True positive data.append({distance_test[i] : [True, False, False, False]}) elif y_test[i] == y_pred[i] and y_test[i] == 0: # True negative data.append({distance_test[i] : [False, True, False, False]}) elif y_test[i] != y_pred[i] and y_pred[i] == 1: # False positive data.append({distance_test[i] : [False, False, True, False]}) elif y_test[i] != y_pred[i] and y_pred[i] == 0: # False negative data.append({distance_test[i] : [False, False, False, True]})
def exp_raw(dtype): shp = (None, 3, 256, 256) input_var = T.tensor4('input_var', dtype = 'float32') psp = T.dmatrix("psp") network = OrderedDict() network['input'] = lasagne.layers.InputLayer(shape = shp, input_var = input_var) # network = make_vgg16(network, 'model/vgg16_weights_from_caffe.h5') # First conv and segmentation part network['conv1_1'] = lasagne.layers.Conv2DLayer(network['input'], num_filters = 64, filter_size = (3, 3),nonlinearity = lasagne.nonlinearities.rectify, W=lasagne.init.GlorotUniform()) network['conv1_2'] = lasagne.layers.Conv2DLayer(network['conv1_1'], num_filters = 64, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify) network['pool1_1'] = lasagne.layers.MaxPool2DLayer(network['conv1_2'], pool_size = (2, 2)) network['norm1_1'] = lasagne.layers.BatchNormLayer(network['pool1_1']) network['conv1_3'] = lasagne.layers.Conv2DLayer(network['norm1_1'], num_filters = 128, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify) network['conv1_4'] = lasagne.layers.Conv2DLayer(network['conv1_3'], num_filters = 128, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify) network['pool1_2'] = lasagne.layers.MaxPool2DLayer(network['conv1_4'], pool_size = (2, 2)) network['norm1_2'] = lasagne.layers.BatchNormLayer(network['pool1_2']) network['conv1_5'] = lasagne.layers.Conv2DLayer(network['norm1_2'], num_filters = 256, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify) network['pool1_3'] = lasagne.layers.MaxPool2DLayer(network['conv1_5'], pool_size = (2, 2)) network['conv1_6'] = lasagne.layers.Conv2DLayer(network['pool1_3'], num_filters = 256, filter_size = (3, 3), nonlinearity = lasagne.nonlinearities.rectify) network['pool1_4'] = lasagne.layers.MaxPool2DLayer(network['conv1_6'], pool_size = (2, 2)) # Perspective Transform network['norm2'] = lasagne.layers.BatchNormLayer(network['pool1_4']) # network['cast'] = CastingLayer(network['norm2'], dtype) theano.config.floatX = dtype network['pfc2_1'] = lasagne.layers.DenseLayer( lasagne.layers.dropout(network['norm2'], p = 0.05), num_units = 1024, nonlinearity = lasagne.nonlinearities.rectify) network['pfc2_2'] = lasagne.layers.DenseLayer( lasagne.layers.dropout(network['pfc2_1'], p=0.05), num_units = 1024, nonlinearity = lasagne.nonlinearities.rectify) network['pfc2_3'] = lasagne.layers.DenseLayer( lasagne.layers.dropout(network['pfc2_2'], p=0.05), num_units = 1024, nonlinearity = lasagne.nonlinearities.rectify) # loss target 2 network['pfc_out'] = lasagne.layers.DenseLayer( lasagne.layers.dropout(network['pfc2_3'], p = 0.05), num_units = 8, nonlinearity = lasagne.nonlinearities.rectify) theano.config.floatX = 'float32' predict = lasagne.layers.get_output(network['pfc_out']) loss = T.sqrt(lasagne.objectives.squared_error(predict, psp).mean()) paras = lasagne.layers.get_all_params(network['pfc_out'], trainable = True) updates = adam(loss, paras, [theano.shared(np.float32(0.0001)) for i in range(len(paras))]) ftrain = theano.function([input_var, psp], [loss, predict], updates = updates) def get_inputs(meta, batch, path): # batchidx = [keys[i] for i in batch] input = np.array([read_image(path + 'patch/' + idx + '.jpg', shape = (256, 256)) for idx in batch]).astype(np.float32) seg = np.array([read_image(path + 'pmask/' + idx + '.jpg', shape = (256, 256)) for idx in batch]).astype(np.float32) dat = [meta[key] for key in batch] Ps = np.array([np.array(dat[i][0]).flatten()[0 : 8] for i in range(len(batch))]) for P in Ps: P[6 : 8] = (P[6 : 8] + 1e-3) * 1e4 return input, Ps path = '/home/yancz/text_generator/data/real/' dat, meta = load_data(path, 10000, False) for epoch in range(10): loss = 0 trs = 0 for batch in iterate_minibatch(dat['train'], 32, len(dat['train'])): inputs = get_inputs(meta, batch, path) l, valp = ftrain(*inputs) log(l) print(valp) loss += l trs += 1 loss /= trs log('loss ' + str(epoch) + ' ' + str(l)) return ftrain
import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn import datasets from sklearn.preprocessing import StandardScaler from matlab_port.utils import partition_data, shuffle_data from train import load_data X, y = load_data('data/ex4data1_conv.mat', 'numpy') X, y = shuffle_data(X, y) X, y, X_test, y_test = partition_data(X, y, split=.8) classifier = LogisticRegression(C=.1) classifier.fit(X, y) z = classifier.predict(X_test) print("Test accuracy: {acc:.1%}".format(acc=(sum(y_test == z) / y_test.size)))
import joblib from os import path from train import load_data from sklearn.metrics import accuracy_score DIR_NAME = path.dirname(__file__) MODELS_FOLDER = path.join('.', 'models') EXPERIMENT_NAME = path.join(MODELS_FOLDER, 'exp_01_default') TRANFORMER_NAME = 'tf_std_default_v0.1.pkl' MODEL_NAME = 'model_mlp_default_v0.1.pkl' X, y = load_data() # load models tf = joblib.load(path.join(EXPERIMENT_NAME, TRANFORMER_NAME)) model = joblib.load(path.join(EXPERIMENT_NAME, MODEL_NAME)) X_tf = tf.transform(X) y_hat = model.predict(X_tf) print('accuracy score {}'.format(accuracy_score(y, y_hat)))
# EXTRA_QUERY='(adulterated | scandal | countries | fake)' # Shooter queries # EXTRA_QUERY='( injured | wounded | victim )' # EXTRA_QUERY='( suspect | shooter | identified | arrested | charged )' if __name__ == "__main__": trainFile = sys.argv[1] saveFile = sys.argv[2] extra_query = sys.argv[3] # load data and process identifiers articles, identifiers = load_data(trainFile) identifiers_tmp = [] titles = [] for e in identifiers: e = e.split(",") for i in range(NUM_ENTITIES): try: e[i] = int(e[i]) e[i] = inflect_engine.number_to_words(e[i]) except: pass identifiers_tmp.append(e[:NUM_ENTITIES]) titles.append(",".join(e[NUM_ENTITIES:])) identifiers = identifiers_tmp # download related files
batch_size = 16 nb_epoch_mask = 5 nb_epoch_mask_exist = 4 dropout_mask = .2 dropout_exist_mask = .15 basename = generate_basename(batch_size, nb_epoch_mask, nb_epoch_mask_exist, dropout_mask, dropout_exist_mask) weight_load = '' mask_filename = '' mask_exist_filename = '' weight_mask_filename = '' weight_mask_exist_filename = '' imgs_train, imgs_mask_train, imgs_patient_train, imgs_test = load_data(data_path) def multiply_mask(mask_filename, mask_exist_filename): imgs_mask_test = np.load(mask_filename) imgs_mask_exist_test = np.load(mask_exist_filename) for n in range(len(imgs_mask_exist_test)): if (imgs_mask_exist_test[n] == 0): imgs_mask_test[n,0] = 0 return imgs_mask_test score_mask_train = [] score_mask_val = [] score_exist_mask_train = [] score_exist_mask_val = [] for n in range(10):
def test(num_features, model_name): df = train.load_data() test_with_data(num_features, model_name, df)
print "reload helper" reload(helper) helper.load_constants() print "end load helper" retrain = True if retrain: num_blocks = 1 ## num_blocks = 5 training_file = "../data/tagged_data/EMA/train.tag" dev_file = "../data/tagged_data/EMA/dev.tag" test_file = "../data/tagged_data/EMA/test.tag" trained_model = "trained_model_crf.EMA.p" print "load files" train_data, train_identifier = train.load_data(training_file) test_data, test_identifier = train.load_data(dev_file) print "End load files" prev_n = 2 next_n = 2 print "Start Feature extract on train set" trainX, trainY = featureExtract(train_data,train_identifier, prev_n, next_n ) print "Done Feature extract on train set" #trainX, trainY = featureExtract(dev_data, prev_n, next_n) print "Start Feature extract on test set" testX, testY = featureExtract(test_data, test_identifier, prev_n, next_n) print "Done Feature extract on test set" #testX, testY = featureExtract(train_data[split_index:], prev_n, next_n) trainer = trainModel(1)