def runTest(file1, version, model, mode='words'): """Gets training or test file for stance detection SemiVal 2016 competition and prints prediction results. Parameters ---------- file1 : list a list with text tokens on index (0) and hashtags list on index (1) istest : Boolean specifies if the dataset is for test or training version : int 0: Training dataset, 1: Test dataset, 2:Other domain dataset mode : str choose either (words) or (hashtags) """ indata = readfile(file1, version) data = preprocesstweets(indata, ignoreNONE=False, version=version, lowerCase=True) tfidfAdded = getTfidfRepresentation(data, version, mode) labels = [d[7] for d in data] encoder = LabelEncoder() y = encoder.fit_transform(labels) print(encoder.classes_) if version == 0: x_train, x_test, y_train, y_test = train_test_split(tfidfAdded, y, test_size=0.2) y_test = np_utils.to_categorical(y_test, num_classes=3) y_train = np_utils.to_categorical(y_train, num_classes=3) print(x_train.shape[1]) print(model.summary()) model.fit(x_train, y_train, epochs=10, verbose=2, validation_data=(x_test, y_test)) loss, acc = model.evaluate(x_test, y_test, verbose=0) ypred = model.predict(x_test) print('Training Accuracy: %f' % (acc * 100)) print('Training F-Score: ', f1(y_test, ypred) * 100) if version == 1 or version == 2: y = np_utils.to_categorical(y, num_classes=3) loss, acc = model.evaluate(tfidfAdded, y) ypred = model.predict(tfidfAdded) otherdomain = '' if version == 2: otherdomain = '(other domain)' print('TEST Accuracy ' + otherdomain + ': %f' % ((acc * 100))) print('TEST F-Score ' + otherdomain + ': ', (f1(y, ypred) * 100))
def link(self, m1, m2, hypothetical=False, beta=1): if m1 == -1: return self.get_f1(beta=beta) if hypothetical else None c1, c2 = self.mention_to_cluster[m1], self.mention_to_cluster[m2] assert c1 != c2 new_c = c1 + c2 p_num, r_num, p_den, r_den = self.p_num, self.r_num, self.p_den, self.r_den if len(c1) == 1: self.p_den += 1 if len(c2) == 1: self.p_den += 1 self.update_b3(new_c, hypothetical=hypothetical) if hypothetical: f1 = evaluation.f1(self.p_num, self.p_den, self.r_num, self.r_den, beta=beta) self.p_num, self.r_num, self.p_den, self.r_den = p_num, r_num, p_den, r_den return f1 else: self.ana_to_ant[m2] = m1 self.ant_to_anas[m1].append(m2) self.clusters.remove(c1) self.clusters.remove(c2) self.clusters.append(new_c) for m in new_c: self.mention_to_cluster[m] = new_c
def link(self, m1, m2, hypothetical=False, beta=1): timer.start("link") if m1 == -1: return self.get_f1(beta=beta) if hypothetical else None c1, c2 = self.mention_to_cluster[m1], self.mention_to_cluster[m2] assert c1 != c2 new_c = c1 + c2 p_num, r_num, p_den, r_den = self.p_num, self.r_num, self.p_den, self.r_den if len(c1) == 1: self.p_den += 1 if len(c2) == 1: self.p_den += 1 self.update_b3(new_c, hypothetical=hypothetical) if hypothetical: f1 = evaluation.f1(self.p_num, self.p_den, self.r_num, self.r_den, beta=beta) self.p_num, self.r_num, self.p_den, self.r_den = p_num, r_num, p_den, r_den timer.stop("link") return f1 else: self.ana_to_ant[m2] = m1 self.ant_to_anas[m1].append(m2) self.clusters.remove(c1) self.clusters.remove(c2) self.clusters.append(new_c) for m in new_c: self.mention_to_cluster[m] = new_c timer.stop("link")
def main(): # reading in import argparse parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default='data/sampling', help='determine the base dir of the dataset document') parser.add_argument("--sample_n", default=1000, type=int, help='starting image index of preprocessing') parser.add_argument("--evidence_n", default=20, type=int, help='how many top/bottom tiles to pick from') parser.add_argument("--repl_n", default=3, type=int, help='how many resampled replications') parser.add_argument("--image_split", action='store_true', help='if use image_split') parser.add_argument("--batch_size", default=50, type=int, help="batch size") parser.add_argument("--stage_two", action='store_true', help='if only use stage two patients') parser.add_argument("--changhai", action='store_true', help='if use additional data') args = parser.parse_args() feature_size = 32 #gpu = "cuda:0" gpu = None # 5-folds cross validation dataloader = CVDataLoader(args, gpu, feature_size) n_epoch = 800 lr = 0.0005 if args.stage_two: weight_decay = 0.008 else: weight_decay = 0.005 manytimes_n = 8 if not os.path.isdir('figure'): os.mkdir('figure') if not os.path.isdir(os.path.join(args.data_dir, 'model')): os.mkdir(os.path.join(args.data_dir, 'model')) acc_folds = [] auc_folds = [] c_index_folds = [] f1_folds = [] f1_folds_pos = [] total_round = 0 model_count = 0 loss_function = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(0.8)) for _ in range(manytimes_n): # averaging for i in range(5): train_history = [] test_history = [] minimum_loss = None auc_fold = None acc_fold = None early_stop_count = 0 model = Predictor(evidence_size=args.evidence_n, layers=(100, 50, 1), feature_size=feature_size) # model.apply(weight_init) if gpu: model = model.to(gpu) optimizer = torch.optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay) # optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) dataloader.set_fold(i) X_test, Y_test, df_test = dataloader.get_test() # X_train, Y_train, df_train = dataloader.get_train() print('starting fold %d' % i) for epoch in range(n_epoch): #result = model(X_train) #loss = nn.functional.binary_cross_entropy(result, Y_train) + nn.functional.mse_loss(result, Y_train) # loss = nn.functional.mse_loss(result, Y_train) #loss.backward() #optimizer.step() #optimizer.zero_grad() # batch input for X_train_batch, Y_train_batch, df_train_batch in dataloader: # print(X_train_batch.shape) result = model(X_train_batch) loss = loss_function(result, Y_train_batch) loss.backward() optimizer.step() optimizer.zero_grad() X_train, Y_train, df_train = X_train_batch, Y_train_batch, df_train_batch if epoch % 20 == 0: result_test = model(X_test) loss_test = loss_function(result_test, Y_test) #loss_test = nn.functional.mse_loss(result_test, Y_test) acc_train, acc_test = accuracy(result, Y_train), accuracy( result_test, Y_test) auc_train, auc_test = auc(result, Y_train), auc( result_test, Y_test) if args.changhai: c_index_train, c_index_test = 0, 0 else: c_index_train, c_index_test = c_index( result, df_train), c_index(result_test, df_test) recall_train, recall_test = recall(result, Y_train), recall( result_test, Y_test) precision_train, precision_test = precision( result, Y_train), precision(result_test, Y_test) f1_train_pos, f1_test_pos = f1(result, Y_train), f1( result_test, Y_test) f1_train, f1_test = f1(result, Y_train, negative=True), f1(result_test, Y_test, negative=True) train_history.append( (epoch, loss, acc_train, auc_train, c_index_train)) test_history.append( (epoch, loss_test, acc_test, auc_test, c_index_test)) if epoch % 40 == 0: print( "%s epoch:%d loss:%.3f/%.3f acc:%.3f/%.3f auc:%.3f/%.3f c_index:%.3f/%.3f recall:%.3f/%.3f prec:%.3f/%.3f f1:%.3f/%.3f f1(neg):%.3f/%.3f" % (time.strftime( '%m.%d %H:%M:%S', time.localtime( time.time())), epoch, loss, loss_test, acc_train, acc_test, auc_train, auc_test, c_index_train, c_index_test, recall_train, recall_test, precision_train, precision_test, f1_train_pos, f1_test_pos, f1_train, f1_test)) # early stop if minimum_loss is None or minimum_loss * 0.995 > loss_test: # if minimum_loss is None or minimum_loss > loss_test: if f1_train == 0: continue minimum_loss = loss_test auc_fold = auc_test acc_fold = acc_test c_index_fold = c_index_test f1_fold_pos = f1_test_pos f1_fold = f1_test early_stop_count = 0 elif auc_test > auc_fold and auc_test > 0.5 and acc_test >= acc_fold: minimum_loss = loss_test auc_fold = auc_test acc_fold = acc_test c_index_fold = c_index_test f1_fold_pos = f1_test_pos f1_fold = f1_test early_stop_count = 0 else: early_stop_count += 1 if early_stop_count > 2 and epoch > 100: if args.stage_two: if auc_fold > 0.55: print('early stop at epoch %d' % epoch) break elif early_stop_count > 3: print('early stop at epoch %d' % epoch) break if epoch > 500: optimizer = torch.optim.RMSprop( model.parameters(), lr * 0.6, weight_decay=weight_decay * 1.2) train_history = np.array(train_history) test_history = np.array(test_history) acc_folds.append(acc_fold) auc_folds.append(auc_fold) f1_folds.append(f1_fold) f1_folds_pos.append(f1_fold_pos) c_index_folds.append(c_index_fold) plt.plot(train_history[:, 0], train_history[:, 1], label='train') plt.plot(test_history[:, 0], test_history[:, 1], label='test') plt.legend() plt.savefig('figure/sample_%d_fold%d.png' % (args.sample_n, i)) plt.cla() if acc_fold > 0.7 and auc_fold > 0.6 and model_count < 10: model.save(args.data_dir + "/model/model_%d" % model_count) model_count += 1 print("acc:%.3f\tauc:%.3f\tc_index:%.3f\tf1:%.3f" % (acc_fold, auc_fold, c_index_fold, f1_fold)) total_round += 1 if gpu: del dataloader.X_train, dataloader.Y_train, dataloader.X_test, dataloader.Y_test del X_test, Y_test, X_train, Y_train, model, optimizer torch.cuda.empty_cache() print('CV-acc:%.3f CV-auc:%.3f CV-c-index:%.3f f1:%.3f f1(neg):%.3f' % (sum(acc_folds) / 5 / manytimes_n, sum(auc_folds) / 5 / manytimes_n, sum(c_index_folds) / 5 / manytimes_n, sum(f1_folds_pos) / 5 / manytimes_n, sum(f1_folds) / 5 / manytimes_n))
def get_f1(self, beta=1): return evaluation.f1(self.p_num, self.p_den, self.r_num, self.r_den, beta=beta)
def main(): """Main function for training and testing.""" # Parse command line arguments and cache opt = opts.Opts().args utils.savecmd(opt.resume, sys.argv) utils.print_color_msg("==> Setting up data loader") train_loader, val_loader, test_loader = dataloader.create(opt) # Load checkpoint if specified, None otherwise utils.print_color_msg("==> Checking checkpoints") checkpoint = checkpoints.load(opt) utils.print_color_msg("==> Setting up model and criterion") model, optim_state = init.setup(opt, checkpoint) loss_fn = criterion.setup(opt, checkpoint) utils.print_color_msg("==> Loading trainer") trainer = train.create_trainer(model, loss_fn, opt, optim_state) best_loss = float('Inf') val_loss = float('Inf') start_epoch = max([1, opt.epochNum]) if checkpoint is not None: start_epoch = checkpoint['epoch'] + 1 best_loss = checkpoint['loss'] print("".ljust(4) + "Previous best loss: " + utils.color_msg('%.5f' % best_loss)) if opt.valOnly: assert start_epoch > 1, "There must be at least one epoch" utils.print_color_msg("==> Validation:") print("".ljust(4) + "=> Epoch %i" % (start_epoch - 1)) trainer.val(val_loader, start_epoch - 1) sys.exit() if opt.testOnly: assert start_epoch > 1, "There must be at least one epoch" utils.print_color_msg("==> Testing:") print("".ljust(4) + "=> Epoch %i" % (start_epoch - 1)) _, prediction, reference, post, seq_length = trainer.test( test_loader, start_epoch - 1) prediction = F.sigmoid(torch.Tensor(prediction)).numpy() nce = evaluation.nce(reference, prediction) precision, recall, area, threshold = evaluation.pr( reference, prediction) precision_bl, recall_bl, area_bl, _ = evaluation.pr(reference, post) f1, f1_precision, f1_recall, f1_threshold = evaluation.f1( precision, recall, threshold) tpr, fpr, roc_area = evaluation.roc(reference, prediction) # Calculate stats for sequences binned by the posterior limits = np.linspace(0, 1, 11).tolist() utils.print_color_msg('\n\nEffect of Input Posterior on Performance') for i in range(len(limits) - 1): ref, pred, p = evaluation.bin_results(reference, prediction, post, measure=post, \ lower_limit=limits[i], upper_limit=limits[i+1]) if ref.size: nce_post = evaluation.nce(ref, pred) nce_post_bl = evaluation.nce(ref, p) precision_post, recall_post, area_post, threshold_post = evaluation.pr( ref, pred) precision_post_bl, recall_post_bl, area_post_bl, threshold_post_bl = evaluation.pr( ref, p) f1_post, _, _, _ = evaluation.f1(precision_post, recall_post, threshold_post) f1_post_bl, _, _, _ = evaluation.f1(precision_post_bl, recall_post_bl, threshold_post_bl) _, _, roc_area_post = evaluation.roc(ref, pred) print('%.1f. - %.1f. %d Results (model/bl) NCE: %.4f. , %.4f. AUC(PR): %.4f. , %.4f. F-1: %.4f. , %.4f. AUC(ROC): %.4f.'\ %(limits[i], limits[i+1], int(ref.size), nce_post, nce_post_bl, area_post, area_post_bl, f1_post, f1_post_bl, roc_area_post)) else: print('%.1f. - %.1f. Empty' % (limits[i], limits[i + 1])) # Caluclate stats for sequences binned by sequence length limits = [0, 2, 3, 6, 10, 20, 40] utils.print_color_msg('\n\nEffect of Sequence Length on Performance') for i in range(len(limits) - 1): ref, pred, p = evaluation.bin_results(reference, prediction, post, measure=seq_length, \ lower_limit=limits[i], upper_limit=limits[i+1]) if ref.size: nce_len = evaluation.nce(ref, pred) nce_len_bl = evaluation.nce(ref, p) precision_len, recall_len, area_len, threshold_len = evaluation.pr( ref, pred) precision_len_bl, recall_len_bl, area_len_bl, threshold_len_bl = evaluation.pr( ref, p) f1_len, _, _, _ = evaluation.f1(precision_len, recall_len, threshold_len) f1_len_bl, _, _, _ = evaluation.f1(precision_len_bl, recall_len_bl, threshold_len_bl) _, _, roc_area_len = evaluation.roc(ref, pred) print(f'%d - %d %d Results (model/bl) NCE: %.4f. , %.4f. AUC: %.4f. , %.4f. F-1: %.4f. , %.4f. AUC(ROC): %.4f.'\ %(limits[i], limits[i+1], int(ref.size), nce_len, nce_len_bl, area_len, area_len_bl, f1_len, f1_len_bl, roc_area_len)) else: print('%d - %d Empty' % (limits[i], limits[i + 1])) # Calulate calibration stats limits = np.linspace(0, 1, 11).tolist() print('\n\nCalibration Stats') ece = 0 for i in range(len(limits) - 1): ref, pred, p = evaluation.bin_results(reference, prediction, post, measure=prediction, \ lower_limit=limits[i], upper_limit=limits[i+1]) if ref.size: accuracy_bin = np.mean(ref) confidence_bin = np.mean(pred) posterior_bin = np.mean(p) ece += abs(accuracy_bin - confidence_bin) * len(ref) / len(reference) print( f'%.1f. - %.1f. %d Reference: %.4f. , Prediction: %.4f. , Posterior: %.4f.' % (limits[i], limits[i + 1], int(ref.size), accuracy_bin, confidence_bin, posterior_bin)) else: print('%.1f. - %.1f. Empty' % (limits[i], limits[i + 1])) # Print Test Stats print('\n\nTest Stats') print( "".ljust(7) + "\nNCE: %.4f. \nAUC(PR): %.4f. \nF-1: %.4f. p: %.4f. r: %.4f. t: %.4f. \nAUC(ROC): %.4f. \nECE: %.4f. " \ %(nce, area, f1, f1_precision, f1_recall, f1_threshold, roc_area, nce)) trainer.logger['test'].write('NCE: %f\nAUC(PR): %f\n' % (nce, area)) evaluation.plot_pr([precision, precision_bl], [recall, recall_bl], [area, area_bl], ['BiLatticeRNN', 'posterior'], opt.resume) np.savez(os.path.join(opt.resume, 'result.npz'), prediction=prediction, reference=reference, posteriors=post) sys.exit() utils.print_color_msg("==> Training:") for epoch in range(start_epoch, opt.nEpochs + 1): print("".ljust(4) + "=> Epoch %i" % epoch) best_model = False _ = trainer.train(train_loader, epoch, val_loss) if not opt.debug: val_loss = trainer.val(val_loader, epoch) if val_loss < best_loss: best_model = True print("".ljust(4) + "** Best model: " + utils.color_msg('%.4f' % val_loss)) best_loss = val_loss checkpoints.save(epoch, trainer.model, loss_fn, trainer.optim_state, best_model, val_loss, opt) if not opt.debug: utils.print_color_msg("==> Testing:") _, prediction, reference, _, _ = trainer.test(test_loader, opt.nEpochs) prediction = F.sigmoid(torch.Tensor(prediction)).numpy() nce = evaluation.nce(reference, prediction) precision, recall, area, _ = evaluation.pr(reference, prediction) utils.print_color_msg("".ljust(7) + "NCE: %.4f. AUC(PR): %.4f" % (nce, area)) trainer.logger['test'].write('NCE: %f\nAUC(PR): %f\n' % (nce, area)) evaluation.plot_pr([precision], [recall], [area], ['BiLatticeRNN'], opt.resume) # Flush write out and reset pointer for open_file in trainer.logger.values(): open_file.flush() open_file.seek(0) plot.plot(opt.resume, opt.onebest)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--data_dir", default='data/sampling', help='determine the base dir of the dataset document') parser.add_argument("--sample_n", default=2000, type=int, help='starting image index of preprocessing') parser.add_argument("--evidence_n", default=500, type=int, help='how many top/bottom tiles to pick from') parser.add_argument("--repl_n", default=3, type=int, help='how many resampled replications') parser.add_argument("--image_split", action='store_true', help='if use image_split') parser.add_argument("--batch_size", default=200, type=int, help="batch size") parser.add_argument("--stage_two", action='store_true', help='if only use stage two patients') parser.add_argument("--threshold", default=25, type=float, help='threshold') parser.add_argument("--changhai", action='store_true', help='if use additional data') parser.add_argument("--TH", action='store_true') args = parser.parse_args() gpu = "cuda:0" n_epoch = 80 acc_folds = [] auc_folds = [] c_index_folds = [] f1_folds = [] f1_folds_pos = [] unsuccessful_count = 0 model_count = 0 n_manytimes = 8 # caching if False: # if os.path.exists(os.path.join(args.data_dir, 'graph', 'graph_dataset.pkl')) and os.path.exists(os.path.join(args.data_dir, 'graph', 'graph_df.pkl')): print("loading cached graph data") with open(os.path.join(args.data_dir, 'graph', 'graph_dataset.pkl'), 'rb') as file: dataset = pickle.load(file) with open(os.path.join(args.data_dir, 'graph', 'graph_df.pkl'), 'rb') as file: df = pickle.load(file) else: if not os.path.exists(os.path.join(args.data_dir, 'graph')): os.mkdir(os.path.join(args.data_dir, 'graph')) dataset, df = construct_graph_dataset(args, gpu) with open(os.path.join(args.data_dir, 'graph', 'graph_dataset.pkl'), 'wb') as file: pickle.dump(dataset, file) with open(os.path.join(args.data_dir, 'graph', 'graph_df.pkl'), 'wb') as file: pickle.dump(df, file) splitter = CrossValidationSplitter(dataset, df, n=5, n_manytimes=n_manytimes) # criterion = torch.nn.CrossEntropyLoss() criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(0.4)) fold_num = 0 if not os.path.isdir(os.path.join(args.data_dir, 'model')): os.mkdir(os.path.join(args.data_dir, 'model')) for train_dataset, test_dataset, train_df, test_df in splitter: print("starting fold %d-%d" % (fold_num // 5, fold_num % 5)) train_loader = DataLoader(train_dataset, batch_size=args.batch_size) test_loader = DataLoader(test_dataset, batch_size=args.batch_size) train_history = [] test_history = [] minimum_loss = None auc_fold = None acc_fold = None early_stop_count = 0 model = GNN(32).to(gpu) optimizer = torch.optim.RMSprop(model.parameters(), lr=0.0004, weight_decay=0.001) for epoch in range(n_epoch): model.train() for data in train_loader: # Iterate in batches over the training dataset. y_pred = model(data.x, data.edge_index, data.batch.to(gpu)).view( -1) # Perform a single forward pass. loss = criterion(y_pred, data.y) # Compute the loss. loss.backward() # Derive gradients. optimizer.step() # Update parameters based on gradients. optimizer.zero_grad() # Clear gradients. if epoch % 1 == 0: model.eval() y_pred_train, y_train = concat_result(train_loader, model, gpu) y_pred_test, y_test = concat_result(test_loader, model, gpu) loss_train, loss_test = criterion(y_pred_train, y_train), criterion( y_pred_test, y_test) #loss_test = nn.functional.mse_loss(result_test, Y_test) acc_train, acc_test = accuracy(y_pred_train, y_train), accuracy( y_pred_test, y_test) auc_train, auc_test = auc(y_pred_train, y_train), auc(y_pred_test, y_test) if False: c_index_train, c_index_test = 0, 0 else: c_index_train, c_index_test = c_index( y_pred_train, train_df), c_index(y_pred_test, test_df) f1_train, f1_test = f1(y_pred_train, y_train, negative=True), f1(y_pred_test, y_test, negative=True) if epoch % 5 == 0: print( f'Epoch:{epoch:03d} Loss:{loss_train:.3f}/{loss_test:.3f} ACC:{acc_train:.3f}/{acc_test:.3f} AUC:{auc_train:.3f}/{auc_test:.3f} CI:{c_index_train:.3f}/{c_index_test:.3f} f1(neg):{f1_train:.3f}/{f1_test:.3f}' ) # early stop if minimum_loss is None or minimum_loss * 0.997 > loss_test: # if minimum_loss is None or minimum_loss > loss_test: if f1_train == 0: continue minimum_loss = loss_test auc_fold = auc_test acc_fold = acc_test c_index_fold = c_index_test f1_fold = f1_test early_stop_count = 0 if acc_fold > 0.75 and auc_fold > 0.75: model.save(args.data_dir + "/model/graph_%d" % model_count) #elif auc_test > auc_fold and auc_test>0.5 and acc_test >= acc_fold: # minimum_loss = loss_test # auc_fold = auc_test # acc_fold = acc_test # c_index_fold = c_index_test # f1_fold = f1_test # early_stop_count = 0\ elif auc_fold + acc_fold + c_index_fold < auc_test + acc_test + c_index_fold: minimum_loss = loss_test auc_fold = auc_test acc_fold = acc_test c_index_fold = c_index_test f1_fold = f1_test early_stop_count = 0 if acc_fold > 0.75 and auc_fold > 0.75: model.save(args.data_dir + "/model/graph_%d" % model_count) else: early_stop_count += 1 if abs(auc_fold - 1) < 0.0001: pass #print('wtf') if early_stop_count > 3 and epoch > 25: if args.stage_two: if auc_fold > 0.55 and acc_fold > 0.55: print('early stop at epoch %d' % epoch) if acc_fold > 0.75 and auc_fold > 0.75: model.load(args.data_dir + "/model/graph_%d" % model_count) model_count += 1 break elif early_stop_count > 3: print('early stop at epoch %d' % epoch) break acc_folds.append(acc_fold) auc_folds.append(auc_fold) f1_folds.append(f1_fold) c_index_folds.append(c_index_fold) fold_num += 1 print("acc:%.3f\tauc:%.3f\tc_index:%.3f\tf1:%.3f" % (acc_fold, auc_fold, c_index_fold, f1_fold)) total_count = 5 * n_manytimes print('CV-acc:%.3f CV-auc:%.3f CV-c-index:%.3f f1(neg):%.3f' % (sum(acc_folds) / total_count, sum(auc_folds) / total_count, sum(c_index_folds) / total_count, sum(f1_folds) / total_count))
def convModel(tweets, stances, tweets_test, stances_test): #General Parameters global max embeding_dim = 200 dropout_prob = (0.0, 0.5) batch_size = 64 num_epochs = 20 print('Fitting tokenizer') tokenizer = Tokenizer() tokenizer.fit_on_sequences(tweets + tweets2) max_length = max([len(s.split()) for s in tweets + tweets2]) print('max_length', max_length) vocab_size = len(tokenizer.word_index) + 1 #Train and test split print('Train and test split') x_train, x_test, y_train, y_test = train_test_split(tweets, stances, test_size=0.2) print('x_train: ', len(x_train), 'x_test', len(x_test)) #Training data #traindata = np.array(x_train) #testdata = np.array(x_test) trainTokens = tokenizer.texts_to_sequences(x_train) Xtrain = pad_sequences(trainTokens, maxlen=max_length, padding='post') XtestTokens = tokenizer.texts_to_sequences(x_test) Xtest = pad_sequences(XtestTokens, maxlen=max_length, padding='post') #============ TEST DATA ============================================= #testgroup = np.array(tweets_test) #testGroupTokens = tokenizer.texts_to_sequences(tweets_test) #XtestGroup = pad_sequences(testGroupTokens, maxlen=max_length, padding='post') #print('Xtrain padding: ', len(Xtrain), 'Xtest padding: ', len(Xtest), 'XtestGroup padding: ', len(XtestGroup)) #Convert stances to categorical output y_test = np_utils.to_categorical(y_test, num_classes=3) y_train = np_utils.to_categorical(y_train, num_classes=3) y_testGroup = np_utils.to_categorical(stances_test, num_classes=3) print('y_test: ', len(y_test), 'y_train: ', len(y_train), 'y_testGroup: ', len(stances_test)) print('Loading embeddings..') #load word2vec and create embedding layer wv_from_bin = KeyedVectors.load_word2vec_format(datapath('E:/glove/glove.twitter.27B.200dGINSIM.txt'),binary=False) embedding_vectors = get_weight_matrix2(wv_from_bin, tokenizer.word_index.items()) embedding_layer = Embedding(vocab_size, embeding_dim, weights=[embedding_vectors], input_length=max_length, trainable=False) #Create the model print('Create and compile the model..') model = createModelC(max_length, embedding_layer) model.compile(loss="categorical_hinge", optimizer="adam", metrics=[f1]) model.summary(85) print('Fitting the model..') history = model.fit(Xtrain, y_train, batch_size=batch_size, epochs=num_epochs, validation_data=(Xtest, y_test), verbose=2) print('History', history.history) # evaluate print('Predicting (training)..') ypred = model.predict(Xtest) print('Accuracy (TRAIN): %f' % (model.evaluate(Xtest,y_test)[0]*100)) print('FScore (TRAIN): %f' % (f1(y_test, ypred)*100)) print('Predicting (testing)..')