def main(): semeval_dir = 'semeval' train2013 = "task-B-train.20140221.tsv" dev2013 = "task-B-dev.20140225.tsv" test2013_sms = "task-B-test2013-sms.tsv" test2013_twitter = "task-B-test2013-twitter.tsv" test2014_twitter = "task-B-test2014-twitter.tsv" test2014_livejournal = "task-B-test2014-livejournal.tsv" test2014_sarcasm = "test_2014_sarcasm.tsv" test15 = "task-B-test2015-twitter.tsv" train16 = "task-A-train-2016.tsv" dev2016 = "task-A-dev-2016.tsv" devtest2016 = "task-A-devtest-2016.tsv" test2016 = "SemEval2016-task4-test.subtask-A.tsv" data_dir = 'ACL/ep15params' models = ['L1A','L2A','L2B','L3A','L3B','L3C','L3E','L3F','L3G','L3H'] pred_prob = 'predictions_probs' pred_pred = 'predictions_pred' files_training = [train2013,dev2013,train16,dev2016,devtest2016] sentpos_train = [2,2,1,1,1] files_test = [test2016,test15,test2014_twitter,test2013_twitter,test2014_livejournal,test2014_sarcasm] sentpos_test = [1,2,2,2,2,2] X_train,_ = extract_data(data_dir,files_training,models) y_train = extract_labels(semeval_dir,files_training,sentpos_train) testSets = {} goldSets = {} predModel = {} for file,spos in zip(files_test,sentpos_test): X_test,y_pred_model = extract_data(data_dir,[file],models) y_test = extract_labels(semeval_dir,[file],[spos]) testSets[file] = X_test goldSets[file] = y_test predModel[file] = y_pred_model print X_train.shape print y_train.shape print 'Fit model' model = RandomForestClassifier(n_estimators=300,max_depth=3,max_features=15,bootstrap=True,n_jobs=4) model.fit(X_train,y_train) print 'Compute score' for file in files_test: X_test = testSets[file] y_test = goldSets[file] y_pred = model.predict(X_test) print 'Set:\t{}\tRF\tScore:\t\t{}'.format(file,semeval_f1_taskA(y_test,y_pred)) for m in models: y_pred = predModel[file][m] print 'Set:\t{}\t{}\tScore:\t\t{}'.format(file,m,semeval_f1_taskA(y_test,y_pred)) print '\n'
def main(argv): ########## # LAYERS # ######### HOME_DIR = "semeval_parsed" input_fname = '200M' data_dir = HOME_DIR + '_' + input_fname update_type = 'adadelta' batch_size = 1000 test_type = '' use_reg = False compute_paramdist = False max_norm = 0 reg = None rho=0.95 eps=1e-6 argv = map(lambda x: x.replace('\r',''),argv) try: opts, args = getopt.getopt(argv,"t:u:r:pb:m:e:",["test_type","update=","rho=","batch_size=",'max_norm=','eps=']) except getopt.GetoptError as e: print e sys.exit(2) for opt, arg in opts: if opt in ("-u", "--update"): update_type = arg elif opt in ("-b", "--batch_size"): batch_size = int(arg) elif opt in ("-t", "--testtype"): test_type = arg elif opt in ("-m", "--max_norm"): max_norm = int(arg) elif opt in ("-r", "--rho"): rho = float(arg) elif opt in ("-e", "--eps"): eps = float(arg) elif opt == "-p": compute_paramdist = True print update_type print batch_size print max_norm print rho print eps printeps = abs(int(numpy.floor(numpy.log10(numpy.abs(eps))))) print data_dir+'/supervised_results_{}{}{}rho{}eps{}.p'.format(test_type,update_type,max_norm,int(rho*100),printeps) numpy_rng = numpy.random.RandomState(123) print "Load Parameters" parameter_map = cPickle.load(open(data_dir+'/parameters_distant_{}.p'.format(test_type), 'rb')) input_shape = parameter_map['inputShape'] filter_width = parameter_map['filterWidth'] n_in = parameter_map['n_in'] st = parameter_map['st'] batch_size = input_shape[0] def relu(x): return x * (x > 0) activation = relu tweets = T.imatrix('tweets_train') y = T.lvector('y') batch_tweets= T.imatrix('batch_x_q') batch_y = T.lvector('batch_y') lookup_table_words = nn_layers.LookupTableFast( W=parameter_map['LookupTableFastStaticW'].get_value(), pad=filter_width-1 ) parameter_map['LookupTableFastStaticW'] = lookup_table_words.W filter_shape = parameter_map['FilterShape'] conv_layers = [] conv = nn_layers.Conv2dLayer( W=parameter_map['Conv2dLayerW'], rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape ) parameter_map['Conv2dLayerW'] = conv.W non_linearity = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB'], b_size=filter_shape[0], activation=activation ) parameter_map['NonLinearityLayerB'] = non_linearity.b shape1 = parameter_map['PoolingShape1'] pooling = nn_layers.KMaxPoolLayerNative(shape=shape1,ignore_border=True,st=st) input_shape2 = parameter_map['input_shape2'] filter_shape2 = parameter_map['FilterShape2'] con2 = nn_layers.Conv2dLayer( W=parameter_map['Conv2dLayerW2' ], rng=numpy_rng, input_shape=input_shape2, filter_shape=filter_shape2 ) parameter_map['Conv2dLayerW2'] = con2.W non_linearity2 = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB2'], b_size=filter_shape2[0], activation=activation ) parameter_map['NonLinearityLayerB2'] = non_linearity2.b shape2 = parameter_map['PoolingShape2'] st2 = parameter_map['st2'] pooling2 = nn_layers.KMaxPoolLayerNative(shape=shape2,st=st2,ignore_border=True) input_shape3 = parameter_map['input_shape3'] filter_shape3 = parameter_map['FilterShape3'] con3 = nn_layers.Conv2dLayer( W=parameter_map['Conv2dLayerW3'], rng=numpy_rng, input_shape=input_shape3, filter_shape=filter_shape3 ) parameter_map['Conv2dLayerW3'] = con3.W non_linearity3 = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB3'], b_size=filter_shape3[0], activation=activation ) parameter_map['NonLinearityLayerB3'] = non_linearity3.b shape3 = parameter_map['PoolingShape3'] pooling3 = nn_layers.KMaxPoolLayerNative(shape=shape3,ignore_border=True) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[ conv, non_linearity, pooling, con2, non_linearity2, pooling2, con3, non_linearity3, pooling3 ]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() hidden_layer = nn_layers.LinearLayer( W=parameter_map['LinearLayerW'], b=parameter_map['LinearLayerB'], rng=numpy_rng, n_in=n_in, n_out=n_in, activation=activation ) parameter_map['LinearLayerW'] = hidden_layer.W parameter_map['LinearLayerB'] = hidden_layer.b n_outs = 3 classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) nnet_tweets = nn_layers.FeedForwardNet(layers=[ lookup_table_words, join_layer, flatten_layer, hidden_layer, classifier ]) inputs_train = [batch_tweets, batch_y] givens_train = {tweets: batch_tweets, y: batch_y} inputs_pred = [batch_tweets, ] givens_pred = {tweets:batch_tweets, } nnet_tweets.set_input(tweets) print nnet_tweets params = nnet_tweets.params cost = nnet_tweets.layers[-1].training_cost(y) predictions = nnet_tweets.layers[-1].y_pred l2 = 0 for param in params: l2 += (param**2).sum() if reg: cost += reg*l2 if update_type == 'adadelta': updates = sgd_trainer.get_adadelta_updates(cost, params,rho=rho,eps=eps,max_norm=max_norm,word_vec_name='None') elif update_type == 'grad': updates = sgd_trainer.get_sgd_updates(cost,params,max_norm=max_norm) train_fn = theano.function( inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train, ) pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred) def predict_batch(batch_iterator): preds = numpy.hstack([pred_fn(batch_x_q[0]) for batch_x_q in batch_iterator]) return preds[:batch_iterator.n_samples] ####### #Names# ####### test_2016n = 'Test 2016' test_2015n = 'Test 2015' test_2014n = 'Test 2014' test_2013n = 'Test 2013' test_2014ljn = 'Test 2014 LiveJournal' test_2014srcn = 'Test 2014 Sarcasm' test_2013_smsn = 'Test 2013 SMS' train_fulln = 'Training Score' pdist_n = 'parameter distance' ep_pred = {} ep_pred[test_2016n] = [] ep_pred[test_2015n] = [] ep_pred[test_2014n] = [] ep_pred[test_2013n] = [] ep_pred[test_2014ljn] = [] ep_pred[test_2014srcn] = [] ep_pred[test_2013_smsn] = [] ep_pred[train_fulln] = [] ep_pred[pdist_n] = [] ####################### # Supervised Learining# ###################### training2013_tids = numpy.load(os.path.join(data_dir, 'task-B-train.20140221.tids.npy')) training2013_tweets = numpy.load(os.path.join(data_dir, 'task-B-train.20140221.tweets.npy')) training2013_sentiments = numpy.load(os.path.join(data_dir, 'task-B-train.20140221.sentiments.npy')) dev_2013_tids = numpy.load(os.path.join(data_dir, 'task-B-dev.20140225.tids.npy')) dev_2013_tweets = numpy.load(os.path.join(data_dir, 'task-B-dev.20140225.tweets.npy')) dev_2013_sentiments = numpy.load(os.path.join(data_dir, 'task-B-dev.20140225.sentiments.npy')) trainingA_2016_tids = numpy.load(os.path.join(data_dir, 'task-A-train-2016.tids.npy')) trainingA_2016_tweets = numpy.load(os.path.join(data_dir, 'task-A-train-2016.tweets.npy')) trainingA_2016_sentiments = numpy.load(os.path.join(data_dir, 'task-A-train-2016.sentiments.npy')) devA_2016_tids = numpy.load(os.path.join(data_dir, 'task-A-dev-2016.tids.npy')) devA_2016_tweets = numpy.load(os.path.join(data_dir, 'task-A-dev-2016.tweets.npy')) devA_2016_sentiments = numpy.load(os.path.join(data_dir, 'task-A-dev-2016.sentiments.npy')) devtestA_2016_tids = numpy.load(os.path.join(data_dir, 'task-A-devtest-2016.tids.npy')) devtestA_2016_tweets = numpy.load(os.path.join(data_dir, 'task-A-devtest-2016.tweets.npy')) devtestA_2016_sentiments = numpy.load(os.path.join(data_dir, 'task-A-devtest-2016.sentiments.npy')) test_2016_tids = numpy.load(os.path.join(data_dir, 'task-A-test2016.tids.npy')) test_2016_tweets = numpy.load(os.path.join(data_dir, 'task-A-test2016.tweets.npy')) test_2016_sentiments = numpy.load(os.path.join(data_dir, 'task-A-test2016.sentiments.npy')) test_2013_tids = numpy.load(os.path.join(data_dir, 'task-B-test2013-twitter.tids.npy')) test_2013_tweets = numpy.load(os.path.join(data_dir, 'task-B-test2013-twitter.tweets.npy')) test_2013_sentiments = numpy.load(os.path.join(data_dir, 'task-B-test2013-twitter.sentiments.npy')) test_2014_tids = numpy.load(os.path.join(data_dir, 'task-B-test2014-twitter.tids.npy')) test_2014_tweets = numpy.load(os.path.join(data_dir, 'task-B-test2014-twitter.tweets.npy')) test_2014_sentiments = numpy.load(os.path.join(data_dir, 'task-B-test2014-twitter.sentiments.npy')) test_2015_tids = numpy.load(os.path.join(data_dir, 'task-B-test2015-twitter.tids.npy')) test_2015_tweets = numpy.load(os.path.join(data_dir, 'task-B-test2015-twitter.tweets.npy')) test_2015_sentiments = numpy.load(os.path.join(data_dir, 'task-B-test2015-twitter.sentiments.npy')) test_2013_sms_tids = numpy.load(os.path.join(data_dir, 'task-B-test2013-sms.tids.npy')) test_2013_sms_tweets = numpy.load(os.path.join(data_dir, 'task-B-test2013-sms.tweets.npy')) test_2013_sms_sentiments = numpy.load(os.path.join(data_dir, 'task-B-test2013-sms.sentiments.npy')) test_2014_livejournal_tids = numpy.load(os.path.join(data_dir, 'task-B-test2014-livejournal.tids.npy')) test_2014_livejournal_tweets = numpy.load(os.path.join(data_dir, 'task-B-test2014-livejournal.tweets.npy')) test_2014_livejournal_sentiments = numpy.load(os.path.join(data_dir, 'task-B-test2014-livejournal.sentiments.npy')) test_2014_sarcasm_tids = numpy.load(os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tids.npy')) test_2014_sarcasm_tweets = numpy.load(os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tweets.npy')) test_2014_sarcasm_sentiments = numpy.load(os.path.join(data_dir, 'task-B-test2014-twittersarcasm.sentiments.npy')) rand_tweets_tids = numpy.load(os.path.join(data_dir, 'random_tweet.tids.npy')) rand_tweets_tweets = numpy.load(os.path.join(data_dir, 'random_tweet.tweets.npy')) rand_tweets_sentiments = numpy.load(os.path.join(data_dir, 'random_tweet.sentiments.npy')) rand_tweets_neg_tids = numpy.load(os.path.join(data_dir, 'random_tweet_neg.tids.npy')) rand_tweets_neg_tweets = numpy.load(os.path.join(data_dir, 'random_tweet_neg.tweets.npy')) rand_tweets_neg_sentiments = numpy.load(os.path.join(data_dir, 'random_tweet_neg.sentiments.npy')) rand_tweets_neut_tids = numpy.load(os.path.join(data_dir, 'random_tweet_neut.tids.npy')) rand_tweets_neut_tweets = numpy.load(os.path.join(data_dir, 'random_tweet_neut.tweets.npy')) rand_tweets_neut_sentiments = numpy.load(os.path.join(data_dir, 'random_tweet_neut.sentiments.npy')) training_full_id = numpy.concatenate((training2013_tids,dev_2013_tids),axis=0) training_full_id = numpy.concatenate((training_full_id,trainingA_2016_tids),axis=0) training_full_id = numpy.concatenate((training_full_id,devA_2016_tids),axis=0) training_full_id = numpy.concatenate((training_full_id,devtestA_2016_tids),axis=0) training_full_tweets = numpy.concatenate((training2013_tweets,dev_2013_tweets),axis=0) training_full_tweets = numpy.concatenate((training_full_tweets,trainingA_2016_tweets),axis=0) training_full_tweets = numpy.concatenate((training_full_tweets,devA_2016_tweets),axis=0) training_full_tweets = numpy.concatenate((training_full_tweets,devtestA_2016_tweets),axis=0) training_full_sentiments = numpy.concatenate((training2013_sentiments,dev_2013_sentiments),axis=0) training_full_sentiments = numpy.concatenate((training_full_sentiments,trainingA_2016_sentiments),axis=0) training_full_sentiments = numpy.concatenate((training_full_sentiments,devA_2016_sentiments),axis=0) training_full_sentiments = numpy.concatenate((training_full_sentiments,devtestA_2016_sentiments),axis=0) train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training_full_tweets,training_full_sentiments], batch_size=batch_size, randomize=True ) train_err_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training_full_tweets], batch_size=batch_size, randomize=False ) test_2015_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2015_tweets], batch_size=batch_size, randomize=False ) dev2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [devA_2016_tweets], batch_size=batch_size, randomize=False ) test_2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2016_tweets], batch_size=batch_size, randomize=False ) train2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [trainingA_2016_tweets], batch_size=batch_size, randomize=False ) test2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2016_tweets], batch_size=batch_size, randomize=False ) test2013_itarator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2013_tweets], batch_size=batch_size, randomize=False ) test_2014_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_tweets], batch_size=batch_size, randomize=False ) test_2014_sarcasm_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_sarcasm_tweets], batch_size=batch_size, randomize=False ) train2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training2013_tweets], batch_size=batch_size, randomize=False ) dev_2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [dev_2013_tweets], batch_size=batch_size, randomize=False ) test_2013_sms_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2013_sms_tweets], batch_size=batch_size, randomize=False ) test_2014_livejournal_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_livejournal_tweets], batch_size=batch_size, randomize=False ) W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function([], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) epoch = 0 n_epochs = 0 early_stop = 250 check_freq = 4 timer_train = time.time() no_best_dev_update = 0 best_dev_acc = -numpy.inf num_train_batches = len(train_set_iterator) best_params = [numpy.copy(p.get_value(borrow=True)) for p in params] old_params = [numpy.copy(p.get_value(borrow=True)) for p in params] while epoch < n_epochs: timer = time.time() for i, (tweet,y_label) in enumerate(tqdm(train_set_iterator,ascii=True), 1): train_fn(tweet,y_label) if i % check_freq == 0 or i == num_train_batches: y_pred_dev_2015 = predict_batch(test_2015_iterator) y_pred_test_2014 = predict_batch(test_2014_iterator) y_pred_test_2013 = predict_batch(test2013_itarator) y_pred_test_sms_2013 = predict_batch(test_2013_sms_iterator) y_pred_test_livejournal_2014 = predict_batch(test_2014_livejournal_iterator) y_pred_test_sarcasm_2014 = predict_batch(test_2014_sarcasm_iterator) y_pred_test_2016 = predict_batch(test_2016_iterator) y_train_score = predict_batch(train_err_iterator) dev_acc_2015 = semeval_f1_taskA(test_2015_sentiments,y_pred_dev_2015) dev_acc_2014 = semeval_f1_taskA(test_2014_sentiments,y_pred_test_2014) dev_acc_2014_lj = semeval_f1_taskA(test_2014_livejournal_sentiments,y_pred_test_livejournal_2014) dev_acc_2014_srcs = semeval_f1_taskA(test_2014_sarcasm_sentiments,y_pred_test_sarcasm_2014) dev_acc_2013 = semeval_f1_taskA(test_2013_sentiments,y_pred_test_2013) dev_acc_2013_sms = semeval_f1_taskA(test_2013_sms_sentiments,y_pred_test_sms_2013) dev_acc_2016_test = semeval_f1_taskA(test_2016_sentiments,y_pred_test_2016) dev_acc_train_err = semeval_f1_taskA(training_full_sentiments,y_train_score) ep_pred[test_2016n].append(dev_acc_2016_test) ep_pred[test_2015n].append(dev_acc_2015) ep_pred[test_2014n].append(dev_acc_2014) ep_pred[test_2013n].append(dev_acc_2013) ep_pred[test_2014ljn].append(dev_acc_2014_lj) ep_pred[test_2014srcn].append(dev_acc_2014_srcs) ep_pred[test_2013_smsn].append(dev_acc_2013_sms) ep_pred[train_fulln].append(dev_acc_train_err) if dev_acc_2016_test > best_dev_acc: best_dev_acc = dev_acc_2016_test best_params = [numpy.copy(p.get_value(borrow=True)) for p in params] no_best_dev_update = 0 print('2016 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2016_test)) print('2015 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2015)) print('2014 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2014)) print('2013 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2013)) print('2014lj epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2014_lj)) print('2014src epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2014_srcs)) print('2013sms epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2013_sms)) print('Train epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_train_err)) zerout_dummy_word() print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 if no_best_dev_update >= early_stop: print "Quitting after of no update of the best score on dev set", no_best_dev_update break print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) cPickle.dump(ep_pred,open(data_dir+'/supervised_results_{}{}{}eps{}rho{}.p'.format(test_type,update_type,max_norm,printeps,int(rho*100)), 'wb')) if n_epochs > 0: cPickle.dump(best_params, open(data_dir+'/best_param_{}_{}.p'.format('supervised',test_type), 'wb')) ####################### # Get Sentence Vectors# ###################### inputs_senvec = [batch_tweets] givents_senvec = {tweets:batch_tweets} sets = [ (rand_tweets_tids,rand_tweets_tweets,'random_tweets'), (rand_tweets_neg_tids,rand_tweets_neg_tweets,'random_tweets_neg'), (rand_tweets_neut_tids,rand_tweets_neut_tweets,'random_tweets_neut'), (test_2016_tids,test_2016_tweets,'SemEval2016-task4-test.subtask-A'), (test_2014_tids,test_2014_tweets,'task-B-test2014-twitter'), (test_2015_tids,test_2015_tweets,'task-B-test2015-twitter'), (test_2013_tids,test_2013_tweets,'task-B-test2013-twitter'), (test_2014_livejournal_tids,test_2014_livejournal_tweets,'task-B-test2014-livejournal'), (test_2014_sarcasm_tids,test_2014_sarcasm_tweets,'test_2014_sarcasm'), (test_2013_sms_tids,test_2013_sms_tweets,'task-B-test2013-sms'), (training2013_tids,training2013_tweets,'task-B-train.20140221'), (devA_2016_tids,devA_2016_tweets,'task-A-dev-2016'), (trainingA_2016_tids,trainingA_2016_tweets,'task-A-train-2016'), (devtestA_2016_tids,devtestA_2016_tweets,'task-A-devtest-2016'), (dev_2013_tids,dev_2013_tweets,'task-B-dev.20140225'), (training_full_id,training_full_tweets,'training_full_set') ] get_senvec = False if get_senvec: batch_size = input_shape[0] output = nnet_tweets.layers[-2].output output_fn = function(inputs=inputs_senvec, outputs=output,givens=givents_senvec) for (fids,fset,name) in sets: test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset], batch_size=batch_size, randomize=False ) counter = 0 fname_prob = open(os.path.join(data_dir,'sentence-vecs/{}.txt'.format(name)), 'w+') for i, tweet in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet[0]) for vec in o: fname_prob.write(fids[counter]) for el in numpy.nditer(vec): fname_prob.write(" %f" % el) fname_prob.write("\n") counter+=1 if counter == test_set_iterator.n_samples: break ############################## # Get Predictions Probabilites# ############################# print 'Store Predictions' best_params = cPickle.load(open(data_dir+'/best_param_supervised_{}.p'.format(test_type), 'rb')) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) y_pred_test_2016 = predict_batch(test_2016_iterator) dev_acc_2016_test = semeval_f1_taskA(test_2016_sentiments,y_pred_test_2016) print dev_acc_2016_test batch_size = input_shape[0] output = nnet_tweets.layers[-1].p_y_given_x output_fn = function(inputs=inputs_senvec, outputs=output,givens=givents_senvec) for line in open('semeval/phrases'): fids = numpy.load(os.path.join(data_dir, 'random_tweet_{}.tids.npy'.format(line.replace(' ','_').replace('\r','').replace('\n','')))) fset = numpy.load(os.path.join(data_dir, 'random_tweet_{}.tweets.npy'.format(line.replace(' ','_').replace('\r','').replace('\n','')))) name = 'random_tweet_{}'.format(line.replace(' ','_').replace('\r','').replace('\n','')) test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset], batch_size=batch_size, randomize=False ) opath_prob = os.path.join(data_dir,'{}/{}/predictions_probs'.format('err',test_type)) if not os.path.exists(opath_prob): os.makedirs(opath_prob) print 'Created Path',opath_prob opath_pred = os.path.join(data_dir,'{}/{}/predictions_pred'.format('err',test_type)) if not os.path.exists(opath_pred): os.makedirs(opath_pred) print 'Created Path',opath_pred counter = 0 fname_prob = open(os.path.join(opath_prob,'{}.txt'.format(name)), 'w+') fname_pred = open(os.path.join(opath_pred,'{}.txt'.format(name)), 'w+') for i, tweet in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet[0]) for vec in o: #save pred_prob for el in numpy.nditer(vec): fname_prob.write("%f\t" % el) fname_prob.write("\n") #save pred pred = numpy.argmax(vec) sentiments = { 0 : 'negative', 1 : 'neutral', 2 : 'positive' } fname_pred.write('{}\n'.format(sentiments[pred])) counter+=1 if counter == test_set_iterator.n_samples: break
def main(): ########## # LAYERS # ######### HOME_DIR = "semeval_parsed" input_fname = '200M' test_type = '' if len(sys.argv) > 1: test_type = sys.argv[1] data_dir = HOME_DIR + '_' + input_fname numpy_rng = numpy.random.RandomState(123) print "Load Parameters" parameter_map = cPickle.load( open(data_dir + '/parameters_distant_{}.p'.format(test_type), 'rb')) input_shape = parameter_map['inputShape'] filter_width = parameter_map['filterWidth'] n_in = parameter_map['n_in'] st = parameter_map['st'] def relu(x): return x * (x > 0) activation = relu tweets = T.imatrix('tweets_train') y = T.lvector('y') batch_tweets = T.imatrix('batch_x_q') batch_y = T.lvector('batch_y') lookup_table_words = nn_layers.LookupTableFast( W=parameter_map['LookupTableFastStaticW'].get_value(), pad=filter_width - 1) filter_shape = parameter_map['FilterShape'] conv_layers = [] conv = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW'], rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape) non_linearity = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB'], b_size=filter_shape[0], activation=activation) shape1 = parameter_map['PoolingShape1'] pooling = nn_layers.KMaxPoolLayerNative(shape=shape1, ignore_border=True, st=st) input_shape2 = parameter_map['input_shape2'] filter_shape2 = parameter_map['FilterShape2'] con2 = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW2'], rng=numpy_rng, input_shape=input_shape2, filter_shape=filter_shape2) non_linearity2 = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB2'], b_size=filter_shape2[0], activation=activation) shape2 = parameter_map['PoolingShape2'] st2 = parameter_map['st2'] pooling2 = nn_layers.KMaxPoolLayerNative(shape=shape2, st=st2, ignore_border=True) input_shape3 = parameter_map['input_shape3'] filter_shape3 = parameter_map['FilterShape3'] con3 = nn_layers.Conv2dLayer(W=parameter_map['Conv2dLayerW3'], rng=numpy_rng, input_shape=input_shape3, filter_shape=filter_shape3) non_linearity3 = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB3'], b_size=filter_shape3[0], activation=activation) shape3 = parameter_map['PoolingShape3'] pooling3 = nn_layers.KMaxPoolLayerNative(shape=shape3, ignore_border=True) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[ conv, non_linearity, pooling, con2, non_linearity2, pooling2, con3, non_linearity3, pooling3 ]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() hidden_layer = nn_layers.LinearLayer(W=parameter_map['LinearLayerW'], b=parameter_map['LinearLayerB'], rng=numpy_rng, n_in=n_in, n_out=n_in, activation=activation) n_outs = 3 classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) nnet_tweets = nn_layers.FeedForwardNet(layers=[ lookup_table_words, join_layer, flatten_layer, hidden_layer, classifier ]) inputs_train = [batch_tweets, batch_y] givens_train = {tweets: batch_tweets, y: batch_y} inputs_pred = [ batch_tweets, ] givens_pred = { tweets: batch_tweets, } nnet_tweets.set_input(tweets) print nnet_tweets params = nnet_tweets.params cost = nnet_tweets.layers[-1].training_cost(y) predictions = nnet_tweets.layers[-1].y_pred updates = sgd_trainer.get_adadelta_updates(cost, params, rho=0.95, eps=1e-6, max_norm=0, word_vec_name='None') train_fn = theano.function( inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train, ) pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred) def predict_batch(batch_iterator): preds = numpy.hstack( [pred_fn(batch_x_q[0]) for batch_x_q in batch_iterator]) return preds[:batch_iterator.n_samples] ####### #Names# ####### test_2016n = 'Test 2016' test_2015n = 'Test 2015' test_2014n = 'Test 2014' test_2013n = 'Test 2013' test_2014ljn = 'Test 2014 LiveJournal' test_2014srcn = 'Test 2014 Sarcasm' test_2013_smsn = 'Test 2013 SMS' ep_pred = {} ep_pred[test_2016n] = [] ep_pred[test_2015n] = [] ep_pred[test_2014n] = [] ep_pred[test_2013n] = [] ep_pred[test_2014ljn] = [] ep_pred[test_2014srcn] = [] ep_pred[test_2013_smsn] = [] ####################### # Supervised Learining# ###################### batch_size = 1000 training2013_tids = numpy.load( os.path.join(data_dir, 'task-B-train.20140221.tids.npy')) training2013_tweets = numpy.load( os.path.join(data_dir, 'task-B-train.20140221.tweets.npy')) training2013_sentiments = numpy.load( os.path.join(data_dir, 'task-B-train.20140221.sentiments.npy')) dev_2013_tids = numpy.load( os.path.join(data_dir, 'task-B-dev.20140225.tids.npy')) dev_2013_tweets = numpy.load( os.path.join(data_dir, 'task-B-dev.20140225.tweets.npy')) dev_2013_sentiments = numpy.load( os.path.join(data_dir, 'task-B-dev.20140225.sentiments.npy')) trainingA_2016_tids = numpy.load( os.path.join(data_dir, 'task-A-train-2016.tids.npy')) trainingA_2016_tweets = numpy.load( os.path.join(data_dir, 'task-A-train-2016.tweets.npy')) trainingA_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-A-train-2016.sentiments.npy')) devA_2016_tids = numpy.load( os.path.join(data_dir, 'task-A-dev-2016.tids.npy')) devA_2016_tweets = numpy.load( os.path.join(data_dir, 'task-A-dev-2016.tweets.npy')) devA_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-A-dev-2016.sentiments.npy')) devtestA_2016_tids = numpy.load( os.path.join(data_dir, 'task-A-devtest-2016.tids.npy')) devtestA_2016_tweets = numpy.load( os.path.join(data_dir, 'task-A-devtest-2016.tweets.npy')) devtestA_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-A-devtest-2016.sentiments.npy')) test_2016_tids = numpy.load( os.path.join(data_dir, 'task-A-test2016.tids.npy')) test_2016_tweets = numpy.load( os.path.join(data_dir, 'task-A-test2016.tweets.npy')) test_2016_sentiments = numpy.load( os.path.join(data_dir, 'task-A-test2016.sentiments.npy')) test_2013_tids = numpy.load( os.path.join(data_dir, 'task-B-test2013-twitter.tids.npy')) test_2013_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2013-twitter.tweets.npy')) test_2013_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2013-twitter.sentiments.npy')) test_2014_tids = numpy.load( os.path.join(data_dir, 'task-B-test2014-twitter.tids.npy')) test_2014_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2014-twitter.tweets.npy')) test_2014_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2014-twitter.sentiments.npy')) test_2015_tids = numpy.load( os.path.join(data_dir, 'task-B-test2015-twitter.tids.npy')) test_2015_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2015-twitter.tweets.npy')) test_2015_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2015-twitter.sentiments.npy')) test_2013_sms_tids = numpy.load( os.path.join(data_dir, 'task-B-test2013-sms.tids.npy')) test_2013_sms_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2013-sms.tweets.npy')) test_2013_sms_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2013-sms.sentiments.npy')) test_2014_livejournal_tids = numpy.load( os.path.join(data_dir, 'task-B-test2014-livejournal.tids.npy')) test_2014_livejournal_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2014-livejournal.tweets.npy')) test_2014_livejournal_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2014-livejournal.sentiments.npy')) test_2014_sarcasm_tids = numpy.load( os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tids.npy')) test_2014_sarcasm_tweets = numpy.load( os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tweets.npy')) test_2014_sarcasm_sentiments = numpy.load( os.path.join(data_dir, 'task-B-test2014-twittersarcasm.sentiments.npy')) training_full_tweets = numpy.concatenate( (training2013_tweets, dev_2013_tweets), axis=0) training_full_tweets = numpy.concatenate( (training_full_tweets, trainingA_2016_tweets), axis=0) training_full_tweets = numpy.concatenate( (training_full_tweets, devA_2016_tweets), axis=0) training_full_tweets = numpy.concatenate( (training_full_tweets, devtestA_2016_tweets), axis=0) training_full_sentiments = numpy.concatenate( (training2013_sentiments, dev_2013_sentiments), axis=0) training_full_sentiments = numpy.concatenate( (training_full_sentiments, trainingA_2016_sentiments), axis=0) training_full_sentiments = numpy.concatenate( (training_full_sentiments, devA_2016_sentiments), axis=0) training_full_sentiments = numpy.concatenate( (training_full_sentiments, devtestA_2016_sentiments), axis=0) train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training_full_tweets, training_full_sentiments], batch_size=batch_size, randomize=True) test_2015_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2015_tweets], batch_size=batch_size, randomize=False) dev2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [devA_2016_tweets], batch_size=batch_size, randomize=False) test_2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2016_tweets], batch_size=batch_size, randomize=False) train2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [trainingA_2016_tweets], batch_size=batch_size, randomize=False) test2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2016_tweets], batch_size=batch_size, randomize=False) test2013_itarator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2013_tweets], batch_size=batch_size, randomize=False) test_2014_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_tweets], batch_size=batch_size, randomize=False) test_2014_sarcasm_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_sarcasm_tweets], batch_size=batch_size, randomize=False) train2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training2013_tweets], batch_size=batch_size, randomize=False) dev_2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [dev_2013_tweets], batch_size=batch_size, randomize=False) test_2013_sms_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2013_sms_tweets], batch_size=batch_size, randomize=False) test_2014_livejournal_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_livejournal_tweets], batch_size=batch_size, randomize=False) W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function([], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) epoch = 0 n_epochs = 50 early_stop = 50 check_freq = 4 timer_train = time.time() no_best_dev_update = 0 best_dev_acc = -numpy.inf num_train_batches = len(train_set_iterator) while epoch < n_epochs: timer = time.time() for i, (tweet, y_label) in enumerate(tqdm(train_set_iterator, ascii=True), 1): train_fn(tweet, y_label) if i % check_freq == 0 or i == num_train_batches: y_pred_dev_2015 = predict_batch(test_2015_iterator) y_pred_test_2014 = predict_batch(test_2014_iterator) y_pred_test_2013 = predict_batch(test2013_itarator) y_pred_test_sms_2013 = predict_batch(test_2013_sms_iterator) y_pred_test_livejournal_2014 = predict_batch( test_2014_livejournal_iterator) y_pred_test_sarcasm_2014 = predict_batch( test_2014_sarcasm_iterator) y_pred_test_2016 = predict_batch(test_2016_iterator) dev_acc_2015 = semeval_f1_taskA(test_2015_sentiments, y_pred_dev_2015) dev_acc_2014 = semeval_f1_taskA(test_2014_sentiments, y_pred_test_2014) dev_acc_2014_lj = semeval_f1_taskA( test_2014_livejournal_sentiments, y_pred_test_livejournal_2014) dev_acc_2014_srcs = semeval_f1_taskA( test_2014_sarcasm_sentiments, y_pred_test_sarcasm_2014) dev_acc_2013 = semeval_f1_taskA(test_2013_sentiments, y_pred_test_2013) dev_acc_2013_sms = semeval_f1_taskA(test_2013_sms_sentiments, y_pred_test_sms_2013) dev_acc_2016_test = semeval_f1_taskA(test_2016_sentiments, y_pred_test_2016) ep_pred[test_2016n].append(dev_acc_2016_test) ep_pred[test_2015n].append(dev_acc_2015) ep_pred[test_2014n].append(dev_acc_2014) ep_pred[test_2013n].append(dev_acc_2013) ep_pred[test_2014ljn].append(dev_acc_2014_lj) ep_pred[test_2014srcn].append(dev_acc_2014_srcs) ep_pred[test_2013_smsn].append(dev_acc_2013_sms) if dev_acc_2016_test > best_dev_acc: best_dev_acc = dev_acc_2016_test best_params = [ numpy.copy(p.get_value(borrow=True)) for p in params ] no_best_dev_update = 0 print('2016 epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2016_test)) print('2015 epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2015)) print('2014 epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2014)) print('2013 epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2013)) print('2014lj epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2014_lj)) print( '2014src epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2014_srcs)) print( '2013sms epoch: {} chunk: {} best_chunk_auc: {:.4f};'. format(epoch, i, dev_acc_2013_sms)) zerout_dummy_word() print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 if no_best_dev_update >= early_stop: print "Quitting after of no update of the best score on dev set", no_best_dev_update break print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) for i, param in enumerate(best_params): params[i].set_value(param, borrow=True) cPickle.dump( ep_pred, open(data_dir + '/supervised_results_{}.p'.format(test_type), 'wb')) return ####################### # Get Sentence Vectors# ###################### batch_size = input_shape[0] inputs_senvec = [batch_tweets] givents_senvec = { tweets: batch_tweets, } output = nnet_tweets.layers[-2].output output_fn = function(inputs=inputs_senvec, outputs=output, givens=givents_senvec) sets = [(test_2014_tids, test_2014_tweets, 'task-B-test2014-twitter'), (test_2015_tids, test_2015_tweets, 'task-B-test2015-twitter'), (training2013_tids, training2013_tweets, 'task-BD-train-2013'), (test_2013_sms_tids, test_2013_sms_tweets, 'task-B-test2013-sms'), (devA_2016_tids, devA_2016_tweets, 'task-A-dev-2016'), (trainingA_2016_tids, trainingA_2016_tweets, 'task-A-train-2016'), (devtestA_2016_tids, devtestA_2016_tweets, 'task-A-devtest-2016'), (test_2016_tids, test_2016_tweets, 'SemEval2016-task4-test.subtask-A'), (test_2014_sarcasm_tids, test_2014_sarcasm_tweets, 'test_2014_sarcasm'), (test_2014_livejournal_tids, test_2014_livejournal_tweets, 'task-B-test2014-livejournal'), (test_2013_tids, test_2013_tweets, 'task-BD-train-2013'), (dev_2013_tids, dev_2013_tweets, 'task-BD-dev-2013')] for (fids, fset, name) in sets: test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset], batch_size=batch_size, randomize=False) counter = 0 fname = open( os.path.join(data_dir, 'sentence-vecs/{}.txt'.format(name)), 'w+') for i, tweet in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet[0]) for vec in o: fname.write(fids[counter]) for el in numpy.nditer(vec): fname.write(" %f" % el) fname.write("\n") counter += 1 if counter == test_set_iterator.n_samples: break ############################## # Get Predictions Probabilites# ############################# batch_size = input_shape[0] output = nnet_tweets.layers[-1].p_y_given_x output_fn = function(inputs=inputs_senvec, outputs=output, givens=givents_senvec) for (fids, fset, name) in sets: test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset], batch_size=batch_size, randomize=False) counter = 0 fname = open( os.path.join(data_dir, 'prob_predictions/{}.txt'.format(name)), 'w+') for i, tweet in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet[0]) for vec in o: for el in numpy.nditer(vec): fname.write(" %f" % el) fname.write("\n") counter += 1 if counter == test_set_iterator.n_samples: break
def main(): ########## # LAYERS # ######### HOME_DIR = "semeval_parsed" input_fname = '200M' test_type = '' if len(sys.argv) > 1: test_type = sys.argv[1] data_dir = HOME_DIR + '_' + input_fname numpy_rng = numpy.random.RandomState(123) print "Load Parameters" parameter_map = cPickle.load(open(data_dir+'/parameters_distant_{}.p'.format(test_type), 'rb')) input_shape = parameter_map['inputShape'] filter_width = parameter_map['filterWidth'] n_in = parameter_map['qLogisticIn'] k_max = parameter_map['kmax'] def relu(x): return x * (x > 0) activation = relu tweets = T.imatrix('tweets_train') y = T.lvector('y') batch_tweets= T.imatrix('batch_x_q') batch_y = T.lvector('batch_y') lookup_table_words = nn_layers.LookupTableFast( W=parameter_map['LookupTableFastStaticW'].get_value(), pad=filter_width-1 ) filter_shape = parameter_map['FilterShape' + str(filter_width)] conv_layers = [] conv = nn_layers.Conv2dLayer( W=parameter_map['Conv2dLayerW' + str(filter_width)], rng=numpy_rng, filter_shape=filter_shape, input_shape=input_shape ) non_linearity = nn_layers.NonLinearityLayer( b=parameter_map['NonLinearityLayerB' + str(filter_width)], b_size=filter_shape[0], activation=activation ) pooling = nn_layers.KMaxPoolLayer(k_max=k_max) conv2dNonLinearMaxPool = nn_layers.FeedForwardNet(layers=[ conv, non_linearity, pooling ]) conv_layers.append(conv2dNonLinearMaxPool) join_layer = nn_layers.ParallelLayer(layers=conv_layers) flatten_layer = nn_layers.FlattenLayer() hidden_layer = nn_layers.LinearLayer( W=parameter_map['LinearLayerW'], b=parameter_map['LinearLayerB'], rng=numpy_rng, n_in=n_in, n_out=n_in, activation=activation ) n_outs = 3 classifier = nn_layers.LogisticRegression(n_in=n_in, n_out=n_outs) nnet_tweets = nn_layers.FeedForwardNet(layers=[ lookup_table_words, join_layer, flatten_layer, hidden_layer, classifier ]) inputs_train = [batch_tweets, batch_y] givens_train = {tweets: batch_tweets, y: batch_y} inputs_pred = [batch_tweets, ] givens_pred = {tweets:batch_tweets, } nnet_tweets.set_input(tweets) print nnet_tweets params = nnet_tweets.params cost = nnet_tweets.layers[-1].training_cost(y) predictions = nnet_tweets.layers[-1].y_pred updates = sgd_trainer.get_adadelta_updates( cost, params, rho=0.95, eps=1e-6, max_norm=0, word_vec_name='None' ) train_fn = theano.function( inputs=inputs_train, outputs=cost, updates=updates, givens=givens_train, ) pred_fn = theano.function(inputs=inputs_pred, outputs=predictions, givens=givens_pred) def predict_batch(batch_iterator): preds = numpy.hstack([pred_fn(batch_x_q[0]) for batch_x_q in batch_iterator]) return preds[:batch_iterator.n_samples] ####### #Names# ####### test_2016n = 'Test 2016' test_2015n = 'Test 2015' test_2014n = 'Test 2014' test_2013n = 'Test 2013' test_2014ljn = 'Test 2014 LiveJournal' test_2014srcn = 'Test 2014 Sarcasm' test_2013_smsn = 'Test 2013 SMS' train_fulln = 'Training Score' ep_pred = {} ep_pred[test_2016n] = [] ep_pred[test_2015n] = [] ep_pred[test_2014n] = [] ep_pred[test_2013n] = [] ep_pred[test_2014ljn] = [] ep_pred[test_2014srcn] = [] ep_pred[test_2013_smsn] = [] ep_pred[train_fulln] = [] ####################### # Supervised Learining# ###################### batch_size = 1000 training2013_tids = numpy.load(os.path.join(data_dir, 'task-B-train.20140221.tids.npy')) training2013_tweets = numpy.load(os.path.join(data_dir, 'task-B-train.20140221.tweets.npy')) training2013_sentiments = numpy.load(os.path.join(data_dir, 'task-B-train.20140221.sentiments.npy')) dev_2013_tids = numpy.load(os.path.join(data_dir, 'task-B-dev.20140225.tids.npy')) dev_2013_tweets = numpy.load(os.path.join(data_dir, 'task-B-dev.20140225.tweets.npy')) dev_2013_sentiments = numpy.load(os.path.join(data_dir, 'task-B-dev.20140225.sentiments.npy')) trainingA_2016_tids = numpy.load(os.path.join(data_dir, 'task-A-train-2016.tids.npy')) trainingA_2016_tweets = numpy.load(os.path.join(data_dir, 'task-A-train-2016.tweets.npy')) trainingA_2016_sentiments = numpy.load(os.path.join(data_dir, 'task-A-train-2016.sentiments.npy')) devA_2016_tids = numpy.load(os.path.join(data_dir, 'task-A-dev-2016.tids.npy')) devA_2016_tweets = numpy.load(os.path.join(data_dir, 'task-A-dev-2016.tweets.npy')) devA_2016_sentiments = numpy.load(os.path.join(data_dir, 'task-A-dev-2016.sentiments.npy')) devtestA_2016_tids = numpy.load(os.path.join(data_dir, 'task-A-devtest-2016.tids.npy')) devtestA_2016_tweets = numpy.load(os.path.join(data_dir, 'task-A-devtest-2016.tweets.npy')) devtestA_2016_sentiments = numpy.load(os.path.join(data_dir, 'task-A-devtest-2016.sentiments.npy')) test_2016_tids = numpy.load(os.path.join(data_dir, 'task-A-test2016.tids.npy')) test_2016_tweets = numpy.load(os.path.join(data_dir, 'task-A-test2016.tweets.npy')) test_2016_sentiments = numpy.load(os.path.join(data_dir, 'task-A-test2016.sentiments.npy')) test_2013_tids = numpy.load(os.path.join(data_dir, 'task-B-test2013-twitter.tids.npy')) test_2013_tweets = numpy.load(os.path.join(data_dir, 'task-B-test2013-twitter.tweets.npy')) test_2013_sentiments = numpy.load(os.path.join(data_dir, 'task-B-test2013-twitter.sentiments.npy')) test_2014_tids = numpy.load(os.path.join(data_dir, 'task-B-test2014-twitter.tids.npy')) test_2014_tweets = numpy.load(os.path.join(data_dir, 'task-B-test2014-twitter.tweets.npy')) test_2014_sentiments = numpy.load(os.path.join(data_dir, 'task-B-test2014-twitter.sentiments.npy')) test_2015_tids = numpy.load(os.path.join(data_dir, 'task-B-test2015-twitter.tids.npy')) test_2015_tweets = numpy.load(os.path.join(data_dir, 'task-B-test2015-twitter.tweets.npy')) test_2015_sentiments = numpy.load(os.path.join(data_dir, 'task-B-test2015-twitter.sentiments.npy')) test_2013_sms_tids = numpy.load(os.path.join(data_dir, 'task-B-test2013-sms.tids.npy')) test_2013_sms_tweets = numpy.load(os.path.join(data_dir, 'task-B-test2013-sms.tweets.npy')) test_2013_sms_sentiments = numpy.load(os.path.join(data_dir, 'task-B-test2013-sms.sentiments.npy')) test_2014_livejournal_tids = numpy.load(os.path.join(data_dir, 'task-B-test2014-livejournal.tids.npy')) test_2014_livejournal_tweets = numpy.load(os.path.join(data_dir, 'task-B-test2014-livejournal.tweets.npy')) test_2014_livejournal_sentiments = numpy.load(os.path.join(data_dir, 'task-B-test2014-livejournal.sentiments.npy')) test_2014_sarcasm_tids = numpy.load(os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tids.npy')) test_2014_sarcasm_tweets = numpy.load(os.path.join(data_dir, 'task-B-test2014-twittersarcasm.tweets.npy')) test_2014_sarcasm_sentiments = numpy.load(os.path.join(data_dir, 'task-B-test2014-twittersarcasm.sentiments.npy')) rand_tweets_tids = numpy.load(os.path.join(data_dir, 'random_tweet.tids.npy')) rand_tweets_tweets = numpy.load(os.path.join(data_dir, 'random_tweet.tweets.npy')) rand_tweets_sentiments = numpy.load(os.path.join(data_dir, 'random_tweet.sentiments.npy')) training_full_id = numpy.concatenate((training2013_tids,dev_2013_tids),axis=0) training_full_id = numpy.concatenate((training_full_id,trainingA_2016_tids),axis=0) training_full_id = numpy.concatenate((training_full_id,devA_2016_tids),axis=0) training_full_id = numpy.concatenate((training_full_id,devtestA_2016_tids),axis=0) training_full_tweets = numpy.concatenate((training2013_tweets,dev_2013_tweets),axis=0) training_full_tweets = numpy.concatenate((training_full_tweets,trainingA_2016_tweets),axis=0) training_full_tweets = numpy.concatenate((training_full_tweets,devA_2016_tweets),axis=0) training_full_tweets = numpy.concatenate((training_full_tweets,devtestA_2016_tweets),axis=0) training_full_sentiments = numpy.concatenate((training2013_sentiments,dev_2013_sentiments),axis=0) training_full_sentiments = numpy.concatenate((training_full_sentiments,trainingA_2016_sentiments),axis=0) training_full_sentiments = numpy.concatenate((training_full_sentiments,devA_2016_sentiments),axis=0) training_full_sentiments = numpy.concatenate((training_full_sentiments,devtestA_2016_sentiments),axis=0) train_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training_full_tweets,training_full_sentiments], batch_size=batch_size, randomize=True ) train_err_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training_full_tweets], batch_size=batch_size, randomize=False ) test_2015_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2015_tweets], batch_size=batch_size, randomize=False ) dev2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [devA_2016_tweets], batch_size=batch_size, randomize=False ) devtestA2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [devtestA_2016_tweets], batch_size=batch_size, randomize=False ) train2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [trainingA_2016_tweets], batch_size=batch_size, randomize=False ) test2016_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2016_tweets], batch_size=batch_size, randomize=False ) test2013_itarator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2013_tweets], batch_size=batch_size, randomize=False ) test_2014_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_tweets], batch_size=batch_size, randomize=False ) test_2014_sarcasm_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_sarcasm_tweets], batch_size=batch_size, randomize=False ) train2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [training2013_tweets], batch_size=batch_size, randomize=False ) dev_2013_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [dev_2013_tweets], batch_size=batch_size, randomize=False ) test_2013_sms_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2013_sms_tweets], batch_size=batch_size, randomize=False ) test_2014_livejournal_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [test_2014_livejournal_tweets], batch_size=batch_size, randomize=False ) W_emb_list = [w for w in params if w.name == 'W_emb'] zerout_dummy_word = theano.function([], updates=[(W, T.set_subtensor(W[-1:], 0.)) for W in W_emb_list]) epoch = 0 n_epochs = 50 early_stop = 50 check_freq = 4 timer_train = time.time() no_best_dev_update = 0 best_dev_acc = -numpy.inf num_train_batches = len(train_set_iterator) saved12 = False while epoch < n_epochs: timer = time.time() for i, (tweet,y_label) in enumerate(tqdm(train_set_iterator,ascii=True), 1): train_fn(tweet,y_label) if i % check_freq == 0 or i == num_train_batches: y_pred_dev_2015 = predict_batch(test_2015_iterator) #y_pred_train_2013 = predict_batch(train2013_iterator) #y_pred_train_2016 = predict_batch(train2016_iterator) #y_pred_dev2016 = predict_batch(dev2016_iterator) #y_pred_dev2013 = predict_batch(dev_2013_iterator) y_pred_test_2016 = predict_batch(test2016_iterator) y_pred_test_2014 = predict_batch(test_2014_iterator) y_pred_test_2013 = predict_batch(test2013_itarator) y_pred_test_sms_2013 = predict_batch(test_2013_sms_iterator) y_pred_test_livejournal_2014 = predict_batch(test_2014_livejournal_iterator) y_pred_test_sarcasm_2014 = predict_batch(test_2014_sarcasm_iterator) #y_pred_devtest_2016 = predict_batch(devtestA2016_iterator) y_train_score = predict_batch(train_err_iterator) dev_acc_2015 = semeval_f1_taskA(test_2015_sentiments,y_pred_dev_2015) dev_acc_2014 = semeval_f1_taskA(test_2014_sentiments,y_pred_test_2014) dev_acc_2014_lj = semeval_f1_taskA(test_2014_livejournal_sentiments,y_pred_test_livejournal_2014) dev_acc_2014_srcs = semeval_f1_taskA(test_2014_sarcasm_sentiments,y_pred_test_sarcasm_2014) dev_acc_2013 = semeval_f1_taskA(test_2013_sentiments,y_pred_test_2013) dev_acc_2013_sms = semeval_f1_taskA(test_2013_sms_sentiments,y_pred_test_sms_2013) dev_acc_2016 = semeval_f1_taskA(test_2016_sentiments,y_pred_test_2016) dev_acc_train_err = semeval_f1_taskA(training_full_sentiments,y_train_score) ep_pred[test_2016n].append(dev_acc_2016) ep_pred[test_2015n].append(dev_acc_2015) ep_pred[test_2014n].append(dev_acc_2014) ep_pred[test_2013n].append(dev_acc_2013) ep_pred[test_2014ljn].append(dev_acc_2014_lj) ep_pred[test_2014srcn].append(dev_acc_2014_srcs) ep_pred[test_2013_smsn].append(dev_acc_2013_sms) ep_pred[train_fulln].append(dev_acc_train_err) if dev_acc_2016 > best_dev_acc: best_dev_acc = dev_acc_2016 best_params = [numpy.copy(p.get_value(borrow=True)) for p in params] no_best_dev_update = 0 print('2016 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2016)) print('2015 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2015)) print('2014 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2014)) print('2013 epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2013)) print('2014lj epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2014_lj)) print('2014src epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2014_srcs)) print('2013sms epoch: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_2013_sms)) print('Train Err: {} chunk: {} best_chunk_auc: {:.4f};'.format(epoch, i, dev_acc_train_err)) if epoch == 15 and not saved12: saved12 = True ep_12_params = [numpy.copy(p.get_value(borrow=True)) for p in params] zerout_dummy_word() print('epoch {} took {:.4f} seconds'.format(epoch, time.time() - timer)) epoch += 1 no_best_dev_update += 1 if no_best_dev_update >= early_stop: print "Quitting after of no update of the best score on dev set", no_best_dev_update break print('Training took: {:.4f} seconds'.format(time.time() - timer_train)) cPickle.dump(ep_pred,open(data_dir+'/supervised_results_{}.p'.format(test_type), 'wb')) ####################### # Get Sentence Vectors# ###################### inputs_senvec = [batch_tweets] givents_senvec = {tweets:batch_tweets} sets = [ (rand_tweets_tids,rand_tweets_tweets,'random_tweets') (test_2016_tids,test_2016_tweets,'SemEval2016-task4-test.subtask-A'), (test_2014_tids,test_2014_tweets,'task-B-test2014-twitter'), (test_2015_tids,test_2015_tweets,'task-B-test2015-twitter'), (test_2013_tids,test_2013_tweets,'task-B-test2013-twitter'), (test_2014_livejournal_tids,test_2014_livejournal_tweets,'task-B-test2014-livejournal'), (test_2014_sarcasm_tids,test_2014_sarcasm_tweets,'test_2014_sarcasm'), (test_2013_sms_tids,test_2013_sms_tweets,'task-B-test2013-sms'), (training2013_tids,training2013_tweets,'task-B-train.20140221'), (devA_2016_tids,devA_2016_tweets,'task-A-dev-2016'), (trainingA_2016_tids,trainingA_2016_tweets,'task-A-train-2016'), (devtestA_2016_tids,devtestA_2016_tweets,'task-A-devtest-2016'), (dev_2013_tids,dev_2013_tweets,'task-B-dev.20140225'), (training_full_id,training_full_tweets,'training_full_set') ] get_senvec = False if get_senvec: batch_size = input_shape[0] output = nnet_tweets.layers[-2].output output_fn = function(inputs=inputs_senvec, outputs=output,givens=givents_senvec) for (fids,fset,name) in sets: test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset], batch_size=batch_size, randomize=False ) counter = 0 fname = open(os.path.join(data_dir,'sentence-vecs/{}.txt'.format(name)), 'w+') for i, tweet in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet[0]) for vec in o: fname.write(fids[counter]) for el in numpy.nditer(vec): fname.write(" %f" % el) fname.write("\n") counter+=1 if counter == test_set_iterator.n_samples: break ############################## # Get Predictions Probabilites# ############################# print 'Store Predictions' get_params = [ ('best_params',best_params), ('ep15params',ep_12_params) ] for pname, gparams in get_params: for i, param in enumerate(gparams): params[i].set_value(param, borrow=True) batch_size = input_shape[0] output = nnet_tweets.layers[-1].p_y_given_x output_fn = function(inputs=inputs_senvec, outputs=output,givens=givents_senvec) for (fids,fset,name) in sets: test_set_iterator = sgd_trainer.MiniBatchIteratorConstantBatchSize( numpy_rng, [fset], batch_size=batch_size, randomize=False ) opath_prob = os.path.join(data_dir,'{}/{}/predictions_probs'.format(pname,test_type)) if not os.path.exists(opath_prob): os.makedirs(opath_prob) print 'Created Path',opath_prob opath_pred = os.path.join(data_dir,'{}/{}/predictions_pred'.format(pname,test_type)) if not os.path.exists(opath_pred): os.makedirs(opath_pred) print 'Created Path',opath_pred counter = 0 fname_prob = open(os.path.join(opath_prob,'{}.txt'.format(name)), 'w+') fname_pred = open(os.path.join(opath_pred,'{}.txt'.format(name)), 'w+') for i, tweet in enumerate(tqdm(test_set_iterator), 1): o = output_fn(tweet[0]) for vec in o: #save pred_prob for el in numpy.nditer(vec): fname_prob.write("%f\t" % el) fname_prob.write("\n") #save pred pred = numpy.argmax(vec) sentiments = { 0 : 'negative', 1 : 'neutral', 2 : 'positive' } fname_pred.write('{}\n'.format(sentiments[pred])) counter+=1 if counter == test_set_iterator.n_samples: break
print('Load Supervised Model') model.load_weights(super_weight_path) for tids, X_test, y_test, name in test_sets: raw_data = open(os.path.join('semeval', '{}.tsv'.format(name)), 'r').readlines() raw_data = map(lambda x: x.replace('\n', '').split('\t'), raw_data) raw_tweets = map(lambda x: (x[0], x[-1]), raw_data) raw_lables = map(lambda x: (x[0], x[-2]), raw_data) raw_data_dict = dict(raw_tweets) raw_lables_dict = dict(raw_lables) ofile = open(os.path.join(res_dir, name), 'w') y_pred = model.predict(X_test) y_pred = probas_to_classes(y_pred) score = semeval_f1_taskA(y_test, y_pred) scores = f1_score(y_test, y_pred, average=None) output += '{}: {}\t'.format(name, score) output += 'neg_f1: {}\t neut_f1: {}\t pos_f1: {}'.format( scores[0], scores[1], scores[2]) for tid, label in zip(tids, y_pred): tweet = raw_data_dict[tid].replace('\n', '') truth = raw_lables_dict[tid] l = {0: 'negative', 1: 'neutral', 2: 'positive'}.get(label) outline = '{}\t{}\t{}\n'.format(tweet, truth, l) ofile.write(outline) open(os.path.join('results', 'results_log.tsv'), 'a').write(output + '\n')