BASE_DIR = os.path.dirname(__file__) start_time = time.time() #DL_file = open("Data/DL_reslts.txt", "w") #DA_file = open("Data/DA_reslts.txt", "w") GT_actual = open("results/GT_actual.txt", "w") probabilities = open("results/probabilities.txt", "w") predict = open("results/predict.txt", "w") aucs = open("results/aucs.txt", "w") test_set = open("results/test_set_vocab.txt", "w") # Process source domain dataset PV dataset PV_DS_x_text, PV_DS_y = tfp.load_data_and_labels( os.path.join(BASE_DIR, 'Data/PV_DS/PV_Dann_Data/pos_vio_news.txt'), os.path.join(BASE_DIR, 'Data/PV_DS/PV_Dann_Data/neg_vio_news.txt')) print("labeled (source) dataset size", len(PV_DS_x_text)) PV_DS_AVG_document_length = sum([len(x.split(" ")) for x in PV_DS_x_text]) // len(PV_DS_x_text) # Load target domain training dataset turkish tweets tweets_x_text, tweets_y = tfp.load_data_and_labels( os.path.join(BASE_DIR, 'Data/Turkish_protests_tweets/pos_twts.txt'), os.path.join(BASE_DIR, 'Data/Turkish_protests_tweets/neg_twts.txt')) print("unlabeled (target) dataset size", len(tweets_x_text)) tweets_AVG_document_length = sum([len(x.split(" ")) for x in tweets_x_text ]) // len(tweets_x_text) # Load target domain test dataset CF labeled turkish tweets
from sklearn.manifold import TSNE import os from flip_gradient import flip_gradient from util_pv import * from text_CNN_feature_extraction import TextCNN from tensorflow.contrib import learn from Data_Preprocessing import text_files_preprocessing as tfp import time BASE_DIR = os.path.dirname(__file__) start_time = time.time() # Process source domain dataset amazon books #load source 1 amazon books Amz_books_x_text, Amz_books_y = tfp.load_data_and_labels( os.path.join(BASE_DIR, 'Data/Amazon_reviews/books/pos_books.txt'), os.path.join(BASE_DIR, 'Data/Amazon_reviews/books/neg_books.txt')) Amz_books_x_text = Amz_books_x_text[:-1] Amz_books_y = Amz_books_y[:-1] print(len(Amz_books_x_text)) Amz_books_AVG_document_length = sum( [len(x.split(" ")) for x in Amz_books_x_text]) // len(Amz_books_x_text) # Load amazon source 2 amazon movies Amz_movies_x_text, Amz_movies_y = tfp.load_data_and_labels( os.path.join(BASE_DIR, 'Data/Amazon_reviews/movies/pos_movies.txt'), os.path.join(BASE_DIR, 'Data/Amazon_reviews/movies/neg_movies.txt')) Amz_movies_x_text = Amz_movies_x_text[:-1] Amz_movies_y = Amz_movies_y[:-1] print(len(Amz_movies_x_text)) Amz_movies_AVG_document_length = sum(
predict = open("results/predict.txt", "w") aucs = open("results/aucs.txt", "w") test_set = open("results/test_set_vocab.txt", "w") test_labels = open("results/test_set_labels.txt", "w") # Process source domain dataset PV dataset PV_DS_x_text, PV_DS_y = tfp.load_5000_data_and_labels( os.path.join(BASE_DIR, 'Data/PV_DS/PV_Dann_Data/pos_vio_news.txt'), os.path.join(BASE_DIR, 'Data/PV_DS/PV_Dann_Data/neg_vio_news.txt')) print("labeled (source) dataset size", len(PV_DS_x_text)) PV_DS_AVG_document_length = sum([len(x.split(" ")) for x in PV_DS_x_text]) // len(PV_DS_x_text) # Load target domain training dataset turkish tweets tweets_x_text, tweets_y = tfp.load_data_and_labels( os.path.join(BASE_DIR, 'Data/Turkish_protests_tweets/pos_twts.txt'), os.path.join(BASE_DIR, 'Data/Turkish_protests_tweets/neg_twts.txt')) print("unlabeled (target) dataset size", len(tweets_x_text)) tweets_AVG_document_length = sum([len(x.split(" ")) for x in tweets_x_text ]) // len(tweets_x_text) # Load target domain test dataset CF labeled turkish tweets tweets_x_text_test, tweets_y_test = tfp.load_data_and_labels( os.path.join( BASE_DIR, 'Data/Turkish_protests_tweets/turkish_protest_test_pos_prccd2.txt'), os.path.join( BASE_DIR, 'Data/Turkish_protests_tweets/turkish_protest_test_neg_prccd2.txt'))
loss, tf.contrib.framework.get_global_step(), optimizer='Adam', learning_rate=0.01) return ({ 'class': tf.argmax(logits, 1), 'prob': tf.nn.softmax(logits) }, loss, train_op) # Load target domain test dataset CF labeled turkish tweets tweets_text, tweets_y = tfp.load_data_and_labels( os.path.join( BASE_DIR, 'Data/Turkish_protests_tweets/Turkish_tweets_CF_results_09_05_2018_prccd_pos.txt' ), os.path.join( BASE_DIR, 'Data/Turkish_protests_tweets/Turkish_tweets_CF_results_09_05_2018_prccd_neg.txt' )) print("PV positive samples", (tweets_y.tolist()).count([0, 1])) print("PV negative samples", (tweets_y.tolist()).count([1, 0])) tweets_y = tweets_y[:, 1] # Setup vocabulary processor vocab_processor = learn.preprocessing.VocabularyProcessor( sentence_size, min_frequency=min_word_freq) # Have to fit transform to get length of unique words. vocab_processor.transform(tweets_text) embedding_size = len([x for x in vocab_processor.transform(tweets_text)])
GT_actual = open(os.path.join(BASE_DIR, "results/CF_trained_GT_actual.txt"), "w") probabilities = open( os.path.join(BASE_DIR, "results/CF_trained_probabilities.txt"), "w") predict = open(os.path.join(BASE_DIR, "results/CF_trained_predict.txt"), "w") #loss = open(os.path.join(BASE_DIR, "results/CF_trained_loss.txt"), "w") test_set = open( os.path.join(BASE_DIR, "results/CF_trained_test_set_vocab.txt"), "w") test_set_label = open( os.path.join(BASE_DIR, "results/CF_trained_test_set_label.txt"), "w") # Load target domain test dataset CF labeled turkish tweets PV_DS_x_text, PV_DS_y = tfp.load_data_and_labels( os.path.join( BASE_DIR, 'Data/Turkish_protests_tweets/turkish_protest_test_pos_prccd2.txt'), os.path.join( BASE_DIR, 'Data/Turkish_protests_tweets/turkish_protest_test_neg_prccd2.txt')) print("PV positive samples", (PV_DS_y.tolist()).count([0, 1])) print("PV negative samples", (PV_DS_y.tolist()).count([1, 0])) tweets_test_AVG_document_length = sum( [len(x.split(" ")) for x in PV_DS_x_text]) // len(PV_DS_x_text) seq_length = tweets_test_AVG_document_length print("seq_length", seq_length) #source dataset Build vocabulary vocab_processor_PV = learn.preprocessing.VocabularyProcessor(seq_length) PV_DS_x = np.array(list(vocab_processor_PV.fit_transform(PV_DS_x_text)))
import numpy as np from sklearn.manifold import TSNE import os from flip_gradient import flip_gradient from util_pv import * from text_CNN_feature_extraction import TextCNN from tensorflow.contrib import learn from Data_Preprocessing import text_files_preprocessing as tfp import time BASE_DIR = os.path.dirname(__file__) start_time = time.time() # Process source domain dataset rp-polaritydata Amz_movies_x_text, Amz_movies_y = tfp.load_data_and_labels(os.path.join(BASE_DIR, 'Data/Amazon_reviews/movies/pos_movies.txt'), os.path.join(BASE_DIR, 'Data/Amazon_reviews/movies/neg_movies.txt')) Amz_movies_x_text = Amz_movies_x_text[:-1] Amz_movies_y = Amz_movies_y[:-1] print(len(Amz_movies_x_text)) Amz_movies_AVG_document_length = sum([len(x.split(" ")) for x in Amz_movies_x_text])// len(Amz_movies_x_text) # Load target domain dataset IMDB tweets_x_text, tweets_y = tfp.load_data_and_labels(os.path.join(BASE_DIR, 'Data/sentiment_tweets/pos_tweets_prccd.txt'), os.path.join(BASE_DIR, 'Data/sentiment_tweets/neg_tweets_prccd.txt')) #tweets_x_text, tweets_y = tfp.load_data_and_labels(os.path.join(BASE_DIR,'Data/IMDB/pos_movies.txt'), os.path.join(BASE_DIR,'Data/IMDB/neg_movies.txt')) print(len(tweets_x_text)) tweets_AVG_document_length = sum([len(x.split(" ")) for x in tweets_x_text]) // len(tweets_x_text) print(tweets_AVG_document_length) Amz_movies_AVG_document_length = sum([len(x.split(" ")) for x in Amz_movies_x_text]) // len(Amz_movies_x_text) print(Amz_movies_AVG_document_length)
from sklearn.manifold import TSNE import os from flip_gradient import flip_gradient from util_pv import * from text_CNN_feature_extraction import TextCNN from tensorflow.contrib import learn from Data_Preprocessing import text_files_preprocessing as tfp import time BASE_DIR = os.path.dirname(__file__) start_time = time.time() # Process source domain dataset rp-polaritydata PV_DS_x_text, PV_DS_y = tfp.load_data_and_labels( os.path.join(BASE_DIR, 'Data/PV_DS/PV_Dann_Data/pos_vio_news.txt'), os.path.join(BASE_DIR, 'Data/PV_DS/PV_Dann_Data/neg_vio_news.txt')) PV_DS_x_text = PV_DS_x_text[:-1] PV_DS_y = PV_DS_y[:-1] print(len(PV_DS_x_text)) PV_DS_AVG_document_length = sum([len(x.split(" ")) for x in PV_DS_x_text]) // len(PV_DS_x_text) # Load target domain dataset IMDB tweets_x_text, tweets_y = tfp.load_data_and_labels( os.path.join(BASE_DIR, 'Data/sentiment_tweets/pos_tweets_prccd.txt'), os.path.join(BASE_DIR, 'Data/sentiment_tweets/neg_tweets_prccd.txt')) #tweets_x_text, tweets_y = tfp.load_data_and_labels(os.path.join(BASE_DIR,'Data/IMDB/pos_movies.txt'), os.path.join(BASE_DIR,'Data/IMDB/neg_movies.txt')) print(len(tweets_x_text)) tweets_AVG_document_length = sum([len(x.split(" ")) for x in tweets_x_text ]) // len(tweets_x_text)