def review_to_tensor(review_list, word2index, max_sentence_length, max_review_length): """ As the result, each review will be composed of max_rev_len sentences. If the original review is longer than that, we truncate it, and if shorter than that, we append empty sentences to it. And each sentence will be composed of sent_length words. If the original sentence is longer than that, we truncate it, and if shorter, we append the word of 'UNK' to it. Also, we keep track of the actual number of sentences each review contains. :param review_list: :param word2index :param max_sentence_length: :param max_review_length: :return: [batch_size, max_review_length, max_sentence_length] """ batch_size = len(review_list) review_tensor_list = np.zeros( (batch_size, max_review_length, max_sentence_length), dtype=np.int32) review_lens = [] for index, review in enumerate(review_list): review_tensor = get_train_data(review, word2index) review_tensor = preprocessing.sequence.pad_sequences( review_tensor, maxlen=max_sentence_length, padding="post", truncating="post", value=0) review_lens.append(min(review_tensor.shape[0], 15)) review_tensor = preprocessing.sequence.pad_sequences( [review_tensor], maxlen=max_review_length, padding="post", truncating="post", value=np.zeros(max_sentence_length))[0] review_tensor_list[index] = review_tensor return review_tensor_list, np.array(review_lens)
import torch import torch.nn as nn import pandas as pd import numpy as np import torch.utils.data as Data import torch.nn.functional as F from preprocess import get_train_data device = 'cuda' BATCH_SIZE = 20000 print('--getting training data --') print('device: ', device) npx, npy = get_train_data() print('size: ', npx.shape, npy.shape) x = torch.from_numpy(npx) y = torch.from_numpy(npy) x = x.type(torch.float) y = y.type(torch.float) x = x.to(device) y = y.to(device) torch_dataset = Data.TensorDataset(x, y) loader = Data.DataLoader( dataset=torch_dataset, # torch TensorDataset format batch_size=BATCH_SIZE, # mini batch size shuffle=True, # random shuffle for training num_workers=0, # subprocesses for loading data )
import preprocess import train import predict if __name__ == "__main__": print("preprocess Start") preprocess.get_train_data() preprocess.get_test_data() print("train Start") train.full_train() print("predict Start") predict.past_predict()
import codecs import string import numpy as np import sys from sklearn.feature_extraction.text import TfidfVectorizer from nltk.corpus import stopwords from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression from nltk.tokenize import TweetTokenizer sys.path.append("../") from preprocess import get_train_data, clean_host_texts data = "../data/" train_file = data + "train_noduplicates.csv" train_hosts, y_train = get_train_data(train_file) # Loading the textual content of a set of web pages for each host into the dictionary "text". # The encoding parameter is required since the majority of our text is french. file_names = os.listdir("../text/text") splitting_text = "__________________________________________________________________" file_name_format = "#.txt" def new_f_out(input_file, file_num): """ Function to create the file for each subtext of the original bigger text Input : - input_file : the file name str for the text which we seperate - file_num : index for the subtext
import numpy as np import pandas as pd import matplotlib.pyplot as plt from preprocess import get_train_data, get_test_data from dtree import DecisionTree x_train, y_train = get_train_data() x_test, y_test = get_test_data() n_attr = np.size(x_train, axis=1) def learn_depths(): # training decision tree for different heights train_acc = np.zeros(n_attr) test_acc = np.zeros(n_attr) for depth in range(n_attr): dtree = DecisionTree(x_train, y_train, max_depth=depth) dtree.fit() train_acc[depth] = dtree.accuracy(x_train, y_train) test_acc[depth] = dtree.accuracy(x_test, y_test) df = pd.DataFrame({ 'depth': range(1, n_attr + 1), 'Train accuracy': train_acc, 'Test accuracy': test_acc }) # df.to_csv('res/acc.csv') return df def plot_acc(df): plt.plot('depth', 'Train accuracy', data=df) plt.plot('depth', 'Test accuracy', data=df)
# training set: train_d aka x, (mon4, mon5); train_t aka y, (mon6) # testing set: test_d aka x, (mon4, mon5, mon6); test_t aka y, (mon7) train_d = dc(mon4) train_d.extend(mon5) test_d = dc(train_d) test_d.extend(mon6) train_t = dc(mon6) test_t = dc(mon7) # processing data train_d = pp.process_activity(train_d) train_t = pp.process_activity(mon6) train_d = pp.normalization(train_d) train_x,train_y = pp.get_train_data(train_d,train_t) test_d = pp.process_activity(test_d) test_d = pp.normalization(test_d) test_t = pp.process_activity(test_t) # using percetron to train model pcpt = Perceptron() pcpt.fit(train_x, train_y) # geting 3000 best prediction data result = heapq.nlargest(2000,test_d,lambda x:pcpt.decision_function(test_d[x])) # calculating the quality of result precision, recall, f1 = pp.get_comments(result, test_t) print "Precision rate: %f\nRecall rate: %f\nF1: %f\n" % (precision,recall,f1)