def test_LinearSVM(): # test svm with tdidf-vectorized data from thesis.Data import Data_loader import thesis.Vectorizer as vec data = Data_loader().get_data() vec = vec.get_Vectorizer(vectorizer='tfidf') #vec = vec.get_Vectorizer(vectorizer='word2vec') clf = LinearSVM() vectorized_data = vec.vectorize(data=data) clf.classify(vectorized_data) clf.predict(vectorized_data)
def plot_each_review_dimension(vectorized_data, bias=0.1): logging.info('negative vectors in vetorized[train_neg_v] : ' + str(len(vectorized_data['train_neg_v']))) logging.info('positive vectors in vetorized[train_pos_v] : ' + str(len(vectorized_data['train_pos_v']))) ############# plot each dimension to find the significant dimensions ######### avg = [] avg_v_neg = vec.avg_vectors(vectorized_data['train_neg_v']) avg_v_pos = vec.avg_vectors(vectorized_data['train_pos_v']) # calculate a difference vector for all averaged neg and pos vectors diff_v = vec.diff(avg_v_neg, avg_v_pos, bias=bias) # diff_v = normalize(diff_v) avg.append(avg_v_neg) avg.append(avg_v_pos) vis.plot_each_dim(neg_v=vectorized_data['train_neg_v'], pos_v=vectorized_data['train_pos_v'], avgs=avg, used_bias=bias, diff=diff_v, filename='feats')
def test_NaiveBayes_sklearn(): from thesis.Data import Data_loader import thesis.Vectorizer as vec # load data data = Data_loader().get_data() # create a vectorizer tfidf_vec = vec.get_Vectorizer(vectorizer='tfidf') # create a classifier clf = NaiveBayes_sklearn() # vectorize the data vectorized_data = tfidf_vec.vectorize(data=data) # train classifier clf.classify(vectorized_data) # inverence for the classifier clf.predict(vectorized_data)
def run(self): self.vectorizer = v.get_Vectorizer(vectorizer=self.vectorizer, num_of_samples=self.num_of_samples, reduction_methode=self.red_method, w2v_dimension=self.w2v_dim) # dependency injection for the provided data data_vectorized = self.vectorizer.vectorize( self.data_loader.get_data()) # reduce the dimensionality of the training and testing data with tsne # no effort, acc 50 - 60 % # data_vectorized['x_train_v'] = v.reduce_with_TSNE_single(unreduced_data=data_vectorized['x_train_v']) # data_vectorized['x_test_v'] = v.reduce_with_TSNE_single(unreduced_data=data_vectorized['x_test_v']) self.classifier.classify(data_vectorized) self.classifier.predict(data_vectorized)
def use_word2vec_with_movie_reviews(): clf = cls.LinearSVM() # samples per sentiment for cluster plotting samples = 10000 # tsne related params perplexity = 80 # filter the most significant dimensions #learning_rates = np.logspace(2, 3, 5) learning_rates = [1000] # how to reduce the dimensionality of the wordvectors / document vectors reduction_methode = 'tsne' extract_dim = True normalize = True truncate_by_svd = True # bias for the difference of all averaged document vectors # how big should the difference between negative and positive feats be? # biases = np.array([0.1,0.09,0.08,0.07,0.06,0.05,0.04,0.03,0.02, 0.01, 0.009, 0.008, 0.007,0.006]) biases = np.array([0.09]) accuracies = np.zeros(len(biases)) extracted_dim = np.zeros(len(biases)) logging.info(biases) logging.info(extracted_dim) logging.info(accuracies) # cache the vectorized features for faster parameter research import thesis.IO_Organizer as saver feature_filename = 'w2v_google' try: logging.info('Try to load vectorized features') vectorized_data_full = saver.load_features('dict_' + feature_filename) logging.info('Features loaded from files') except: logging.info('Feature-file not found, vectorize reviews') data = Data_loader().get_data() word2vec = vec.get_Vectorizer(vectorizer='word2vec') vectorized_data_full = word2vec.vectorize(data=data) saver.save_features(vectorized_data_full, feature_filename) data = Data_loader().get_data() word2vec = vec.get_Vectorizer(vectorizer='word2vec') vectorized_data_full = word2vec.vectorize(data=data) for learning_rate in learning_rates: for i, bias in enumerate(biases): logging.info(bias) # create a working copy vectorized_data = dict(vectorized_data_full) ############## plot most informative dimensions ############## #plot_sentiment_distribution(vectorized_data['train_neg_v'], vectorized_data['train_pos_v'], source='feats') # reduce the dim of our document vectors #vectorized_data = vec.transform_data(vectorized_data, bias=bias) # plotting plot_each_review_dimension(vectorized_data=vectorized_data, bias=bias) # # extract the most significant dim of our document vectors if extract_dim: vectorized_data = vec.transform_data(vectorized_data, bias=bias) #### testing purpose, shrinking the whole amount of data to 2d # we need to do it batchsized to avoid memory overflow batchsize = 4000 reduced_to_2d = [] for x in batch(vectorized_data['x_train_v'], batchsize): reduced_to_2d.extend(shrink_dim_to_2d(x)) vectorized_data['x_train_v'] = reduced_to_2d reduced_to_2d = [] for x in batch(vectorized_data['x_test_v'], batchsize): reduced_to_2d.extend(shrink_dim_to_2d(x)) vectorized_data['x_test_v'] = reduced_to_2d reduced_to_2d = [] for x in batch(vectorized_data['train_neg_v'], batchsize): reduced_to_2d.extend(shrink_dim_to_2d(x)) vectorized_data['train_neg_v'] = reduced_to_2d reduced_to_2d = [] for x in batch(vectorized_data['train_pos_v'], batchsize): reduced_to_2d.extend(shrink_dim_to_2d(x)) vectorized_data['train_pos_v'] = reduced_to_2d reduced_to_2d = [] #### shrink_dim_and_plot_2d_clusters( neg_v=vectorized_data['train_neg_v'], pos_v=vectorized_data['train_pos_v'], reduction_methode=reduction_methode, bias=bias, perplexity=perplexity, learning_rate=learning_rate, normalize=normalize, extract_dim=extract_dim, truncate_by_svd=truncate_by_svd, source='feat') # select num_of_samples randomly # we need to define samples, or we get an memory error # neg_samples_v = random.sample(vectorized_data['train_neg_v'], k=samples) # pos_samples_v = random.sample(vectorized_data['train_pos_v'], k=samples) # shrink_dim_and_plot_2d_clusters(neg_v= neg_samples_v, # pos_v= pos_samples_v, # reduction_methode= reduction_methode, # bias= bias, # perplexity= perplexity, # learning_rate= learning_rate, # normalize= normalize, # extract_dim= extract_dim, # truncate_by_svd= truncate_by_svd, # source= 'feat') extr_dim = len(vectorized_data['x_train_v'][0]) extracted_dim[i] = extr_dim #vectorized_data = vec.delete_relevant_dimensions(vectorized_data) ######## linear svm ################ cl = cls.LinearSVM() cl.classify(vectorized_data) cl.predict(vectorized_data) cl = LinearSVC() cl.fit(vectorized_data['x_train_v'], vectorized_data['y_train']) pred = cl.predict(vectorized_data['x_test_v']) acc = accuracy_score(y_true=vectorized_data['y_test'], y_pred=pred) logging.info('acc: ' + str(acc)) accuracies[i] = acc del vectorized_data # #vis.plot_hyperplane(clf=cl, X=vectorized_data['x_train_v'], Y=vectorized_data['y_train']) # ######### RandomForestClassifier ######### # target_names = ['negative', 'positive'] # # clf = RandomForestClassifier(n_jobs=2) # clf.fit(vectorized_data['x_train_v'], vectorized_data['y_train']) # prediction = clf.predict(vectorized_data['x_test_v']) # logging.info(classification_report(vectorized_data['y_test'], prediction, # target_names=target_names)) # ######## Logisticregression ############# # from sklearn.linear_model import LogisticRegression # import pandas as pd # # lr = LogisticRegression() # lr.fit(vectorized_data['x_train_v'], vectorized_data['y_train']) # prediction = lr.predict_proba(vectorized_data['x_test_v']) # # logging.info('LR acc: ' + str(lr.score(vectorized_data['x_test_v'], vectorized_data['y_test']))) # # metrics.accuracy_score(vectorized_data['y_test'], prediction) # logging.info(biases) logging.info(extracted_dim) logging.info(accuracies)
def shrink_dim_and_plot_2d_clusters(neg_v, pos_v, reduction_methode, bias=None, perplexity=None, learning_rate=None, normalize=True, extract_dim=None, truncate_by_svd=True, source='word or feat'): #take the first n feats, they are randomized so we can take the first 2000 - avoid memory error input_dimension = len(neg_v[0]) logging.info('input dimensions before reduction: ' + str(input_dimension)) if input_dimension == 2: calc_acc(neg_v, pos_v) # print 2d vis.plot_2d_clusters( v_neg_reduced=neg_v, v_pos_reduced=pos_v, filename=source + '_' + reduction_methode + '_' + 'b_' + str(bias) + '_' + 'len_' + str(len(neg_v) + len(pos_v)) + '_' + 'perpl_' + str(perplexity) + '_' + 'learn_' + str(learning_rate) + '_' + 'filter_' + str(extract_dim) + '_' + 'norm_' + str(normalize)) else: # first reduce the dimensions to 50, then perform t-SNE or PCA if truncate_by_svd: try: start_time = time.time() truncated = TruncatedSVD(n_components=50, random_state=0).fit_transform(neg_v + pos_v) # split the truncated neg_v = truncated[0:int(len(truncated) / 2)] pos_v = truncated[int(len(truncated) / 2):] logging.info("dimension truncated with SVD - %6.2f seconds " % (time.time() - start_time)) except: logging.info('truncating not possible, dimension < 50') #reduce dimension with TSNE or PCA if reduction_methode == 'tsne': # data mixed before dimension reduction neg_v, pos_v = vec.reduce_with_TSNE_mixed( neg_v=neg_v, pos_v=pos_v, goal_dimensions=2, perplexity=perplexity, learning_rate=learning_rate) # negative and positive separately shrinked # neg_v_reduced, pos_v_reduced = reduce_with_TSNE(neg_v=neg_v, pos_v=pos_v, goal_dimensions=2) elif reduction_methode == 'pca': neg_v, pos_v = vec.reduce_with_PCA_mixed(neg_v=neg_v, pos_v=pos_v, goal_dimensions=2) # normalize the data if normalize: scaler = preprocessing.StandardScaler().fit(neg_v + pos_v) neg_v = scaler.transform(neg_v) pos_v = scaler.transform(pos_v) calc_acc(neg_v, pos_v) # print 2d vis.plot_2d_clusters( v_neg_reduced=neg_v, v_pos_reduced=pos_v, filename=source + '_' + reduction_methode + '_' + 'b_' + str(bias) + '_' + 'len_' + str(len(neg_v) + len(pos_v)) + '_' + 'perpl_' + str(perplexity) + '_' + 'learn_' + str(learning_rate) + '_' + 'filter_' + str(extract_dim) + '_' + 'norm_' + str(normalize))
def use_word2vec_with_wordlists(): # define general testing parameters for word2vec plotting words_to_load = 2000 # define the min difference between the neg and pos averaged wordvectors bias = 0.4 # tsne related params perplexity = 150 learning_rate = 1000 # reduce by tsne or pca reduction_methode = 'pca' # filter the most significant dimensions extract_dim = True normalize = True truncate_by_svd = True neg_v = [] pos_v = [] extracted_neg_wordvectors = [] extracted_pos_wordvectors = [] model = Word2Vec.load('./w2v_model/300_dimensions/word_tokenized/own.d2v') mod = model.wv del model #mod = gensim.models.KeyedVectors.load_word2vec_format('./w2v_model/GoogleNews-vectors-negative300.bin',binary=True ) test_words = {} test_words['neg'], test_words['pos'] = data.load_neg_pos_wordlist( num_of_words=words_to_load) for word in test_words['neg']: try: word_vector = mod[word] neg_v.append(word_vector) except: continue for word in test_words['pos']: try: word_vector = mod[word] pos_v.append(word_vector) except: continue # avg all neg and pos words for each dimension avg_neg = vec.avg_vectors(neg_v) avg_pos = vec.avg_vectors(pos_v) avgs = [] avgs.append(avg_neg) avgs.append(avg_pos) difference = vec.diff(avg_neg, avg_pos, bias=bias) # plot each dimensions of our words, the average and the difference vis.plot_each_dim(neg_v=neg_v, pos_v=pos_v, avgs=avgs, used_bias=bias, diff=difference, filename='words') ############## plot most informative dimensions ############## #plot_sentiment_distribution(neg_v=neg_v, pos_v=pos_v, source='words') # extract the significant dimensions of our word vectors according to a defined bias if extract_dim: relevant_indexes = vec.extraxt_rel_indexes(difference) [ extracted_neg_wordvectors.append( vec.extract_rel_dim_vec(v, relevant_indexes)) for v in neg_v ] [ extracted_pos_wordvectors.append( vec.extract_rel_dim_vec(v, relevant_indexes)) for v in pos_v ] else: extracted_neg_wordvectors = neg_v extracted_pos_wordvectors = pos_v # try to classify the words # first with all dimensions later with only the most significant dimensions neg_labels = [] pos_labels = [] for _ in neg_v: neg_labels.append(c.NEGATIVE) for _ in pos_v: pos_labels.append(c.POSITIVE) # split data into testing and training set + shuffle x_train, x_test, y_train, y_test = train_test_split(neg_v + pos_v, neg_labels + pos_labels, test_size=0.25, random_state=42) cl = LinearSVC() cl.fit(x_train, y_train) pred = cl.predict(x_test) acc = accuracy_score(y_true=y_test, y_pred=pred) logging.info('acc with all dimensions: ' + str(acc)) # split data into testing and training set + shuffle x_train, x_test, y_train, y_test = train_test_split( extracted_neg_wordvectors + extracted_pos_wordvectors, neg_labels + pos_labels, test_size=0.25, random_state=42) cl = LinearSVC() cl.fit(x_train, y_train) pred = cl.predict(x_test) acc = accuracy_score(y_true=y_test, y_pred=pred) logging.info('acc with extracted dimensions: ' + str(acc)) shrink_dim_and_plot_2d_clusters(neg_v=extracted_neg_wordvectors, pos_v=extracted_pos_wordvectors, reduction_methode=reduction_methode, bias=bias, perplexity=perplexity, learning_rate=learning_rate, normalize=normalize, extract_dim=extract_dim, truncate_by_svd=truncate_by_svd, source='word')
from sklearn.model_selection import StratifiedShuffleSplit from sklearn.model_selection import GridSearchCV import thesis.Data as d import thesis.Vectorizer as vec import thesis.my_logger import thesis.Visualization as plotter # tfidf # data = d.Data_loader().get_data() # tfidf_vec = vec.get_Vectorizer('tfidf') # vectorized_data = tfidf_vec.vectorize(data=data) # word2vec data = d.Data_loader().get_data() word2vec_vec = vec.get_Vectorizer('word2vec') vectorized_data = word2vec_vec.vectorize(data=data) X = vectorized_data['x_train_v'] y = vectorized_data['y_train'] print('grid') C_range = np.logspace(-2, 2, 5) gamma_range = np.logspace(-4, 2, 5) param_grid = dict(tol=gamma_range, C=C_range) cv = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42) grid = GridSearchCV(LinearSVC(), param_grid=param_grid, cv=cv, verbose=1) grid.fit(X, y) print("The best parameters are %s with a score of %0.2f" %