def plot_bias(): # values gained by raising the bias threshhold # biases = np.array([0.1, 0.09, 0.08, 0.07, 0.06, 0.05, 0.04, 0.03, 0.02, 0.01, 0.009, 0.008, 0.007, 0.006, 0.005, 0.004, 0.003, 0.002]) # extracted_dim = np.array([2., 3., 5., 10., 22., 40., 69., 106., 165., 242., 250., 255., 257., 268., 273., 278., 286., 291.]) # acc = np.array([0.6745, 0.69683333, 0.69625, 0.73083333, 0.77433333, 0.79225, 0.79941667, 0.81966667, 0.83083333, 0.84666667,0.84733333, 0.84858333, 0.8475, 0.85083333, 0.8515, 0.8523333, 0.85308333, 0.85591667]) biases = np.array([ 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.011, 0.012, 0.013 ]) extracted_dim = np.array( [255., 205., 167., 131., 108., 80., 60., 45., 28., 21., 12., 9., 6.]) acc = np.array([ 0.8435, 0.83783333, 0.835, 0.82833333, 0.82116667, 0.81658333, 0.80641667, 0.79791667, 0.78483333, 0.769, 0.73633333, 0.72758333, 0.6945 ]) acc = [elem * 100 for elem in acc] vis.plot_acc_for_bias(biases=biases, dimensions=extracted_dim, accs=acc)
def predict(self, data_vectorized): target_names = ['negative', 'positive'] #x_test_v_scaled = self.scaler.fit_transform(data_vectorized['x_test_v']) x_test_v_scaled = data_vectorized['x_test_v'] start_time = time.time() self.prediction_liblinear = self.Classifier_liblinear.predict( x_test_v_scaled) self.time_prediction = (time.time() - start_time) logging.info("prediction finished - %6.2f seconds " % self.time_prediction) # cross validation # logging.info("cross validation ... ") # start_time = time.time() # scores = cross_val_score(self.Classifier_liblinear, # data_vectorized['x_train_v'], # data_vectorized['y_train'], # cv=3, n_jobs=-1) # # logging.info("Cross-Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) # logging.info("Cross-Validation finished- %6.2f seconds " % (time.time() - start_time)) # # # Print results in a nice table linearSVC logging.info("Results for LinearSVC()") logging.info("Training time: %fs; Prediction time: %fs" % (self.time_training, self.time_prediction)) logging.info( classification_report(data_vectorized['y_test'], self.prediction_liblinear, target_names=target_names)) # ### plot top features - only possible for linear and tfidf try: plotter.plot_coefficients( self.Classifier_liblinear, data_vectorized['vectorizer'].get_feature_names(), fname=self.name) except: logging.info('feature-plotting not possible') io.save_classifier(self.Classifier_liblinear)
def plot_each_review_dimension(vectorized_data, bias=0.1): logging.info('negative vectors in vetorized[train_neg_v] : ' + str(len(vectorized_data['train_neg_v']))) logging.info('positive vectors in vetorized[train_pos_v] : ' + str(len(vectorized_data['train_pos_v']))) ############# plot each dimension to find the significant dimensions ######### avg = [] avg_v_neg = vec.avg_vectors(vectorized_data['train_neg_v']) avg_v_pos = vec.avg_vectors(vectorized_data['train_pos_v']) # calculate a difference vector for all averaged neg and pos vectors diff_v = vec.diff(avg_v_neg, avg_v_pos, bias=bias) # diff_v = normalize(diff_v) avg.append(avg_v_neg) avg.append(avg_v_pos) vis.plot_each_dim(neg_v=vectorized_data['train_neg_v'], pos_v=vectorized_data['train_pos_v'], avgs=avg, used_bias=bias, diff=diff_v, filename='feats')
def plot_sentiment_distribution(neg_v, pos_v, source=None): pos_index_21 = [] pos_index_119 = [] neg_index_21 = [] neg_index_119 = [] for neg_v, pos_v in zip(neg_v, pos_v): pos_index_21.append(pos_v[21]) pos_index_119.append(pos_v[119]) neg_index_21.append(neg_v[21]) neg_index_119.append(neg_v[119]) negative_reduced = [] positive_reduced = [] [ negative_reduced.append([v21, v119]) for v21, v119 in zip(neg_index_21, neg_index_119) ] [ positive_reduced.append([v21, v119]) for v21, v119 in zip(pos_index_21, pos_index_119) ] vis.plot_relevant_indexes(neg_index_21, neg_index_119, pos_index_21, pos_index_119, source)
def shrink_dim_and_plot_2d_clusters(neg_v, pos_v, reduction_methode, bias=None, perplexity=None, learning_rate=None, normalize=True, extract_dim=None, truncate_by_svd=True, source='word or feat'): #take the first n feats, they are randomized so we can take the first 2000 - avoid memory error input_dimension = len(neg_v[0]) logging.info('input dimensions before reduction: ' + str(input_dimension)) if input_dimension == 2: calc_acc(neg_v, pos_v) # print 2d vis.plot_2d_clusters( v_neg_reduced=neg_v, v_pos_reduced=pos_v, filename=source + '_' + reduction_methode + '_' + 'b_' + str(bias) + '_' + 'len_' + str(len(neg_v) + len(pos_v)) + '_' + 'perpl_' + str(perplexity) + '_' + 'learn_' + str(learning_rate) + '_' + 'filter_' + str(extract_dim) + '_' + 'norm_' + str(normalize)) else: # first reduce the dimensions to 50, then perform t-SNE or PCA if truncate_by_svd: try: start_time = time.time() truncated = TruncatedSVD(n_components=50, random_state=0).fit_transform(neg_v + pos_v) # split the truncated neg_v = truncated[0:int(len(truncated) / 2)] pos_v = truncated[int(len(truncated) / 2):] logging.info("dimension truncated with SVD - %6.2f seconds " % (time.time() - start_time)) except: logging.info('truncating not possible, dimension < 50') #reduce dimension with TSNE or PCA if reduction_methode == 'tsne': # data mixed before dimension reduction neg_v, pos_v = vec.reduce_with_TSNE_mixed( neg_v=neg_v, pos_v=pos_v, goal_dimensions=2, perplexity=perplexity, learning_rate=learning_rate) # negative and positive separately shrinked # neg_v_reduced, pos_v_reduced = reduce_with_TSNE(neg_v=neg_v, pos_v=pos_v, goal_dimensions=2) elif reduction_methode == 'pca': neg_v, pos_v = vec.reduce_with_PCA_mixed(neg_v=neg_v, pos_v=pos_v, goal_dimensions=2) # normalize the data if normalize: scaler = preprocessing.StandardScaler().fit(neg_v + pos_v) neg_v = scaler.transform(neg_v) pos_v = scaler.transform(pos_v) calc_acc(neg_v, pos_v) # print 2d vis.plot_2d_clusters( v_neg_reduced=neg_v, v_pos_reduced=pos_v, filename=source + '_' + reduction_methode + '_' + 'b_' + str(bias) + '_' + 'len_' + str(len(neg_v) + len(pos_v)) + '_' + 'perpl_' + str(perplexity) + '_' + 'learn_' + str(learning_rate) + '_' + 'filter_' + str(extract_dim) + '_' + 'norm_' + str(normalize))
def use_word2vec_with_wordlists(): # define general testing parameters for word2vec plotting words_to_load = 2000 # define the min difference between the neg and pos averaged wordvectors bias = 0.4 # tsne related params perplexity = 150 learning_rate = 1000 # reduce by tsne or pca reduction_methode = 'pca' # filter the most significant dimensions extract_dim = True normalize = True truncate_by_svd = True neg_v = [] pos_v = [] extracted_neg_wordvectors = [] extracted_pos_wordvectors = [] model = Word2Vec.load('./w2v_model/300_dimensions/word_tokenized/own.d2v') mod = model.wv del model #mod = gensim.models.KeyedVectors.load_word2vec_format('./w2v_model/GoogleNews-vectors-negative300.bin',binary=True ) test_words = {} test_words['neg'], test_words['pos'] = data.load_neg_pos_wordlist( num_of_words=words_to_load) for word in test_words['neg']: try: word_vector = mod[word] neg_v.append(word_vector) except: continue for word in test_words['pos']: try: word_vector = mod[word] pos_v.append(word_vector) except: continue # avg all neg and pos words for each dimension avg_neg = vec.avg_vectors(neg_v) avg_pos = vec.avg_vectors(pos_v) avgs = [] avgs.append(avg_neg) avgs.append(avg_pos) difference = vec.diff(avg_neg, avg_pos, bias=bias) # plot each dimensions of our words, the average and the difference vis.plot_each_dim(neg_v=neg_v, pos_v=pos_v, avgs=avgs, used_bias=bias, diff=difference, filename='words') ############## plot most informative dimensions ############## #plot_sentiment_distribution(neg_v=neg_v, pos_v=pos_v, source='words') # extract the significant dimensions of our word vectors according to a defined bias if extract_dim: relevant_indexes = vec.extraxt_rel_indexes(difference) [ extracted_neg_wordvectors.append( vec.extract_rel_dim_vec(v, relevant_indexes)) for v in neg_v ] [ extracted_pos_wordvectors.append( vec.extract_rel_dim_vec(v, relevant_indexes)) for v in pos_v ] else: extracted_neg_wordvectors = neg_v extracted_pos_wordvectors = pos_v # try to classify the words # first with all dimensions later with only the most significant dimensions neg_labels = [] pos_labels = [] for _ in neg_v: neg_labels.append(c.NEGATIVE) for _ in pos_v: pos_labels.append(c.POSITIVE) # split data into testing and training set + shuffle x_train, x_test, y_train, y_test = train_test_split(neg_v + pos_v, neg_labels + pos_labels, test_size=0.25, random_state=42) cl = LinearSVC() cl.fit(x_train, y_train) pred = cl.predict(x_test) acc = accuracy_score(y_true=y_test, y_pred=pred) logging.info('acc with all dimensions: ' + str(acc)) # split data into testing and training set + shuffle x_train, x_test, y_train, y_test = train_test_split( extracted_neg_wordvectors + extracted_pos_wordvectors, neg_labels + pos_labels, test_size=0.25, random_state=42) cl = LinearSVC() cl.fit(x_train, y_train) pred = cl.predict(x_test) acc = accuracy_score(y_true=y_test, y_pred=pred) logging.info('acc with extracted dimensions: ' + str(acc)) shrink_dim_and_plot_2d_clusters(neg_v=extracted_neg_wordvectors, pos_v=extracted_pos_wordvectors, reduction_methode=reduction_methode, bias=bias, perplexity=perplexity, learning_rate=learning_rate, normalize=normalize, extract_dim=extract_dim, truncate_by_svd=truncate_by_svd, source='word')
# for ind, i in enumerate(Cs): # plt.plot(Tol, scores[ind], label='C: ' + str(i)) # plt.legend() # plt.xlabel('Los') # plt.ylabel('Mean score') # plt.show() import thesis.Visualization as plotter # plot the most informative features of the best pipeline features = grid_search.best_estimator_.named_steps['vect'].get_feature_names() logging.info(features[0]) logging.info(len(features)) clf = grid_search.best_estimator_.named_steps['clf'] plotter.plot_coefficients(clf, features,fname='test') # show best accuracy from the 4 fold cross validation with the validation data print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters set:") best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print("\t%s: %r" % (param_name, best_parameters[param_name])) # print classification_report with the unseen testing data clf = grid_search.best_estimator_ prediction = clf.predict(data['x_test']) target_names = ['negative', 'positive'] print(classification_report(data['y_test'], prediction, target_names=target_names))
y = vectorized_data['y_train'] print('grid') C_range = np.logspace(-2, 2, 5) gamma_range = np.logspace(-4, 2, 5) param_grid = dict(tol=gamma_range, C=C_range) cv = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42) grid = GridSearchCV(LinearSVC(), param_grid=param_grid, cv=cv, verbose=1) grid.fit(X, y) print("The best parameters are %s with a score of %0.2f" % (grid.best_params_, grid.best_score_)) print('grid done') scores = grid.cv_results_['mean_test_score'].reshape(len(C_range), len(gamma_range)) # Draw heatmap of the validation accuracy as a function of gamma and C # # The score are encoded as colors with the hot colormap which varies from dark # red to bright yellow. As the most interesting scores are all located in the # 0.82 to 0.85 range we use a custom normalizer to set the mid-point to 0.82 so # as to make it easier to visualize the small variations of score values in the # interesting range while not brutally collapsing all the low score values to # the same color. plotter.plot_heatmap(scores=scores, gamma_range=gamma_range, C_range=C_range, filename='linear_SVM')