def test_LinearSVM():
    # test svm with tdidf-vectorized data
    from thesis.Data import Data_loader
    import thesis.Vectorizer as vec

    data = Data_loader().get_data()
    vec = vec.get_Vectorizer(vectorizer='tfidf')
    #vec = vec.get_Vectorizer(vectorizer='word2vec')
    clf = LinearSVM()

    vectorized_data = vec.vectorize(data=data)
    clf.classify(vectorized_data)
    clf.predict(vectorized_data)
def plot_each_review_dimension(vectorized_data, bias=0.1):
    logging.info('negative vectors in vetorized[train_neg_v] : ' +
                 str(len(vectorized_data['train_neg_v'])))
    logging.info('positive vectors in vetorized[train_pos_v] : ' +
                 str(len(vectorized_data['train_pos_v'])))

    ############# plot each dimension to find the significant dimensions #########
    avg = []
    avg_v_neg = vec.avg_vectors(vectorized_data['train_neg_v'])
    avg_v_pos = vec.avg_vectors(vectorized_data['train_pos_v'])

    # calculate a difference vector for all averaged neg and pos vectors
    diff_v = vec.diff(avg_v_neg, avg_v_pos, bias=bias)

    # diff_v = normalize(diff_v)
    avg.append(avg_v_neg)
    avg.append(avg_v_pos)
    vis.plot_each_dim(neg_v=vectorized_data['train_neg_v'],
                      pos_v=vectorized_data['train_pos_v'],
                      avgs=avg,
                      used_bias=bias,
                      diff=diff_v,
                      filename='feats')
def test_NaiveBayes_sklearn():
    from thesis.Data import Data_loader
    import thesis.Vectorizer as vec

    # load data
    data = Data_loader().get_data()
    # create a vectorizer
    tfidf_vec = vec.get_Vectorizer(vectorizer='tfidf')
    # create a classifier
    clf = NaiveBayes_sklearn()
    # vectorize the data
    vectorized_data = tfidf_vec.vectorize(data=data)
    # train classifier
    clf.classify(vectorized_data)
    # inverence for the classifier
    clf.predict(vectorized_data)
Пример #4
0
    def run(self):
        self.vectorizer = v.get_Vectorizer(vectorizer=self.vectorizer,
                                           num_of_samples=self.num_of_samples,
                                           reduction_methode=self.red_method,
                                           w2v_dimension=self.w2v_dim)

        # dependency injection for the provided data
        data_vectorized = self.vectorizer.vectorize(
            self.data_loader.get_data())

        # reduce the dimensionality of the training and testing data with tsne
        # no effort, acc 50 - 60 %
        # data_vectorized['x_train_v'] = v.reduce_with_TSNE_single(unreduced_data=data_vectorized['x_train_v'])
        # data_vectorized['x_test_v'] = v.reduce_with_TSNE_single(unreduced_data=data_vectorized['x_test_v'])

        self.classifier.classify(data_vectorized)

        self.classifier.predict(data_vectorized)
def use_word2vec_with_movie_reviews():
    clf = cls.LinearSVM()

    # samples per sentiment for cluster plotting
    samples = 10000

    # tsne related params
    perplexity = 80
    # filter the most significant dimensions

    #learning_rates = np.logspace(2, 3, 5)
    learning_rates = [1000]
    # how to reduce the dimensionality of the wordvectors / document vectors
    reduction_methode = 'tsne'

    extract_dim = True
    normalize = True
    truncate_by_svd = True

    # bias for the difference of all averaged document vectors
    # how big should the difference between negative and positive feats be?
    # biases = np.array([0.1,0.09,0.08,0.07,0.06,0.05,0.04,0.03,0.02, 0.01, 0.009, 0.008, 0.007,0.006])
    biases = np.array([0.09])
    accuracies = np.zeros(len(biases))
    extracted_dim = np.zeros(len(biases))

    logging.info(biases)
    logging.info(extracted_dim)
    logging.info(accuracies)

    # cache the vectorized features for faster parameter research
    import thesis.IO_Organizer as saver
    feature_filename = 'w2v_google'
    try:
        logging.info('Try to load vectorized features')
        vectorized_data_full = saver.load_features('dict_' + feature_filename)
        logging.info('Features loaded from files')
    except:
        logging.info('Feature-file not found, vectorize reviews')
        data = Data_loader().get_data()
        word2vec = vec.get_Vectorizer(vectorizer='word2vec')
        vectorized_data_full = word2vec.vectorize(data=data)
        saver.save_features(vectorized_data_full, feature_filename)

    data = Data_loader().get_data()
    word2vec = vec.get_Vectorizer(vectorizer='word2vec')
    vectorized_data_full = word2vec.vectorize(data=data)

    for learning_rate in learning_rates:
        for i, bias in enumerate(biases):
            logging.info(bias)
            # create a working copy
            vectorized_data = dict(vectorized_data_full)

            ############## plot most informative dimensions ##############
            #plot_sentiment_distribution(vectorized_data['train_neg_v'], vectorized_data['train_pos_v'], source='feats')

            # reduce the dim of our document vectors
            #vectorized_data = vec.transform_data(vectorized_data, bias=bias)

            # plotting
            plot_each_review_dimension(vectorized_data=vectorized_data,
                                       bias=bias)

            # # extract the most significant dim of our document vectors
            if extract_dim:
                vectorized_data = vec.transform_data(vectorized_data,
                                                     bias=bias)

            #### testing purpose, shrinking the whole amount of data to 2d
            # we need to do it batchsized to avoid memory overflow
            batchsize = 4000
            reduced_to_2d = []
            for x in batch(vectorized_data['x_train_v'], batchsize):
                reduced_to_2d.extend(shrink_dim_to_2d(x))
            vectorized_data['x_train_v'] = reduced_to_2d
            reduced_to_2d = []

            for x in batch(vectorized_data['x_test_v'], batchsize):
                reduced_to_2d.extend(shrink_dim_to_2d(x))
            vectorized_data['x_test_v'] = reduced_to_2d
            reduced_to_2d = []

            for x in batch(vectorized_data['train_neg_v'], batchsize):
                reduced_to_2d.extend(shrink_dim_to_2d(x))
            vectorized_data['train_neg_v'] = reduced_to_2d
            reduced_to_2d = []

            for x in batch(vectorized_data['train_pos_v'], batchsize):
                reduced_to_2d.extend(shrink_dim_to_2d(x))
            vectorized_data['train_pos_v'] = reduced_to_2d
            reduced_to_2d = []

            ####

            shrink_dim_and_plot_2d_clusters(
                neg_v=vectorized_data['train_neg_v'],
                pos_v=vectorized_data['train_pos_v'],
                reduction_methode=reduction_methode,
                bias=bias,
                perplexity=perplexity,
                learning_rate=learning_rate,
                normalize=normalize,
                extract_dim=extract_dim,
                truncate_by_svd=truncate_by_svd,
                source='feat')

            # select num_of_samples randomly
            # we need to define samples, or we get an memory error
            # neg_samples_v = random.sample(vectorized_data['train_neg_v'], k=samples)
            # pos_samples_v = random.sample(vectorized_data['train_pos_v'], k=samples)

            # shrink_dim_and_plot_2d_clusters(neg_v= neg_samples_v,
            #                                            pos_v= pos_samples_v,
            #                                            reduction_methode= reduction_methode,
            #                                            bias= bias,
            #                                            perplexity= perplexity,
            #                                            learning_rate= learning_rate,
            #                                            normalize= normalize,
            #                                            extract_dim= extract_dim,
            #                                            truncate_by_svd= truncate_by_svd,
            #                                            source= 'feat')

            extr_dim = len(vectorized_data['x_train_v'][0])
            extracted_dim[i] = extr_dim

            #vectorized_data = vec.delete_relevant_dimensions(vectorized_data)

            ######## linear svm ################
            cl = cls.LinearSVM()
            cl.classify(vectorized_data)
            cl.predict(vectorized_data)

            cl = LinearSVC()
            cl.fit(vectorized_data['x_train_v'], vectorized_data['y_train'])
            pred = cl.predict(vectorized_data['x_test_v'])
            acc = accuracy_score(y_true=vectorized_data['y_test'], y_pred=pred)
            logging.info('acc: ' + str(acc))
            accuracies[i] = acc
            del vectorized_data
            #
            #vis.plot_hyperplane(clf=cl, X=vectorized_data['x_train_v'], Y=vectorized_data['y_train'])

    #         ######### RandomForestClassifier #########
    #         target_names = ['negative', 'positive']
    #
    #         clf = RandomForestClassifier(n_jobs=2)
    #         clf.fit(vectorized_data['x_train_v'], vectorized_data['y_train'])
    #         prediction = clf.predict(vectorized_data['x_test_v'])
    #         logging.info(classification_report(vectorized_data['y_test'], prediction,
    #                                            target_names=target_names))
    #         ######## Logisticregression #############
    #         from sklearn.linear_model import LogisticRegression
    #         import pandas as pd
    #
    #         lr = LogisticRegression()
    #         lr.fit(vectorized_data['x_train_v'], vectorized_data['y_train'])
    #         prediction = lr.predict_proba(vectorized_data['x_test_v'])
    #
    #         logging.info('LR acc: ' + str(lr.score(vectorized_data['x_test_v'], vectorized_data['y_test'])))
    #
    #         metrics.accuracy_score(vectorized_data['y_test'], prediction)
    #
    logging.info(biases)
    logging.info(extracted_dim)
    logging.info(accuracies)
def shrink_dim_and_plot_2d_clusters(neg_v,
                                    pos_v,
                                    reduction_methode,
                                    bias=None,
                                    perplexity=None,
                                    learning_rate=None,
                                    normalize=True,
                                    extract_dim=None,
                                    truncate_by_svd=True,
                                    source='word or feat'):

    #take the first n feats, they are randomized so we can take the first 2000 - avoid memory error

    input_dimension = len(neg_v[0])
    logging.info('input dimensions before reduction: ' + str(input_dimension))
    if input_dimension == 2:
        calc_acc(neg_v, pos_v)
        # print 2d
        vis.plot_2d_clusters(
            v_neg_reduced=neg_v,
            v_pos_reduced=pos_v,
            filename=source + '_' + reduction_methode + '_' + 'b_' +
            str(bias) + '_' + 'len_' + str(len(neg_v) + len(pos_v)) + '_' +
            'perpl_' + str(perplexity) + '_' + 'learn_' + str(learning_rate) +
            '_' + 'filter_' + str(extract_dim) + '_' + 'norm_' +
            str(normalize))

    else:

        # first reduce the dimensions to 50, then perform t-SNE or PCA
        if truncate_by_svd:
            try:
                start_time = time.time()

                truncated = TruncatedSVD(n_components=50,
                                         random_state=0).fit_transform(neg_v +
                                                                       pos_v)
                # split the truncated
                neg_v = truncated[0:int(len(truncated) / 2)]
                pos_v = truncated[int(len(truncated) / 2):]

                logging.info("dimension truncated with SVD - %6.2f seconds " %
                             (time.time() - start_time))
            except:
                logging.info('truncating not possible, dimension < 50')

        #reduce dimension with TSNE or PCA
        if reduction_methode == 'tsne':
            # data mixed before dimension reduction
            neg_v, pos_v = vec.reduce_with_TSNE_mixed(
                neg_v=neg_v,
                pos_v=pos_v,
                goal_dimensions=2,
                perplexity=perplexity,
                learning_rate=learning_rate)

            # negative and positive separately shrinked
            # neg_v_reduced, pos_v_reduced = reduce_with_TSNE(neg_v=neg_v, pos_v=pos_v, goal_dimensions=2)
        elif reduction_methode == 'pca':
            neg_v, pos_v = vec.reduce_with_PCA_mixed(neg_v=neg_v,
                                                     pos_v=pos_v,
                                                     goal_dimensions=2)

        # normalize the data
        if normalize:
            scaler = preprocessing.StandardScaler().fit(neg_v + pos_v)
            neg_v = scaler.transform(neg_v)
            pos_v = scaler.transform(pos_v)

        calc_acc(neg_v, pos_v)

        # print 2d
        vis.plot_2d_clusters(
            v_neg_reduced=neg_v,
            v_pos_reduced=pos_v,
            filename=source + '_' + reduction_methode + '_' + 'b_' +
            str(bias) + '_' + 'len_' + str(len(neg_v) + len(pos_v)) + '_' +
            'perpl_' + str(perplexity) + '_' + 'learn_' + str(learning_rate) +
            '_' + 'filter_' + str(extract_dim) + '_' + 'norm_' +
            str(normalize))
def use_word2vec_with_wordlists():
    # define general testing parameters for word2vec plotting
    words_to_load = 2000
    # define the min difference between the neg and pos averaged wordvectors
    bias = 0.4
    # tsne related params
    perplexity = 150
    learning_rate = 1000
    # reduce by tsne or pca
    reduction_methode = 'pca'
    # filter the most significant dimensions

    extract_dim = True
    normalize = True
    truncate_by_svd = True

    neg_v = []
    pos_v = []
    extracted_neg_wordvectors = []
    extracted_pos_wordvectors = []

    model = Word2Vec.load('./w2v_model/300_dimensions/word_tokenized/own.d2v')
    mod = model.wv
    del model

    #mod = gensim.models.KeyedVectors.load_word2vec_format('./w2v_model/GoogleNews-vectors-negative300.bin',binary=True )

    test_words = {}
    test_words['neg'], test_words['pos'] = data.load_neg_pos_wordlist(
        num_of_words=words_to_load)

    for word in test_words['neg']:
        try:
            word_vector = mod[word]
            neg_v.append(word_vector)
        except:
            continue

    for word in test_words['pos']:
        try:
            word_vector = mod[word]
            pos_v.append(word_vector)
        except:
            continue

    # avg all neg and pos words for each dimension
    avg_neg = vec.avg_vectors(neg_v)
    avg_pos = vec.avg_vectors(pos_v)
    avgs = []
    avgs.append(avg_neg)
    avgs.append(avg_pos)
    difference = vec.diff(avg_neg, avg_pos, bias=bias)

    # plot each dimensions of our words, the average and the difference
    vis.plot_each_dim(neg_v=neg_v,
                      pos_v=pos_v,
                      avgs=avgs,
                      used_bias=bias,
                      diff=difference,
                      filename='words')

    ############## plot most informative dimensions ##############
    #plot_sentiment_distribution(neg_v=neg_v, pos_v=pos_v, source='words')

    # extract the significant dimensions of our word vectors according to a defined bias
    if extract_dim:
        relevant_indexes = vec.extraxt_rel_indexes(difference)
        [
            extracted_neg_wordvectors.append(
                vec.extract_rel_dim_vec(v, relevant_indexes)) for v in neg_v
        ]
        [
            extracted_pos_wordvectors.append(
                vec.extract_rel_dim_vec(v, relevant_indexes)) for v in pos_v
        ]
    else:
        extracted_neg_wordvectors = neg_v
        extracted_pos_wordvectors = pos_v

    # try to classify the words
    # first with all dimensions later with only the most significant dimensions
    neg_labels = []
    pos_labels = []
    for _ in neg_v:
        neg_labels.append(c.NEGATIVE)
    for _ in pos_v:
        pos_labels.append(c.POSITIVE)

    # split data into testing and training set + shuffle
    x_train, x_test, y_train, y_test = train_test_split(neg_v + pos_v,
                                                        neg_labels +
                                                        pos_labels,
                                                        test_size=0.25,
                                                        random_state=42)

    cl = LinearSVC()
    cl.fit(x_train, y_train)
    pred = cl.predict(x_test)
    acc = accuracy_score(y_true=y_test, y_pred=pred)
    logging.info('acc with all dimensions: ' + str(acc))

    # split data into testing and training set + shuffle
    x_train, x_test, y_train, y_test = train_test_split(
        extracted_neg_wordvectors + extracted_pos_wordvectors,
        neg_labels + pos_labels,
        test_size=0.25,
        random_state=42)

    cl = LinearSVC()
    cl.fit(x_train, y_train)
    pred = cl.predict(x_test)
    acc = accuracy_score(y_true=y_test, y_pred=pred)
    logging.info('acc with extracted dimensions: ' + str(acc))

    shrink_dim_and_plot_2d_clusters(neg_v=extracted_neg_wordvectors,
                                    pos_v=extracted_pos_wordvectors,
                                    reduction_methode=reduction_methode,
                                    bias=bias,
                                    perplexity=perplexity,
                                    learning_rate=learning_rate,
                                    normalize=normalize,
                                    extract_dim=extract_dim,
                                    truncate_by_svd=truncate_by_svd,
                                    source='word')
Пример #8
0
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

import thesis.Data as d
import thesis.Vectorizer as vec
import thesis.my_logger
import thesis.Visualization as plotter

# tfidf
# data = d.Data_loader().get_data()
# tfidf_vec = vec.get_Vectorizer('tfidf')
# vectorized_data = tfidf_vec.vectorize(data=data)

# word2vec
data = d.Data_loader().get_data()
word2vec_vec = vec.get_Vectorizer('word2vec')
vectorized_data = word2vec_vec.vectorize(data=data)

X = vectorized_data['x_train_v']
y = vectorized_data['y_train']

print('grid')
C_range = np.logspace(-2, 2, 5)
gamma_range = np.logspace(-4, 2, 5)

param_grid = dict(tol=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=42)
grid = GridSearchCV(LinearSVC(), param_grid=param_grid, cv=cv, verbose=1)
grid.fit(X, y)

print("The best parameters are %s with a score of %0.2f" %