def test_NaiveBayes_NLTK():
    from thesis.Data import Data_loader
    # load data
    data = Data_loader().get_data()

    clf_nltk = NaiveBayes_nltk()
    # test for NLTK Naive Bayes
    clf_nltk.classify(data_vectorized=data)
    clf_nltk.predict(data_vectorized=data)
示例#2
0
class Hypothese(object):

    fitness = 0
    name = 'Hypothese'
    classifier = None
    vectorizer = None
    data_cleaner = None
    data_loader = None

    # define standard parameter assigning to convention over configuration
    def __init__(self,
                 data_loader=None,
                 samples=1000,
                 red_method='tsne',
                 vectorizer='word2vec',
                 w2v_dim=300):

        if data_loader == None:
            self.data_loader = Data_loader()
        else:
            self.data_loader = data_loader

        self.num_of_samples = samples
        self.red_method = red_method
        self.w2v_dim = w2v_dim
        self.vectorizer = vectorizer

        # initial variant
        self.classifier = LinearSVM()

    def run(self):
        self.vectorizer = v.get_Vectorizer(vectorizer=self.vectorizer,
                                           num_of_samples=self.num_of_samples,
                                           reduction_methode=self.red_method,
                                           w2v_dimension=self.w2v_dim)

        # dependency injection for the provided data
        data_vectorized = self.vectorizer.vectorize(
            self.data_loader.get_data())

        # reduce the dimensionality of the training and testing data with tsne
        # no effort, acc 50 - 60 %
        # data_vectorized['x_train_v'] = v.reduce_with_TSNE_single(unreduced_data=data_vectorized['x_train_v'])
        # data_vectorized['x_test_v'] = v.reduce_with_TSNE_single(unreduced_data=data_vectorized['x_test_v'])

        self.classifier.classify(data_vectorized)

        self.classifier.predict(data_vectorized)

    def calc_fitness(self):
        pass

    def mutate(self):
        pass

    def compare_to(self):
        pass
示例#3
0
    def __init__(self,
                 data_loader=None,
                 samples=1000,
                 red_method='tsne',
                 vectorizer='word2vec',
                 w2v_dim=300):

        if data_loader == None:
            self.data_loader = Data_loader()
        else:
            self.data_loader = data_loader

        self.num_of_samples = samples
        self.red_method = red_method
        self.w2v_dim = w2v_dim
        self.vectorizer = vectorizer

        # initial variant
        self.classifier = LinearSVM()
def test_LinearSVM():
    # test svm with tdidf-vectorized data
    from thesis.Data import Data_loader
    import thesis.Vectorizer as vec

    data = Data_loader().get_data()
    vec = vec.get_Vectorizer(vectorizer='tfidf')
    #vec = vec.get_Vectorizer(vectorizer='word2vec')
    clf = LinearSVM()

    vectorized_data = vec.vectorize(data=data)
    clf.classify(vectorized_data)
    clf.predict(vectorized_data)
示例#5
0
    def __init__(self):

        # set up a unique logger
        logging.info(' start ------------------------------------------------------------------')






        # possible variants
        # self.vectorizer = ['word2vec', 'tfidf']
        # self.samples = [1000] # n examples possible
        # self.methods = ['pca', 'tsne']
        # self.dim = [50, 100, 300]




        self.vectorizer = ['word2vec']
        self.num_of_samples_to_print = [500]
        self.methods = ['tsne']
        self.dim = [300]
        self.preprocessing = ['stopwords']


        self.Data_loader = Data_loader()
        #self.Data_cleaner = Data_cleaner()

        # initialize the population with hypotheses to train/evaluate on
        # how many samples should we plot
        for s in self.num_of_samples_to_print:
            # run with each vectorizer defined
            for vec in self.vectorizer:
                for m in self.methods:
                    # only word2vec has different vectorizer models
                    if vec == "word2vec":
                        for d in self.dim:

                            logging.info('initialize w2v hyp')
                            self.population.append(Hypothese(data_loader = self.Data_loader,
                                                             samples=s, red_method=m,
                                                             w2v_dim=d,
                                                             vectorizer=vec))
                    else:
                        logging.info('initialize tfidf hyp')
                        self.population.append(
                            Hypothese(data_loader=self.Data_loader,
                                      samples=s,
                                      red_method=m,
                                      vectorizer=vec))
def test_NaiveBayes_sklearn():
    from thesis.Data import Data_loader
    import thesis.Vectorizer as vec

    # load data
    data = Data_loader().get_data()
    # create a vectorizer
    tfidf_vec = vec.get_Vectorizer(vectorizer='tfidf')
    # create a classifier
    clf = NaiveBayes_sklearn()
    # vectorize the data
    vectorized_data = tfidf_vec.vectorize(data=data)
    # train classifier
    clf.classify(vectorized_data)
    # inverence for the classifier
    clf.predict(vectorized_data)
def use_word2vec_with_movie_reviews():
    clf = cls.LinearSVM()

    # samples per sentiment for cluster plotting
    samples = 10000

    # tsne related params
    perplexity = 80
    # filter the most significant dimensions

    #learning_rates = np.logspace(2, 3, 5)
    learning_rates = [1000]
    # how to reduce the dimensionality of the wordvectors / document vectors
    reduction_methode = 'tsne'

    extract_dim = True
    normalize = True
    truncate_by_svd = True

    # bias for the difference of all averaged document vectors
    # how big should the difference between negative and positive feats be?
    # biases = np.array([0.1,0.09,0.08,0.07,0.06,0.05,0.04,0.03,0.02, 0.01, 0.009, 0.008, 0.007,0.006])
    biases = np.array([0.09])
    accuracies = np.zeros(len(biases))
    extracted_dim = np.zeros(len(biases))

    logging.info(biases)
    logging.info(extracted_dim)
    logging.info(accuracies)

    # cache the vectorized features for faster parameter research
    import thesis.IO_Organizer as saver
    feature_filename = 'w2v_google'
    try:
        logging.info('Try to load vectorized features')
        vectorized_data_full = saver.load_features('dict_' + feature_filename)
        logging.info('Features loaded from files')
    except:
        logging.info('Feature-file not found, vectorize reviews')
        data = Data_loader().get_data()
        word2vec = vec.get_Vectorizer(vectorizer='word2vec')
        vectorized_data_full = word2vec.vectorize(data=data)
        saver.save_features(vectorized_data_full, feature_filename)

    data = Data_loader().get_data()
    word2vec = vec.get_Vectorizer(vectorizer='word2vec')
    vectorized_data_full = word2vec.vectorize(data=data)

    for learning_rate in learning_rates:
        for i, bias in enumerate(biases):
            logging.info(bias)
            # create a working copy
            vectorized_data = dict(vectorized_data_full)

            ############## plot most informative dimensions ##############
            #plot_sentiment_distribution(vectorized_data['train_neg_v'], vectorized_data['train_pos_v'], source='feats')

            # reduce the dim of our document vectors
            #vectorized_data = vec.transform_data(vectorized_data, bias=bias)

            # plotting
            plot_each_review_dimension(vectorized_data=vectorized_data,
                                       bias=bias)

            # # extract the most significant dim of our document vectors
            if extract_dim:
                vectorized_data = vec.transform_data(vectorized_data,
                                                     bias=bias)

            #### testing purpose, shrinking the whole amount of data to 2d
            # we need to do it batchsized to avoid memory overflow
            batchsize = 4000
            reduced_to_2d = []
            for x in batch(vectorized_data['x_train_v'], batchsize):
                reduced_to_2d.extend(shrink_dim_to_2d(x))
            vectorized_data['x_train_v'] = reduced_to_2d
            reduced_to_2d = []

            for x in batch(vectorized_data['x_test_v'], batchsize):
                reduced_to_2d.extend(shrink_dim_to_2d(x))
            vectorized_data['x_test_v'] = reduced_to_2d
            reduced_to_2d = []

            for x in batch(vectorized_data['train_neg_v'], batchsize):
                reduced_to_2d.extend(shrink_dim_to_2d(x))
            vectorized_data['train_neg_v'] = reduced_to_2d
            reduced_to_2d = []

            for x in batch(vectorized_data['train_pos_v'], batchsize):
                reduced_to_2d.extend(shrink_dim_to_2d(x))
            vectorized_data['train_pos_v'] = reduced_to_2d
            reduced_to_2d = []

            ####

            shrink_dim_and_plot_2d_clusters(
                neg_v=vectorized_data['train_neg_v'],
                pos_v=vectorized_data['train_pos_v'],
                reduction_methode=reduction_methode,
                bias=bias,
                perplexity=perplexity,
                learning_rate=learning_rate,
                normalize=normalize,
                extract_dim=extract_dim,
                truncate_by_svd=truncate_by_svd,
                source='feat')

            # select num_of_samples randomly
            # we need to define samples, or we get an memory error
            # neg_samples_v = random.sample(vectorized_data['train_neg_v'], k=samples)
            # pos_samples_v = random.sample(vectorized_data['train_pos_v'], k=samples)

            # shrink_dim_and_plot_2d_clusters(neg_v= neg_samples_v,
            #                                            pos_v= pos_samples_v,
            #                                            reduction_methode= reduction_methode,
            #                                            bias= bias,
            #                                            perplexity= perplexity,
            #                                            learning_rate= learning_rate,
            #                                            normalize= normalize,
            #                                            extract_dim= extract_dim,
            #                                            truncate_by_svd= truncate_by_svd,
            #                                            source= 'feat')

            extr_dim = len(vectorized_data['x_train_v'][0])
            extracted_dim[i] = extr_dim

            #vectorized_data = vec.delete_relevant_dimensions(vectorized_data)

            ######## linear svm ################
            cl = cls.LinearSVM()
            cl.classify(vectorized_data)
            cl.predict(vectorized_data)

            cl = LinearSVC()
            cl.fit(vectorized_data['x_train_v'], vectorized_data['y_train'])
            pred = cl.predict(vectorized_data['x_test_v'])
            acc = accuracy_score(y_true=vectorized_data['y_test'], y_pred=pred)
            logging.info('acc: ' + str(acc))
            accuracies[i] = acc
            del vectorized_data
            #
            #vis.plot_hyperplane(clf=cl, X=vectorized_data['x_train_v'], Y=vectorized_data['y_train'])

    #         ######### RandomForestClassifier #########
    #         target_names = ['negative', 'positive']
    #
    #         clf = RandomForestClassifier(n_jobs=2)
    #         clf.fit(vectorized_data['x_train_v'], vectorized_data['y_train'])
    #         prediction = clf.predict(vectorized_data['x_test_v'])
    #         logging.info(classification_report(vectorized_data['y_test'], prediction,
    #                                            target_names=target_names))
    #         ######## Logisticregression #############
    #         from sklearn.linear_model import LogisticRegression
    #         import pandas as pd
    #
    #         lr = LogisticRegression()
    #         lr.fit(vectorized_data['x_train_v'], vectorized_data['y_train'])
    #         prediction = lr.predict_proba(vectorized_data['x_test_v'])
    #
    #         logging.info('LR acc: ' + str(lr.score(vectorized_data['x_test_v'], vectorized_data['y_test'])))
    #
    #         metrics.accuracy_score(vectorized_data['y_test'], prediction)
    #
    logging.info(biases)
    logging.info(extracted_dim)
    logging.info(accuracies)
# define a logfile and a level at one point to reuse it in all modules
# actually we log to std out and to /logs/SA.log

logger = logging.getLogger()
hdlr = logging.FileHandler('./logs/' + 'Gridsearch' + '.log')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
# console output
hdlr_console = logging.StreamHandler()
hdlr_console.setFormatter(formatter)
logger.addHandler(hdlr_console)
logger.setLevel(logging.INFO)

from thesis.Data import Data_loader
data = Data_loader().get_data()
#tokenizer = RegexpTokenizer(r'\w+')
tokenizer = nltk.TweetTokenizer()

def tokenize(text):
    #tokens = nltk.word_tokenize(text)
    tokens = tokenizer.tokenize(text)
    stems = []
    for item in tokens:
        stems.append(nltk.PorterStemmer().stem(item))
        #stems.append(item)
    return stems