예제 #1
0
    def Define_vectorizer(self):

        logging.info("this is length of the dataframe: {}".format(len(
            self.df)))

        if self.vect == 'tfidf':
            logging.info("the vectorizers is: {}".format(self.vect))
            VECT = TfidfVectorizer()

        elif self.vect == 'count':
            logging.info("the vectorizers is: {}".format(self.vect))
            VECT = CountVectorizer()

        elif self.vect == 'w2v_count':
            logging.info("the vectorizers is: {}".format(self.vect))

            VECT = embeddingvectorizer.EmbeddingCountVectorizer(
                self.model, 'mean')

        elif self.vect == 'w2v_tfidf':
            logging.info("the vectorizers is: {}".format(self.vect))
            VECT = embeddingvectorizer.EmbeddingTfidfVectorizer(
                self.model, 'mean')

        return VECT
 def get_vectorizer(self, vectorizer, model):
     logging.info("the vectorizer is: {}".format(vectorizer))
     
     vec = {}   
     vec['filename'] = vectorizer
     if vectorizer == 'w2v_count':
         s = embeddingvectorizer.EmbeddingCountVectorizer(model['gensimmodel'], 'mean')
     elif vectorizer == 'w2v_tfidf':
         s = embeddingvectorizer.EmbeddingTfidfVectorizer(model['gensimmodel'], 'mean')
     vec['vectorizer'] = s
 
     yield vec
예제 #3
0
    def Define_pipelines(self):
        logging.info('Start defining pipelines...\n\n')

        SGD_tfidf_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_pipeline = Pipeline([
            ('count', CountVectorizer()),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'mean')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'mean')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'sum')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'sum')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_count_embedding_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'max')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SGD_tfidf_embedding_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'max')),
            ('clf', OneVsRestClassifier(SGDClassifier())),
        ])

        SVC_tfidf_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_pipeline = Pipeline([
            ('count', CountVectorizer()),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'mean')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'mean')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'sum')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'sum')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_count_embedding_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'max')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        SVC_tfidf_embedding_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'max')),
            ('clf', OneVsRestClassifier(SVC())),
        ])

        PA_tfidf_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_pipeline = Pipeline([
            ('count', CountVectorizer()),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_embedding_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'mean')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'mean')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_embedding_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'sum')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'sum')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_count_embedding_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'max')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        PA_tfidf_embedding_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'max')),
            ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())),
        ])

        ET_tfidf_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_pipeline = Pipeline([
            ('count', CountVectorizer()),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'mean')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_pipeline = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'mean')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'sum')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_pipeline_sum = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'sum')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_count_embedding_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'max')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        ET_tfidf_embedding_pipeline_max = Pipeline([
            ("Embedding",
             embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'max')),
            ('clf', OneVsRestClassifier(ExtraTreesClassifier())),
        ])

        all_models = [
            ("SGD tfidf", SGD_tfidf_pipeline),
            ("SGD count", SGD_count_pipeline),
            ("SGD count embedding", SGD_count_embedding_pipeline),
            ("SGD tfidf embedding", SGD_tfidf_embedding_pipeline),
            ("SGD count embedding sum", SGD_count_embedding_pipeline_sum),
            ("SGD tfidf embedding sum", SGD_tfidf_embedding_pipeline_sum),
            ("SGD count embedding max", SGD_count_embedding_pipeline_max),
            ("SGD tfidf embedding max", SGD_tfidf_embedding_pipeline_max),
            ("SVC tfidf", SVC_tfidf_pipeline),
            ("SVC count", SVC_count_pipeline),
            ("SVC count embedding", SVC_count_embedding_pipeline),
            ("SVC tfidf embedding", SVC_tfidf_embedding_pipeline),
            ("SVC count embedding sum", SVC_count_embedding_pipeline_sum),
            ("SVC tfidf embedding sum", SVC_tfidf_embedding_pipeline_sum),
            ("SVC count embedding max", SVC_count_embedding_pipeline_max),
            ("SVC tfidf embedding max", SVC_tfidf_embedding_pipeline_max),
            ("PA tfidf", PA_tfidf_pipeline), ("PA count", PA_count_pipeline),
            ("PA count embedding", PA_count_embedding_pipeline),
            ("PA tfidf embedding", PA_tfidf_embedding_pipeline),
            ("PA count embedding sum", PA_count_embedding_pipeline_sum),
            ("PA tfidf embedding sum", PA_tfidf_embedding_pipeline_sum),
            ("PA count embedding max", PA_count_embedding_pipeline_max),
            ("PA tfidf embedding max", PA_tfidf_embedding_pipeline_max),
            ("ET tfidf", ET_tfidf_pipeline), ("ET count", ET_count_pipeline),
            ("ET count embedding", ET_count_embedding_pipeline),
            ("ET tifdf embedding", ET_tfidf_embedding_pipeline),
            ("ET count embedding sum", ET_count_embedding_pipeline_sum),
            ("ET tifdf embedding sum", ET_tfidf_embedding_pipeline_sum),
            ("ET count embedding max", ET_count_embedding_pipeline_max),
            ("ET tifdf embedding max", ET_tfidf_embedding_pipeline_max)
        ]

        return all_models
def gridsearch_with_classifiers(sample, vect):

    df = get_data()
    print("this is length of the dataframe: {}".format(len(df)))
    logging.info('getting the data. keeping sample: {}'.format(sample))

    if sample == 'newspaper_sample_only':
        df = df[df['type'] == 'newspaper']
    elif sample == 'pq_sample_only':
        df = df[df['type'] == 'parlementary question']
    elif sample == 'RPA_sample':
        df = df[df['origin'] == 'RPA']

    if vect == 'tfidf':
        logging.info("the vectorizers is: {}".format(vect))
        VECT = TfidfVectorizer()

    elif vect == 'count':
        logging.info("the vectorizers is: {}".format(vect))
        VECT = CountVectorizer()

    elif vect == 'w2v_count':
        logging.info("the vectorizers is: {}".format(vect))

        PE = '/home/anne/tmpanne/RPA/w2v_models/w2v_300d2000-01-01_2018-12-31'
        mod = gensim.models.Word2Vec.load(PE)
        MDL = dict(zip(mod.wv.index2word, mod.wv.syn0))
        VECT = embeddingvectorizer.EmbeddingCountVectorizer(MDL, 'mean')

    elif vect == 'w2v_tfidf':
        logging.info("the vectorizers is: {}".format(vect))

        PE = '/home/anne/tmpanne/RPA/w2v_models/w2v_300d2000-01-01_2018-12-31'
        mod = gensim.models.Word2Vec.load(PE)
        MDL = dict(zip(mod.wv.index2word, mod.wv.syn0))
        VECT = embeddingvectorizer.EmbeddingTfidfVectorizer(MDL, 'mean')

    logging.info('total size df: {}'.format(len(df)))

    X_train, X_test, y_train, y_test = train_test_split(df['text_clean'],
                                                        df['main_topic_label'],
                                                        test_size=0.2,
                                                        random_state=0)

    class_report = []
    results = []

    names = ["Passive Agressive", "SGDClassifier", "SVM", "ET"]

    classifiers = [
        PassiveAggressiveClassifier(),
        SGDClassifier(),
        SVC(),
        ExtraTreesClassifier()
    ]

    parameters = [
        {
            'clf__loss': ('hinge', 'squared_hinge'),
            'clf__C': (0.01, 0.5, 1.0),
            'clf__fit_intercept': (True, False),
            #'vect__ngram_range': [(1, 1), (1, 2)] ,
            #    'tfidf__use_idf' :(True ,False),
            'clf__max_iter': (5, 10, 15)
        },
        {
            'clf__max_iter': (20, 30),
            'clf__alpha': (1e-2, 1e-3, 1e-5),
            'clf__penalty': ('l2', 'elasticnet')
        },
        {
            'clf__C': [1, 10, 100, 1000],
            'clf__gamma': [0.001, 0.0001],
            'clf__kernel': ['rbf', 'linear']
        },
        {
            "clf__max_features": ['auto', 'sqrt', 'log2']
        }
    ]

    for name, classifier, params in zip(names, classifiers, parameters):
        my_dict = {}
        print(name)
        print(classifier)
        print(params)
        clf_pipe = Pipeline([
            ('vect', VECT),
            ('clf', classifier),
        ])

        gs_clf = GridSearchCV(clf_pipe, param_grid=params, cv=5)
        logger.info("Starting gridsearch....")
        clf = gs_clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        print("{} score: {}".format(name, score))
        print("{} are the best estimators".format(clf.best_estimator_))

        results_to_dict = classification_report(
            (clf.best_estimator_.predict(X_test)), y_test, output_dict=True)

        results_to_dict['classifier:'] = name
        results_to_dict['best estimators:'] = clf.best_params_

        print("Created dictionary with classification report: \n\n{}".format(
            results_to_dict))

        y_hats = clf.predict(X_test)

        my_dict = {
            "predicted": y_hats,
            "actual": y_test.values,
            "classifier": name
        }

        results.append(my_dict)
        class_report.append(results_to_dict)

    return class_report, results
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(SGDClassifier())),
])

SGD_count_pipeline = Pipeline([
    ('count', CountVectorizer()),
    ('clf', OneVsRestClassifier(SGDClassifier())),
])

SGD_count_embedding_pipeline = Pipeline([
    ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(MDL, 'mean')),
    ('clf', OneVsRestClassifier(SGDClassifier())),
])

SGD_tfidf_embedding_pipeline = Pipeline([
    ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(MDL, 'mean')),
    ('clf', OneVsRestClassifier(SGDClassifier())),
])

SVC_tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(SVC())),
])

SVC_count_pipeline = Pipeline([
    ('count', CountVectorizer()),
    ('clf', OneVsRestClassifier(SVC())),
])

SVC_count_embedding_pipeline = Pipeline([
    ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(MDL, 'mean')),