def Define_vectorizer(self): logging.info("this is length of the dataframe: {}".format(len( self.df))) if self.vect == 'tfidf': logging.info("the vectorizers is: {}".format(self.vect)) VECT = TfidfVectorizer() elif self.vect == 'count': logging.info("the vectorizers is: {}".format(self.vect)) VECT = CountVectorizer() elif self.vect == 'w2v_count': logging.info("the vectorizers is: {}".format(self.vect)) VECT = embeddingvectorizer.EmbeddingCountVectorizer( self.model, 'mean') elif self.vect == 'w2v_tfidf': logging.info("the vectorizers is: {}".format(self.vect)) VECT = embeddingvectorizer.EmbeddingTfidfVectorizer( self.model, 'mean') return VECT
def get_vectorizer(self, vectorizer, model): logging.info("the vectorizer is: {}".format(vectorizer)) vec = {} vec['filename'] = vectorizer if vectorizer == 'w2v_count': s = embeddingvectorizer.EmbeddingCountVectorizer(model['gensimmodel'], 'mean') elif vectorizer == 'w2v_tfidf': s = embeddingvectorizer.EmbeddingTfidfVectorizer(model['gensimmodel'], 'mean') vec['vectorizer'] = s yield vec
def Define_pipelines(self): logging.info('Start defining pipelines...\n\n') SGD_tfidf_pipeline = Pipeline([ ('tfidf', TfidfVectorizer()), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_pipeline = Pipeline([ ('count', CountVectorizer()), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'mean')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'mean')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'sum')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'sum')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'max')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'max')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SVC_tfidf_pipeline = Pipeline([ ('tfidf', TfidfVectorizer()), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_pipeline = Pipeline([ ('count', CountVectorizer()), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'mean')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'mean')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'sum')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'sum')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'max')), ('clf', OneVsRestClassifier(SVC())), ]) SVC_tfidf_embedding_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'max')), ('clf', OneVsRestClassifier(SVC())), ]) PA_tfidf_pipeline = Pipeline([ ('tfidf', TfidfVectorizer()), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_pipeline = Pipeline([ ('count', CountVectorizer()), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_embedding_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'mean')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'mean')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_embedding_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'sum')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'sum')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_count_embedding_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'max')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) PA_tfidf_embedding_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'max')), ('clf', OneVsRestClassifier(PassiveAggressiveClassifier())), ]) ET_tfidf_pipeline = Pipeline([ ('tfidf', TfidfVectorizer()), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_pipeline = Pipeline([ ('count', CountVectorizer()), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'mean')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'mean')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'sum')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_pipeline_sum = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'sum')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_count_embedding_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(self.model, 'max')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) ET_tfidf_embedding_pipeline_max = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(self.model, 'max')), ('clf', OneVsRestClassifier(ExtraTreesClassifier())), ]) all_models = [ ("SGD tfidf", SGD_tfidf_pipeline), ("SGD count", SGD_count_pipeline), ("SGD count embedding", SGD_count_embedding_pipeline), ("SGD tfidf embedding", SGD_tfidf_embedding_pipeline), ("SGD count embedding sum", SGD_count_embedding_pipeline_sum), ("SGD tfidf embedding sum", SGD_tfidf_embedding_pipeline_sum), ("SGD count embedding max", SGD_count_embedding_pipeline_max), ("SGD tfidf embedding max", SGD_tfidf_embedding_pipeline_max), ("SVC tfidf", SVC_tfidf_pipeline), ("SVC count", SVC_count_pipeline), ("SVC count embedding", SVC_count_embedding_pipeline), ("SVC tfidf embedding", SVC_tfidf_embedding_pipeline), ("SVC count embedding sum", SVC_count_embedding_pipeline_sum), ("SVC tfidf embedding sum", SVC_tfidf_embedding_pipeline_sum), ("SVC count embedding max", SVC_count_embedding_pipeline_max), ("SVC tfidf embedding max", SVC_tfidf_embedding_pipeline_max), ("PA tfidf", PA_tfidf_pipeline), ("PA count", PA_count_pipeline), ("PA count embedding", PA_count_embedding_pipeline), ("PA tfidf embedding", PA_tfidf_embedding_pipeline), ("PA count embedding sum", PA_count_embedding_pipeline_sum), ("PA tfidf embedding sum", PA_tfidf_embedding_pipeline_sum), ("PA count embedding max", PA_count_embedding_pipeline_max), ("PA tfidf embedding max", PA_tfidf_embedding_pipeline_max), ("ET tfidf", ET_tfidf_pipeline), ("ET count", ET_count_pipeline), ("ET count embedding", ET_count_embedding_pipeline), ("ET tifdf embedding", ET_tfidf_embedding_pipeline), ("ET count embedding sum", ET_count_embedding_pipeline_sum), ("ET tifdf embedding sum", ET_tfidf_embedding_pipeline_sum), ("ET count embedding max", ET_count_embedding_pipeline_max), ("ET tifdf embedding max", ET_tfidf_embedding_pipeline_max) ] return all_models
def gridsearch_with_classifiers(sample, vect): df = get_data() print("this is length of the dataframe: {}".format(len(df))) logging.info('getting the data. keeping sample: {}'.format(sample)) if sample == 'newspaper_sample_only': df = df[df['type'] == 'newspaper'] elif sample == 'pq_sample_only': df = df[df['type'] == 'parlementary question'] elif sample == 'RPA_sample': df = df[df['origin'] == 'RPA'] if vect == 'tfidf': logging.info("the vectorizers is: {}".format(vect)) VECT = TfidfVectorizer() elif vect == 'count': logging.info("the vectorizers is: {}".format(vect)) VECT = CountVectorizer() elif vect == 'w2v_count': logging.info("the vectorizers is: {}".format(vect)) PE = '/home/anne/tmpanne/RPA/w2v_models/w2v_300d2000-01-01_2018-12-31' mod = gensim.models.Word2Vec.load(PE) MDL = dict(zip(mod.wv.index2word, mod.wv.syn0)) VECT = embeddingvectorizer.EmbeddingCountVectorizer(MDL, 'mean') elif vect == 'w2v_tfidf': logging.info("the vectorizers is: {}".format(vect)) PE = '/home/anne/tmpanne/RPA/w2v_models/w2v_300d2000-01-01_2018-12-31' mod = gensim.models.Word2Vec.load(PE) MDL = dict(zip(mod.wv.index2word, mod.wv.syn0)) VECT = embeddingvectorizer.EmbeddingTfidfVectorizer(MDL, 'mean') logging.info('total size df: {}'.format(len(df))) X_train, X_test, y_train, y_test = train_test_split(df['text_clean'], df['main_topic_label'], test_size=0.2, random_state=0) class_report = [] results = [] names = ["Passive Agressive", "SGDClassifier", "SVM", "ET"] classifiers = [ PassiveAggressiveClassifier(), SGDClassifier(), SVC(), ExtraTreesClassifier() ] parameters = [ { 'clf__loss': ('hinge', 'squared_hinge'), 'clf__C': (0.01, 0.5, 1.0), 'clf__fit_intercept': (True, False), #'vect__ngram_range': [(1, 1), (1, 2)] , # 'tfidf__use_idf' :(True ,False), 'clf__max_iter': (5, 10, 15) }, { 'clf__max_iter': (20, 30), 'clf__alpha': (1e-2, 1e-3, 1e-5), 'clf__penalty': ('l2', 'elasticnet') }, { 'clf__C': [1, 10, 100, 1000], 'clf__gamma': [0.001, 0.0001], 'clf__kernel': ['rbf', 'linear'] }, { "clf__max_features": ['auto', 'sqrt', 'log2'] } ] for name, classifier, params in zip(names, classifiers, parameters): my_dict = {} print(name) print(classifier) print(params) clf_pipe = Pipeline([ ('vect', VECT), ('clf', classifier), ]) gs_clf = GridSearchCV(clf_pipe, param_grid=params, cv=5) logger.info("Starting gridsearch....") clf = gs_clf.fit(X_train, y_train) score = clf.score(X_test, y_test) print("{} score: {}".format(name, score)) print("{} are the best estimators".format(clf.best_estimator_)) results_to_dict = classification_report( (clf.best_estimator_.predict(X_test)), y_test, output_dict=True) results_to_dict['classifier:'] = name results_to_dict['best estimators:'] = clf.best_params_ print("Created dictionary with classification report: \n\n{}".format( results_to_dict)) y_hats = clf.predict(X_test) my_dict = { "predicted": y_hats, "actual": y_test.values, "classifier": name } results.append(my_dict) class_report.append(results_to_dict) return class_report, results
('tfidf', TfidfVectorizer()), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_pipeline = Pipeline([ ('count', CountVectorizer()), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_count_embedding_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(MDL, 'mean')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SGD_tfidf_embedding_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingTfidfVectorizer(MDL, 'mean')), ('clf', OneVsRestClassifier(SGDClassifier())), ]) SVC_tfidf_pipeline = Pipeline([ ('tfidf', TfidfVectorizer()), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_pipeline = Pipeline([ ('count', CountVectorizer()), ('clf', OneVsRestClassifier(SVC())), ]) SVC_count_embedding_pipeline = Pipeline([ ("Embedding", embeddingvectorizer.EmbeddingCountVectorizer(MDL, 'mean')),