def build_model(): """ Create a pipeline including feature extraction, classification and grid search. Returns: A sklearn model-pipeline, wich provides `fit()` and `predict()`. """ pipeline = Pipeline([ ("ct", ColumnTransformer([ ("tfidf", Pipeline([ ("countVectorizer", CountVectorizer(tokenizer=tokenize)), ("tfidfTransformer", TfidfTransformer()), ]), "message"), ("msg2Vec", Pipeline([ ("tokenizer", FunctionTransformer(func=vector_tokenize, validate=False)), ("d2v", D2VTransformer(min_count=1, seed=1)), ]), "message"), ("centroidDistance", Pipeline([ ("tokenizer", FunctionTransformer(func=vector_tokenize, validate=False)), ("tcd", TopicCentroidDistance()), ]), "message"), ("genre_onehot", OneHotEncoder(dtype="int"), ["genre"]) ])), ("clf", MultiOutputClassifier(RandomForestClassifier())) ]) parameters = dict(clf__estimator__n_estimators=[10, 50], ct__centroidDistance__tcd__vector_size=[10, 50, 75], ct__msg2Vec__d2v__size=[10, 50, 75]) return GridSearchCV(pipeline, parameters, scoring=f1_scoring, n_jobs=3)
def run_test1(normas): models = [[('cv', CountVectorizer(min_df=20, max_df=0.5))], [('tfidf', TfidfVectorizer(min_df=20, max_df=0.5))], [('tokenize', Tokenizador()), ('d2v', D2VTransformer(dm=0, min_count=100, size=200, workers=6))]] clfs = [{ 'clf': ('dt', DecisionTreeClassifier()), 'params': { 'dt__min_samples_split': [0.005, 0.010, 2], 'dt__max_depth': [16, 32, None] } }, { 'clf': ('rf', RandomForestClassifier()), 'params': { 'rf__n_estimators': [100, 110, 120], 'rf__min_samples_split': [0.005, 0.010, 2], 'rf__min_samples_leaf': [5, 3, 1] } }, { 'clf': ('mlknn', MLkNN()), 'params': { 'mlknn__k': [6, 8, 10, 12], 'mlknn__s': [0.5, 1.0, 1.5, 2.0] } }, { 'clf': ('mlp', MLPClassifier()), 'params': { 'mlp__hidden_layer_sizes': [(150), (100, 100), (50, 50, 50)], 'mlp__activation': ['tanh', 'relu'], 'mlp__solver': ['sgd', 'adam'] } }] run(normas, models, clfs)
def __init__(self, vector_size=10): """ Initialize transformer with an internal D2VTransformer. Args: vector_size: Size of vectors produced by Doc2Vec. """ self.vector_size = vector_size self.d2v = D2VTransformer(size=vector_size, min_count=1, seed=1)
def __init__(self, directory_path: str, model_store_path: str): self.directory_path = directory_path self.model_store_path = model_store_path self.classifier = RandomForestClassifier(n_estimators=1000, random_state=0) self.vectorizer = D2VTransformer(min_count=1, size=5, dm=1, dm_concat=1) self.logger = logging.getLogger(__name__)
def get_vk_group_prediction(self, payload: dict): """ Метод для опредления типа группы ВК :param payload: тело запроса """ username = payload["username"] link = payload["link"] data = {"username": username, "link": link, "link_type": "VK_GROUP"} link_topic = app.config.kafka.postgres_topics[0] kafka_producer.send(topic=link_topic, value=data) kafka_producer.flush() model_path = \ os.path.join(base_path, f"{app.config.management.influx.model_store_path}", "group_model_data.dump") classifier: RandomForestClassifier = pickle.load(open( model_path, 'rb')) vectorizer = D2VTransformer(min_count=1, size=5, dm=1, dm_concat=1) link = link.replace("https://vk.com/public", "-") self.logger.info(f"LINK: {link}") vk_resp = vk_api.wall.get(owner_id=int(link), count=1) self.logger.info(f"Получен ответ от ВК: {vk_resp}") item = vk_resp["items"][0] text_to_classify = vectorizer.fit_transform( [TextPrettifier.get_prepared_text(item["text"])]) result_index = classifier.predict(text_to_classify)[0] current_time = datetime.datetime.now() dict_to_send = { "group_id": link, "creation_time": current_time.strftime("%d.%m.%Y %H:%M:%S"), "post_text": ClassificationUtils.clear_post_text(item["text"]), "post_type": self.categories[result_index], "post_id": str(item["id"]), "username": username } kafka_producer.send(topic=self.classification_topic, value=dict_to_send) kafka_producer.flush() return self.categories[result_index]
def get_pipeline(classifier, method): pipe = [] if method == 'D2V': pipe.append(('d2v', D2VTransformer(window=1, workers=4))) elif method == 'BoW': pipe.append(('bow', CountVectorizer(stop_words='english'))) elif method == 'SVD': pipe.append(('bow', CountVectorizer(stop_words='english'))) pipe.append(('svd', TruncatedSVD(n_components=150))) else: pipe.append( ('tfidf', TfidfVectorizer(stop_words='english', max_features=50000))) if classifier == "SVM": pipe.append(('svm', svm.LinearSVC(max_iter=1000))) elif classifier == "Random Forest": pipe.append(('random_forest', RandomForestClassifier(n_estimators=200, n_jobs=-1))) else: pipe.append(('naive_bayes', MultinomialNB())) return Pipeline(pipe)
# ------------------------------------------------ PREPARING THE 10-FOLD VALIDATION ------------------------------------------------ # kfold = KFold(n_splits=10, random_state=42) scoring = {'accuracy' : make_scorer(accuracy_score), 'precision' : make_scorer(precision_score), 'recall' : make_scorer(recall_score), 'f1_score' : make_scorer(f1_score)} test_df = pd.read_csv("./../data/test_set.csv", sep = "\t") df = pd.read_csv("./../data/train_set.csv", sep = "\t") df['Content'] = df.Content.map(lambda x: x.lower().translate(str.maketrans('','', string.punctuation))) X_train, X_test, y_train, y_test = train_test_split(df['Content'], df['Category'], random_state = 1) # ------------------------------------------------ FEATURES ------------------------------------------------ # count_vector = CountVectorizer(stop_words = 'english') # svd = TruncatedSVD(n_components = 90, algorithm = 'arpack') doc2vec = D2VTransformer(min_count=1, size=5) # ------------------------------------------------ CLASSIFIERS ------------------------------------------------ # randomForestClassifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0) svclassifier = SVC(kernel='linear') # BAG OF WORDS training_data_bow = count_vector.fit_transform(X_train) testing_data_bow = count_vector.fit_transform(X_test) ## SVD # training_data_svd = svd(X_train, full_matrices=False) # testing_data_svd = svd(X_test, full_matrices=False) ## DOC 2 VECTOR # training_data_d2v = doc2vec.fit_transform(X_train)
def TestWorksWithIterableNotHavingElementWithZeroIndex(self): a = IterableWithoutZeroElement(common_texts) transformer = D2VTransformer(min_count=1, size=5) transformer.fit(a)
data_path = "/home/philipp/projects/dad4td/data/processed/20_news_imdb.pkl" d = dict(data_frac=0.1, contamination=0.1, seed=42) # prepare data df = pd.read_pickle(data_path) df = sample_data(df, **d) df["outlier_label"].value_counts().to_string().replace("\n", "\t") # %% import pandas as pd from gensim.models.doc2vec import TaggedDocument from gensim.sklearn_api import D2VTransformer data_path = "/home/philipp/projects/dad4td/data/processed/20_news_imdb.pkl" d = dict(data_frac=0.1, contamination=0.1, seed=42) # prepare data df = pd.read_pickle(data_path) df = sample_data(df, **d) X_tagged = [TaggedDocument(doc, str(i)) for i, doc in df["text"].items()] doc_vecs = D2VTransformer(seed=d["seed"], min_count=25).fit_transform(X_tagged) print(doc_vecs) print(doc_vecs.shape) # %%
#Transform the dataset from Pandas series to Python list of lists, #which is the necessary input for the Doc2Vec fit_transform method X_train = [document.split() for document in X_train] params={'d2v__dm':[0], 'd2v__size':[50, 100, 300], 'd2v__window':[2, 5, 8], 'd2v__iter':[1, 5, 15, 30], 'd2v__seed':[42], 'd2v__workers': [16], 'clf__n_jobs': [-1] 'clf__solver': ['liblinear'], 'clf__multi_class':['ovr'], 'clf__C':[0.001, 1, 4], 'clf__random_state':[42]} pipe = Pipeline([('d2v', D2VTransformer()), ('clf', LogisticRegression()) ]) grid = GridSearchCV(pipe, params, cv=5, n_jobs=-1) grid.fit(X_train, y_train) print(grid.best_score_) print(grid.cv_results_) results = pd.DataFrame(grid.cv_results_) results.to_json('./d2v_outfile_05.json')
encoded_y_test = label_encoder.transform(y_test) print(encoded_y_train[0]) # In[6]: print(len(label_encoder.classes_)) for i, element in enumerate(label_encoder.classes_): print(i, element) # Klassifikation der Daten mit gensim # In[40]: vectorizer = D2VTransformer(dm=0, window=10, iter=20, size=100, min_count=4, sample=0) # In[41]: text_clf = Pipeline([('vect', vectorizer), ('clf', MLPClassifier(hidden_layer_sizes=(4096, 1024), validation_fraction=0.1, early_stopping=True, verbose=True, random_state=1))]) # In[42]:
ngram_range=(4, 4), lowercase=False, min_df=1, max_df=0.8) bigram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 2), lowercase=False, min_df=4, max_df=0.8) tfidf_ngram = TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=True, min_df=0.1, max_df=0.6) tfidf_transformer = TfidfTransformer(sublinear_tf=True) tsvd = TruncatedSVD(random_state=2016, n_components=200, n_iter=5) doc2vec = D2VTransformer() all_features = [ ('ARI', digit_feature(key='ari')), ('GFI', digit_feature(key='gfi')), ('smog', digit_feature(key='smog')), ('Num. unique words', digit_feature(key='unique_words')), ('eGeMAPS ADR', digit_features(key='eGeMAPS')), ('MFCC ADR', digit_features(key='MFCC12')), ('MFCC', digit_features(key='mfcc')), ('duration', digit_features(key='duration')), ('Unigram', pipeline.Pipeline([('s1', text_col(key='!PAR')), ('tfidf_unigram', tfidf_unigram)])), ('Bigram', pipeline.Pipeline([('s2', text_col(key='!PAR')),
test_size=0.2, stratify=y) #Transform the dataset from Pandas series to Python list of lists, #which is the necessary input for the Doc2Vec fit_transform method X_train = [document.split() for document in X_train] params = { 'd2v__dm': [1], 'd2v__size': [50, 100, 300], 'd2v__window': [2, 5, 8], 'd2v__iter': [1, 5, 15, 30], 'd2v__seed': [42], 'd2v__workers': [16], 'clf__solver': ['saga'], 'clf__multi_class': ['multinomial'], 'clf__C': [0.001, 1, 4], 'clf__random_state': [42] } pipe = Pipeline([('d2v', D2VTransformer()), ('clf', LogisticRegression())]) grid = GridSearchCV(pipe, params, cv=5, n_jobs=-1) grid.fit(X_train, y_train) print(grid.best_score_) print(grid.cv_results_) results = pd.DataFrame(grid.cv_results_) results.to_json('./d2v_outfile_05.json')