예제 #1
0
def build_model():
    """
    Create a pipeline including feature extraction, classification and grid search.

    Returns:
        A sklearn model-pipeline, wich provides `fit()` and `predict()`.
    """
    pipeline = Pipeline([
        ("ct",
         ColumnTransformer([
             ("tfidf",
              Pipeline([
                  ("countVectorizer", CountVectorizer(tokenizer=tokenize)),
                  ("tfidfTransformer", TfidfTransformer()),
              ]), "message"),
             ("msg2Vec",
              Pipeline([
                  ("tokenizer",
                   FunctionTransformer(func=vector_tokenize, validate=False)),
                  ("d2v", D2VTransformer(min_count=1, seed=1)),
              ]), "message"),
             ("centroidDistance",
              Pipeline([
                  ("tokenizer",
                   FunctionTransformer(func=vector_tokenize, validate=False)),
                  ("tcd", TopicCentroidDistance()),
              ]), "message"),
             ("genre_onehot", OneHotEncoder(dtype="int"), ["genre"])
         ])), ("clf", MultiOutputClassifier(RandomForestClassifier()))
    ])
    parameters = dict(clf__estimator__n_estimators=[10, 50],
                      ct__centroidDistance__tcd__vector_size=[10, 50, 75],
                      ct__msg2Vec__d2v__size=[10, 50, 75])

    return GridSearchCV(pipeline, parameters, scoring=f1_scoring, n_jobs=3)
예제 #2
0
def run_test1(normas):
    models = [[('cv', CountVectorizer(min_df=20, max_df=0.5))],
              [('tfidf', TfidfVectorizer(min_df=20, max_df=0.5))],
              [('tokenize', Tokenizador()),
               ('d2v', D2VTransformer(dm=0, min_count=100, size=200,
                                      workers=6))]]

    clfs = [{
        'clf': ('dt', DecisionTreeClassifier()),
        'params': {
            'dt__min_samples_split': [0.005, 0.010, 2],
            'dt__max_depth': [16, 32, None]
        }
    }, {
        'clf': ('rf', RandomForestClassifier()),
        'params': {
            'rf__n_estimators': [100, 110, 120],
            'rf__min_samples_split': [0.005, 0.010, 2],
            'rf__min_samples_leaf': [5, 3, 1]
        }
    }, {
        'clf': ('mlknn', MLkNN()),
        'params': {
            'mlknn__k': [6, 8, 10, 12],
            'mlknn__s': [0.5, 1.0, 1.5, 2.0]
        }
    }, {
        'clf': ('mlp', MLPClassifier()),
        'params': {
            'mlp__hidden_layer_sizes': [(150), (100, 100), (50, 50, 50)],
            'mlp__activation': ['tanh', 'relu'],
            'mlp__solver': ['sgd', 'adam']
        }
    }]
    run(normas, models, clfs)
예제 #3
0
    def __init__(self, vector_size=10):
        """
        Initialize transformer with an internal D2VTransformer.

        Args:
            vector_size: Size of vectors produced by Doc2Vec.
        """
        self.vector_size = vector_size
        self.d2v = D2VTransformer(size=vector_size, min_count=1, seed=1)
예제 #4
0
 def __init__(self, directory_path: str, model_store_path: str):
     self.directory_path = directory_path
     self.model_store_path = model_store_path
     self.classifier = RandomForestClassifier(n_estimators=1000,
                                              random_state=0)
     self.vectorizer = D2VTransformer(min_count=1,
                                      size=5,
                                      dm=1,
                                      dm_concat=1)
     self.logger = logging.getLogger(__name__)
예제 #5
0
    def get_vk_group_prediction(self, payload: dict):
        """
        Метод для опредления типа группы ВК
        :param payload: тело запроса
        """

        username = payload["username"]
        link = payload["link"]

        data = {"username": username, "link": link, "link_type": "VK_GROUP"}

        link_topic = app.config.kafka.postgres_topics[0]

        kafka_producer.send(topic=link_topic, value=data)
        kafka_producer.flush()

        model_path = \
            os.path.join(base_path, f"{app.config.management.influx.model_store_path}", "group_model_data.dump")

        classifier: RandomForestClassifier = pickle.load(open(
            model_path, 'rb'))
        vectorizer = D2VTransformer(min_count=1, size=5, dm=1, dm_concat=1)

        link = link.replace("https://vk.com/public", "-")
        self.logger.info(f"LINK: {link}")

        vk_resp = vk_api.wall.get(owner_id=int(link), count=1)

        self.logger.info(f"Получен ответ от ВК: {vk_resp}")

        item = vk_resp["items"][0]

        text_to_classify = vectorizer.fit_transform(
            [TextPrettifier.get_prepared_text(item["text"])])

        result_index = classifier.predict(text_to_classify)[0]

        current_time = datetime.datetime.now()

        dict_to_send = {
            "group_id": link,
            "creation_time": current_time.strftime("%d.%m.%Y %H:%M:%S"),
            "post_text": ClassificationUtils.clear_post_text(item["text"]),
            "post_type": self.categories[result_index],
            "post_id": str(item["id"]),
            "username": username
        }

        kafka_producer.send(topic=self.classification_topic,
                            value=dict_to_send)
        kafka_producer.flush()

        return self.categories[result_index]
def get_pipeline(classifier, method):
    pipe = []
    if method == 'D2V':
        pipe.append(('d2v', D2VTransformer(window=1, workers=4)))
    elif method == 'BoW':
        pipe.append(('bow', CountVectorizer(stop_words='english')))
    elif method == 'SVD':
        pipe.append(('bow', CountVectorizer(stop_words='english')))
        pipe.append(('svd', TruncatedSVD(n_components=150)))
    else:
        pipe.append(
            ('tfidf', TfidfVectorizer(stop_words='english',
                                      max_features=50000)))

    if classifier == "SVM":
        pipe.append(('svm', svm.LinearSVC(max_iter=1000)))
    elif classifier == "Random Forest":
        pipe.append(('random_forest',
                     RandomForestClassifier(n_estimators=200, n_jobs=-1)))
    else:
        pipe.append(('naive_bayes', MultinomialNB()))
    return Pipeline(pipe)
예제 #7
0
# ------------------------------------------------ PREPARING THE 10-FOLD VALIDATION ------------------------------------------------ #
kfold = KFold(n_splits=10, random_state=42)
scoring = {'accuracy' : make_scorer(accuracy_score),
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score),
           'f1_score' : make_scorer(f1_score)}

test_df = pd.read_csv("./../data/test_set.csv", sep = "\t")
df = pd.read_csv("./../data/train_set.csv", sep = "\t")
df['Content'] = df.Content.map(lambda x: x.lower().translate(str.maketrans('','', string.punctuation)))
X_train, X_test, y_train, y_test = train_test_split(df['Content'], df['Category'], random_state = 1)

# ------------------------------------------------ FEATURES ------------------------------------------------ #
count_vector = CountVectorizer(stop_words = 'english')
# svd = TruncatedSVD(n_components = 90, algorithm = 'arpack')
doc2vec = D2VTransformer(min_count=1, size=5)

# ------------------------------------------------ CLASSIFIERS ------------------------------------------------ #
randomForestClassifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=0)
svclassifier = SVC(kernel='linear')

# BAG OF WORDS
training_data_bow = count_vector.fit_transform(X_train)
testing_data_bow = count_vector.fit_transform(X_test)

## SVD
# training_data_svd = svd(X_train, full_matrices=False)
# testing_data_svd = svd(X_test, full_matrices=False)

## DOC 2 VECTOR
# training_data_d2v = doc2vec.fit_transform(X_train)
 def TestWorksWithIterableNotHavingElementWithZeroIndex(self):
     a = IterableWithoutZeroElement(common_texts)
     transformer = D2VTransformer(min_count=1, size=5)
     transformer.fit(a)
예제 #9
0
data_path = "/home/philipp/projects/dad4td/data/processed/20_news_imdb.pkl"

d = dict(data_frac=0.1, contamination=0.1, seed=42)

# prepare data
df = pd.read_pickle(data_path)
df = sample_data(df, **d)

df["outlier_label"].value_counts().to_string().replace("\n", "\t")
# %%
import pandas as pd
from gensim.models.doc2vec import TaggedDocument
from gensim.sklearn_api import D2VTransformer

data_path = "/home/philipp/projects/dad4td/data/processed/20_news_imdb.pkl"

d = dict(data_frac=0.1, contamination=0.1, seed=42)

# prepare data
df = pd.read_pickle(data_path)
df = sample_data(df, **d)

X_tagged = [TaggedDocument(doc, str(i)) for i, doc in df["text"].items()]

doc_vecs = D2VTransformer(seed=d["seed"], min_count=25).fit_transform(X_tagged)

print(doc_vecs)
print(doc_vecs.shape)

# %%
예제 #10
0
#Transform the dataset from Pandas series to Python list of lists, 
#which is the necessary input for the Doc2Vec fit_transform method
X_train = [document.split() for document in X_train]

params={'d2v__dm':[0],
        'd2v__size':[50, 100, 300],
        'd2v__window':[2, 5, 8],
        'd2v__iter':[1, 5, 15, 30], 
        'd2v__seed':[42], 
        'd2v__workers': [16],
        'clf__n_jobs': [-1]
        'clf__solver': ['liblinear'],
        'clf__multi_class':['ovr'],
        'clf__C':[0.001, 1, 4],
        'clf__random_state':[42]}

pipe = Pipeline([('d2v', D2VTransformer()),
                 ('clf', LogisticRegression())
                ])

grid = GridSearchCV(pipe, params, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.cv_results_)

results = pd.DataFrame(grid.cv_results_)
results.to_json('./d2v_outfile_05.json')

encoded_y_test = label_encoder.transform(y_test)
print(encoded_y_train[0])

# In[6]:

print(len(label_encoder.classes_))
for i, element in enumerate(label_encoder.classes_):
    print(i, element)

# Klassifikation der Daten mit gensim

# In[40]:

vectorizer = D2VTransformer(dm=0,
                            window=10,
                            iter=20,
                            size=100,
                            min_count=4,
                            sample=0)

# In[41]:

text_clf = Pipeline([('vect', vectorizer),
                     ('clf',
                      MLPClassifier(hidden_layer_sizes=(4096, 1024),
                                    validation_fraction=0.1,
                                    early_stopping=True,
                                    verbose=True,
                                    random_state=1))])

# In[42]:
                                           ngram_range=(4, 4),
                                           lowercase=False,
                                           min_df=1,
                                           max_df=0.8)
    bigram_vectorizer = CountVectorizer(analyzer='char',
                                        ngram_range=(1, 2),
                                        lowercase=False,
                                        min_df=4,
                                        max_df=0.8)
    tfidf_ngram = TfidfVectorizer(ngram_range=(1, 1),
                                  sublinear_tf=True,
                                  min_df=0.1,
                                  max_df=0.6)
    tfidf_transformer = TfidfTransformer(sublinear_tf=True)
    tsvd = TruncatedSVD(random_state=2016, n_components=200, n_iter=5)
    doc2vec = D2VTransformer()

    all_features = [
        ('ARI', digit_feature(key='ari')),
        ('GFI', digit_feature(key='gfi')),
        ('smog', digit_feature(key='smog')),
        ('Num. unique words', digit_feature(key='unique_words')),
        ('eGeMAPS ADR', digit_features(key='eGeMAPS')),
        ('MFCC ADR', digit_features(key='MFCC12')),
        ('MFCC', digit_features(key='mfcc')),
        ('duration', digit_features(key='duration')),
        ('Unigram',
         pipeline.Pipeline([('s1', text_col(key='!PAR')),
                            ('tfidf_unigram', tfidf_unigram)])),
        ('Bigram',
         pipeline.Pipeline([('s2', text_col(key='!PAR')),
예제 #13
0
                                                    test_size=0.2,
                                                    stratify=y)

#Transform the dataset from Pandas series to Python list of lists,
#which is the necessary input for the Doc2Vec fit_transform method
X_train = [document.split() for document in X_train]

params = {
    'd2v__dm': [1],
    'd2v__size': [50, 100, 300],
    'd2v__window': [2, 5, 8],
    'd2v__iter': [1, 5, 15, 30],
    'd2v__seed': [42],
    'd2v__workers': [16],
    'clf__solver': ['saga'],
    'clf__multi_class': ['multinomial'],
    'clf__C': [0.001, 1, 4],
    'clf__random_state': [42]
}

pipe = Pipeline([('d2v', D2VTransformer()), ('clf', LogisticRegression())])

grid = GridSearchCV(pipe, params, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.cv_results_)

results = pd.DataFrame(grid.cv_results_)
results.to_json('./d2v_outfile_05.json')