class HateCl: def __init__(self): print(pkg_resources.resource_filename('hate_cl', 'randomforest.sav')) classifier_file = open( pkg_resources.resource_filename('hate_cl', 'randomforest.sav'), 'rb') self.classifier = Unpickler(classifier_file).load() self.df = self.load_df() self.X = self.df['sentence'].tolist() self.y = self.df['hate'].tolist() self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.X, self.y, test_size=0.33, random_state=42) def load_df(self): off_com_file = open( pkg_resources.resource_filename('hate_cl', 'OffComBR3.arff')) data = arff.load(off_com_file) df = DataFrame(data['data']) df.columns = ['hate', 'sentence'] df['hate'] = df['hate'].apply(lambda x: 1 if x == 'yes' else 0) return df def predict(self, text): classified_text = self.classifier.predict_proba([text])[:, 1] return classified_text[0] def get_samples(self): df = Unpickler( open( pkg_resources.resource_filename('hate_cl', 'hate_cl/data.sav', 'rb'))).load() return df def refit(self, samples): df = Unpickler( open( pkg_resources.resource_filename('hate_cl', 'hate_cl/data.sav', 'rb'))).load() aux_df = DataFrame(samples, columns=['hate', 'sentence']) df = df.append(aux_df, ignore_index=True) print(df) X = df['sentence'].tolist() y = df['hate'].tolist() cl = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 4))), ('clf', RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0))]) cl.fit(X, y) self.classifier = cl cl_filename = pkg_resources.resource_filename( 'hate_cl', 'hate_cl/randomforest.sav') df_filename = pkg_resources.resource_filename('hate_cl', 'hate_cl/data.sav') f = open(cl_filename, 'wb') Pickler(f).dump(cl) f.close() f = open(df_filename, 'wb') Pickler(f).dump(df) f.close() # method that uses fit on every model with the defined dataset def fit_all(self, dataset): pass def get_recal_1(self, y_true, y_pred): precision, recall, fscore, support = score(y_true, y_pred) print(classification_report(y_true, y_pred)) return (recall[1]) def create_best_cl(self, model, params): gs_clf = GridSearchCV(model, params, cv=5, iid=False, n_jobs=-1, scoring=make_scorer(self.get_recal_1)) gs_clf.fit(self.X_train, self.y_train) print(gs_clf.best_score_) print(gs_clf.best_params_) return gs_clf def grid_search_all(self): nltk.download('stopwords') stopwords = nltk.corpus.stopwords.words('portuguese') for model in MODEL_LIST: cl = Pipeline([('tfidf', TfidfVectorizer(strip_accents='ascii', lowercase=True, stop_words=stopwords)), ('clf', model['model'])]) gs_clf = self.create_best_cl(cl, model['param']) pred = gs_clf.best_estimator_.predict(self.X_test) print(classification_report(self.y_test, pred)) # saving cl in file cl_filename = pkg_resources.resource_filename( 'hate_cl', model['model'].__class__.__name__ + '.sav') f = open(cl_filename, 'wb') Pickler(f).dump(cl) f.close() def create_committee(self): pass def import_commitee(self): pass