Пример #1
0
class HateCl:
    def __init__(self):
        print(pkg_resources.resource_filename('hate_cl', 'randomforest.sav'))
        classifier_file = open(
            pkg_resources.resource_filename('hate_cl', 'randomforest.sav'),
            'rb')
        self.classifier = Unpickler(classifier_file).load()
        self.df = self.load_df()
        self.X = self.df['sentence'].tolist()
        self.y = self.df['hate'].tolist()
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=0.33, random_state=42)

    def load_df(self):
        off_com_file = open(
            pkg_resources.resource_filename('hate_cl', 'OffComBR3.arff'))
        data = arff.load(off_com_file)
        df = DataFrame(data['data'])
        df.columns = ['hate', 'sentence']
        df['hate'] = df['hate'].apply(lambda x: 1 if x == 'yes' else 0)
        return df

    def predict(self, text):
        classified_text = self.classifier.predict_proba([text])[:, 1]
        return classified_text[0]

    def get_samples(self):
        df = Unpickler(
            open(
                pkg_resources.resource_filename('hate_cl', 'hate_cl/data.sav',
                                                'rb'))).load()
        return df

    def refit(self, samples):
        df = Unpickler(
            open(
                pkg_resources.resource_filename('hate_cl', 'hate_cl/data.sav',
                                                'rb'))).load()
        aux_df = DataFrame(samples, columns=['hate', 'sentence'])
        df = df.append(aux_df, ignore_index=True)
        print(df)
        X = df['sentence'].tolist()
        y = df['hate'].tolist()
        cl = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1, 4))),
                       ('clf',
                        RandomForestClassifier(n_estimators=100,
                                               max_depth=None,
                                               min_samples_leaf=1,
                                               min_samples_split=2,
                                               min_weight_fraction_leaf=0))])
        cl.fit(X, y)
        self.classifier = cl
        cl_filename = pkg_resources.resource_filename(
            'hate_cl', 'hate_cl/randomforest.sav')
        df_filename = pkg_resources.resource_filename('hate_cl',
                                                      'hate_cl/data.sav')

        f = open(cl_filename, 'wb')
        Pickler(f).dump(cl)
        f.close()

        f = open(df_filename, 'wb')
        Pickler(f).dump(df)
        f.close()

    # method that uses fit on every model with the defined dataset
    def fit_all(self, dataset):
        pass

    def get_recal_1(self, y_true, y_pred):
        precision, recall, fscore, support = score(y_true, y_pred)
        print(classification_report(y_true, y_pred))
        return (recall[1])

    def create_best_cl(self, model, params):
        gs_clf = GridSearchCV(model,
                              params,
                              cv=5,
                              iid=False,
                              n_jobs=-1,
                              scoring=make_scorer(self.get_recal_1))
        gs_clf.fit(self.X_train, self.y_train)
        print(gs_clf.best_score_)
        print(gs_clf.best_params_)
        return gs_clf

    def grid_search_all(self):
        nltk.download('stopwords')
        stopwords = nltk.corpus.stopwords.words('portuguese')

        for model in MODEL_LIST:
            cl = Pipeline([('tfidf',
                            TfidfVectorizer(strip_accents='ascii',
                                            lowercase=True,
                                            stop_words=stopwords)),
                           ('clf', model['model'])])
            gs_clf = self.create_best_cl(cl, model['param'])
            pred = gs_clf.best_estimator_.predict(self.X_test)
            print(classification_report(self.y_test, pred))

            # saving cl in file
            cl_filename = pkg_resources.resource_filename(
                'hate_cl', model['model'].__class__.__name__ + '.sav')
            f = open(cl_filename, 'wb')
            Pickler(f).dump(cl)
            f.close()

    def create_committee(self):
        pass

    def import_commitee(self):
        pass