示例#1
0
    def run_knn(self, k2, outputfile):

        scores = '_knn_scores.txt'
        print('Running model...')
        forest = KNeighborsClassifier(train=self.x_train,
                                      test=self.x_test,
                                      k=k2,
                                      scores=outputfile + "_" + str(k2) +
                                      scores,
                                      Y_train=self.y_train)
        forest.preprocess_input_vectors()
        pred, predprob = forest.build_knn_model()
        #               forest.model_evaluation(prediction, self.y_test)
        f1, prec, rec, acc = forest.model_evaluation2(pred, predprob,
                                                      self.y_test)
        print("f1_binary: %f" % f1)
        print("Accuracy: %f" % acc)
        print("Precision: %f" % prec)
        print("Recall: %f" % rec)

        PrecisionGraphics(
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) + scores,
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) +
            "_precisionvsrecall.pdf")
        NDCGGraphics(
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) + scores,
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) +
            "_NDCG.pdf")
        ids = GetIds("files3_claro_mig_model")
        ids.gen_features(
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) +
            "_ids.txt")
    def run_rf(self, prunning=5, balanced=None, imputer=False):

        scores = '_rf_scores.txt'
        if (imputer):
            imp = Imputer(missing_values=-1,
                          strategy='median').fit(self.x_train)
            self.x_train = imp.transform(self.x_train)
            self.x_test = imp.transform(self.x_test)
        print('Gerando modelo...')
        forest = RandomForestClassifier(train=self.x_train,
                                        test=self.x_test,
                                        prunning=prunning,
                                        n_estimators=100,
                                        balanced=balanced,
                                        scores=str(prunning) + '_' +
                                        str(balanced) + '_' + str(imputer) +
                                        scores,
                                        Y_train=self.y_train)
        forest.preprocess_input_vectors()
        pred, predprob = forest.build_rf_model()
        #		forest.model_evaluation(prediction, self.y_test)
        f1, prec, rec, acc = forest.model_evaluation2(pred, predprob,
                                                      self.y_test)
        print("f1_binary: %f" % f1)
        print("Accuracy: %f" % acc)
        print("Precision: %f" % prec)
        print("Recall: %f" % rec)

        PrecisionGraphics(
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) + scores,
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) +
            "_precisionvsrecall.pdf")
        NDCGGraphics(
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) + scores,
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) +
            "_NDCG.pdf")
        ids = GetIds("files3_claro_mig_model")
        ids.gen_features(
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) +
            "_ids.txt")
    def run_rf(self, prunning=5, balanced='balanced', imputer=False):

        scores = '_rf_scores.txt'
        if (imputer):
            imp = Imputer(missing_values=-1,
                          strategy='median').fit(self.x_train)
            self.x_train = imp.transform(self.x_train)
            self.x_test = imp.transform(self.x_test)
        print('Running model...')
        forest = RandomForestClassifier(train=self.x_train,
                                        test=self.x_test,
                                        prunning=prunning,
                                        n_estimators=100,
                                        balanced=balanced,
                                        scores=str(prunning) + '_' +
                                        str(balanced) + '_' + str(imputer) +
                                        scores,
                                        Y_train=self.y_train)

        index_1 = np.where(self.y_train == 1)
        data_y_balanced_1 = self.y_train[index_1]
        index_0 = np.where(self.y_train == 0)
        data_y_balanced_0 = self.y_train[
            index_0[0]]  # [0:30*len(data_y_balanced_1[:])]]
        data_y_balanced = np.array(
            np.hstack([data_y_balanced_0, data_y_balanced_1]))

        data_x_balanced_1 = self.x_train[index_1]
        data_x_balanced_0 = self.x_train[
            index_0[0]]  # [0:30*len(data_x_balanced_1[:])]]
        data_x_balanced = np.array(
            np.vstack([data_x_balanced_0, data_x_balanced_1]))
        del data_x_balanced_0

        clf = MLPClassifier(solver='sgd',
                            alpha=1e-5,
                            hidden_layer_sizes=(50, 100, 50),
                            max_iter=50,
                            random_state=1,
                            verbose=1)

        filename = 'clf_model_tT4.sav'
        try:
            clf = joblib.load(filename)
        except:
            clf.fit(np.nan_to_num(data_x_balanced), data_y_balanced)
            # save the model to disk
            joblib.dump(clf, filename)
        pred = clf.predict(np.nan_to_num(self.x_test))
        print(classification_report(self.y_test[:len(pred)], pred))

        #predprob = clf.predict_proba(np.nan_to_num(self.x_test))[:, 1]
        forest.preprocess_input_vectors()
        pred, predprob = forest.build_rf_model()
        # forest.model_evaluation(prediction, self.y_test)
        f1, prec, rec, acc = forest.model_evaluation2(pred, predprob,
                                                      self.y_test)
        print("f1_binary: %f" % f1)
        print("Accuracy: %f" % acc)
        print("Precision: %f" % prec)
        print("Recall: %f" % rec)
        plt.hist(predprob)

        plt.title("Histogram")
        plt.xlabel("Value")
        plt.ylabel("Frequency")
        plt.show()

        PrecisionGraphics(
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) + scores,
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) +
            "_precisionvsrecall.pdf")
        NDCGGraphics(
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) + scores,
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) +
            "_NDCG.pdf")
        ids = GetIds("files3_claro_mig_model_2")
        ids.gen_features(
            str(prunning) + '_' + str(balanced) + '_' + str(imputer) +
            "_ids.txt")