Пример #1
0
class BinaryRelevancesSimple:
    def __init__(self, model):
        # self.params = {
        #     # 'num_class': num_class,
        #     # "boosting_type": "gbdt",
        #     "objective": "binary",
        #     "metric": 'None',
        #     "learning_rate": 0.05,
        #     "verbosity": 1,
        #     "seed": 888,
        #     "num_threads": NUM_THREAD
        # }

        self.model = BinaryRelevance(LGBMClassifier())
        if model == 'RF':
            self.model = BinaryRelevance(RandomForestClassifier(n_estimators=200, max_depth=12))

#     def set_grow_step(self, new_step):
#         self.grow_boost_round = new_step
     
    def fit(self, X_train, y_train):
        print ('###start trainging...')
        start = time.time()
        self.model.fit(X_train, y_train)
        print ('####training time:', time.time() - start)
 
    def predict_proba(self, X_test):
        return self.model.predict_proba(X_test).A
Пример #2
0
    def RecommendByBinaryRelevance(train_data, train_data_y, test_data, test_data_y, recommendNum=5):
        """使用多标签问题的 二值相关 """
        classifier = BinaryRelevance(RandomForestClassifier(oob_score=True, max_depth=10, min_samples_split=20))
        classifier.fit(train_data, train_data_y)

        predictions = classifier.predict_proba(test_data)
        predictions = predictions.todense().getA()

        recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1),
                                                             recommendNum)
        answerList = test_data_y
        print(predictions)
        print(test_data_y)
        print(recommendList)
        print(answerList)
        return [recommendList, answerList]
class MyBinaryRelevanceFeatureSelect():
    def fit(self, X, y):

        # I'm using a gaussian naive bayes base classifier
        self.BinaryRelevanceObject = BinaryRelevance(
            classifier=SVC(gamma='auto', probability=True),
            require_dense=[True, True])
        #self.BinaryRelevanceObject = BinaryRelevance()

        # fitting the data
        self.BinaryRelevanceObject.fit(X, y)

        #the classifiers for each label
        self.classifiers = self.BinaryRelevanceObject.classifiers_

        return self.BinaryRelevanceObject.fit(X, y)

#     def partition(self):
#         return self.BinaryRelevanceObject.partition_#BinaryRelevanceObject

#     def model_count(self):
#         return self.BinaryRelevanceObject.model_count_

    def predict(self, X, y=None):
        return self.BinaryRelevanceObject.predict(X)

    def predict_proba(self, X):
        return self.BinaryRelevanceObject.predict_proba(X)


#    def feature_select(self, X, y, transformer):
#        transformer.fit(X, y)
#        selected_attributes_indices = transformer.get_support(indices = True)
#
#        return selected_attributes_indices
#
#    def sets_of_selected_features(self, X, predictions, classifier, transformer ): #X is the df with the predictions
#        selected_features_array = []
#
#        for i in predictions:
#            indices_features_selected = classifier.feature_select(X, predictions[i], transformer)
#            selected_features_array.append(indices_features_selected)
#
#        return selected_features_array
Пример #4
0
X_test = sc.sparse.csr_matrix(X_test.drop('user', axis=1).values)
t_test = sc.sparse.csr_matrix(t_test.values)

X_train_scale = scale(X_train.toarray(
))  # scaling not work well for many methods, for its offset of similarities
X_test_scale = scale(X_test.toarray())

X_sparse = sc.sparse.csr_matrix(X.drop('user', axis=1).values)
t_sparse = sc.sparse.csr_matrix(t.values)

# firstly test the transformations with a simple naive-bayes classifier, roughly conclude that BR suits the most
# intuitively the hotels shouldn't have correlation based on userID, for its randomness
classifier = BinaryRelevance(GaussianNB())
classifier.fit(X_train, t_train)
predictions = classifier.predict(X_test)
probabilities = classifier.predict_proba(X_test)
accuracy_score(t_test, predictions)  # 0
mean_squared_error(t_test.toarray(),
                   probabilities.toarray())  # 0.063299324514418692

classifier = ClassifierChain(GaussianNB())
classifier.fit(X_train, t_train)
predictions = classifier.predict(X_test)
probabilities = classifier.predict_proba(X_test)
accuracy_score(t_test, predictions)  # 0
mean_squared_error(t_test.toarray(),
                   probabilities.toarray())  # 0.084135897849476421

classifier = LabelPowerset(GaussianNB())
classifier.fit(X_train, t_train)
predictions = classifier.predict(X_test)
# * If there are n number of different labels it will create n datasets and train for each label and will result the union of all predicted labels.
# * Here the correlation b/w the labels is not taken into account

# In[65]:

classifier = BinaryRelevance(LogisticRegression())

# In[66]:

classifier.fit(x_train, y_train)
print('Accuracy_score using BinaryRelevance is ',
      round(accuracy_score(y_test, classifier.predict(x_test)) * 100, 1), '%')
print('-------------------------------------------------')
print('roc_auc_score using BinaryRelevance is ',
      roc_auc_score(y_test,
                    classifier.predict_proba(x_test).toarray()))

# # Label Powerset
# * Label Powerset creates a unique class for every possible label combination that is present in the training set, this way it makes use of label correlation
# * Only problem with this method is as the no of classes increases its computational complexity also increases.

# In[67]:

log_classifier = LabelPowerset(LogisticRegression())

# In[68]:

log_classifier.fit(x_train, y_train)
print('Accuracy_score using LabelPowerset is ',
      round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1),
      '%')
Пример #6
0
def predict():
            # loading data
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")

    # cleaning comments
    def clean_text(text):
        text = text.lower()
        pat = re.compile(r"[^A-Za-z\s']")
        text = pat.sub(" ", text)
        text = text.rstrip()
        newLines = re.compile(r"[\n\r\t]")
        text = newLines.sub(" ", text)
        extraspace = re.compile(r'\s{2,}')
        text = extraspace.sub(" ", text)
        return text


    eng_stopwords = set(stopwords.words('english'))
    def preprocess_text(text): 
        text = " ".join([word for word in text.split() if len(word) >2])
        text = " ".join([word for word in text.split() if word not in eng_stopwords])
        text = " ".join([word for word in text.split() if word not in ["i'm", "can't"]])
        text = " ".join([WordNetLemmatizer().lemmatize(word) for word in text.split()])
        return text

    train["cleaned_comments"] = train["comment_text"].map(clean_text)
    train["final_cleaned_comments"] = train["cleaned_comments"].map(preprocess_text)

    test["cleaned_comments"] = test["comment_text"].map(clean_text)
    test["final_cleaned_comments"] = test["cleaned_comments"].map(preprocess_text)

    tfidfVect = TfidfVectorizer(min_df = 100, strip_accents = "unicode", stop_words = "english", smooth_idf = True)
    tfidfVect = tfidfVect.fit_transform(train["final_cleaned_comments"])

    train_target = train[["toxic", "severe_toxic","obscene", "threat", "insult", "identity_hate"]]
    X_train_w, X_test_w, y_train_w, y_test_w = train_test_split(train["final_cleaned_comments"], train_target, test_size = 0.3, random_state = 0)
    tfidfVectClean = TfidfVectorizer(min_df = 100, strip_accents = "unicode", stop_words = "english", smooth_idf = True).fit(X_train_w)
    nbModel_w = BinaryRelevance(naive_bayes.MultinomialNB())
    X_train_w_dtm_tfidf = tfidfVectClean.transform(X_train_w)
    nbModel_w.fit(X_train_w_dtm_tfidf, y_train_w)
    predictions = nbModel_w.predict(tfidfVectClean.transform(X_test_w))
    probs = nbModel_w.predict_proba(tfidfVectClean.transform(X_test_w))

    if request.method == "POST":
        message = request.form["message"]
        data = [message]
        vect = tfidfVectClean.transform(data).toarray()
        class_prediction = nbModel_w.predict(vect)
        #toxic_prediction = class_prediction.toarray()[:,0]
        keys = ["toxic", "severe_toxic","obscene", "threat", "insult", "identity_hate"]
        values = class_prediction.toarray()
        t_pred = dict(zip(keys, values[0]))
        tox_labels = [k for k,v in t_pred.items() if v == 1]
        if len(tox_labels) == 0:
            output = "Thank you for keeping your comment respectful to all"
        else:
            output = "Your comment has been flagged as: " + ', '.join(tox_labels) + "."
        
    #return render_template("result.html", prediction = toxic_prediction)
    return render_template("result.html", output = output)
Пример #7
0
binary_rel_clf = BinaryRelevance(MultinomialNB())
binary_rel_clf.fit(X_train, y_train)

# Predictions
X_test_n1 = np.array([X_test[0]])  # sample size of one

print("X_test -----------------")
print(X_test)
print("y_test -----------------")
print(y_test)
print("------------------------")
br_prediction = binary_rel_clf.predict(X_test)
print(br_prediction)
print("Pred Prob ------------------------")
br_prediction_prob = binary_rel_clf.predict_proba(X_test)
print(br_prediction_prob)

# Accuracy
print("Accuracy Score: " + str(accuracy_score(y_test, br_prediction) * 100) +
      " %")

# save trained model
import joblib

# binary_rel_clf_file = open("beep_boop_stonks.pkl","wb")
joblib.dump(binary_rel_clf, 'beep_boop_stonks.joblib')

# load trained model
model = joblib.load('beep_boop_stonks.joblib')
def evaluate_model_svm(x, y, learn_path, k=10, thresh=0.5):
    print(len(y), len(y[0]))
    # create a k fold with no unique classes
    count = 0
    while True:
        count += 1
        # print(count, 'Finding a proper KF...')
        kf = list(
            KFold(n_splits=k, shuffle=True,
                  random_state=randint(0, 100000)).split(x))
        good_folds = True
        for train_index, test_index in kf:
            for i in range(len(y[0])):
                if len(np.unique(
                        y[train_index,
                          i])) < 2:  # or len(np.unique(y[test_index, i])) < 2:
                    # print(y[train_index, i],np.unique(y[train_index, i]))
                    print(i)
                    good_folds = False
                    break
            if not good_folds:
                break
        if good_folds:
            break
    print('Found a good KF in', count, 'try!')

    with open(learn_path + 'topic_classifier-folds.pkl', 'wb') as out_file:
        pickle.dump(kf, out_file)
    fold_num = 0

    stats = QuickDataFrame([
        'Jaccard (normalised)', 'Accuracy (normalised)', 'Accuracy',
        'F1_score (micro averaged)', 'F1_score (macro averaged by labels)',
        'F1_score (averaged by samples)', 'Hamming loss', 'Label Ranking loss:'
    ])

    prog = Progresser(k)
    for train_index, test_index in kf:
        # print(train_index, test_index)
        print('___________________________________________________')
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # cls = SVC(kernel='linear')
        # cls = SVC(kernel='poly', probability=True, tol=1e-5)
        cls = SVC(kernel='linear', probability=True, tol=1e-5)
        # cls = GaussianNB()
        # cls = RandomForestClassifier(max_features='auto', random_state=1)

        topic_classifier = BinaryRelevance(classifier=cls,
                                           require_dense=[True, True])

        try:
            topic_classifier.fit(x_train, y_train)
        except Exception as e:
            print('\nfit error!:', e)
            continue

        # with open(learn_path + 'topic_classifier-SVC' + str(fold_num) + '.pkl', 'wb') as out_file:
        #     pickle.dump(topic_classifier, out_file)

        try:
            # predictions = topic_classifier.predict(x_test)
            predictions = np.zeros((len(x_test), y.shape[1]))
            preds = topic_classifier.predict_proba(x_test)
            for i in range(len(x_test)):
                for j in range(y.shape[1]):
                    predictions[i, j] = 1.0 if preds[i, j] > thresh else 0.0
            s = [
                jaccard_similarity_score(y_test, predictions, normalize=True),
                accuracy_score(y_test, predictions, normalize=True),
                accuracy_score(y_test, predictions, normalize=False),
                f1_score(y_test, predictions, average='micro'),
                f1_score(y_test, predictions, average='macro'),
                f1_score(y_test, predictions, average='samples'),
                hamming_loss(y_test, predictions),
                label_ranking_loss(y_test, predictions)
            ]

            stats.append(s)
            print(stats[stats.length - 1])
        except Exception as e:
            print('Eval error!:', e)

        fold_num += 1
        prog.count()

    for col in stats.cols:
        print(col, np.mean(stats[col]))