示例#1
0
def main():
    bibtex = sci.loadmat('D:\课程作业\机器学习\机器学习课程设计\dataset\\bibtex.mat')
    medical = sci.loadmat('D:\课程作业\机器学习\机器学习课程设计\dataset\medical.mat')
    bib_X = bibtex['data']  #7395,1836
    bib_y = bibtex['target']  #159,7395
    med_X = medical['data']  #978,1449
    med_y = medical['target']  #45,978
    scaler = MinMaxScaler()
    scaler.fit(bib_X)
    bib_X = scaler.transform(bib_X)
    scaler = MinMaxScaler()
    scaler.fit(med_X)
    med_X = scaler.transform(med_X)

    f1_scores = []
    l2_s = ['l1', 'l2']
    for l2 in l2_s:
        clf = BinaryRelevance(
            LogisticRegression(penalty=l2, solver='liblinear', dual=False))
        clf.fit(med_X, med_y.T)
        pre = clf.predict(med_X)
        f1_scores.append(metrics.f1_score(med_y.T, pre, average='samples'))
    for l2 in l2_s:
        clf = BinaryRelevance(LinearSVC(penalty=l2, dual=False))
        clf.fit(med_X, med_y.T)
        pre = clf.predict(med_X)
        f1_scores.append(metrics.f1_score(med_y.T, pre, average='samples'))
    tabel = PrettyTable(["", "log", "hinge"])
    tabel.padding_width = 1
    tabel.add_row(["l1", f1_scores[0], f1_scores[2]])
    tabel.add_row(["l2", f1_scores[1], f1_scores[3]])
    csfs = CSFS(u=0.1)
    W, b = csfs.fit(med_X.T, med_y.T, u=0.1)
    pred = csfs.predict(med_X.T, W, b)
    new_y = np.zeros(med_y.shape)
    size = int(med_y.shape[1] * 0.7)
    new_y[:, :size] = med_y[:, :size]
    smile = SMILE(alpha=0.1)
    smile.fit(med_X.T, new_y)
    pred_s = smile.predict(med_X.T)
    csfs2 = CSFS(u=0.1)
    W, b = csfs.fit(med_X.T, new_y.T, u=0.1)
    pred = csfs.predict(med_X.T, W, b)
    print('large mult_score:',
          metrics.f1_score(med_y.T, pred, average='samples'))
    print('CSFSf1_scores:', metrics.f1_score(med_y.T,
                                             pred_s,
                                             average='samples'))
    print('SMILE_score:', metrics.f1_score(med_y.T, pred, average='samples'))
    print(tabel)
示例#2
0
def binary(X_train, X_test, y_train, y_test):

    print("Binary Relevance")
    model = BinaryRelevance(classifier=SVC(),
                            require_dense=[True, True]).fit(X_train, y_train)
    y_pred = model.predict(X_test)

    hamming = hamming_loss(y_test, y_pred)
    subset_accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='micro')
    precision = precision_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')
    coverage = coverage_error(y_test, y_pred.toarray())
    aps = label_ranking_average_precision_score(y_test, y_pred.toarray())
    rankingloss = label_ranking_loss(y_test, y_pred.toarray())
    print("Hamming: " + str(hamming))
    print("Subset Accuracy: " + str(subset_accuracy))
    print("Recall: " + str(recall))
    print("Precision: " + str(precision))
    print("F1: " + str(f1))
    print("Coverage error: " + str(coverage))
    print("Average Precision Score: " + str(aps))
    print("Ranking Loss: " + str(rankingloss))
    print("\n")

    return hamming, subset_accuracy, recall, precision, f1, coverage, aps, rankingloss
示例#3
0
def train(X, y):
    classifier = BinaryRelevance(classifier=SVC(), require_dense=[False, True])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    print("before train")
    classifier.fit(X_train, y_train)
    print("train over begin predict")
    predictions = classifier.predict(X_test)
    print("validate loss accuracy: {}".format(
        1 - hamming_loss(y_test, np.stack(predictions))))
示例#4
0
    def train(self):
        classifier = BinaryRelevance(GaussianNB())
        classifier.fit(self.x_data, self.y_data)
        predictions = classifier.predict(self.x_test)

        return {
            'accuracy': accuracy_score(self.y_test, predictions),
            'f1_score': f1_score(self.y_test, predictions, average='micro')
        }
示例#5
0
def BN_fit(clfs, X_train, y_train, X_test, y_test, evaluate):
    metrics_lb = {}
    for key, clf in zip(clfs.keys(), clfs.values()):
        print('Fitting BinaryRelevance with Classifier : %s' % key)
        clf = BinaryRelevance(clf)
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        for m in evaluate:
            metrics_lb[key + ' ' + m] = scores(m, y_test, preds)
    return metrics_lb
def binRel(X_train, X_test, y_test, y_train):
    # initialize binary relevance multi-label classifier
    # with a gaussian naive bayes base classifier
    classifier = BinaryRelevance(GaussianNB())
    # train
    classifier.fit(X_train, y_train)
    # predict
    predictions = classifier.predict(X_test)
    print('Hamming loss: {0}'.format(
        sklearn.metrics.hamming_loss(y_test, predictions)))
def formDataMultiLabel(sampData, df):
    
    ####### Convert Response Category into Labelized Binarizer ################
    manActData = df.label_num.unique()
    lb = preprocessing.LabelBinarizer()    
    lb.fit(manActData)                       
    
    tfdLabelNum = lb.transform(df.label_num)
    
    ####### Convert Next Action Category into Labelized Binarizer #############
    nxtActData = df.sec_label_num.unique()
    lb = preprocessing.LabelBinarizer()    
    lb.fit(nxtActData)
    
    tfdSecLabelNum = lb.transform(df.sec_label_num)
       
    ####### Convert Response Category into Labelized Binarizer ################
    inpRPAData     = (df.inp_Data).astype(str)
    inpRPAData     = inpRPAData.apply(lambda x: x.split()[0])
    lab, lev = pd.factorize(inpRPAData)
    
    lb = preprocessing.LabelBinarizer()    
    lb.fit(np.unique(lab))
    
    tfdInpRPAData = lb.transform(lab)
    #print (np.unique(tfdInpRPAData))
    
    #This concatenation is the actual process
    #conCatData    = np.concatenate((tfdLabelNum, tfdSecLabelNum, tfdInpRPAData), axis=1)
    
    ####### Build Multi-Label Prediction Model  ###############################
    respTrain, respTest, labTrain, labTest = train_test_split(sampData, tfdSecLabelNum, random_state=1)

    TR  = tree.DecisionTreeClassifier(criterion = "gini", max_depth=100, min_samples_leaf=2) 
    GNB = GaussianNB()
    RF  = RandomForestClassifier(n_estimators = 100)
    
    classifier = BinaryRelevance(GNB)
    #classifier = ClassifierChain(TR)
    #classifier = LabelPowerset(RF)
    
    vect = TfidfVectorizer(min_df=1, max_df=1.0, stop_words='english')
    respTrainVec = vect.fit_transform(respTrain)
    
    respTestVec = vect.transform(respTest)
    
    classifier.fit(respTrainVec, labTrain)
    predictions = classifier.predict(respTestVec)
    acc = metrics.accuracy_score(labTest, predictions)
    print (acc)
    
    return lab
def problemTransformation(data):

    # Binary Relevance
    # Classifier Chains
    # Label Powerset

    # initialize multi-label classifier
    # with a gaussian naive bayes base classifier
    classifier = BinaryRelevance(GaussianNB())
    classifier.fit(X_train, y_train)  # train
    predictions = classifier.predict(X_test)  # predict
    accuracyScore = accuracy_score(y_test, predictions)

    classifier = ClassifierChain(GaussianNB())
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    accuracyScore = accuracy_score(y_test, predictions)

    classifier = LabelPowerset(GaussianNB())
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_test)
    accuracyScore = accuracy_score(y_test, predictions)
    return None
示例#9
0
def classifiers(X_train, Y_train, X_test):

    classifier1 = BinaryRelevance(GaussianNB())
    classifier2 = ClassifierChain(GaussianNB())
    classifier3 = LabelPowerset(GaussianNB())

    classifier1.fit(X_train, Y_train)
    classifier2.fit(X_train, Y_train)
    classifier3.fit(X_train, Y_train)

    predictions1 = classifier1.predict(X_test)
    predictions2 = classifier2.predict(X_test)
    predictions3 = classifier3.predict(X_test)

    return predictions1, predictions2, predictions3
def multiLabel_SKLearn_GaussianNBayes(rData, lData, sData):
    
    xData = rData.values
    yData = np.array( [lData.values, sData.values] )       
        
    respTrain, respTest, labTrain, labTest = train_test_split(xData, yData, random_state=1)    
    
    classifier = BinaryRelevance(GaussianNB())
    #classifier = ClassifierChain(GaussianNB())
    #classifier = LabelPowerset(GaussianNB())
    
    classifier.fit(respTrain, labTrain)
    predictions = classifier.predict(respTest)
    acc = accuracy_score(labTest, predictions)
    
    return acc
示例#11
0
    def binary_relevance(self):
        '''Name: Binary Relevance
           Main Idea: Divide multi-classify into multi binary classfier
           Evaluation Metric: accuracy_score
        '''
        print(self.X_train)
        print(self.y_train)
        classifier = BinaryRelevance(GaussianNB())
        classifier.fit(self.X_train, self.y_train)

        predictions = classifier.predict(self.X_test)
        print(predictions)
        #print(y_test)
        #print("predictions:\n",predictions)

        result = accuracy_score(self.y_test, predictions)

        print(result)
class MyBinaryRelevanceFeatureSelect():
    def fit(self, X, y):

        # I'm using a gaussian naive bayes base classifier
        self.BinaryRelevanceObject = BinaryRelevance(
            classifier=SVC(gamma='auto', probability=True),
            require_dense=[True, True])
        #self.BinaryRelevanceObject = BinaryRelevance()

        # fitting the data
        self.BinaryRelevanceObject.fit(X, y)

        #the classifiers for each label
        self.classifiers = self.BinaryRelevanceObject.classifiers_

        return self.BinaryRelevanceObject.fit(X, y)

#     def partition(self):
#         return self.BinaryRelevanceObject.partition_#BinaryRelevanceObject

#     def model_count(self):
#         return self.BinaryRelevanceObject.model_count_

    def predict(self, X, y=None):
        return self.BinaryRelevanceObject.predict(X)

    def predict_proba(self, X):
        return self.BinaryRelevanceObject.predict_proba(X)


#    def feature_select(self, X, y, transformer):
#        transformer.fit(X, y)
#        selected_attributes_indices = transformer.get_support(indices = True)
#
#        return selected_attributes_indices
#
#    def sets_of_selected_features(self, X, predictions, classifier, transformer ): #X is the df with the predictions
#        selected_features_array = []
#
#        for i in predictions:
#            indices_features_selected = classifier.feature_select(X, predictions[i], transformer)
#            selected_features_array.append(indices_features_selected)
#
#        return selected_features_array
    def get_train_test_lda(self, topic):

        # get training set
        dataset = arff.load(open(os.path.join(dir, "medical-train.arff")), encode_nominal=True)
        dataset = np.array(dataset.get("data"))

        X_train = dataset[:, :-num_label]
        y_train = dataset[:, -num_label:]

        # get test set
        dataset = arff.load(open(os.path.join(dir, "medical-test.arff")), encode_nominal=True)
        dataset = np.array(dataset.get("data"))

        X_test = dataset[:, :-num_label]
        y_test = dataset[:, -num_label:]

        for k in topic:
            X_iter = X_train.astype(np.int64)

            # get training_data feature topics
            model = lda.LDA(n_topics=k, n_iter=1000)
            model.fit(X_iter)
            doc_topic_x = model.doc_topic_

            # get training data label topics
            model_label = lda.LDA(n_topics=k, n_iter=1000)
            model_label.fit(y_train)
            doc_topic_y = model_label.doc_topic_

            # concat feature-topic and label topic
            x = np.hstack((doc_topic_x, doc_topic_y))

            # discretize the topics
            x = self.discretization_doc_topic(x)
            X_train = np.hstack((X_train, x))

            # multi-label learning to get test_data label topics and feature topics
            classifier = BinaryRelevance(RandomForestClassifier())
            classifier.fit(X_iter, x)
            x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray())

            X_test = np.hstack((X_test, x))

        return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
示例#14
0
def gaussianNaiveBayesBinary():
    print("Gaussian naive bayes binary")

    start = time.time()
    classifier = BinaryRelevance(GaussianNB())

    filename = "gaussianNaiveBayes"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
示例#15
0
def knnBinary(m):
    print("knn binary")

    start = time.time()
    classifier = BinaryRelevance(KNeighborsClassifier(n_neighbors=m))

    filename = "knnBinary"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
示例#16
0
def supportVectorMachine():
    print("Support vector machine")

    start = time.time()
    classifier = BinaryRelevance(classifier=svm.SVC(),
                                 require_dense=[False, True])
    filename = "SupportVectorMachine"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
示例#17
0
def randomForest():
    print("Random forest classifier")

    start = time.time()
    classifier = BinaryRelevance(classifier=RandomForestClassifier(),
                                 require_dense=[False, True])
    filename = "randomForest"

    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
示例#18
0
def BinaryRelevence ():
    
    # Train-Test Split =======================================================
    print("setting up a neural network...")
    from sklearn.model_selection import train_test_split
    train, test = train_test_split(df, test_size=0.33, shuffle=True)
    
    train_text = train['Book_Text']
    test_text = test['Book_Text']
    
    # TF-IDF ==================================================================
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
    vectorizer.fit(train_text)
    vectorizer.fit(test_text)
    
    x_train = vectorizer.transform(train_text)
    y_train = train.drop(labels = ['Book_Text'], axis=1)
    
    x_test = vectorizer.transform(test_text)
    y_test = test.drop(labels = ['Book_Text'], axis=1)
    
    # using binary relevance
    from skmultilearn.problem_transform import BinaryRelevance
    from sklearn.naive_bayes import GaussianNB
    
    # initialize binary relevance multi-label classifier
    # with a gaussian naive bayes base classifier
    classifier = BinaryRelevance(GaussianNB())
    
    # train
    classifier.fit(x_train, y_train)
    
    # predict
    predictions = classifier.predict(x_test)
    
    # accuracy
    print("Accuracy = ",accuracy_score(y_test,predictions))
    print("\n")
示例#19
0
def multinomialLogisticRegressionChain():
    # Train multi-classification model with logistic regression
    print("Logisticka regresija chain")

    start = time.time()
    classifier = BinaryRelevance(classifier=linear_model.LogisticRegression(
        multi_class='multinomial', solver='newton-cg'),
                                 require_dense=[False, True])

    filename = "logistickaRegresija.sav"
    classifier.fit(train_x, train_y)

    # save
    pickle.dump(classifier, open(filename, 'wb'))

    # load the model from disk
    classifier = pickle.load(open(filename, 'rb'))

    print('training time taken: ', round(time.time() - start, 0), 'seconds')

    predictions_new = classifier.predict(test_x)

    accuracy(test_y, predictions_new)
def multil_labels_binary_relevance(train_data, test_data):
    """
    可以正常运行
    使用二元关联。多分类多标签问题。
    :param train_data:
    :param test_data:
    :return:
    """
    from skmultilearn.problem_transform import BinaryRelevance
    from skmultilearn.problem_transform import ClassifierChain
    from sklearn.naive_bayes import GaussianNB
    from xgboost import XGBClassifier
    from sklearn.preprocessing import MultiLabelBinarizer
    xgt_param = {'max_depth': 6, 'eta': 0.5, 'eval_metric': 'merror', 'silent': 1,
                 'objective': 'multi:softmax', 'num_class': 20}
    # 用一个基于高斯朴素贝叶斯的分类器
    classifier = BinaryRelevance(GaussianNB())# 初始化二元关联多标签分类器
    # classifier = OneVsRestClassifier(XGBClassifier(n_jobs=-1, max_depth=4))
    X_train, y_train = train_data.iloc[:, [0]], train_data.iloc[:, list(range(1, 21))]
    X_test, y_test = test_data.iloc[:, [0]], test_data.iloc[:, list(range(1, 21))]
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
    # 训练
    temp = X_train.values.tolist()
    X = []
    for i in range(len(temp)):
        X.append(temp[i][0])
    x = tfidf.transform(X)
    y = y_train.values.tolist()
    # Y = [0]*20#长度为20的矩阵
    # for j in range(len(y)):
    #     if "1" in y[j]:
    #         indexs = y[j].index("1")
    #         Y.append(indexs+1)
    #     else:
    #         # print("0")
    #         Y.append(0)#其实有21类,因为有空
    Y = np.array(y)
    Y = Y.astype(np.int32)

    # Y值不能是多值??
    classifier.fit(x, Y)# 直接预测数字?
    """
    报错:raise TypeError('no supported conversion for types: %r' % (args,))
    TypeError: no supported conversion for types: (dtype('O'),)
    难道是??
    """

    # 预测
    temp = X_test.values.tolist()
    X_ts = []
    for i in range(len(temp)):
        X_ts.append(temp[i][0])
    x_test = tfidf.transform(X_ts)

    y_test = y_test.values.tolist()
    # Y_test = []
    # for j in range(len(y_test)):
    #     if "1" in y_test[j]:
    #         indexs = y_test[j].index("1")
    #         Y_test.append(indexs + 1)
    #     else:
    #         # print("0")
    #         Y_test.append(0)  # 其实有21类,因为有空
    Y_test = np.array(y_test)#形成一个矩阵
    Y_test = Y_test.astype(np.int32)
    unique_test, counts_test = np.unique(Y_test, return_counts=True)
    print("truth=", dict(zip(unique_test, counts_test)))

    predictions = classifier.predict(x_test)#此时csr_matrix类型
    predictions = predictions.toarray()
    # 里面有0吗??
    unique, counts = np.unique(predictions, return_counts=True)
    print("preditions=", dict(zip(unique, counts)))
    from sklearn.metrics import accuracy_score
    score = accuracy_score(Y_test, predictions)
    print(score)
print(y_train.shape)
print(y_test.shape)

'''

print("\n\nTraining data with Binary Relevance using Gaussian Naive Bayes")

#initialize binary relevance multi-label classifier
#with a gaussian naive bayes base classifier
classifier_binary = BinaryRelevance(GaussianNB())

# train for Binary Relevance
classifier_binary.fit(X_train, y_train)

# predict for Binary Relevance
predictions_binary = classifier_binary.predict(X_test)

#Hamming Loss for Binary Relevance
hamm_loss_binary = hamming_loss(y_test, predictions_binary)

print("Hamming Loss:", hamm_loss_binary)

print("\n\n\nTraining data with Classifier Chains using Gaussian Naive Bayes")

#initialize Classifier Chains multi-label classifier
#with a gaussian naive bayes base classifier
classifier_cc = ClassifierChain(GaussianNB())

# train for Classifier Chaines
classifier_cc.fit(X_train, y_train)
# In[6]:

# 1. using binary relevance
# The multi-label problem is broken into some different single class classification problems
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

# In[7]:

from sklearn.metrics import accuracy_score

accuracy_score(y_test, predictions)

# In[8]:

# 2. using classifier chains
# The problem would be transformed into some different single label problems.
# Different from the previous method, it forms chains in order to preserve label correlation.
from skmultilearn.problem_transform import ClassifierChain
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
示例#23
0
    	decision_function_shape='ovr', degree=1, gamma=100, kernel='linear',
    	max_iter=-1, probability=False, random_state=6, shrinking=True,
    	tol=0.001, verbose=False),
		require_dense = [False, True])

		i = 0
		j = 0
		for i in range(0, 47):
			X_copy = X_orig[(i):(i+1)]  #Slice the ith element from the numpy array
			y_copy = y_orig[(i):(i+1)]
			X_model = X_orig
			y_model = y_orig
			X_model = np.delete(X_model, i, axis = 0)  #Create a new array to train the model with slicing out the ith item for LOOCV
			y_model = np.delete(y_model, i, axis = 0)
			classifier.fit(X_model, y_model)
			prediction = classifier.predict(X_copy)
			equal = prediction.toarray()
			print(equal, y_copy)
			if np.array_equal(y_copy, equal):
				j = j + 1
				#print(y_copy, equal)
			if np.not_equal:
				#print(y_copy, equal)
				pass
		print(j/48)





示例#24
0
fold_auc = []
for s in dataset:
    fp = open(timeStamp('./datasets/' + s + '/' + s), 'w')
    for i in range(0, nfolds):

        X_train, y_train = readDataFromFile('./datasets/' + s + '/' + s +
                                            str(i) + '.train')
        print('Reading: ./datasets/' + s + '/' + s + str(i) + '.train')
        X_test, y_test = readDataFromFile('./datasets/' + s + '/' + s +
                                          str(i) + '.test')
        print('Reading: ./datasets/' + s + '/' + s + str(i) + '.test')
        classif = BinaryRelevance(
            classifier=RandomForestClassifier(n_estimators=10),
            require_dense=[False, True])
        classif.fit(X_train, y_train)
        y_score = classif.predict(X_test)

        #y_prob = classif.predict_proba(X_test)

        #-----------------------------------------#
        #Medidas: sklearn.metrics...(true,predict,..)
        acc = sklearn.metrics.accuracy_score(y_test, y_score)
        fold_accuracy.append(acc)
        #-----------------------------------------#
        hl = sklearn.metrics.hamming_loss(y_test, y_score)
        fold_hamming.append(hl)
        #-----------------------------------------#
        #Mean average precision
        m = sklearn.metrics.average_precision_score(y_test,
                                                    y_score.toarray(),
                                                    average='macro',
示例#25
0
t_train = sc.sparse.csr_matrix(t_train.values)
X_test = sc.sparse.csr_matrix(X_test.drop('user', axis=1).values)
t_test = sc.sparse.csr_matrix(t_test.values)

X_train_scale = scale(X_train.toarray(
))  # scaling not work well for many methods, for its offset of similarities
X_test_scale = scale(X_test.toarray())

X_sparse = sc.sparse.csr_matrix(X.drop('user', axis=1).values)
t_sparse = sc.sparse.csr_matrix(t.values)

# firstly test the transformations with a simple naive-bayes classifier, roughly conclude that BR suits the most
# intuitively the hotels shouldn't have correlation based on userID, for its randomness
classifier = BinaryRelevance(GaussianNB())
classifier.fit(X_train, t_train)
predictions = classifier.predict(X_test)
probabilities = classifier.predict_proba(X_test)
accuracy_score(t_test, predictions)  # 0
mean_squared_error(t_test.toarray(),
                   probabilities.toarray())  # 0.063299324514418692

classifier = ClassifierChain(GaussianNB())
classifier.fit(X_train, t_train)
predictions = classifier.predict(X_test)
probabilities = classifier.predict_proba(X_test)
accuracy_score(t_test, predictions)  # 0
mean_squared_error(t_test.toarray(),
                   probabilities.toarray())  # 0.084135897849476421

classifier = LabelPowerset(GaussianNB())
classifier.fit(X_train, t_train)
示例#26
0
 # y_pred = cross_val_predict(clf, X_train, y_train, cv=args.kfolds, n_jobs=-1, verbose=1)
 # 交叉验证
 rs = KFold(n_splits=args.kfolds,
            shuffle=True,
            random_state=args.randomseed)
 # 生成 k-fold 训练集、测试集索引
 cv_index_set = rs.split(X_train)
 k_fold_step = 1  # 初始化折数
 # 暂存每次选中的测试集索引和对应预测结果
 test_idx_cache = np.array([], dtype=np.int)
 pred_cache = np.empty((1, args.nclass))
 # 迭代训练 k-fold 交叉验证
 for train_index, test_index in cv_index_set:
     clf.fit(X_train[train_index], y_train[train_index])
     # 测试集验证
     y_pred = clf.predict(X_train[test_index])
     # 暂存每次选中的测试集和预测结果
     test_idx_cache = np.concatenate((test_idx_cache, test_index))
     pred_cache = np.concatenate((pred_cache, y_pred.toarray()), axis=0)
     # 每个fold训练结束后次数 +1
     k_fold_step += 1
 # 删除第一行空白元素
 pred_cache = np.delete(pred_cache, 0, axis=0)
 # 输出交叉验证后的预测结果
 df_pred = pd.DataFrame(index=test_idx_cache,
                        data=pred_cache,
                        columns=df.columns.values[0:args.nclass])
 df_pred.to_csv('{0}-pred.csv'.format(args.output), index_label='Sample ID')
 print('\nThe prediction saved to:`{0}-pred.csv`'.format(args.output))
 end_time = time.time()  # 程序结束时间
 print("\n[Finished in: {0:.6f} mins = {1:.6f} seconds]".format(
示例#27
0
import pandas as pd
import numpy as np
import pm_telemetry as pmt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from skmultilearn.problem_transform import BinaryRelevance
pd.set_option("display.max_columns", None)
# prepare data
tele = pmt.PmTelemetry()
tele.merge_failures()
tele.merge_errors()
tele.merge_maint()
tele.merge_machines()
tele.gen_past_hr_rolling_telemetry(24, "_24", np.mean)
print("Make datetime ordinal")
tele.data["dt_ordinal"] = tele.data["datetime"].apply(lambda x: x.toordinal())
print("Label encode model")
label_encoder = LabelEncoder()
tele.data["model"] = label_encoder.fit_transform(tele.data["model"])
print(tele.data.info())
print(tele.data.head())
# start modelling
print("Start modeling")
features = [
    "dt_ordinal", "machineID", "volt", "pressure", "vibration",
    "errorID_error1", "errorID_error2", "errorID_error3", "errorID_error4",
    "errorID_error5", "comp_comp1", "comp_comp2", "comp_comp3", "comp_comp4",
    "model", "age", "volt_24", "rotate_24", "vibration_24", "pressure_24"
      roc_auc_score(y_test, pd.DataFrame(pred)))

# # BinaryRelevance
# * This one is an ensemble of single-class (Yes/No) binary classifier
# * If there are n number of different labels it will create n datasets and train for each label and will result the union of all predicted labels.
# * Here the correlation b/w the labels is not taken into account

# In[65]:

classifier = BinaryRelevance(LogisticRegression())

# In[66]:

classifier.fit(x_train, y_train)
print('Accuracy_score using BinaryRelevance is ',
      round(accuracy_score(y_test, classifier.predict(x_test)) * 100, 1), '%')
print('-------------------------------------------------')
print('roc_auc_score using BinaryRelevance is ',
      roc_auc_score(y_test,
                    classifier.predict_proba(x_test).toarray()))

# # Label Powerset
# * Label Powerset creates a unique class for every possible label combination that is present in the training set, this way it makes use of label correlation
# * Only problem with this method is as the no of classes increases its computational complexity also increases.

# In[67]:

log_classifier = LabelPowerset(LogisticRegression())

# In[68]:
示例#29
0
    train_data = tokenization_pos_tag(train_data["comment_text"])
    test_data = tokenization_pos_tag(test_data["comment_text"])

    X_train = dataset_creation(train_data)
    X_test = dataset_creation(test_data)

    X_train_tfidf, X_test_tfidf = function_tfidf(X_train, X_test)

    from skmultilearn.problem_transform import BinaryRelevance
    from sklearn.svm import LinearSVC

    classifier = BinaryRelevance(LinearSVC())
    classifier.fit(X_train_tfidf, train_labels)

    # predict
    predictions = classifier.predict(X_test_tfidf)

    # accuracy
    from sklearn.metrics import accuracy_score

    print("Accuracy = ", accuracy_score(test_labels, predictions))

    # confusion matix
    pred = predictions.toarray()

    import sklearn.metrics as skm

    cm = skm.multilabel_confusion_matrix(test_labels, pred)
    print(cm)
    print(skm.classification_report(test_labels, pred))
示例#30
0
from sklearn.cross_validation import cross_val_predict

data = pd.read_csv("your_csv_file.csv", dtype={'sentence': np.str_})

y = data[['isA', 'isB', 'isC', 'isD', 'isE']]
to_drop = ['id', 'isA', 'isB', 'isC', 'isD', 'isE']
X = data.drop(to_drop, axis=1)

count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(X['sentence'].values)

X_train, X_test, y_train, y_test = train_test_split(counts, y, test_size=0.33)

clf = BinaryRelevance(RandomForestClassifier())

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Print invidiual label wise F1, precision and recall scores
target_names = ['A', 'B', 'C', 'D', 'E']
print(classification_report(y_pred, y_test))

# Extract the values of label A alone, repeat the same for other labels
extractedData1 = y_pred[:, [1]]
z = scipy.sparse.csr_matrix(y_test)
extractedData2 = z[:, [1]]

# Flatten both the array and print the accuracy score for label A
actual = extractedData1.toarray().flatten()
predicted = extractedData2.toarray().flatten()
print(accuracy_score(actual, predicted))