def train(self, data: DataFrame, X_column: str, y_columns: List[str] = None): if y_columns is None: _ = data.columns.to_list() y_columns = list(set(_) - set([X_column])) X = data[X_column] y: DataFrame = data.drop(X_column, axis=1) xtrain, xtest, ytrain, ytest = train_test_split(X, y, random_state=42, test_size=0.2) mlb = MultiLabelBinarizer() train_labels = mlb.fit_transform(ytrain[y_columns].values) # test_labels not used when training # test_labels = mlb.fit_transform(ytest[y_columns].values) train_cleaned = xtrain.copy(deep=True).apply( nlp_preprocess.Preprocess().clean_text) # test cleaned not used when training # test_cleaned = xtest.copy(deep=True).apply(clean_text) vectorizer = TfidfVectorizer() vectorised_train_documents = vectorizer.fit_transform(train_cleaned) powersetsvc = LabelPowerset(LinearSVC()) powersetsvc.fit(vectorised_train_documents, train_labels) dump(powersetsvc, open("powersetsvc.pickle", "wb")) with open('vec.pickle', 'wb') as f1: dump(vectorizer, f1) return powersetsvc, vectorizer
class LP(): ''' Label Powerset Method ''' h = None def __init__(self, h=LogisticRegression()): self.h = LabelPowerset(h) def fit(self, X, Y): ''' Train the model on training data X,Y ''' return self.h.fit(X, Y) def predict(self, X): ''' Return predictions Y, given X ''' return self.h.predict(X) def predict_proba(self, X): ''' Return matrix P, where P[i,j] = P(Y[i,j] = 1 | X[i]) (where i-th row/example, and j-th label) ''' return self.h.predict_proba(X)
class MyLabelPowerSetFeatureSelect(): def fit(self, X, y): # I'm using a gaussian naive bayes base classifier self.LabelPowerSetObject = LabelPowerset(GaussianNB()) # fitting the data self.LabelPowerSetObject.fit(X, y) # transformed y y_transformed = self.LabelPowerSetObject.transform(y) # instanciating with SelectKBest object self.X_new = SelectKBest(chi2, k=2) # the feature selecting self.X_transformed = self.X_new.fit_transform(X, y_transformed) # save indices of the saved attributes self.selected_attributes_indices = self.X_new.get_support(indices = True) #print(self.attributes_indices,'the indices of the selected atributes') return self def transform(self, X): return X[:,self.selected_attributes_indices] def predict(self, X): return self.LabelPowerSetObject.predict(X) def predict_proba(self, X): return self.LabelPowerSetObject.predict_proba(X)
def multi_classTo_multi_multi(Y, model): num_of_labels = Y.ndim if (num_of_labels >= 2): print("This is already a multi-label problem!!!!!!") return Y transclf = LabelPowerset(classifier=model, require_dense=[False, True]) return transclf.inverse_transform(Y)
def filter_rare_classes(self, feature_matrix, target_matrix): ''' In order to perform stratified split between train and test,there should be atleast 2 instances present in the data. Hence, filter label combinations that occurs only once in the entire dataset. Input : Feature Matrix : matrix of features Target Matrix : matrix containing the the target labels Output : Feature Matrix : Filtered Target Matrix : Filtered ''' lp = LabelPowerset() multi_class_target_labels = lp.transform(target_matrix) classes_vc = np.asarray( np.unique(multi_class_target_labels, return_counts=True)).T # 1635 unique classes class_to_keep = classes_vc[np.where(classes_vc[:, 1] > 1)][:, 0] mask = [ True if (multi_class_target_labels[i] in (class_to_keep)) else False for i in range(len(multi_class_target_labels)) ] feature_matrix = feature_matrix[mask] target_matrix = target_matrix[mask] return feature_matrix, target_matrix
def reduce_dimension(data1, label1, dimension_num, estimators=100): # The method is to reduce the dimension of vector # and choose the most important features # print('label1: ', label1.shape) y_train = sparse.lil_matrix((label1.shape[0], 85)) y_train[:, :] = label1 # print(y_train.shape) X_train = sparse.lil_matrix((label1.shape[0], 4189)) X_train[:, :] = data1 # print(X_train.shape) classifier5 = RandomForestClassifier(n_estimators=estimators, random_state=1) classifier1 = LabelPowerset(classifier=classifier5, require_dense=[False, True]) classifier1.fit(X_train, y_train) importances = classifier5.feature_importances_ # print('importances1: ', importances) indices = np.argsort(importances)[::-1] # print('indices', indices) features_importances = importances[indices] # plot_feature_importances(importances, 'Features Importance(Random Forest)', name1) return indices[:dimension_num], indices[ dimension_num:], features_importances
def train_test_split(self, feature_matrix, target_matrix, test_size=0.2): ''' Stratified Shuffle split technique is used to split train and test set, to have the equal proportion of classes in train and test. Input: feature_matrix : Feature matrix with rare classes filtered out target_matrix : Target matrix with rare classes filtered out test_size: default is 20% Output: train_x, train_y, test_x, test_y ''' lp = LabelPowerset() sss_level_1 = StratifiedShuffleSplit(lp.transform(target_matrix), n_iter=1, test_size=0.2, random_state=123) for train_ix, test_ix in sss_level_1: train_x = feature_matrix.iloc[train_ix, :] train_y = target_matrix.iloc[train_ix, :] test_x = feature_matrix.iloc[test_ix, :] test_y = target_matrix.iloc[test_ix, :] return train_x, train_y, test_x, test_y
def __init__(self, C=1.0, use_idf=False, filename=None, **kwargs): self.lm = LabelPowerset(MultinomialNB()) self.vect1 = TfidfVectorizer(norm=None, use_idf=use_idf, min_df=0.0, ngram_range=(1, 1)) self.selector = sklearn.feature_selection.SelectKBest(k='all') self.output_dim = 0 if filename is not None: self.load(filename)
def multi_labelTo_multi_class(Y, model): num_of_labels = Y.ndim if (num_of_labels == 1): print("This is not a multi-label problem!!!!!!") return Y #LabelPowerset is used here as it contains the transform function #that actuall do the multi_label to muti_class transformation. transclf = LabelPowerset(classifier=model, require_dense=[False, True]) return [transclf, transclf.transform(Y)]
def labelSet(self): classifier = LabelPowerset(GaussianNB()) classifier.fit(self.X_train, self.y_train) # predict predictions = classifier.predict(self.X_test) result = accuracy_score(self.y_test, predictions) print(result)
def logistic_regression_classifier(train_x, train_y): from sklearn.linear_model import LogisticRegression from skmultilearn.problem_transform import BinaryRelevance from skmultilearn.problem_transform import LabelPowerset from skmultilearn.problem_transform import ClassifierChain from sklearn.naive_bayes import GaussianNB model = LabelPowerset(LogisticRegression(penalty='l1')) model.fit(train_x, train_y) return model
def generat_model(): try: logging.info("Generating mlb.pkl model file in 'pkl' folder") mlb = read_input() # Loading processed data pickle annotated_reviews_df = pd.read_pickle("./pkl/annotated_reviews_df.pkl") # Convert the multi-labels into arrays y = mlb.fit_transform(annotated_reviews_df.aspects) X = annotated_reviews_df.text # Split data into train and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) # save the the fitted binarizer labels # This is important: it contains the how the multi-label was binarized, so you need to # load this in the next folder in order to undo the transformation for the correct labels. filename = "./pkl/mlb.pkl" pickle.dump(mlb, open(filename, 'wb')) logging.info( "Successfully generated and saved mlb.pkl model file in 'pkl' folder" ) text_clf = Pipeline([ ('vect', CountVectorizer(stop_words="english", ngram_range=(1, 1))), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))), ]) text_clf = text_clf.fit(X_train, y_train) predicted = text_clf.predict(X_test) # Calculate accuracy np.mean(predicted == y_test) # Test if SVM performs better text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-svm', LabelPowerset( SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=6, random_state=42)))]) _ = text_clf_svm.fit(X_train, y_train) predicted_svm = text_clf_svm.predict(X_test) except: logging.error("Error in Generating mlb.pkl model file in 'pkl' folder") pass return predicted_svm, y_test, X, y
def naive_bayes_classifier(train_x, train_y): from skmultilearn.problem_transform import BinaryRelevance from skmultilearn.problem_transform import LabelPowerset from skmultilearn.problem_transform import ClassifierChain from sklearn.naive_bayes import GaussianNB classifier = LabelPowerset(GaussianNB()) # classifier = ClassifierChain(GaussianNB()) # classifier = BinaryRelevance(GaussianNB()) classifier.fit(train_x, train_y) return classifier
def buildLBClassifier(xTrain, yTrain): # initialize Label Powerset multi-label classifier # with a gaussian naive bayes base classifier classifier = LabelPowerset(GaussianNB()) # train xTrain = np.ascontiguousarray(xTrain) yTrain = np.ascontiguousarray(yTrain) classifier.fit(xTrain, yTrain) return classifier
def powerset(self): classifier = LabelPowerset(LogisticRegression()) classifier.fit(self.x_data, self.y_data) predictions = classifier.predict(self.x_test) return { 'accuracy': accuracy_score(self.y_test, predictions), 'f1_score': f1_score(self.y_test, predictions, average='micro') }
def getClassifier(classifierType): if (classifierType == OVR_SVC_UNIGRAM_3 or classifierType == OVR_SVC_BiGRAM_3): return OneVsRestClassifier(LinearSVC()) elif (classifierType == OVR_MNB_UNIGRAM or classifierType == OVR_MNB_BiGRAM): return OneVsRestClassifier(MultinomialNB(alpha=0.7)) elif (classifierType == OVR_SGD_UNIGRAM or classifierType == OVR_SGD_BiGRAM): return OneVsRestClassifier(linear_model.SGDClassifier()) elif (classifierType == LP_SVC_UNIGRAM or classifierType == LP_SVC_BIGRAM): return LabelPowerset(LinearSVC()) elif (classifierType == LP_MNB_UNIGRAM or classifierType == LP_MNB_BIGRAM): return LabelPowerset(MultinomialNB(alpha=0.7)) elif (classifierType == LP_SGD_UNIGRAM or classifierType == LP_SGD_BiGRAM): return LabelPowerset(linear_model.SGDClassifier())
def resampling_data(self, X, y): # Import a dataset with X and multi-label y lp = LabelPowerset() ros = RandomOverSampler(random_state=42) # Applies the above stated multi-label (ML) to multi-class (MC) transformation. yt = lp.transform(y) X_resampled, y_resampled = ros.fit_sample(X, yt) # Inverts the ML-MC transformation to recreate the ML set y_resampled = lp.inverse_transform(y_resampled) return X_resampled, y_resampled
def chooseClassifier(classifier, X_train, y_train): if (classifier == "XGBoost"): model = LabelPowerset(XGBClassifier(random_state=0)).fit( X_train, y_train) elif (classifier == "RandomForest"): model = LabelPowerset( RandomForestClassifier(n_estimators=1000, criterion='entropy', random_state=0)).fit(X_train, y_train) elif (classifier == "SVM"): model = LabelPowerset(LinearSVC(random_state=0)).fit(X_train, y_train) elif (classifier == "LogisticRegression"): model = LabelPowerset(LogisticRegression(random_state=0)).fit( X_train, y_train) return model
def classifiers(X_train, Y_train, X_test): classifier1 = BinaryRelevance(GaussianNB()) classifier2 = ClassifierChain(GaussianNB()) classifier3 = LabelPowerset(GaussianNB()) classifier1.fit(X_train, Y_train) classifier2.fit(X_train, Y_train) classifier3.fit(X_train, Y_train) predictions1 = classifier1.predict(X_test) predictions2 = classifier2.predict(X_test) predictions3 = classifier3.predict(X_test) return predictions1, predictions2, predictions3
def rakeld_ensemble(vec, label): problem_transform_classifier = LabelPowerset(classifier=LinearSVC(), require_dense=[False, True]) classifier = RakelD(classifier=problem_transform_classifier, labelset_size=5) classifier.fit(vec, label) return classifier
def make_use_w2v_fix(self): x_all = self.__vectors_provider.get_w2v_vectors_fix() y_all = self.__data_source.get_y_multi_label() # TODO here grid search base_estimators = [ LogisticRegression(C=1.0, solver='sag', n_jobs=-1), # LogisticRegression(n_jobs=-1), # LinearSVC(), # MLPClassifier() ] model_params = [ "LogisticRegression(C=1.0, solver='sag')", # "LogisticRegression()", # "LinearSVC()", # "MLPClassifier()" ] i = 0 for base_estimator in base_estimators: logging.warning(str(datetime.now()) + 'Start ' + model_params[i]) try: model = LabelPowerset(base_estimator) cross_val_f1 = Evaluator.evaluate_only_cross_val( model, x_all, y_all) self.__visualizer.show_results_briefly(self.__CLASSIFIER_NAME, model_params[i], "Word2Vec_fix", cross_val_f1) except: logging.warning('Error on ' + model_params[i]) logging.warning(str(datetime.now()) + 'End ' + model_params[i]) i += 1
def train(classifier, X_train, X_test, y_train, y_test, strategy): """Computes a multi-label classification. This approach is used by `one-vs-the-rest`, `classifier-chains`, and `label-powerset` strategies. For each classifier, the classes are fitted at the same time or in sequence. Since all the classes are represented by one and only one classifier, it is possible to gain knowledge about the classes by inspecting this unique classifier. Args: classifier: An instance of a scikit-learn classifier. classes: A list of strings representing the classes to be trained. X_train: A matrix containing features for training. y_train: A one-column dataframe containing labels for training. strategy: A string defining which of the three strategies will be used. Returns: A classification model and its performance report """ if strategy == 'one-vs-the-rest': model = OneVsRestClassifier(classifier) if strategy == 'classifier-chains': model = ClassifierChain(classifier) if strategy == 'label-powerset': model = LabelPowerset(classifier) model.fit(X_train, y_train) y_pred = model.predict(X_test) report = classification_report(y_test, y_pred, output_dict=True, target_names=y_train.columns) return model, report
def calculate_accuracy(): #Calculate accuracy try: logging.info("Generating naive_model1.pkl model file in 'pkl' folder") predicted_svm, y_test, X, y = generat_model() np.mean(predicted_svm == y_test) # Train naive bayes on full dataset and save model text_clf = Pipeline([ ('vect', CountVectorizer(stop_words="english", ngram_range=(1, 1))), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', LabelPowerset(MultinomialNB(alpha=1e-1))), ]) text_clf = text_clf.fit(X, y) # save the model to disk filename = './pkl/naive_model1.pkl' pickle.dump(text_clf, open(filename, 'wb')) logging.info( "Successfully Generated naive_model1.pkl model file in 'pkl' folder" ) except: logging.error( "Error in Generating naive_model1.pkl model file in 'pkl' folder") pass
def fit(self, X, y): """Fit classifier to multi-label data Parameters ---------- X : numpy.ndarray or scipy.sparse input features, can be a dense or sparse matrix of size :code:`(n_samples, n_features)` y : numpy.ndaarray or scipy.sparse {0,1} binary indicator matrix with label assignments, shape :code:`(n_samples, n_labels)` Returns ------- fitted instance of self """ self._label_count = y.shape[1] self.model_count_ = int(np.ceil(self._label_count / self.labelset_size)) self.classifier_ = LabelSpacePartitioningClassifier( classifier=LabelPowerset( classifier=self.base_classifier, require_dense=self.base_classifier_require_dense), clusterer=GreedyLabelSpaceClusterer( cluster_size=self.labelset_size, cluster_count=self.model_count_, allow_overlap=False), require_dense=[False, False]) return self.classifier_.fit(X, y)
def __init__( self, rdm_state=84, params={"classifier__C": [0.1, 1.0, 10.0, 100.0, 1000.0, 10000.0]}, niterations=5): self.model = LabelPowerset(LogisticRegression(random_state=rdm_state)) self.params = params self.niterations = niterations
def evaluate_verse(embedding, labels, number_shuffles=10, train_perc=0.1): from skmultilearn.problem_transform import LabelPowerset micro = [] macro = [] sss = StratifiedShuffleSplit( n_splits=number_shuffles, test_size=1 - train_perc) for train_index, test_index in sss.split(embedding, labels): X_train, X_test = embedding[train_index], embedding[test_index] y_train, y_test = labels[train_index], labels[test_index] clf = LabelPowerset(LogisticRegression()) clf.fit(X_train, y_train) preds = clf.predict(X_test) micro.append(f1_score(y_test, preds, average='micro')) macro.append(f1_score(y_test, preds, average='macro')) return (micro, macro)
def LabelPowerset_method(X_train, y_train, samples_leaf, samples_split): """ 问题转换-->标签Powerset方法 :param X_train: 输入数据 :param y_train: 对应标签数据 :return: """ try: classifier = LabelPowerset( DecisionTreeClassifier(min_samples_leaf=int(samples_leaf), min_samples_split=int(samples_split))) classifier.fit(X_train, y_train) return classifier except Exception as e: print("warning----标签Powerset|LabelPowerset_method----" + str(e)) return None
def make_use_tfidf_with_results(self): x_all = self.__vectors_provider.get_tfidf_vectors() y_all = self.__data_source.get_y_multi_label() model1 = LabelPowerset( LogisticRegression(C=1.0, solver='sag', n_jobs=-1)) Evaluator.multi_label_predict_proba_tfidf( model1, x_all, y_all, data_source=self.__data_source)
def runSet(model, x, y): mse = [] accuracy = [] kf = KFold(n_splits=splitNo) for train, test in kf.split(x): classifier = LabelPowerset(model) classifier.fit(x[train], y[train]) predictions = classifier.predict(x[test]) accuracy.append(accuracy_score(y[test], predictions)) mse.append(mean_squared_error(y[test], predictions.toarray())) mse = np.array(mse) accuracy = np.array(accuracy) mse = np.mean(mse) accuracy = np.mean(accuracy) return accuracy, mse
def get_train_test_lda(topic): model = VGG16(include_top=False, pooling='avg') x_train, y_train, x_test, y_test = load() x_train = x_train.astype('float32') x_train /= 255 y_train = y_train.astype('int64') x_test = x_test.astype('float32') x_test /= 255 y_test = y_test.astype('float32') X_train = model.predict(x_train) print(X_train.shape) X_test = model.predict(x_test) # X_train = model.predict(x_train) # X_test = model.predict(x_test) for k in topic: X_iter = X_train model_label = lda.LDA(n_topics=k, n_iter=1000) model_label.fit(y_train) doc_topic = model_label.doc_topic_ x2 = doc_topic x = x2 x = discretization_doc_topic(x) X_train = np.hstack((X_train, x)) # multi-label learning to get x2 classifier = LabelPowerset(RandomForestClassifier()) classifier.fit(X_iter, x) x = np.array(sp.csr_matrix(classifier.predict(X_test)).toarray()) # print(x) # x = alpha * x1 + (1-alpha) * x2 # x = self.discretization_doc_topic(x) X_test = np.hstack((X_test, x)) return np.array(X_train)[:, -28:], np.array(y_train), np.array( X_test)[:, -28:], np.array(y_test)