class MyLabelPowerSetFeatureSelect(): def fit(self, X, y): # I'm using a gaussian naive bayes base classifier self.LabelPowerSetObject = LabelPowerset(GaussianNB()) # fitting the data self.LabelPowerSetObject.fit(X, y) # transformed y y_transformed = self.LabelPowerSetObject.transform(y) # instanciating with SelectKBest object self.X_new = SelectKBest(chi2, k=2) # the feature selecting self.X_transformed = self.X_new.fit_transform(X, y_transformed) # save indices of the saved attributes self.selected_attributes_indices = self.X_new.get_support(indices = True) #print(self.attributes_indices,'the indices of the selected atributes') return self def transform(self, X): return X[:,self.selected_attributes_indices] def predict(self, X): return self.LabelPowerSetObject.predict(X) def predict_proba(self, X): return self.LabelPowerSetObject.predict_proba(X)
class LP(): ''' Label Powerset Method ''' h = None def __init__(self, h=LogisticRegression()): self.h = LabelPowerset(h) def fit(self, X, Y): ''' Train the model on training data X,Y ''' return self.h.fit(X, Y) def predict(self, X): ''' Return predictions Y, given X ''' return self.h.predict(X) def predict_proba(self, X): ''' Return matrix P, where P[i,j] = P(Y[i,j] = 1 | X[i]) (where i-th row/example, and j-th label) ''' return self.h.predict_proba(X)
# * Only problem with this method is as the no of classes increases its computational complexity also increases. # In[67]: log_classifier = LabelPowerset(LogisticRegression()) # In[68]: log_classifier.fit(x_train, y_train) print('Accuracy_score using LabelPowerset is ', round(accuracy_score(y_test, log_classifier.predict(x_test)) * 100, 1), '%') print('-------------------------------------------------') print('roc_auc_score using LabelPowerset is ', roc_auc_score(y_test, log_classifier.predict_proba(x_test).toarray())) # # ClassifierChain # * This method uses a chain of binary classifiers # * Each new Classifier uses the predictions of all previous classifiers # * This was the correlation b/w labels is taken into account # In[69]: chain = ClassifierChain(LogisticRegression()) # In[70]: chain.fit(x_train, y_train) print('Accuracy_score using ClassifierChain is ', round(accuracy_score(y_test, chain.predict(x_test)) * 100, 1), '%')
class LMWrapper(Model): def __init__(self, C=1.0, use_idf=False, filename=None, **kwargs): self.lm = LabelPowerset(MultinomialNB()) self.vect1 = TfidfVectorizer(norm=None, use_idf=use_idf, min_df=0.0, ngram_range=(1, 1)) self.selector = sklearn.feature_selection.SelectKBest(k='all') self.output_dim = 0 if filename is not None: self.load(filename) def build_representation(self, x, y=None, fit=False): auxX = [ ' \n '.join([ ' '.join(['w_' + str(token) for token in field if token != 0]) for field in instance ]) for instance in x ] if fit: self.vect1.fit(auxX) auxX = self.vect1.transform(auxX) if fit: self.selector.fit(auxX, np.array([np.argmax(i) for i in y])) auxX = self.selector.transform(auxX) return auxX.todense() def fit(self, x, y, validation_data=None): auxY = y print('Build representation...') auxX = self.build_representation(x, auxY, fit=True) print('auxX shape:', auxX.shape) print('Fit model...') self.lm.fit(auxX, auxY) self.output_dim = auxY.shape[1] if validation_data is None: return None res = self.evaluate(validation_data[0], validation_data[1]) print("Accuracy in validation data =", res) return None def predict(self, x): auxX = self.build_representation(x, fit=False) print('Predicting baseline...') auxY = self.lm.predict(auxX) #auxY = to_categorical(auxY) if auxY.shape[1] < self.output_dim: npad = ((0, 0), (0, self.output_dim - auxY.shape[1])) auxY = np.pad(auxY, pad_width=npad, mode='constant', constant_values=0) return [auxY, [], []] def predict_prob(self, x): auxX = self.build_representation(x, fit=False) print('Predicting baseline...') auxY = self.lm.predict_proba(auxX) if auxY.shape[1] < self.output_dim: npad = ((0, 0), (0, self.output_dim - auxY.shape[1])) auxY = np.pad(auxY, pad_width=npad, mode='constant', constant_values=0) return [auxY, [], []] def evaluate(self, x, y): auxX = self.build_representation(x, fit=False) auxY = y auxY = np.array([np.argmax(i) for i in auxY]) return sklearn.metrics.accuracy_score(y_true=auxY, y_pred=self.lm.predict(auxX)) def save(self, filename): f = open(filename, "wb") pickle.dump(self.lm, f, protocol=4) pickle.dump(self.vect1, f, protocol=4) pickle.dump(self.selector, f, protocol=4) pickle.dump(self.output_dim, f, protocol=4) f.close() def load(self, filename): f = open(filename, "rb") self.lm = pickle.load(f) self.vect1 = pickle.load(f) self.selector = pickle.load(f) self.output_dim = pickle.load(f) f.close()