def test_ovr_partial_fit_exceptions(): ovr = OneVsRestClassifier(MultinomialNB()) X = np.abs(np.random.randn(14, 2)) y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3] ovr.partial_fit(X[:7], y[:7], np.unique(y)) # If a new class that was not in the first call of partial fit is seen # it should raise ValueError y1 = [5] + y[7:-1] msg = r"Mini-batch contains \[.+\] while classes must be subset of \[.+\]" with pytest.raises(ValueError, match=msg): ovr.partial_fit(X=X[7:], y=y1)
def test_ovr_partial_fit_exceptions(): ovr = OneVsRestClassifier(MultinomialNB()) X = np.abs(np.random.randn(14, 2)) y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3] ovr.partial_fit(X[:7], y[:7], np.unique(y)) # A new class value which was not in the first call of partial_fit # It should raise ValueError y1 = [5] + y[7:-1] assert_raises_regexp(ValueError, r"Mini-batch contains \[.+\] while " r"classes must be subset of \[.+\]", ovr.partial_fit, X=X[7:], y=y1)
def test_ovr_partial_fit(): # Test if partial_fit is working as intented X, y = shuffle(iris.data, iris.target, random_state=0) ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(X[:100], y[:100], np.unique(y)) ovr.partial_fit(X[100:], y[100:]) pred = ovr.predict(X) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(y))) assert_greater(np.mean(y == pred), 0.65) # Test when mini batches doesn't have all classes # with SGDClassifier X = np.abs(np.random.randn(14, 2)) y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3] ovr = OneVsRestClassifier( SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) ovr.partial_fit(X[:7], y[:7], np.unique(y)) ovr.partial_fit(X[7:], y[7:]) pred = ovr.predict(X) ovr1 = OneVsRestClassifier( SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) pred1 = ovr1.fit(X, y).predict(X) assert_equal(np.mean(pred == y), np.mean(pred1 == y)) # test partial_fit only exists if estimator has it: ovr = OneVsRestClassifier(SVC()) assert_false(hasattr(ovr, "partial_fit"))
def test_ovr_partial_fit(): # Test if partial_fit is working as intented X, y = shuffle(iris.data, iris.target, random_state=0) ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(X[:100], y[:100], np.unique(y)) ovr.partial_fit(X[100:], y[100:]) pred = ovr.predict(X) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(y))) assert_greater(np.mean(y == pred), 0.65) # Test when mini batches doesn't have all classes ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target)) ovr.partial_fit(iris.data[60:], iris.target[60:]) pred = ovr.predict(iris.data) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(iris.data, iris.target).predict(iris.data) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(iris.target))) assert_greater(np.mean(iris.target == pred), 0.65)
def test_ovr_partial_fit(): # Test if partial_fit is working as intended X, y = shuffle(iris.data, iris.target, random_state=0) ovr = OneVsRestClassifier(MultinomialNB()) ovr.partial_fit(X[:100], y[:100], np.unique(y)) ovr.partial_fit(X[100:], y[100:]) pred = ovr.predict(X) ovr2 = OneVsRestClassifier(MultinomialNB()) pred2 = ovr2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) assert_equal(len(ovr.estimators_), len(np.unique(y))) assert_greater(np.mean(y == pred), 0.65) # Test when mini batches doesn't have all classes # with SGDClassifier X = np.abs(np.random.randn(14, 2)) y = [1, 1, 1, 1, 2, 3, 3, 0, 0, 2, 3, 1, 2, 3] ovr = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) ovr.partial_fit(X[:7], y[:7], np.unique(y)) ovr.partial_fit(X[7:], y[7:]) pred = ovr.predict(X) ovr1 = OneVsRestClassifier(SGDClassifier(max_iter=1, tol=None, shuffle=False, random_state=0)) pred1 = ovr1.fit(X, y).predict(X) assert_equal(np.mean(pred == y), np.mean(pred1 == y)) # test partial_fit only exists if estimator has it: ovr = OneVsRestClassifier(SVC()) assert_false(hasattr(ovr, "partial_fit"))
G = myGeneratorTrain() G1 = myGeneratorVal() clf = OneVsRestClassifier(SGDClassifier(loss='log', penalty='l1', max_iter=1000, n_jobs=-1), n_jobs=-2) for i in range(5): print("i=", i) for j in range(int(num_batches * splits)): K = G.next() yt = K[1] xt = np.concatenate((K[0][0], K[0][1], K[0][2], K[0][3], K[0][4], K[0][5], K[0][6], K[0][7]), axis=1) clf.partial_fit(xt, yt, classes=np.unique(Y) - 1) K1 = G1.next() yv = K1[1] xv = np.concatenate((K1[0][0], K1[0][1], K1[0][2], K1[0][3], K1[0][4], K1[0][5], K1[0][6], K1[0][7]), axis=1) print(clf.score(xv, yv)) from sklearn.externals import joblib joblib.dump(clf, 'sgdClassifier.pkl') np.savez('nnIndices2.npz', trainIdx=trainIdx, testIdx=testIdx) #model.evaluate([X_test[:,0:1000],X_test[:,1000:2000],X_test[:,2000:3000]],np_utils.to_categorical(Y[testIdx]-1,1000))
class MultilabelTraining: X_COLUMN_NAME = "page_text_extract" DEFAULT_TARGET_THEMES = [ 5, 6, 26, 33, 139, 163, 232, 313, 339, 350, 406, 409, 555, 589, 597, 634, 660, 695, 729, 766, 773, 793, 800, 810, 852, 895, 951, 975, ] OTHER_THEMES_VALUE = 4242 def __init__( self, df=pd.DataFrame(), x_column_name=X_COLUMN_NAME, group_processes=True, classifier=XGBClassifier(max_depth=15, random_state=42, n_jobs=-1), vectorizer=HashingVectorizer(n_features=2 ** 14), target_themes=DEFAULT_TARGET_THEMES, other_themes_value=OTHER_THEMES_VALUE, remove_processes_without_theme=True, is_incremental_training=False, vocab_path="", ): self.is_incremental_training = is_incremental_training self.vocab_path = vocab_path self.remove_processes_without_theme = remove_processes_without_theme self.mo_classifier = OneVsRestClassifier(classifier) self.classifier = classifier self.vectorizer = vectorizer self.target_themes = target_themes self.other_themes_value = other_themes_value self.group_processes = group_processes self.x_column_name = x_column_name self._initialize_dataframe(df) def _initialize_dataframe(self, df): if not df.empty: self.dp = DataframePreprocessing( df.copy(), group_processes=self.group_processes, x_column_name=self.x_column_name, target_themes=self.target_themes, other_themes_value=self.other_themes_value, is_incremental_training=self.is_incremental_training, remove_processes_without_theme=self.remove_processes_without_theme, vocab_path=self.vocab_path, ) self.y_columns_names = self.dp.distinct_themes self.df = self.dp.processed_df else: self.df = df def _split(self, X, y): print("Splitting dataset...") self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( X, y, stratify=y, test_size=0.2, random_state=42 ) def _vectorize(self, X_train): print("Vectorizing...") return self.vectorizer.fit_transform(X_train) def train(self, split_df=False): print("Training...") self.X_train, self.y_train = ( self.df[self.x_column_name], self.df[self.y_columns_names], ) if split_df: self._split(self.X_train, self.y_train) vector = self._vectorize(self.X_train) self.mo_classifier.fit(vector, self.y_train) if split_df: vector_test = self._vectorize(self.X_test) self.y_pred = self.mo_classifier.predict(vector_test) metrics = get_multilabel_metrics(self.y_test, self.y_pred) return metrics return None def _update_dataframe( self, df, is_incremental_training=True, is_parquet=False, labels_freq={} ): self.dp = DataframePreprocessing( df.copy(), x_column_name=self.x_column_name, group_processes=self.group_processes, target_themes=self.target_themes, other_themes_value=self.other_themes_value, is_incremental_training=is_incremental_training, remove_processes_without_theme=self.remove_processes_without_theme, is_parquet=is_parquet, vocab_path=self.vocab_path, labels_freq=labels_freq, ) self.df = self.dp.processed_df def incremental_train(self, df_path, nrows=5000): print("Training incrementally...") columns_names = pd.read_csv(df_path, nrows=1).columns.tolist() skiprows = 1 classes, labels_freq = DataframePreprocessing( target_themes=self.target_themes ).get_unique_binarized_labels(df_path, "tema") while True: df = pd.read_csv( df_path, nrows=nrows, skiprows=skiprows, header=None, names=columns_names, ) if df.empty: break self._update_dataframe(df, labels_freq=labels_freq) X_train, y_train = ( self.df[self.x_column_name], self.df[self.target_themes + [self.other_themes_value]], ) vector = self._vectorize(X_train) self.mo_classifier.partial_fit(vector, y_train, classes=classes) skiprows += nrows print("{} rows already trained\n".format(skiprows - 1)) def predict(self): return self.mo_classifier.predict(self._vectorize(self.X_test).todense()) def set_X_test(self, X): self.X_test = X def set_y_test(self, y): self.y_test = y def get_pickle(self): return pickle.dumps(self.mo_classifier)
class OneVsRestSGDClassifier(LabelClassifier): def __init__(self, f_dim=100, ft_iters=20, update_iters=100, label_dict_path='data/labels.txt'): LabelClassifier.__init__(self, label_dict_path) self.f_dim = f_dim # dimension of word feature vector self.ft_iters = ft_iters self.update_iters = update_iters self.ft_model = FastText(min_count=1, size=self.f_dim) self.clf = OneVsRestClassifier( SGDClassifier(loss='modified_huber', class_weight={ 0: 0.4, 1: 0.6 }, penalty='l2', warm_start=False, random_state=1)) def init_fasttext(self, model_path=None, train_data=None): """ if train_data is provided, train a new fasttext model; otherwise, load it from the given path -------- Parameter: model_path: fasttext model prefix train_data: a list of tokenized sentences. if not provided, will try to load existing model from model_path """ if not train_data and model_path and os.path.isfile(model_path): #=== load exisitng model ==== print('loading fasttext model from', model_path) self.ft_model = FastText.load(model_path) elif train_data: #=== train fast text model ==== # if train_data is not a list of list, split each sentence # into list of words print('training fasttext model from scratch...') train_data = [re.split(',| ',r) if (not isinstance(r,list)) else r\ for r in train_data ] self.ft_model.build_vocab(train_data) self.ft_model.train(train_data, total_examples=len(train_data), epochs=self.ft_iters) if model_path: self.ft_model.save(model_path, separately=[]) else: #=== no train data and no model path provided raise TrainDataException( 'Error building fasttext model. No data/model provided.') def div_norm(self, x): norm_value = np.sqrt(np.sum(x**2)) #l2norm if norm_value > 0: return x * (1.0 / norm_value) else: return x def sentence_to_vec(self, words): """ generating embedding by summing up normalized word embeddings -------- Parameter: words: a list of words or a string representation of a sentence (seperated by space or ',' ) Return: sentence embedding matrix of size len(words) x f_dim """ if not isinstance(words, list): words = re.split(',| ', words) vecs = np.zeros((len(words), self.f_dim)) for i, word in enumerate(words): v = self.ft_model.wv.get_vector(word) vecs[i] = self.div_norm(v) return np.mean(vecs, axis=0) def to_vec(self, data): """ batch computation of sentence embeddings """ vec = np.zeros((len(data), self.f_dim)) for i, sentence in enumerate(data): vec[i] = self.sentence_to_vec(sentence) return vec def train(self, train_data, train_label): """ offline training of the SGD classifier -------- Parameters: train_data: a list of tokenized sentences. Each sentence is either a string deliminated by comma or space, or a list of words. train_label: a list of labels. Each label is a string deliminated by comma or space. Return: X: sentence embedding matrix of size len(train_data) x f_dim Y: binary label matrix of size len(train_data) x #_classes """ print('training multilabel classifier on %d samples...' % len(train_data)) Y = np.zeros((len(train_label), len(self.labeldict))) for i, labels in enumerate(train_label): label_list = re.split(',| ', labels) for l in label_list: if l: Y[i, self.labeldictR[l]] = 1 # add dummy sample to classes that do not have samples indices = np.where(np.sum(Y, axis=0) == 0)[0] Y_new = np.zeros((len(indices), Y.shape[1])) for i, id in enumerate(indices): train_data.append([self.labeldict[id]]) Y_new[i, id] = 1 Y = np.vstack((Y, Y_new)) X = self.to_vec(train_data) self.clf.fit(X, Y) return X, Y def train_update(self, train_data, train_label): """ online training of the SGD classifier -------- Parameters: see train() """ Y = np.zeros((len(train_label), len(self.labeldict))) X = self.to_vec(train_data) for i, labels in enumerate(train_label): label_list = re.split(',| ', labels) for l in label_list: if l: Y[i, self.labeldictR[l]] = 1 for i in range(self.update_iters): self.clf.partial_fit(X, Y) return X, Y def classify(self, string): """ predict the labels of a tokenized sentence -------- Parameters: string: string delimited by comma or space, or a list of words Return: labels: a list of predicted labels """ X = self.to_vec([string]) Y = self.clf.predict(X) #print('class probability',self.clf.predict_proba(X) ) labels = [self.labeldict[id] for id in np.nonzero(Y[0])[0]] return labels def save_clf(self, filename): print('writing classification model to', filename, '...') with open(filename, 'wb') as f: pickle.dump(self.clf, f) def load_clf(self, filename): print('loading classification model from', filename, '...') with open(filename, 'rb') as f: self.clf = pickle.load(f)
classifier = SGDClassifier(loss='log',penalty='l1',alpha=0.001) # logistic classifier.partial_fit(samples_train,target_train,classes=target) # training dataset target_pred=classifier.predict(samples_test) #testing dataset accuracy=accuracy_score(target_test,target_pred) #accuracy rate print ('the accuracy score is',accuracy,'\n') classifier_l2 = SGDClassifier(loss='log',penalty='l2',alpha=0.001) # logistic classifier_l2.partial_fit(samples_train,target_train,classes=target) # training dataset target_pred_l2=classifier_l2.predict(samples_test) #testing dataset accuracy_l2=accuracy_score(target_test,target_pred_l2) #accuracy rate print ('the accuracy score is',accuracy_l2,'\n') from sklearn.multiclass import OneVsRestClassifier,OneVsOneClassifier classifier_l1_onevsrest = OneVsRestClassifier(SGDClassifier(loss='log',penalty='l1',alpha=0.001)) # logistic classifier_l1_onevsrest.partial_fit(samples_train,target_train,classes=target) # training dataset target_pred_l1_onevsrest=classifier_l1_onevsrest.predict(samples_test) #testing dataset accuracy_l1_onevsrest=accuracy_score(target_test,target_pred_l1_onevsrest) #accuracy rate print ('the accuracy score is',accuracy_l1_onevsrest,'\n') from sklearn.preprocessing import PolynomialFeatures samples_poly=PolynomialFeatures(5) poly=samples_poly.fit_transform(samples) samples_train,samples_test,target_train,target_test = train_test_split(poly,target,test_size=0.2,random_state=0) classifier_poly = SGDClassifier(loss='log',penalty='l1',alpha=0.001) # logistic classifier_poly.partial_fit(samples_train,target_train,classes=target) # training dataset target_pred_poly=classifier_poly.predict(samples_test) #testing dataset accuracy_poly=accuracy_score(target_test,target_pred_poly) #accuracy rate print ('the accuracy score is',accuracy_poly,'\n')