def training(features, target, cycle): try: if cycle != 1: model = repository.get(PassiveAggressiveClassifier.__name__, DIR_PATH) scaler = repository.get(StandardScaler.__name__, DIR_PATH) else: model = PassiveAggressiveClassifier(loss='squared_hinge') scaler = StandardScaler() # add the new data to the existing scaler model, because the scaling depends on the data scaler.partial_fit(features) # scale the new features features = scaler.transform(features) # Perform online learning with the new features model.partial_fit(features, target, classes=np.array([0, 1])) # Store the model and scaler to respective files for later use repository.create(model, PassiveAggressiveClassifier.__name__, DIR_PATH) repository.create(scaler, StandardScaler.__name__, DIR_PATH) return {'message': 'training successful'} except Exception as e: traceback.print_tb(e.__traceback__) return {'message': 'training failed '}
def test_classifier_partial_fit(): classes = np.unique(y) for data in (X, X_csr): clf = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, random_state=0) for t in xrange(30): clf.partial_fit(data, y, classes) score = clf.score(data, y) assert_greater(score, 0.79)
def train_online_model(xtr, ytr, model=None): # Train classifier t0 = time.time() if model is None: model = PassiveAggressiveClassifier() model.fit(xtr, ytr) else: model.partial_fit(xtr, ytr) print "Training took %.2f seconds" % (time.time()-t0) return model
def test_classifier_partial_fit(): classes = np.unique(y) for data in (X, X_csr): clf = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, random_state=0) for t in range(30): clf.partial_fit(data, y, classes) score = clf.score(data, y) assert_greater(score, 0.79)
def train_online_model(xtr, ytr, model=None): # Train classifier t0 = time.time() if model is None: model = PassiveAggressiveClassifier() model.fit(xtr, ytr) else: model.partial_fit(xtr, ytr) print "Training took %.2f seconds" % (time.time() - t0) return model
def trainOnline(train_sents, tags, batch_size=500): minibatch_iterators = iter_minibatches(train_sents, batch_size) hasher = FeatureHasher(n_features=5000) clf = PassiveAggressiveClassifier() for i, (trainFeatures, trainLabels) in enumerate(minibatch_iterators): trainFeatures = hasher.transform(trainFeatures) clf.partial_fit(trainFeatures, trainLabels, tags) yield Pipeline([('hasher', hasher), ('classifier', clf)])
def test_classifier_partial_fit(): classes = np.unique(y) for data in (X, X_csr): for average in (False, True): clf = PassiveAggressiveClassifier(random_state=0, average=average, max_iter=5) for t in range(30): clf.partial_fit(data, y, classes) score = clf.score(data, y) assert score > 0.79 if average: assert hasattr(clf, 'average_coef_') assert hasattr(clf, 'average_intercept_') assert hasattr(clf, 'standard_intercept_') assert hasattr(clf, 'standard_coef_')
def test_classifier_partial_fit(): classes = np.unique(y) for data in (X, X_csr): for average in (False, True): clf = PassiveAggressiveClassifier( C=1.0, fit_intercept=True, random_state=0, average=average, max_iter=5) for t in range(30): clf.partial_fit(data, y, classes) score = clf.score(data, y) assert_greater(score, 0.79) if average: assert hasattr(clf, 'average_coef_') assert hasattr(clf, 'average_intercept_') assert hasattr(clf, 'standard_intercept_') assert hasattr(clf, 'standard_coef_')
def test_classifier_partial_fit(): classes = np.unique(y) for data in (X, X_csr): for average in (False, True): clf = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, random_state=0, average=average) for t in range(30): clf.partial_fit(data, y) score = clf.score(data, y) assert_greater(score, 0.79) if average: assert_true(hasattr(clf, 'average_coef_')) assert_true(hasattr(clf, 'average_intercept_')) assert_true(hasattr(clf, 'standard_intercept_')) assert_true(hasattr(clf, 'standard_coef_'))
def elongate(object_list): return object_list + [0.0] * (10000 - len(object_list)) net = PassiveAggressiveClassifier(C=0.001) classes = [ ','.join([ str(bool(i // 16)), str(bool(i // 8 % 2)), str(bool(i // 4 % 2)), str(bool(i // 2 % 2)), str(bool(i % 2)) ]) for i in range(1, 32) ] for i in range(number): print(f'{i} out of {number}') with open(f'./bank/{i}', 'rb') as f: data = pickle.load(f) X = [elongate(flatten2list([j[i][0:3] + [0.0, 0.0, 0.0] for j in data['X'][0:2]]))\ for i in range(len(data['X'][0]))] y = [ ','.join([str(j[i]) for j in data['y']]) for i in range(len(data['y'][0])) ] net.partial_fit([j for ID, j in enumerate(X) if [y[i] != 'False,False,False,False,False' for i in range(len(y))][ID]],\ [j for ID, j in enumerate(y) if [y[i] != 'False,False,False,False,False' for i in range(len(y))][ID]], classes=classes) with open('./net.dump', 'wb') as f: pickle.dump(net, f)
def runLearner(printStages = True, useSelector = False, discreteHelpfulness = True, useRST = True, useFew = False): learner = PassiveAggressiveClassifier() if discreteHelpfulness else PassiveAggressiveRegressor() #bestwords = getBestWords(instances,num=1000) tfidvec = TfidfVectorizer(sublinear_tf=True,stop_words='english', ngram_range=(1,3), decode_error='replace') selector = SelectKBest(chi2, k=50000) if useSelector else None encoder = LabelEncoder() if discreteHelpfulness else None if discreteHelpfulness: classlabels = encoder.fit_transform(labels) newData = False count = 0 if useRST: print 'Getting RST data' nums, texts, ilabels = getPickledRSTSciKitDataLists(True) if newData else getRSTSciKitDataLists(True) random = RandomFeatureExtractor() lengthBaseline = LenFeatureExtractor() fullRST = FullPickledRSTFeatureExtractor(nums) if newData else FullTextRSTFeatureExtractor(nums) limitedRST = LimitedPickledRSTFeatureExtractor(nums) if newData else LimitedTextRSTFeatureExtractor(nums) vectorizer = FeatureUnion([('extra',limitedRST),('tfid',tfidvec)]) print 'Fitting random features baseline' random.fit(texts) print 'Fitting text length baseline' lengthBaseline.fit(texts) print 'Fitting full RST features' fullRST.fit(texts) print 'Fitting limited RST features' limitedRST.fit(texts) print 'Fitting limited RST with tfidvec features' vectorizer.fit(texts) print 'Fitting tfidvec features' tfidvec.fit(texts) split = int(0.8*len(ilabels)) trainData = (texts[:split],ilabels[:split]) testData = (texts[split:],ilabels[split:]) X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector) print 'random features baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) dummy = DummyClassifier() X,y = getAsSciKit(trainData[0],trainData[1],random,encoder,selector) dummy.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],random,encoder,selector) print 'Dummy label distribution baseline trained on %d instances has accuracy %f'%(len(trainData[0]),dummy.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],lengthBaseline,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],lengthBaseline,encoder,selector) print 'text length baseline trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],fullRST,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],fullRST,encoder,selector) print 'Full RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],limitedRST,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],limitedRST,encoder,selector) print 'Limited RST learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],vectorizer,encoder,selector) learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],vectorizer,encoder,selector) print 'Limited RST with ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) X,y = getAsSciKit(trainData[0],trainData[1],tfidvec,encoder,selector) learner = learner.fit(X,y) X,y = getAsSciKit(testData[0],testData[1],tfidvec,encoder,selector) print 'ngram learner trained on %d instances has accuracy %f'%(len(trainData[0]),learner.score(X,y)) else: vectorizer = tfidvec testData = None vocabGotten = False instances = ([],[]) numVocab = 50000 numTest = 50000 numTrain = 100000 maxTrainStages = 20 for text,label in getSciKitData(stateProgress = False, discreteLabels=discreteHelpfulness): if label!='few' or useFew: instances[0].append(text) instances[1].append(label) if not vocabGotten and len(instances[0]) == numVocab: if printStages: print 'Fitting vocabulary with %d instances'%numVocab vectorizer.fit(instances[0],None) if selector is not None: X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,None) selector.fit(X,y) vocabGotten = True instances = ([],[]) elif vocabGotten and testData is None and len(instances[0]) == numTest: if printStages: print 'Getting test data with %d instances'%numTest testData = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector) instances = ([],[]) elif vocabGotten and testData is not None and len(instances[0]) == numTrain: X,y = getSciKitInstance(instances[0],instances[1],vectorizer,encoder,selector) if discreteHelpfulness: learner = learner.partial_fit(X,y, classes = classlabels) else: learner = learner.partial_fit(X,y) instances = ([],[]) count = count + 1 if printStages: print 'Baseline trained on %d instances has accuracy %f'%(count*numTrain,learner.score(testData[0],testData[1])) elif count == maxTrainStages: break print 'Final learner trained on %d instances has accuracy %f'%(maxTrainStages*numTrain,learner.score(testData[0],testData[1]))
print(newsgroups_data.target[:30]) return features, labels if __name__ == '__main__': features, labels = preProcess() train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size=0.2, shuffle=False, random_state=42) clf = PassiveAggressiveClassifier(random_state=seed) for i in range(len(train_y)): X, y = train_x[i:i + 1], train_y[i:i + 1] clf.partial_fit(X, y,classes=np.array([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19])) print("--------------------------PassiveAgressive--------------------------------") predpa = clf.predict(test_x) print("Accuracy score") print(accuracy_score(test_y, predpa)) print("F1 score") print(f1_score(test_y, predpa, average='macro')) print("Recall") print(recall_score(test_y, predpa, average='macro')) print("Precision") print(precision_score(test_y, predpa, average='macro'))
def main(): # Vectorizer with 2^18 buckets. chunkSize = 300000 n_buckets = 2**19 vectorizer = HashingVectorizer(decode_error='ignore', n_features=n_buckets, non_negative=True) classifier = PassiveAggressiveClassifier() #JSONGenerator = readChunk("data/dataSampleFile",chunkSize) #JSONGenerator = readChunk("data/RC_2007-10",chunkSize) #JSONGenerator = readChunk("data/RC_2008-01",chunkSize) JSONGenerator = readChunk("data/RC_2008-12", chunkSize) #JSONGenerator = readChunk("data/RC_2009-12",chunkSize) #JSONGenerator = readChunk("data/RC_2012-01",chunkSize) JSONArrayTestSet = next(JSONGenerator) X_test_text = [] Y_test = [] for JSONString in JSONArrayTestSet: JSONObject = json.loads(JSONString) # Don't care about deleted content. if JSONObject["body"] == "[deleted]": continue X_test_text.append(JSONObject["body"]) Y_test.append(rangifyScore(int(JSONObject["score"]))) X_test = vectorizer.transform(X_test_text) log("Start till MainLoop timer: " + str(time.time() - startTick)) generatorTimeTick = time.time() # For loop for generators. Smart! for i, JSONArray in enumerate(JSONGenerator): log("readChunkTimer: " + str(time.time() - generatorTimeTick)) X_train_text = [] Y_train = [] extractFeatureTimeTick = time.time() for JSONString in JSONArray: JSONObject = json.loads(JSONString) # Don't care about deleted content. if JSONObject["body"] == "[deleted]": continue X_train_text.append(JSONObject["body"]) Y_train.append(rangifyScore(int(JSONObject["score"]))) log("Feature Extract timer: " + str(time.time() - extractFeatureTimeTick)) tick = time.time() X_train = vectorizer.transform(X_train_text) log("Vectorize timer:" + str(time.time() - tick)) tick = time.time() classifier.partial_fit(X_train, Y_train, classes=[i for i in range(41)]) log("Partial fit timer:" + str(time.time() - tick)) generatorTimeTick = time.time() log("Total Time: " + str(time.time() - startTick)) print(classifier.score(X_test, Y_test))
def test_partial_fit_weight_class_balanced(): # partial_fit with class_weight='balanced' not supported clf = PassiveAggressiveClassifier(class_weight="balanced", max_iter=100) with pytest.raises(ValueError): clf.partial_fit(X, y, classes=np.unique(y))
w_u = [] p_u = [] beta1 = 0.9 beta2 = 0.95 # num_errors用于统计组合多个源域后的预测错误数量 # num_errors_pa用于统计单纯在目标域增量建立分类器时的预测错误数量 num_errors = 0 num_errors_pa = 0 for s in range(num_source): tmp = 1/(2*num_source) w_u.append(tmp) p_u.append(tmp) pa.partial_fit(target_train_vector[0:100], target_train.target[0:100], list(classes)) for i in range(100, train_size-batch_size, batch_size): target_inst_vector = vectorizer.transform(target_train) print("===================round " + str(i) + "错误率=====================") if i+batch_size >= train_size: break train_inst_tmp = [] train_inst_vector = [] class_tmp = [] # 归一化 sum_us = sum(w_u) p_v = w_t/(sum_us + w_t) for k in range(num_source): p_u[k] = w_u[k]/(sum_us + w_t)
fp = 0 fn = 0 tn = 0 bintargets = [1, -1] #print features[0][i] #print features[0][i].shape #f=features[0][i,:] #print f.shape #print f.shape #print g_it.target[i] #third parameter is compulsory just for the first call print "prediction", pred, score #if abs(score)<1.0 or pred!=g_it.target[i]: if True: ClassWeight = compute_class_weight('auto', np.array([1, -1]), bintargets) print "class weights", {1: ClassWeight[0], -1: ClassWeight[1]} PassiveAggressive.class_weight = { 1: ClassWeight[0], -1: ClassWeight[1] } PassiveAggressive.partial_fit(exampleESN, np.array([g_it.target[i]]), np.unique(g_it.target)) #calcolo statistiche print "BER AVG", sum(BERtotal) / float(len(BERtotal)) print >> f, "BER AVG " + str(sum(BERtotal) / float(len(BERtotal))) f.close()
class PassiveAggressive: __classes = [] __savefile_pas = "" __pas = None def __init__(self, save_folder, classes): """ :param save_folder: :param classes: """ self.__classes = classes self.__savefile_pas = save_folder + "pas.joblib" self.__pas = PassiveAggressiveClassifier() def train(self, X, y): """ :param X: :param y: :return: """ self.__pas.partial_fit(X, y, classes=self.__classes) self.__save_model() def predict(self, X): """ :param X: :return: """ if self.__pas is None: self.__load_model() return self.__pas.predict(X) def update(self, X, y): """ :param X: :param y: :return: """ self.train(X, y) # If no longer using train(), add save_model() def __save_model(self): """ """ dump(self.__pas, self.__savefile_pas) def __load_model(self): """ """ if os.path.exists(self.__savefile_pas): self.__pas = load(self.__savefile_pas) else: print( "ERROR: PassiveAggressive not initialized. Run train() first.")
if __name__ == "__main__": MD = PassiveAggressiveClassifier(max_iter=1000, loss='squared_hinge', average=10, n_jobs=-1) X, y, all_classes = get_data('traindata500k.csv') # word = dict(X) # doc = transfer(X) # tfidf_compute(word, doc) sss = StratifiedShuffleSplit(n_splits = 10,test_size = 0.1)#训练集额和测试集的比例随机选定,训练集和测试集的比例的和可以小于1,但是还要保证训练集中各类所占的比例是一样的 sss.get_n_splits(X, y) i = 0 for train_index, test_index in sss.split(X, y): print("{} time".format(i)) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] MD.partial_fit(sp.get_hv(X_train), y_train, classes = all_classes) result = MD.predict(sp.get_hv(X_test)) print("正确率: %.4g" % metrics.accuracy_score(y_test, result), "召回率: %.4g" % metrics.recall_score(y_test, result, average='macro') , "F1: %.4g" % metrics.f1_score(y_test, result, average='weighted')) i += 1 #joblib.dump(MD, "D:\\PycharmProjects\\Tax\\321_2**15.pkl.gz", compress=('gzip', 3))
# Create the model pac = PassiveAggressiveClassifier(C=0.05, loss='squared_hinge', max_iter=2000, random_state=1000) # Train with the start-up samples nb_initial_samples = int(X_train.shape[0] / 1.5) pac.fit(X_train[0:nb_initial_samples], Y_train[0:nb_initial_samples]) # Continue with the incremental samples validation_accuracies = [] for (x, y) in zip(X_train[nb_initial_samples:], Y_train[nb_initial_samples:]): pac.partial_fit(x.reshape(1, -1), y.ravel(), classes=np.unique(iris['target'])) validation_accuracies.append(pac.score(X_test, Y_test)) # Show the validation plot fig, ax = plt.subplots(figsize=(18, 8)) ax.plot(validation_accuracies) ax.set_xlabel('Online sample') ax.set_ylabel('Validation accuracy') ax.grid() plt.show()
class benchmark: def train(self, training_set, dataset_name, mode): #uses self.training_dataset self.data = training_set if mode == 'ASGD': self.classifier = SGDClassifier(average=True, max_iter=1, penalty='elasticnet', l1_ratio=0.5) self.classifier2 = SGDClassifier(average=False, max_iter=1, penalty='elasticnet', l1_ratio=0.5) elif mode == 'Perceptron': self.classifier = Perceptron(max_iter=1) self.classifier2 = self.classifiers elif mode == 'PA1': self.classifier = PassiveAggressiveClassifier(loss='hinge', C=-1.0, max_iter=1) self.classifier2 = self.classifier elif mode == 'PA2': self.classifier = PassiveAggressiveClassifier(loss='squared_hinge', C=1.0, max_iter=1) self.classifier2 = self.classifier init = np.zeros(len(self.data[0]) - 1).reshape(1, -1) for i in range(0, parameters.rounds): train_error_vector = [] iterations = 0 train_error = 0 copydata = copy.deepcopy(self.data) random.shuffle(copydata) self.data_preprocessor( preprocess2.removeDataTrapezoidal(copydata)) #or trapezoidal self.classifier = clone(self.classifier) self.classifier2 = clone(self.classifier2) self.classifier.partial_fit(init, [-self.y[0]], np.unique(self.y)) self.classifier2.partial_fit(init, [-self.y[0]], np.unique(self.y)) total_error_vector = np.zeros(len(self.y)) #c = list(zip(self.X, self.y)) #random.shuffle(c) #self.X, self.y= zip(*c) self.variance_vector = [np.ones(len(training_set[0]) - 1)] self.average_vector = [np.zeros(len(training_set[0]) - 1)] for i in range(0, len(self.y)): #self.classifier.densify() row = [self.X[i]] label = self.y[i] iterations = i + 1 old = self.classifier.coef_ self.classifier.coef_ = self.update_metadata(iterations) result = self.classifier.predict(row) self.classifier.coef_ = old if result[0] != label: train_error += 1 self.classifier.partial_fit(row, [self.y[i]], np.unique(self.y)) self.classifier2.partial_fit(row, [self.y[i]], np.unique(self.y)) #self.classifier.sparsify() train_error_vector.append(train_error / iterations) total_error_vector = np.add(train_error_vector, total_error_vector) total_error_vector = np.divide(total_error_vector, parameters.rounds) misc.plotError(train_error_vector[0::50], dataset_name) return train_error_vector def update_p(self): return 0.5 def update_metadata(self, i): classifier = self.classifier2.coef_ average = self.average_vector self.average_vector = np.divide( np.add(np.multiply(self.average_vector, i), classifier), i + 1) difference = np.subtract(classifier, average) current_variance = np.absolute(difference) self.variance_vector = np.divide( np.add(np.multiply(i, self.variance_vector), current_variance), i + 1) return np.divide(self.classifier.coef_, 30 * preprocessing.normalize(self.variance_vector)) def data_preprocessor(self, dataset): #find all keys, fill with 0 all_keys = set().union(*(d.keys() for d in dataset)) X = [] y = [] for row in dataset: for key in all_keys: if key not in row.keys(): row[key] = 0 y.append(row['class_label']) del row['class_label'] for row in dataset: X_row = [] for i in range(0, len(row)): X_row.append(row[i]) X.append(X_row) self.X = X self.y = y
sum_u = 0 for s in range(num_source): tmp = pTrue * pSource_res[s] sign_tmp = 0 if tmp < 0: sign_tmp = 1 w_u[s] = w_u[s] * beta2**sign_tmp sum_u = sum_u + w_u[s] for s in range(num_source): w_u[s] = w_u[s] * w_s / sum_u # loss = max(0, 1 - yf(x)) loss = 0 if pLabel * pTrue < 0: loss = 2 # loss = hinge_loss(pTrue, pLabel) if loss > 0: pa.partial_fit(target_train_vector[i:i + batch_size], class_tmp, list(classes)) # 线性组合 f_final = 0 for s in range(num_source): f_final = f_final + p_u[s] * pSource_res[s] f_final = f_final + p_v * pLabel # i = i+batch_size if i + batch_size >= train_size: break