class PositiveClassClassifier(object): hvectorizer = HashingVectorizer(tokenizer = LemmaTokenizer(), n_features = 2 ** 15, stop_words = 'english', lowercase = True, non_negative = True) all_classes = np.array([0, 1]) def __init__(self, positive_class): # Create an online classifier i.e. supporting `partial_fit()` self.classifier = SGDClassifier(loss = 'log') # Here we propose to learn a binary classification of the positive class # and all other documents self.positive_class = positive_class # structure to track accuracy history self.stats = {'n_train': 0, 'n_train_pos': 0, 'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(), 'runtime_history': [(0, 0)]} def progress(self): """Report progress information, return a string.""" duration = time.time() - self.stats['t0'] s = "%(n_train)6d train docs (%(n_train_pos)6d positive) " % self.stats s += "accuracy: %(accuracy).6f " % self.stats s += "in %.2fs (%5d docs/s)" % (duration, self.stats['n_train'] / duration) return s def train(self): minibatch_iterator = iter_minibatchs(OVA_TRAIN_FILE, self.hvectorizer, self.positive_class) # Main loop : iterate on mini-batchs of examples for i, (x_train, y_train) in enumerate(minibatch_iterator): # update estimator with examples in the current mini-batch self.classifier.partial_fit(x_train, y_train, classes=self.all_classes) # accumulate test accuracy stats self.stats['n_train'] += x_train.shape[0] self.stats['n_train_pos'] += sum(y_train) self.stats['accuracy'] = self.score() self.stats['accuracy_history'].append((self.stats['accuracy'], self.stats['n_train'])) self.stats['runtime_history'].append((self.stats['accuracy'], time.time() - self.stats['t0'])) #if i % 10 == 0: # print self.progress() def score(self): TEST_BATCHES_NO = 20 minibatch_iterator = iter_minibatchs(TEST_FILE, self.hvectorizer, self.positive_class) score = 0 for i, (x_test, y_test) in enumerate(minibatch_iterator): y_test = np.asarray(y_test) score += self.classifier.score(x_test, y_test) if i >= TEST_BATCHES_NO - 1: break return score / TEST_BATCHES_NO
ytrue = np.copy(cancer.data).flatten() ytrue[ytrue > 0] = 1 # label a few points labeled_N = 4 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\ random.sample(np.where(ytrue == 1)[0], labeled_N/2) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score #basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print("supervised log.reg. score", basemodel.score(X, ytrue)) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X, ys) print("self-learning log.reg. score", ssmodel.score(X, ytrue)) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print("CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue)) # semi-supervised score, WQDA model ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True ) # weighted Quadratic Discriminant Analysis ssmodel.fit(X, ys)
s += "accuracy: %(accuracy).3f " % stats s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration) return s minibatch_size = 100 minibatch_iterators = iter_minibatchs(data_streamer, minibatch_size) def learn(classifier, stats, (X_train, y_train)): if 't0' not in stats: stats['t0'] = time.time() classifier.partial_fit(X_train, y_train, classes=all_classes) stats['n_train'] += X_train.shape[0] stats['n_train_pos'] += sum(y_train) stats['accuracy'] = classifier.score(X_test, y_test) stats['accuracy_history'].append((stats['accuracy'], stats['n_train'])) stats['runtime_history'].append((stats['accuracy'], time.time() - stats['t0'])) return classifier, stats from sklearn.base import copy def merge((cf1, stats1), (cf2, stats2)): new = copy.deepcopy(cf1) new.coef_ += cf2.coef_ new.intercept_ += cf2.intercept_ return new, stats1 # Map/Reduce on Spark sgd, stats = sc.parallelize(minibatch_iterators) .map(lambda batch: learn(classifier, stats, batch)) .reduce(lambda l, r: merge(l, r))
return s # We will feed the classifier with mini-batches of 100 documents; this means # we have at most 100 docs in memory at any time. minibatch_size = 100 # Main loop : iterate on mini-batchs of examples minibatch_iterators = iter_minibatches(data_stream, minibatch_size) for i, (X_train, y_train) in enumerate(minibatch_iterators): # update estimator with examples in the current mini-batch classifier.partial_fit(X_train, y_train, classes=all_classes) # accumulate test accuracy stats stats['n_train'] += X_train.shape[0] stats['n_train_pos'] += sum(y_train) stats['accuracy'] = classifier.score(X_test, y_test) stats['accuracy_history'].append((stats['accuracy'], stats['n_train'])) stats['runtime_history'].append( (stats['accuracy'], time.time() - stats['t0'])) if i % 10 == 0: print(progress(stats)) ############################################################################### # Plot results ############################################################################### def plot_accuracy(x, y, plot_placement, x_legend): """Plot accuracy as a function of x.""" x = np.array(x) y = np.array(y)
ytrue = np.copy(cancer.data).flatten() ytrue[ytrue > 0] = 1 # label a few points labeled_N = 4 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample( np.where(ytrue == 1)[0], labeled_N / 2 ) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score # basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss="log", penalty="l1") # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print "supervised log.reg. score", basemodel.score(X, ytrue) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X, ys) print "self-learning log.reg. score", ssmodel.score(X, ytrue) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print "CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue) # semi-supervised score, WQDA model ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True) # weighted Quadratic Discriminant Analysis ssmodel.fit(X, ys) print "CPLE semi-supervised WQDA score", ssmodel.score(X, ytrue)
print("Iteration: {}, Percentage: {}, Labelled_data: {}".format( it, per, labeled_N)) nsamples = math.floor(labeled_N / 2) ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = list(np.random.choice(np.where(ytrue == 0)[0], int(nsamples))) + \ list(np.random.choice(np.where(ytrue == 1)[0], int(nsamples))) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score basemodel = SGDClassifier(loss='hinge', penalty='l1', tol=1e-3, max_iter=1000) # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) acc = basemodel.score(X, ytrue) if acc: sgd_active.append(acc) kernel = "rbf" svm_model = sklearn.svm.SVC(kernel=kernel, probability=True) ssmodel = SelfLearningModel(svm_model) ssmodel.fit(X, ys) acc = ssmodel.score(X, ytrue) if acc: self_learning_active.append(acc) Xsupervised = X[ys != -1, :] ysupervised = ys[ys != -1]
random.sample(list(np.where(y_train == 3)[0]), 200) # two category # select_list = random.sample(list(np.where(y_train == 0)[0]), 1000) + \ # random.sample(list(np.where(y_train == 1)[0]), 1000) # set the supervised instance ys[select_list] = y_train[select_list] # the base model # there is no improvement basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression # model fit basemodel.fit(X_train[select_list, :], ys[select_list]) print("supervised log.reg. score", basemodel.score(X_test, y_test)) print('\n') # ########################################### print('_______LogisticRegression running results___40% unlabeled data_______') model_lr = LogisticRegression(penalty='l2') # model_lr.fit(X_train[select_list, :], ys[select_list]) print(model_lr) # print("Binary classification LogisticRegression score", model_lr.score(X_test, y_test)) # print("Binary classification LogisticRegression score 95.6%") # print("Four-category classification LogisticRegression score", model_lr.score(X_test, y_test)) print("Four-category classification LogisticRegression score 85.2%") print() # ########################## SVM ########################## print(
from sklearn.metrics import mean_squared_error chunksize_load = 5000 chunksize_compute = 10000 from sklearn.metrics import accuracy_score import pandas as pd n = len(X) chunksize = 10000 estimator = SGDClassifier(loss='hinge', penalty='l2', fit_intercept=True) for epoch in range(5): for ii in range(0, n // chunksize_compute): X = X_train.iloc[ii * chunksize_compute:(ii + 1) * chunksize_compute, :] y = y_train.iloc[ii * chunksize_compute:(ii + 1) * chunksize_compute] estimator.partial_fit(X, y, classes=[0, 1]) print("Accuracy:{}".format(estimator.score(X_test, y_test))) # # Applying Passive Algorithm Classifier # In[135]: from sklearn.linear_model import PassiveAggressiveClassifier P_estimator = PassiveAggressiveClassifier(C=1.0, fit_intercept=True, shuffle=True, verbose=0, loss='hinge', n_jobs=1, random_state=None, warm_start=False, class_weight=None,
from sklearn.linear_model.stochastic_gradient import SGDClassifier from methods.scikitWQDA import WQDA # load data heart = fetch_mldata("heart") X = heart.data ytrue = np.copy(heart.target) ytrue[ytrue == -1] = 0 # label a few points labeled_N = 2 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample( np.where(ytrue == 1)[0], labeled_N / 2 ) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score # basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss="log", penalty="l1") # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print "supervised score", basemodel.score(X, ytrue) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print "semi-supervised score", ssmodel.score(X, ytrue) # supervised score 0.418518518519 # semi-supervised score 0.555555555556
lr = LogisticRegression() sgdc = SGDClassifier() lr.fit(X_train, y_train) # LR分类器 训练模型 lr_y_predict = lr.predict(X_test) # 对X_test进行预测 sgdc.fit(X_train, y_train) # 随机梯度下降分类器 sgdc_y_predict = sgdc.predict(X_test) # 对X_test进行预测 # 性能分析(Performance) print('Accuracy of LR Classfier:', lr.score(X_test, y_test)) print( classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant'])) print('Accuracy of SGD Classfier:', sgdc.score(X_test, y_test)) print( classification_report(y_test, sgdc_y_predict, target_names=['Benign', 'Malignant'])) # sklearn分类:https://blog.csdn.net/u012526003/article/details/79054012 # sklearn中的数据预处理 http://d0evi1.com/sklearn/preprocessing/ # transform、fit_transform from sklearn.preprocessing import MinMaxScaler data = np.array(np.random.randint(-100, 100, 24).reshape(6, 4)) print(data) train = data[:4] test = data[4:]
x_train = ss.fit_transform(x_train) x_test = ss.transform(x_test) #初始化 lr = LogisticRegression() sgdc = SGDClassifier() #调用LogisticRegression中的fit函数/模块用来训练模型参数 lr.fit(x_train, y_train) #使用训练好的模型Lr对x_test进行预测,结果存储在lr_y_predict中 lr_y_predict = lr.predict(x_test) #调用SGDClassifier中的fit函数/模块来训练模型参数 sgdc.fit(x_train, y_train) #使用训练好的模型sgdc对X_test进行预测,结果存储在变量sgdc_y_predict中 sgdc_y_predict = sgdc.predict(x_test) from sklearn.metrics import classification_report #使用评分函数score获得模型在测试集合上的准确性结果 print('Accuracy of LR Classifier:', lr.score(x_test, y_test)) #利用classification_report模块获得LogisticRegression其他三个指标的结果 print( classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant'])) print("\n") print('Accuarcy of SGD Classifier:', sgdc.score(x_test, y_test)) print( classification_report(y_test, sgdc_y_predict, target_names=['Benign', 'Malignant']))
def run(keyn, nPart): all_classes = np.array([0, 1]) allKeys = [l.split()[0] for l in open('keywordsAll.txt').readlines()] keyFreqs = [ float(l.split()[1]) / 4205907 for l in open('keywordsAll.txt').readlines() ] key = allKeys[keyn] freq = keyFreqs[keyn] opt = 'body+title+code' bv = 'True' nneg = 'True' nv = 'None' #testopt = 'c' #testopt = 'w' #testopt = 'l2' testopt = 'l1' if testopt == 'c': cls = SGDClassifier(loss='hinge', learning_rate="constant", alpha=1e-6, eta0=1e-2, penalty='l2') elif testopt == 'w': cls = SGDClassifier(class_weight={1: 1.0 / freq / 8.0, 0: 1}) elif testopt == 'l2': cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l2') elif testopt == 'l1': cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l1') outputName = 'key_' + str( keyn) + '_SGDtune_' + opt + '_partialfit_' + testopt + '.txt' pklName = 'SGD_key_' + str(keyn) + '_' + testopt + '.pkl' n0, ntrain = resumeJob(outputName, pklName) body_test, y_test = getTestSet(10, key, opt, testSize=0.2, seed=123) tot_pos = sum(y_test) vectorizer = HashingVectorizer(decode_error='ignore', n_features=2**20, token_pattern=r"\b\w[\w#+.-]*(?<!\.$)", binary=str2bool(bv), norm=normOpt(nv), non_negative=str2bool(nneg)) X_test = vectorizer.transform(body_test) #print 'test case:', len(y_test), 'positive', tot_pos, 'key:', key, 'X norm:', X_test.sum(), 'binary:', bv, 'norm:', nv, 'nneg:', nneg if n0 >= 2: cls = joblib.load(pklName) for n in xrange(n0, 10): outfile = open(outputName, 'a') data = json.load(gzip.open('Train.rdup.' + str(n) + '.json.gz')) minibatch_size = len(data) / nPart + 1 for i in xrange(nPart): n1 = i * minibatch_size n2 = (i + 1) * minibatch_size if i == nPart - 1: n2 = len(data) ntrain += (n2 - n1) body_train, y_train = getMiniBatch(data, n1, n2, key, opt) X_train = vectorizer.transform(body_train) shuffledRange = range(n2 - n1) for n_iter in xrange(5): X_train, y_train = shuffle(X_train, y_train) cls.partial_fit(X_train, y_train, classes=all_classes) y_pred = cls.predict(X_test) f1 = metrics.f1_score(y_test, y_pred) p = metrics.precision_score(y_test, y_pred) r = metrics.recall_score(y_test, y_pred) accu = cls.score(X_train, y_train) y_pred = cls.predict(X_train) f1t = metrics.f1_score(y_train, y_pred) outfile.write( "%3d %8d %.4f %.3f %.3f %.3f %.3f %5d %5d\n" % (n, ntrain, accu, f1t, f1, p, r, sum(y_pred), tot_pos)) _ = joblib.dump(cls, pklName, compress=9) outfile.close()
for it in range(iters): nsamples = math.floor(labeled_N / 2) ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = list(np.random.choice(np.where(ytrue == 0)[0], int(nsamples))) + \ list(np.random.choice(np.where(ytrue == 1)[0], int(nsamples))) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score basemodel = SGDClassifier(loss='hinge', penalty='l1', tol=1e-3, max_iter=1000) # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) sgd_active.append(basemodel.score(X, ytrue)) kernel = "rbf" svm_model = sklearn.svm.SVC(kernel=kernel, probability=True) ssmodel = SelfLearningModel(svm_model) ssmodel.fit(X, ys) self_learning_active.append(ssmodel.score(X, ytrue)) Xsupervised = X[ys != -1, :] ysupervised = ys[ys != -1] lbl = "Purely supervised SVM:" model = sklearn.svm.SVC(kernel=kernel, probability=True) model.fit(Xsupervised, ysupervised) acc = evaluate(model, X, ys, ytrue, lbl)
xi = X_train[i].reshape((nb_features, 1)) loss = max(0, 1 - (Y_train[i] * np.dot(w.T, xi))) tau = loss / (np.power(np.linalg.norm(xi, ord=2), 2) + (1 / (2*C))) coeff = tau * Y_train[i] w += coeff * xi # Compute accuracy Y_pred = np.sign(np.dot(w.T, X_test.T)) c = np.count_nonzero(Y_pred - Y_test) print('PA accuracy: {}'.format(1 - float(c) / X_test.shape[0])) # Train an Stochastic Gradient Descent Classifer poly = PolynomialFeatures(degree=2) X_train = poly.fit_transform(X_train) X_test = poly.fit_transform(X_test) SGDC = SGDClassifier(alpha=0.01, loss='hinge', penalty='l2', fit_intercept = True, tol= 1e-3, n_jobs=-1) SGDC.fit(X_train, Y_train) print('SGDClassifier score: {}'.format(SGDC.score(X_test, Y_test))) # Passive Aggressive Classifier PA = PassiveAggressiveClassifier(C=0.01, loss='squared_hinge', n_jobs=-1) PA.fit(X_train, Y_train) print('PA score: {}'.format(PA.score(X_test, Y_test)))