def apply_minibatch_sgd(datasets, minibatch, epoch=5, cores=1, seed=1): ''' Applies the logistic regression sgd method :type datasets: list :param datasets: List containing training/testing data :type minibatch: int :param minibatch: minibatch size :type cores: int :param cores: Number of cores :type seed: int :param seed: Random seed ''' print 'Applying mini-batch SGD with mini-batch size of ', minibatch training_X, training_y = datasets[0] testing_X, testing_y = datasets[1] print 'Shuffling training data' training_X, training_y = shuffle(training_X, training_y, random_state = seed) clf = SGDClassifier(loss="log", random_state=seed, n_iter=epoch, verbose=0, n_jobs=cores) classes = numpy.unique([-1, 1]) minibatches = training_X.shape[0]/minibatch + 1 samples = training_X.shape[0] for i in xrange(epoch): print "Epoch ", i+1 for j in xrange(minibatches): clf.partial_fit(training_X[j*minibatch:min(samples,(j+1)*minibatch)], training_y[j*minibatch:min(samples,(j+1)*minibatch)], classes=classes) print "Accuracy on testing data:", clf.score(testing_X, testing_y)
def run_online_classifier(): vect = HashingVectorizer( decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer_streaming, ) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) csv_filename = os.path.join('datasets', 'movie_data.csv') doc_stream = stream_docs(path=csv_filename) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if X_train is None: break else: X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print("Test accuracy: %.3f" % clf.score(X_test, y_test)) clf = clf.partial_fit(X_test, y_test)
def train(self): fp_lr_model = "./data/lr_model" lr_model = SGDClassifier( loss='log' ) # using log-loss for lr early_stopping=False with partial_fit self.scores = [] for i, chunk in enumerate(self.df_train): print('starting {} chunk...'.format(i + 1)) df_train = self.oh_enc.transform(chunk) # 转换为one-hot编码 # other_feat = ['device_model', 'device_ip', 'device_id', 'app_domain', 'hour'] # one-hot编码没有考虑的object特征也要删除 # ['id', 'click'].extend(other_feat) feat_train = df_train.columns.drop([ 'id', 'click', 'device_model', 'device_ip', 'device_id', 'app_domain', 'hour', 'C17', 'C18', 'C19', 'C20', 'C21' ]) train_x = df_train[feat_train] train_y = df_train['click'].astype('int') lr_model.partial_fit(train_x, train_y, classes=[0, 1]) y_pred = lr_model.predict_proba(train_x)[:, 1] score = log_loss(train_y, y_pred) self.scores.append(score) print('saving model...') pickle.dump(lr_model, open(fp_lr_model, 'wb'))
def train(): vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, ngram_range=(1, 3), tokenizer=tokenizer) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) stream_path = os.path.join(work_path, 'movie_data.csv') doc_stream = stream_docs(path=stream_path) pbar = pyprind.ProgBar(45) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if not X_train: break X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) pbar.update() X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print('Accuracy: %.3f' % clf.score(X_test, y_test)) clf = clf.partial_fit(X_test, y_test) return clf
def dump_classifier(clf_path): vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer) if Version(sklearn_version) < '0.18': clf = SGDClassifier(loss='log', random_state=1, n_iter=1) else: clf = SGDClassifier(loss='log', random_state=1, max_iter=1) cur_dir = os.path.dirname(__file__) doc_stream = stream_docs(path=os.path.join(cur_dir, 'movie_data.csv')) # pbar = pyprind.ProgBar(45) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if not X_train: break X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) # pbar.update() X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print('Accuracy: %.3f' % clf.score(X_test, y_test)) clf = clf.partial_fit(X_test, y_test) pickle.dump(clf, open(clf_path, 'wb'), protocol=4)
def train2(self): # df = pd.DataFrame() # df = pd.read_csv(self._bow.csv_path) # train = df.loc[:25000, 'review'].values # label = df.loc[:25000, 'sentiment'].values # test_train = df.loc[25000:, 'review'].values # test_label = df.loc[25000:, 'sentiment'].values classes = np.array([0, 1]) #tokenized = self._bow.tokenizer_without_stop_word('I hava a pen') # x_train, y_label = self._bow.get_minibatch(self._bow.stream_docs(), size=2) vect = HashingVectorizer( decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=self._bow.tokenizer_without_stop_word) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) doc_stream = self._bow.stream_docs() for _ in range(45): x_train, train_label = self._bow.get_minibatch(doc_stream, size=1000) if not x_train: break x_train = vect.transform(x_train) clf.partial_fit(x_train, train_label, classes=classes) x_test_train, test_label = self._bow.get_minibatch(doc_stream, size=5000) x_test_train = vect.transform(x_test_train) print('accuracy %.3f' % clf.score(x_test_train, test_label)) return clf
def check_classifier(vect: HashingVectorizer) -> None: if not clf_path.is_file(): print('Classifier was not found, creating...') clf = SGDClassifier(loss='log', random_state=1) ds = DocStream('./movie_data.csv') pbar = ProgBar(45) classes = np.array([0, 1]) for _ in range(45): x_train, y_train = ds.get_minibatch(1000) if not x_train: break x_train = vect.transform(x_train) clf.partial_fit(x_train, y_train, classes) pbar.update() print('Training completed...') x_test, y_test = ds.get_minibatch(5000) x_test = vect.transform(x_test) print(f'Score: {clf.score(x_test, y_test)}') clf = clf.partial_fit(x_test, y_test) dump(clf, clf_path, protocol=4)
def SG_classify(X, Y, class_0_weight, class_1_weight, sgc=None): #stochastic gradient descent classifier if sgc: if np.bincount(Y)[0] > 0 and len(np.bincount(Y)) > 1: sgc.partial_fit(X, Y) else: param_SGC = { 'loss': 'hinge', 'penalty': 'elasticnet', 'n_iter': 1, 'shuffle': True, 'class_weight': { 0: class_0_weight, 1: class_1_weight }, 'warm_start': True, 'alpha': 0.001 } sgc = SGDClassifier(**param_SGC) if np.bincount(Y)[0] > 0 and len(np.bincount(Y)) > 1: sgc.partial_fit(X, Y, np.unique(Y)) coef = sgc.coef_ intercept = sgc.intercept_ else: sgc = None coef = None intercept = None return sgc, coef, intercept
def mine(): print("Starting") clf = SGDClassifier(loss='log',random_state=1,n_iter=1) print('Create/Load Classifier') doc_stream = stream_docs(path='./movie_data.csv') print('Fitting data') classes = np.array([0,1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if not X_train: break X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) print('Finished Fitting') X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print('Accuracy: %.3f' % clf.score(X_test,y_test)) print('create pickle objects') dest = os.path.join('','pkl_objects') if not os.path.exists(dest): os.makedirs(dest) pickle.dump(stop, open(os.path.join(dest,'stopwords.pkl'),'wb'), protocol=4) pickle.dump(clf, open(os.path.join(dest,'classifier.pkl'),'wb'), protocol=4)
def test_transformer(transformer, data_set, configuration): clf = SGDClassifier(alpha=0.005) samples = [] labels = range(10) for epoch in range(configuration.hyper_parameters.epochs): for index, sample in enumerate(transformer.compute_outputs(data_set.trainset[0], data_set.trainset[1], 1)): samples.append(sample.reshape((1, sample.shape[0]))) if index % 10 == 9: clf.partial_fit(samples, labels, labels) samples = [] gc.collect() error = 0 count = 0 test_predictions = [] for index, sample in enumerate(transformer.compute_outputs(data_set.testset[0], data_set.testset[1], 1)): prediction = clf.predict(sample) if not prediction == index % 10: error += 1 count += 1 test_predictions.append(prediction) OutputLog().write('test predictions weight: {0}'.format(test_predictions)) OutputLog().write('\nerror: %f%%\n' % error)
def train_and_pickle_classifier(): import numpy as np from sklearn.linear_model import SGDClassifier clf = SGDClassifier(loss='log', random_state=1, n_iter=1) csv_filename = os.path.join('datasets', 'movie_data.csv') doc_stream = stream_docs(path=csv_filename) classes = np.array([0, 1]) for _ in range(45): X_train, y_train = get_minibatch(doc_stream, size=1000) if X_train is None: break else: X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print("Test accuracy: %.3f" % clf.score(X_test, y_test)) clf = clf.partial_fit(X_test, y_test) pickle.dump(clf, open(CLF_FILENAME, 'wb'), protocol=4)
class NOGDClassifier(object): def __init__(self, n_components=100, n_iter=1): self.nys = Nystroem(n_components=n_components) self.clf = SGDClassifier(loss='hinge', penalty='l2', shuffle=True, n_iter=n_iter) self.count = 0 def fit(self, X, y): if self.count == 0: X_tran = self.nys.fit_transform(X) else: X_tran = self.nys.transform(X) self.count += 1 self.clf.fit(X_tran, y) def partial_fit(self, X, y): if self.count == 0: X_tran = self.nys.fit_transform(X) else: X_tran = self.nys.transform(X) self.count += 1 self.clf.partial_fit(X_tran, y) def predict(self, X): X_tran = self.nys.transform(X) y_pred = self.clf.predict(X_tran) return y_pred
def test_partial_fit_doesnt_mutate_inputs(): n, d = 100, 20 X, y = make_classification(n_samples=n, n_features=d, random_state=42, chunks=(n, d)) X = X.compute() y = y.compute() meta = { "iterations": 0, "mean_copy_time": 0, "mean_fit_time": 0, "partial_fit_calls": 0, } model = SGDClassifier(tol=1e-3) model.partial_fit(X[:n // 2], y[:n // 2], classes=np.unique(y)) new_model, new_meta = _partial_fit((model, meta), X[n // 2:], y[n // 2:], fit_params={"classes": np.unique(y)}) assert meta != new_meta assert new_meta["partial_fit_calls"] == 1 assert not np.allclose(model.coef_, new_model.coef_) assert model.t_ < new_model.t_ assert new_meta["partial_fit_time"] >= 0 new_meta2 = _score((model, new_meta), X[n // 2:], y[n // 2:], None) assert new_meta2["score_time"] >= 0 assert new_meta2 != new_meta
def train_by_partial_SGB(): x_train, x_test, y_train, y_test = Load_Traindata_Testdata_with_Tfidf() # X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.3) # model = RandomForestClassifier(n_jobs=-1, n_estimators=100) # model=XGBClassifier() model = SGDClassifier(n_jobs=-1, max_iter=20, alpha=0.01) now = datetime.datetime.now() print("Training begin:", now) batch_size = 50000 for i in range(100): last = datetime.datetime.now() start = (i * batch_size) % len(y_train) end = min(start + batch_size, len(y_train)) model.partial_fit(x_train[start:end], y_train[start:end], classes=y_train) y_pre = model.predict(x_test) acc = accuracy_score(y_test, y_pre) score = model.score(x_test, y_test) cost_time = datetime.datetime.now() - last print("%d times, %f score, %f acc" % (i, score, acc), cost_time, " time(s)") # model.fit(X_train, Y_train) # y_pre = model.predict(X_val) # print(model.score(X_val, Y_val)) # print(accuracy_score(Y_val, y_pre)) training_time = datetime.datetime.now() - now print("Training time(s):", training_time)
def train_by_partial_SGD(filename): x_train, x_test, y_train, y_test = Load_Traindata_Testdata_with_Tfidf( filename) model = SGDClassifier(n_jobs=4, loss='hinge', alpha=0.09, tol=0.001) now = datetime.datetime.now() print("Training begin by SGB:", now) batch_size = 50000 for i in range(1000): last = datetime.datetime.now() start = (i * batch_size) % len(y_train) end = min(start + batch_size, len(y_train)) model.partial_fit(x_train[start:end], y_train[start:end], classes=y_train) y_pre = model.predict(x_test) acc = accuracy_score(y_test, y_pre) score = model.score(x_test, y_test) cost_time = datetime.datetime.now() - last print("%d times, %f score, %f acc" % (i, score, acc), cost_time, " time(s)") # model.fit(X_train, Y_train) # y_pre = model.predict(X_val) # print(model.score(X_val, Y_val)) # print(accuracy_score(Y_val, y_pre)) training_time = datetime.datetime.now() - now print("Training time(s):", training_time)
class SVMClassifier(Model): def __init__(self): Model.__init__(self) # self.model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=50, alpha=1e-4, # solver='sgd', tol=1e-4, random_state=1, # learning_rate_init=.001) self.model = SGDClassifier() def update(self, x, y, learning_rate): if len(y) > 0: self.model.partial_fit(x, y) def batch_update(self, x, y, learning_rate): indices = np.arange(len(y)) np.random.shuffle(indices) # print('new training iteration using {} items'.format(len(y))) self.model.partial_fit(x[indices, :], y[indices], np.unique(y)) def loss(self, x, y): pass def calculate_loss(self, x, y): #p = self.model.predict(x) return 1-self.model.score(x,y) def update_params(self, params): pass def restart(self): pass
def test_multi_output_classification_partial_fit(): # test if multi_target initializes correctly with base estimator and fit # assert predictions work as expected for predict sgd_linear_clf = SGDClassifier(loss='log', random_state=1) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) # train the multi_target_linear and also get the predictions. half_index = X.shape[0] // 2 multi_target_linear.partial_fit( X[:half_index], y[:half_index], classes=classes) first_predictions = multi_target_linear.predict(X) assert_equal((n_samples, n_outputs), first_predictions.shape) multi_target_linear.partial_fit(X[half_index:], y[half_index:]) second_predictions = multi_target_linear.predict(X) assert_equal((n_samples, n_outputs), second_predictions.shape) # train the linear classification with each column and assert that # predictions are equal after first partial_fit and second partial_fit for i in range(3): # create a clone with the same state sgd_linear_clf = clone(sgd_linear_clf) sgd_linear_clf.partial_fit( X[:half_index], y[:half_index, i], classes=classes[i]) assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i]) sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i]) assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
def out_of_core(): vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer_new) clf = SGDClassifier(loss='log', random_state=1, n_iter=1) doc_stream = stream_docs(path='./movie_data.csv') pbar = pyprind.ProgBar(45) classes = np.array([0, 1]) for _ in range(45): # import pdb; pdb.set_trace() X_train, y_train = get_minibatch(doc_stream, size=1000) if not X_train: break X_train = vect.transform(X_train) clf.partial_fit(X_train, y_train, classes=classes) pbar.update() X_test, y_test = get_minibatch(doc_stream, size=5000) X_test = vect.transform(X_test) print('\nAccuracy: %.3f' % clf.score(X_test, y_test)) clf = clf.partial_fit(X_test, y_test) dest = os.path.join('movieclassifier', 'pkl_objects') if not os.path.exists(dest): os.makedirs(dest) pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4) pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)
def _initialise_objective_function(self, x): x = np.atleast_2d(x) fs = np.zeros((x.shape[0], 1)) for i in range(x.shape[0]): fs[i] = 0 gamma = np.exp(x[i, 0]) # learning rate, log scale alpha = np.exp(x[i, 1]) # l2 regulariser, log scale n_iter = int(x[i, 2]) # num epochs batch_size = int(x[i, 3]) # mini batch size clf = SGDClassifier(loss='log', penalty='l2', alpha=alpha, learning_rate='constant', eta0=gamma, n_iter=1) for j in range(n_iter): for (X_batch, y_batch) in self._next_batch(self.X_train, self.y_train, batch_size): clf.partial_fit(X_batch, y_batch, classes=self.classes) score = clf.score(self.X_test, self.y_test) fs[i] = 1 - score # classification error return fs
def evaluate_svm(alpha): # Note: n_iter gets switched to 1 by sklearn whenever you call partial_fit(). This initial # setting is for the pretesting of eta0. basic_svm = SGDClassifier(loss="hinge", penalty="l2", l1_ratio=0.0, random_state=31337, n_jobs=5, n_iter=5, alpha=alpha) learning_rate_grid = [ 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7 ] pretest_svm = GridSearchCV(basic_svm, {"learning_rate": ["constant"], "eta0": learning_rate_grid}).fit(X_pretest, y_pretest) bottou_gamma0 = pretest_svm.best_params_["eta0"] basic_svm.eta0 = bottou_gamma0 basic_svm.learning_rate = "constant" basic_svm = basic_svm.partial_fit(X_pretest, y_pretest, classes = np.unique(y_train)) progressive_val = [] train_score = [] for dp in range(0, X_train.shape[0], batch_size): t = dp + n_pretest basic_svm.eta0 = bottou_gamma0/(1 + bottou_gamma0*alpha*t) X_batch = X_train[dp:dp+batch_size] y_batch = y_train[dp:dp+batch_size] progressive_val.append(basic_svm.score(X_batch, y_batch)) basic_svm = basic_svm.partial_fit(X_batch, y_batch) train_score.append(basic_svm.score(X_batch, y_batch)) scores = progressive_val[-batches_for_cv_performance:] return np.mean(scores), np.std(scores), basic_svm
class PartialSGDEstimator(BaseEstimator): def fit(self, documents, labels=None, mini_batch_size=500): self.model = SGDClassifier() batchDocs = [] batchLabels = [] count = 0 for doc in documents: batchDocs.append(doc) batchLabels.append(labels[count]) if count % mini_batch_size == 0: print("batch") self.model.partial_fit(batchDocs, batchLabels, classes=np.unique(labels)) batchDocs = [] batchLabels = [] gc.collect() else: pass count += 1 return self def predict(self, X, mini_batch_size=500): yhat = [] for doc in X: preds = self.model.predict(doc) yhat.extend(preds) return yhat
def direcrtoryProcessing(train_path): training_names = os.listdir(train_path) # Get all the path to the images and save them in a list # image_paths and the corresponding label in image_paths image_paths = [] image_classes = [] class_id = 0 for training_name in training_names: #Got three directory having images dir = os.path.join(train_path, training_name) class_path = imlist(dir) image_paths += class_path image_classes += [class_id] * len(class_path) class_id += 1 X, Y = trainTestSet(image_paths, image_classes) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0) clf = SGDClassifier() batcherator = iter_minibatches(10, X_train, y_train) for X_chunk, y_chunk in batcherator: clf.partial_fit(X_chunk, y_chunk, classes=np.unique(Y)) y_predicted = clf.predict(X_test) print( classification_report(y_test, y_predicted, target_names=training_names))
def train(model: SGDClassifier, train_data, train_labels, test_data, test_labels, total_epochs=1000): # using partial fit instead of fit in order to gather information on accuracy after every pass labels = [] scores = [] for i in range(total_epochs): if i == 0: model.partial_fit(train_data, train_labels, classes=np.unique(train_labels)) else: model.partial_fit(train_data, train_labels) if (i + 1) % (total_epochs // 20) == 0: pred_labels = model.predict(test_data) scores.append(accuracy_score(test_labels, pred_labels)) labels.append(i + 1) print("Epoch {0} score: {1}".format(i + 1, scores[-1])) return labels, scores
def init(self, n_obs, n_act): self.models = [] for i in xrange(n_act): model = SGDClassifier(**self.model_kwargs) model.partial_fit(np.random.rand(1, n_obs), [0], classes=[0, 1]) self.models.append(model)
def bigram(documents, prediction_documents): vectorizer = HashingVectorizer(decode_error='ignore', ngram_range=(1, 2), preprocessor=None, tokenizer=tokenizer, analyzer='word') classifier = SGDClassifier(loss='hinge', penalty='l1') chunk = stream(path=documents, scope='training') pbar = pyprind.ProgBar(10) classes = np.array([0, 1]) for _ in range(10): reviews, labels = batch(chunk, size=2500, scope='training') reviews = vectorizer.transform(reviews) classifier.partial_fit(reviews, labels, classes=classes) pbar.update() prediction_size = prediction_file_size(prediction_documents) - 1 test_chunk = stream(path=prediction_documents, scope='predicting') test_reviews = batch(test_chunk, size=prediction_size, scope='predicting') test_reviews = vectorizer.transform(test_reviews) predictions = classifier.predict(test_reviews) save('bigram.output.txt', predictions)
def main(args): with open(args.train, 'r') as f: train_data, train_labels = matify(json.load(f)) with open(args.test, 'r') as f: test_data, test_labels = matify(json.load(f)) train_data = np.array(train_data, dtype=np.float32) train_data /= 256. test_data = np.array(test_data, dtype=np.float32) test_data /= 256. train_labels = np.array(train_labels, dtype=np.float32) test_labels = np.array(test_labels, dtype=np.float32) clf = SGDClassifier(loss='hinge', penalty='l2') for i in tqdm(xrange(args.training_steps)): data, labels = get_batch(train_data, train_labels, BATCH_SIZE) clf.partial_fit(data, labels, classes=[c for c in xrange(CLASSES)]) if ((i + 1) % 200 == 0): tqdm.write('step %d, training accuracy %g' % (i + 1, clf.score(data, labels))) print('Validating...') data, labels = get_batch(test_data, test_labels, BATCH_SIZE) print('VAL ACCURACY: %f' % clf.score(data, labels)) if args.param_dir: vars_to_save = { 'fc1_w.summary': clf.coef_, 'fc1_b.summary': clf.intercept_, } for var in vars_to_save: path = os.path.join(args.param_dir, var) with open(path, 'w+') as f: pickle.dump(vars_to_save[var], f)
def SGD_normal(x_train,y_train,x_test,y_test): #X = [[0., 0.], [1., 1.]] #y = [0, 1] import numpy as np print(x_train.shape, y_train.shape, type(x_train)) clf=SGDClassifier(loss='hinge',penalty='l2') clf.partial_fit(x_train,y_train,classes=np.unique(y_train)) ''' SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, tol=None, verbose=0, warm_start=False) ''' #参数说明 ''' loss="hinge": (soft-margin) linear Support Vector Machine ((软-间隔)线性支持向量机), loss="modified_huber": smoothed hinge loss (平滑的 hinge 损失), loss="log": logistic regression (logistic 回归), and all regression losses below(以及所有的回归损失)。 默认设置为 penalty="l2" 。 L1 penalty (惩罚)导致稀疏解,使得大多数系数为零。 Elastic Net(弹性网)解决了在特征高相关时 L1 penalty(惩罚)的一些不足。 参数 l1_ratio 控制了 L1 和 L2 penalty(惩罚)的 convex combination (凸组合)。 ''' note_prediction = list(clf.predict(x_test)) from sklearn.metrics import classification_report, confusion_matrix print(confusion_matrix(y_test, note_prediction)) print(classification_report(y_test, note_prediction)) return clf
def test_multi_output_classification_partial_fit(): # test if multi_target initializes correctly with base estimator and fit # assert predictions work as expected for predict sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5) multi_target_linear = MultiOutputClassifier(sgd_linear_clf) # train the multi_target_linear and also get the predictions. half_index = X.shape[0] // 2 multi_target_linear.partial_fit( X[:half_index], y[:half_index], classes=classes) first_predictions = multi_target_linear.predict(X) assert_equal((n_samples, n_outputs), first_predictions.shape) multi_target_linear.partial_fit(X[half_index:], y[half_index:]) second_predictions = multi_target_linear.predict(X) assert_equal((n_samples, n_outputs), second_predictions.shape) # train the linear classification with each column and assert that # predictions are equal after first partial_fit and second partial_fit for i in range(3): # create a clone with the same state sgd_linear_clf = clone(sgd_linear_clf) sgd_linear_clf.partial_fit( X[:half_index], y[:half_index, i], classes=classes[i]) assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i]) sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i]) assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
def train_tagger(X, y, type='sgdo'): if type == 'sgd': clf = SGDClassifier() clf.fit(X, y) elif type == 'sgdo': clf = SGDClassifier() classes = np.unique(y) for i in range(len(X)): sys.stdout.write('%.3f%% Complete\r' % ((float(i) / float(len(X))) * 100)) A = X[i] b = y[i] clf.partial_fit([A], [b], classes) elif type == 'nn': clf = Perceptron() clf.fit(X, y) elif type == 'nno': clf = Perceptron() classes = np.unique(y) for i in range(len(X)): sys.stdout.write('%.3f%% Complete\r' % ((float(i) / float(len(X))) * 100)) A = X[i] b = y[i] clf.partial_fit([A], [b], classes) elif type == 'svm': clf = svm.LinearSVC() clf.fit(X, y) else: clf = svm.LinearSVC() clf.fit(X, y) return clf
def train_test_bow(ngram_order, batch_size=128, n_epoch=3): label_sets = ['full', 'function', '3way', 'in_out', 'man_nat'] for label_set in label_sets: # need to drop unk for full/function if label_set in ['full', 'function']: df = sentences_df(labels=label_set, drop_unk=True) else: df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=False) X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order) print "X shape: %s" % (X.shape,) print "y shape: %s" % (y.shape,) skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0) scores = [] for (train, test) in skf: clf = None clf = SGDClassifier(loss='log', alpha=0.001, l1_ratio=0, random_state=0) for epoch in range(n_epoch): X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test] n_batches = X_train.shape[0] // batch_size for minibatch_idx in range(n_batches): clf.partial_fit( X_train[minibatch_idx * batch_size : (minibatch_idx+1) * batch_size], y_train[minibatch_idx * batch_size : (minibatch_idx+1) * batch_size], classes=np.unique(y)) print "Epoch: %d/%d Train acc: %.4f" \ % (epoch+1, n_epoch, clf.score(X_train, y_train)) fold_score = clf.score(X_test, y_test) print "Fold acc: %.4f" % fold_score scores.append(fold_score) print '%s label mean cv accuracy: %.4f\n' % (label_set, np.mean(scores))
class SGDRanker(BaseEstimator): """ Ranking predictor using stochastic gradient descent TODO: -allow configurable parameters for classifier -seed random state """ def __init__(self, seconds=10): self.clf = SGDClassifier(loss='hinge') self.clf.fit_intercept = False self.clf.classes_ = np.array([-1, 1]) self.seconds = seconds def fit(self, X, y): rows = X.shape[0] start_time = time.time() for i in itertools.count(): if time.time() - start_time > self.seconds: return self idx1 = random.randint(0, rows - 1) idx2 = random.randint(0, rows - 1) y1, y2 = y[idx1], y[idx2] if y1 == y2: continue self.clf.partial_fit(X[idx1] - X[idx2], np.sign(y1 - y2)) def predict(self, X): return np.dot(X, self.clf.coef_.T)
class LightModel: def __init__(self,learningRate, numEpochs, ppenalty="l1", mustShuffle=True): #Init scikit models self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle) def train(self, gen, v=False): i = 0 for x, y in gen: #For each batch self.Classifier.partial_fit(x, y, [0,1]) i += len(x) if v : print(str(datetime.now())[:-7] , "example:", i) def test(self, gen, v=False): #init target and prediction arrays ytot = np.array([]) ptot = np.array([]) #Get prediction for each batch i = 0 for x,y in gen: p = self.Classifier.predict_proba(x) p = p.T[1].T #Keep column corresponding to probability of class 1 #Stack target and prediction for later analysis ytot = np.hstack((ytot, y)) ptot = np.hstack((ptot, p)) i += y.shape[0] if v : print(str(datetime.now())[:-7] , "example:", i) if v: print("Score:", self.score(ytot, ptot)) return (ytot, ptot) def score(self, target, prediction): return llfun(target, prediction)
class LanguageProcessing(): def __init__(self, model=None): # Machine learning models. self.nlp = spacy_model.load() if model: self.model = model else: self.model = SGDClassifier() # Preprocessing the corpus. # self.corpus: Map<String, List<String>> self.corpus = load(open('pickles/speech_corpus', 'rb')) self.categories = self.corpus.keys() self.classifications_by_cat = {self.categories[i]: i for i in range(len(self.categories))} self.classifications_by_num = {i: self.categories[i] for i in range(len(self.categories))} # Training the model. training_x = [] training_y = [] for k in self.corpus: sentences = self.corpus[k] training_x += [self.return_nlp(s).vector for s in sentences] training_y += [self.classifications_by_cat[k] for i in range(len(sentences))] self.model.fit(array(training_x), array(training_y)) def return_nlp(self, text): """ Wraps given text in unicode and returns its spaCy wrapper. """ return self.nlp(unicode(text)) def string_similarity(self, s1, s2): """ Using spaCy, computes the similarity between two strings based on the GloVe vectors provided. """ return self.return_nlp(s1).similarity(self.return_nlp(s2)) def train_with_query(self, query): query_vectorized = [self.return_nlp(query).vector] pred = self.model.predict( array(query_vectorized))[0] # Inform the user of the prediction and # ask for confirmation of training. To be removed later. print '\n' * 100 print 'QUERY: " ' + query + '"' print 'CLASSIFIED AS: "' + self.classifications_by_num[pred] + '"' decision = raw_input( 'Is this what you expected? 0 for N, 1 for Y.\n> ') print '\n' * 2 if eval(decision) == 1: # Update the corpus and train the model. self.corpus[self.classifications_by_num[pred]].append(query) self.model.partial_fit(array(query_vectorized), array([pred])) return query, self.classifications_by_num[pred]
def incremental_SGD(X, Y, loss): sgd = SGDClassifier(loss=loss, penalty="l2") labels = np.unique(Y) for i in range(X.shape[0]): point_x = X[i] point_y = Y[i] sgd.partial_fit([point_x], [point_y], classes=labels) return sgd
def train(): model = SGDClassifier() for batch_no, batch in enumerate(db.mini_batches(100)): X, y = vectorize_batch(batch) model.partial_fit(X, y) if sampling and batch_no == 10: break return model
def train(train_ids, class_id, polys, gs, patch_size): print('TRAINING...') # First calculate standard scaler parameters scaler = StandardScaler() for img_id in train_ids: print('Calcuating scaler for ' + str(img_id) + ' for class ' + str(class_id) + ' at ' + str(datetime.now(timezone('EST')))) im_rgb = tiff.imread('input/three_band/{}.tif'.format(img_id)).transpose([1, 2, 0]) patches = extract_patches_2d(im_rgb, patch_size) patches = np.reshape(patches, (len(patches), -1)) #xs = im_rgb.reshape(-1, 3).astype(np.float32) xs = patches.astype(np.float32) scaler.partial_fit(xs) # Next build the logistic model model = SGDClassifier(loss='log') for img_id in train_ids: print('Training on ' + str(img_id) + ' for class ' + str(class_id) + ' at ' + str(datetime.now(timezone('EST')))) # Load grid size for current image polygon coordinates x_max, y_min = gs[gs['ImageId'] == img_id].iloc[0,1:].astype(float) # Read current image with tiff im_rgb = tiff.imread('input/three_band/{}.tif'.format(img_id)).transpose([1, 2, 0]) im_size = im_rgb.shape[:2] patches = extract_patches_2d(im_rgb, patch_size) print(len(patches)) patches = np.reshape(patches, (len(patches), -1)) # Read in polygons for current image cur_polygons = polys[(polys['ImageId'] == img_id) & (polys['ClassType'] == class_id)].iloc[0]['MultipolygonWKT'] train_polygons = shapely.wkt.loads(cur_polygons) x_scaler, y_scaler = get_scalers(im_size, x_max, y_min) train_polygons_scaled = shapely.affinity.scale(train_polygons, xfact=x_scaler, yfact=y_scaler, origin=(0, 0, 0)) train_mask = get_polygon_mask(train_polygons_scaled, im_size) # Load xs from image and ys from polygon mask #xs = im_rgb.reshape(-1, 3).astype(np.float32) xs = patches.astype(np.float32) edges_to_delete = (patch_size[1] - 1) / 2 # Delete 0 for patch_size 1, 1 for patch_size 3, 2 for patch_size 5, etc ys = train_mask[edges_to_delete:-edges_to_delete, edges_to_delete:-edges_to_delete].reshape(-1) # Drop beginning & end rows and columns to account for patch size #ys = train_mask.reshape(-1) # Scale x values with trained scaler #print(xs.mean(axis=0)) xs = scaler.transform(xs) print(im_rgb.shape) print(xs.shape) print(ys.shape) #print(xs.mean(axis=0)) print('training partial fit...') model.partial_fit(xs, ys, classes = (0, 1)) return scaler, model
class modle(object): def __init__(self, modle_n=0): #生成模型 if modle_n == 0: #支持向量机 self.clf = svm.SVC() elif modle_n == 1: self.clf = linear_model.LogisticRegression(C=1.0, penalty='l1', tol=1e-6) elif modle_n == 2: #随机梯度下降 self.clf = SGDClassifier() # SGDClassifier的参数设置可以参考sklearn官网 ''' clf = svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) ''' def train(self, X, y): self.clf.fit(X, y) self.out_clf() def train_batch(self, X, y, m=2, n=0): if m == 0: #支持向量机 pass elif m == 1: pass elif m == 2: #随机梯度下降 # 使用 partial_fit ,并在第一次调用 partial_fit 的时候指定 classes self.clf.partial_fit(X, y, classes=np.array([0, 1])) print("train...{0}".format(n)) # 当前次数 self.out_clf() def in_clf(self): #导入模型 self.clf = joblib.load('TalkingDataAdTracking/data/sgd.pkl') print('in_clf...ok') def out_clf(self): #保存模型 joblib.dump(self.clf, 'TalkingDataAdTracking/data/sgd.pkl') print('out_clf...ok') def evaluate(self, X, y): e = self.clf.score(X, y) print(e) return e def evaluate2(self, X, y): #简单评估 e = cross_validation.cross_val_score(self.clf, X, y, cv=5) print(e) return e def predict(self, X): #预测 return self.clf.predict(X)
def create_classifier(self): DB.db.connect() clf = SGDClassifier(loss="modified_huber") labs_map = NameToIndex() with DB.db.transaction(): offset = 0 words_count = self.get_words_count() classes = numpy.arange(0, words_count) x_all = [] y_all = [] while True: print ' %d partial_fit %d' % (time(), offset) query = DB.Vocabulary\ .select(DB.Vocabulary.lv1, DB.Vocabulary.lv2)\ .join(DB.PcaModel, on=(DB.Vocabulary.feature == DB.PcaModel.feature)).order_by( DB.Vocabulary.feature).offset(offset).limit(1000)\ .tuples().iterator() features = numpy.array( map(lambda x: [x[0]] + list(x[1]), query)) offset += len(features) if len(features) == 0: break Y = features[:, 0] X = features[:, 1:] labs = [] for lab in Y: labs.append(labs_map.map(lab)) if (len(x_all) < 10000): x_all = x_all + X.tolist() y_all = y_all + labs labs = numpy.array(labs) #clf = LinearSVC() #clf = OneVsRestClassifier(SVC(probability=True, kernel='linear')) #clf.fit(X,labs) clf.partial_fit(X, labs, classes) print clf.score(x_all, y_all) DB.TrainingResult.delete().where( DB.TrainingResult.name == self.__class__.__name__ + "_clf").execute() DB.TrainingResult.delete().where( DB.TrainingResult.name == self.__class__.__name__ + "_labs_map").execute() tr = DB.TrainingResult() tr.name = self.__class__.__name__ + "_clf" tr.data = clf tr.save() tr = DB.TrainingResult() tr.name = self.__class__.__name__ + "_labs_map" tr.data = labs_map tr.save()
def loadModel(): """ """ train = np.genfromtxt('dataset.csv', delimiter=',') x_train = train[:, 1:] y_train = np.uint8(train[:, 0]) clf = SGDClassifier(loss="log") clf.partial_fit(x_train, y_train, classes=[0, 1]) return x_train, y_train, clf
def objective(trial): alpha = trial.suggest_uniform("alpha", 0.0, 1.0) clf = SGDClassifier(alpha=alpha) n_train_iter = 100 for step in range(n_train_iter): clf.partial_fit(X_train, y_train, classes=classes) intermediate_value = clf.score(X_valid, y_valid)
def create_classifier(self): DB.db.connect() clf = SGDClassifier( loss="modified_huber") labs_map = NameToIndex() with DB.db.transaction(): offset = 0 words_count = self.get_words_count() classes = numpy.arange(0,words_count) x_all = [] y_all = [] while True: print ' %d partial_fit %d'%(time(),offset) query = DB.Vocabulary\ .select(DB.Vocabulary.lv1, DB.Vocabulary.lv2)\ .join(DB.PcaModel, on=(DB.Vocabulary.feature == DB.PcaModel.feature)).order_by( DB.Vocabulary.feature).offset(offset).limit(1000)\ .tuples().iterator() features = numpy.array(map(lambda x:[x[0]]+list(x[1]),query)) offset += len(features) if len(features) == 0: break Y = features[:,0] X = features[:,1:] labs = [] for lab in Y: labs.append(labs_map.map(lab)) if(len(x_all)<10000): x_all = x_all + X.tolist() y_all = y_all + labs labs = numpy.array(labs) #clf = LinearSVC() #clf = OneVsRestClassifier(SVC(probability=True, kernel='linear')) #clf.fit(X,labs) clf.partial_fit(X,labs,classes) print clf.score(x_all,y_all) DB.TrainingResult.delete().where(DB.TrainingResult.name == self.__class__.__name__+"_clf").execute() DB.TrainingResult.delete().where(DB.TrainingResult.name == self.__class__.__name__+"_labs_map").execute() tr = DB.TrainingResult() tr.name = self.__class__.__name__+"_clf" tr.data = clf tr.save() tr = DB.TrainingResult() tr.name = self.__class__.__name__+"_labs_map" tr.data = labs_map tr.save()
def chi_feature_select(train_file, test_file): lines = read_text_src(train_file) lines = [x for x in lines if len(x)>1] X_train = [line[1] for line in lines] y_train = [line[0] for line in lines] lines = read_text_src(test_file) lines = [x for x in lines if len(x) > 1] X_test = [line[1] for line in lines] y_test = [line[0] for line in lines] vectorizer = TfidfVectorizer(tokenizer=zh_tokenize)#ngram_range=(1,2) X_train = vectorizer.fit_transform(X_train) print X_train.shape X_test = vectorizer.transform(X_test) # word = vectorizer.get_feature_names() # N = X_train.shape[1] # ch2 = SelectKBest(chi2, k=int(N*0.2)) #.fit_transform(X, y) # # # X_train = ch2.fit_transform(X_train, y_train) # X_test = ch2.transform(X_test) # feature_names = [word[i] for i # in ch2.get_support(indices=True)] # # for i in feature_names: # print i.encode('utf-8') # feature_names = np.asarray(feature_names) # print feature_names # clf = LinearSVC(penalty="l1", dual=False, tol=1e-3) # clf.fit(X_train, y_train) clf = SGDClassifier(loss="log", penalty='l1') clf.fit(X_train, y_train) pred = clf.predict(X_test) prob = clf.predict_proba(X_test[0]) print prob X=["市场经济复苏,互联网公司蓬勃发展","世纪大战终于开启,勇士引得第73胜"] Y=['1','0'] X=vectorizer.transform(X) clf.partial_fit(X,Y, classes=['0','1']) tmpx=['暴风科技股价大跌',"世纪大战终于开启,勇士引得第73胜"] tmpX=vectorizer.transform(tmpx) pred = clf.predict(tmpX) print pred
def main(): # Get training and model filenames with open('model_metadata.json') as f: config = json.load(f) CLASSES = [float(x) for x in config['classes']] model_filename = config['modelFilename'] NUM_BITS_FOR_HASHING = config['numBitsForHashing'] train_filename = config['trainFilename'] sklearn_version_expected = config['sklearnVersion'] # If sklearn version is wrong, exit without training if float(sklearn.__version__) != float(sklearn_version_expected): print "Wrong sklearn version" sys.exit(0) with open(train_filename) as f: lines = (tuple(line.rstrip('\n').split('\t')) for line in f) parsed_lines = ((line[1:], float(line[0])) for line in lines) # Parse header and get feature names for namespacing header = next(lines) FEATURE_NAMES = tuple(header[1:]) # Build pipeline pre_processing_pipeline = make_pre_processing_pipeline( feature_names=FEATURE_NAMES, num_bits_for_hashing=NUM_BITS_FOR_HASHING ) # Instantiate classifier # (a logistic regression model with Stochastic Gradient Descent) clf = SGDClassifier(loss='log') # Train model in mini-batches batch_size = 8000 for rows, labels in batched_lines(batch_size, parsed_lines): processed_rows = pre_processing_pipeline.fit_transform(rows) clf.partial_fit(processed_rows, labels, classes=CLASSES) print clf # Save model joblib.dump(clf, model_filename) # Reload just to make sure it serializes and de- properly joblib.load(model_filename)
class Model: def __init__(self,numFeatures, learningRate, numEpochs, ppenalty="l1", mustShuffle=True): #Init scikit models self.FH = FeatureHasher(n_features=numFeatures, input_type='string') self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle) def train(self, gen, v=False): i = 0 for x, y in gen: #For each batch xHash = self.FH.transform(x) #hash trick y = np.array(y) ## for epoch in range(numEpochs): self.Classifier.partial_fit(xHash, y, [0,1]) i += len(x) if v : print(str(datetime.now())[:-7] , "example:", i) def test(self, gen, v=False): #init target and prediction arrays ytot = np.array([]) ptot = np.array([]) #Get prediction for each batch i = 0 for x,y in gen: xHash = self.FH.transform(x) #hash trick p = self.Classifier.predict_proba(xHash) p = p.T[1].T #Keep column corresponding to probability of class 1 #Stack target and prediction for later analysis ytot = np.hstack((ytot, y)) ptot = np.hstack((ptot, p)) i += y.shape[0] if v : print(str(datetime.now())[:-7] , "example:", i) if v: print("Score:", self.score(ytot, ptot)) return (ytot, ptot) def predictBatch(self, batch): hashedBatch = self.FH.transform(batch) prediction = self.Classifier.predict_proba(hashedBatch) return prediction def generatePrediction(self, generator): for xBatch, idBatch in generator: prediction = self.predictBatch(xBatch) yield prediction, idBatch def score(self, target, prediction): return llfun(target, prediction)
def train(input_filename, num_train_examples, num_test_examples, block_size): # Load initial training data and test data X_train, y_train, X_test, y_test, scaler = loaddata(input_filename, num_test_examples, block_size) # Feature generation using random forests forest = RandomForestClassifier(n_estimators=150, n_jobs=-1) forest.fit(X_train, y_train) encoder = OneHotEncoder() encoder.fit(forest.apply(X_train)) X_test = encoder.transform(forest.apply(X_test)) # Make sure that classes are weighted inversely to their frequencies weights = float(y_train.shape[0]) / (2 * numpy.bincount(y_train)) class_weights = {0: weights[0], 1: weights[1]} learner = SGDClassifier( loss="hinge", penalty="l2", learning_rate="invscaling", alpha=0.0001, average=10 ** 4, eta0=1.0, class_weight=class_weights, ) num_passes = 3 aucs = [] for j in range(num_passes): for i in range(0, num_train_examples, block_size): df = pandas.read_csv(input_filename, header=None, skiprows=i, nrows=block_size) X_train = df.values[:, 1:] X_train = scaler.transform(X_train) X_train = encoder.transform(forest.apply(X_train)) y_train = numpy.array(df.values[:, 0], numpy.int) del df learner.partial_fit(X_train, y_train, classes=numpy.array([0, 1])) y_pred_prob = learner.decision_function(X_test) auc = roc_auc_score(y_test, y_pred_prob) aucs.append([i + num_train_examples * j, auc]) print(aucs[-1]) df = pandas.DataFrame(aucs, columns=["Iterations", "AUC"]) df = df.set_index("Iterations") return df
def batchPredict(X, y): est = SGDClassifier(loss='log', penalty='l1', alpha=0.01) progressive_validation_score = [] train_score = [] l = len(X) step = 500 for datapoint in range(0, l, step): X_batch = X[datapoint:datapoint+step] y_batch = y[datapoint:datapoint+step] if datapoint > 0: progressive_validation_score.append(est.score(X_batch, y_batch)) est.partial_fit(X_batch, y_batch, classes=range(10)) if datapoint > 0: train_score.append(est.score(X_batch, y_batch)) plt.plot(train_score, label='train score') plt.plot(progressive_validation_score, label='progressive validation score') plt.xlabel('Mini-batch') plt.ylabel('Score') plt.legend(loc='best') plt.show()
class Model: def __init__(self,numFeatures, learningRate, mustShuffle=True): #Init scikit models self.FH = FeatureHasher(n_features=numFeatures, input_type='pair') self.Classifier = SGDClassifier(loss='log', alpha=learningRate, shuffle=mustShuffle) def train(self, gen, numEpochs, v=False): i = 0 for x, y in gen: #For each batch xHash = self.FH.transform(x) #hash trick y = np.array(y) for epoch in range(numEpochs): self.Classifier.partial_fit(xHash, y, [0,1]) if v and (i % (numBatches/60)) == 0: print(datetime.now(), "example:", i*sizeBatch) i+=1 def test(self, gen, v=False): #init target and prediction arrays ytot = np.array([]) ptot = np.array([]) #Get prediction for each batch for batch in gen: data = list(batch) #store batch in memory for prediction x, y = data[0], np.array(data[1]) x = self.FH.transform(x) p = self.Classifier.predict_proba(x) p = p.T[1].T #Keep column corresponding to probability of class 1 #Stack target and prediction for later analysis ytot = np.hstack((ytot, y)) ptot = np.hstack((ptot, p)) if v: print("Score:", self.score(ytot, ptot)) return (ytot, ptot) def score(self, target, prediction): return llfun(target, prediction)
def partial_fit(self, X, y, *args, **kw): X = sp.csr_matrix(X) return SGDClassifier.partial_fit(self, X, y, *args, **kw)
class SGDLearner: def __init__(self, X_train, y_train, X_test, y_test, random_state, eta0, alpha): self.X_train = X_train self.y_train = y_train self.X_test = X_test self.y_test = y_test self.rng = random_state # work out how to use random state ( ie save) self.sgd = SGDClassifier(random_state=self.rng, fit_intercept=True) self.sgd.loss = "hinge" self.sgd.alpha = alpha self.sgd.learning_rate = "constant" self.sgd.eta0 = eta0 # how to make these controlled properties (from chaco) # E_part = np.NaN((self.T // self.update_period, )) self.ntrain, self.ncoef = self.X_train.shape self.wts = np.zeros((self.ntrain + 1, self.ncoef + 1)) self.grad = np.zeros((self.ntrain + 1, self.ncoef + 1)) self.reset() def reset(self): self.scores = [] # turn into dataframe? self.mn_grad = [] self.st_grad = [] self.sgd.warm_start = False # reset learning ?does this reset learning rate time counter? self._iT = 0 def learn(self, learn_for, probe_every): """Train for learn_for and and store results probe_every""" ind = self.rng.randint(0, self.ntrain, learn_for) for time in range(0, learn_for, probe_every): self.sgd.partial_fit( self.X_train[ind[time : time + probe_every], :], self.y_train[ind[time : time + probe_every]], [0, 1] ) self._iT += probe_every self.sgd.warm_start = False # (not necessary? unless we use fit rather than partial) self.calc_grad() mn = self.grad.mean(axis=0) * 1e-6 st = self.grad.std(axis=0) * 1e-6 self.mn_grad.append(mn) self.mn_grad.append(st) ind_part = self.rng.randint(0, self.ntrain, int(self.ntrain * 0.1)) scores = {"timestep": self._iT} scores["part"] = self.sgd.score(self.X_train[ind_part, :], self.y_train[ind_part]) scores["train"] = self.sgd.score(self.X_train, self.y_train) scores["test"] = self.sgd.score(self.X_test, self.y_test) self.scores.append(scores) def calc_grad(self): # now calculate current gradient variance # by using very low learning rate and calculate gradients eta0 = self.sgd.eta0 self.sgd.eta0 = 1e-6 * eta0 self.wts[0, :-1] = self.sgd.coef_ self.wts[0, -1] = self.sgd.intercept_ for im in range(self.ntrain): self.sgd.partial_fit(self.X_train[im, :].reshape((1, -1)), self.y_train[im].reshape((1,)), [0, 1]) self.wts[im, :-1] = self.sgd.coef_ self.wts[im, -1] = self.sgd.intercept_ self.grad = np.diff(self.wts, axis=0) self.sgd.eta0 = eta0
lr_model = SGDClassifier(loss='log') # using log-loss for LogisticRegression scores = [] k = 1 # using k and i to control the training scale (training_samples used = (all_samples / k) i = 1 for chunk in df_train_f: if i < k: i += 1 continue i = 1 df_train = oh_enc.transform(chunk) #----- training LR -----# feature_train = df_train.columns.drop(['id', 'click']) train_X = df_train[feature_train] train_y = df_train['click'].astype('int') lr_model.partial_fit(train_X, train_y, classes = [0,1]) # fitting # the score of training y_pred = lr_model.predict_proba(train_X)[:, 1] score = log_loss(train_y, y_pred) scores.append(score) ## store the pre-trained lr_model pickle.dump(lr_model, open(fp_lr_model, 'wb')) ## show the training curve f1 = plt.figure(1) plt.plot(scores) plt.xlabel('iterations') plt.ylabel('log_loss') plt.title('log_loss of training')
class JointModel: # creating an empty model def __init__(self): # known words and their classifiers self.knownWords = {} self.minimumGuessScore = JM_GUESS_SCORE_THRESHOLD self.clfColor = SGDClassifier(loss="log", penalty="l2") self.clfShape = SGDClassifier(loss="log", penalty="l2") self.classColors = [] self.classShapes = [] # add a word-example pair to the SGD classifer # word: string # example: image def add_sgd_class(self, word, example): self.clfColor = SGDClassifier(loss="log", penalty="l2") self.clfShape = SGDClassifier(loss="log", penalty="l2") X_Color = [example['Color']] y_Color = [word] X_Shape = [example['Shape']] y_Shape = [word] for word in self.knownWords.keys(): for classifier in self.knownWords[word]: if("Synonym" not in str(type(classifier))): examples = classifier.positiveExamples for ex in examples : if("Color" in classifier._type_): X_Color.append(ex['Color']) y_Color.append(word) if("Shape" in classifier._type_): X_Shape.append(ex['Shape']) y_Shape.append(word) classes = np.unique(y_Color) self.clfColor.partial_fit(X_Color, y_Color,classes=classes) self.classColors = classes classes = np.unique(y_Shape) self.clfShape.partial_fit(X_Shape, y_Shape,classes=classes) self.classShapes = classes # add a word-example pair to existing SGD classifer # word: string # example: image def partial_fit_classifer(self,word,example): self.clfColor.partial_fit([example['Color']],[word]) self.clfShape.partial_fit([example['Shape']],[word]) # add a word-example pair to the model # word: string # example: image # example polairty: global definition (constant) def add_word_example_pair(self, word, example, examplePolarity): currentKnownWords = self.knownWords.keys() # check if it is a new word if(word not in self.knownWords.keys()): # new word. add possibly associated classifiers # limited to initialization self.knownWords[word] = [] self.knownWords[word].append(ObjColor(word, example, examplePolarity)) self.knownWords[word].append(ObjShape(word, example, examplePolarity)) self.add_sgd_class(word,example) # add possibilities of being a synonym # this will not contain redundant information like (a b), (a c) and (b c) # this is because syonyms are added in order for knownWord in currentKnownWords: # word may be a synonym of knownWord # when classifying, synonyms are checked for all classifier types # e.g. color, shape self.knownWords[word].append(ObjSynonymColor(word, knownWord, example, examplePolarity)) self.knownWords[word].append(ObjSynonymShape(word, knownWord, example, examplePolarity)) else: self.partial_fit_classifer(word,example) # known word. just add the example # add in all word objects (where adding an example is possible) for classifier in self.knownWords[word]: # assume all types to qualify for example addition classifier.add_example(example, examplePolarity) ''' experiment: trained attributes ''' # classify a word with corresponding example and get positive or negative confirmation # if the classifier is confident, then we don't know # e.g. "is this green?" # word: string # example: image # classificationScores: dictionary of classification scores per classifier def classify_word_example(self, word, example, checkSynonyms=True): probColor = 0.0 probShape = 0.0 if(word in self.classColors) : index = self.classColors(word) colorPredict = self.clfColor.predict([example['Color']]) colorProbs = self.clfColor.predict_proba([example['Color']]) probColor = colorProbs[index] if(word in self.classShapes) : index = self.classShapes(word) shapePredict = self.clfShape.predict([example['Shape']]) shapeProbs = self.clfShape.predict_proba([example['Shape']]) probShape = shapeProbs[index] probabilityScores = {} pExampleGivenWordValues = {} # check all classifiers related to this word for classifier in self.knownWords[word]: #print(word, str(classifier)) if("Synonym" not in str(type(classifier))): # use non-synonym classifiers directly [probabilityScore, pExampleGivenWord] = classifier.calculate_probability_score(example) elif(checkSynonyms == True): # use synonym classifiers indirectly # add positive and negative examples known for the word but not the synonym # we do not care about the return values for recursive calls # we only want to populate probabilityScores in each recursion if("Color" in str(type(classifier))): searchType = "ObjColor" elif("Shape" in str(type(classifier))): searchType = "ObjShape" else: # should never come here for given initialization pass for synonymClassifier in self.knownWords[classifier.synonym]: if(searchType in str(type(synonymClassifier))): # will only enter this once # break is efficient but not neccessary synonymClassifierObj = synonymClassifier break; [probabilityScore, pExampleGivenWord] = synonymClassifierObj.calculate_probability_score(example, classifier.positiveExamples, classifier.negativeExamples) else: # do nothing about the synonyms pass # add score to classification scores probabilityScores[classifier] = probabilityScore pExampleGivenWordValues[classifier] = pExampleGivenWord # now we have accumulated all the scores # check if any of the scores exceed the threshold # initially assume inconsistency isWordExampleConsistent = False # compare for positive # more convoluted but faster this way because no if condition for classifier in probabilityScores.keys(): isWordExampleConsistent = isWordExampleConsistent or (probabilityScores[classifier] >= classifier.get_classification_threshold()) # return the consistency decision and probability scores return [isWordExampleConsistent, probabilityScores, pExampleGivenWordValues] ''' experiment: novel scene ''' # classify a new example and get corresponding word # if no classifier is confident, then it is a new category of example. we do not handle this right now # e.g. "what is this?" # e.g. of bayes' rule: p(cube|example) = p(example|cube) * p(cube) / p(example) # p(example) is constant across all word classifications and can be ignored when comparing them # p(example|cube): the fraction of examples in "cube" which matched the current example # p(cube): the fraction of examples under "cube" relative to examples over all known words # p(cube) = totalExamples of cube / total examples of all words # the denominator is constant for all word scores. ignore it # consider non-normalized version of p(cube) to calculate score # example: image def classify_example(self, example): colorPredict = self.clfColor.predict([example['Color']]) colorProbs = self.clfColor.predict_proba([example['Color']]) shapePredict = self.clfShape.predict([example['Shape']]) shapeProbs = self.clfShape.predict_proba([example['Shape']]) # check against each known word # maximum probability score data corresponding to a word wordMaxProabilityScores = {} # all probability score data corresponding to a word wordProbabilityScores = {} # maintain best guess bestGuessWord = "" bestGuessObj = "" bestGuessMaxScore = 0 # calculate word probability scores # check all associated classifiers for word in self.knownWords.keys(): [isWordExampleConsistent, probabilityScores, pExampleGivenWordValues] = self.classify_word_example(word, example) # select maximum score corresponding to best interpretation maxScore = max(probabilityScores.values()) maxScoreObj = "none" for classifier in probabilityScores: if(probabilityScores[classifier] == maxScore): maxScoreObj = classifier # add to probability scores #totalObjExamples = float(len(maxScoreObj.positiveExamples) + len(maxScoreObj.negativeExamples)) #wordMaxProabilityScores[word] = [maxScore, maxScoreObj, maxScore/totalObjExamples] wordMaxProabilityScores[word] = [maxScore, maxScoreObj, pExampleGivenWordValues] wordProbabilityScores[word] = [isWordExampleConsistent, probabilityScores] # update best guess if possible if(maxScore > bestGuessMaxScore): bestGuessWord = word bestGuessObj = maxScoreObj bestGuessMaxScore = maxScore # guess confidence # initial assumption isConfidentGuess = False if(bestGuessMaxScore >= self.minimumGuessScore): isConfidentGuess = True # return everything known to man return [isConfidentGuess, bestGuessWord, bestGuessObj, bestGuessMaxScore, wordMaxProabilityScores, wordProbabilityScores] ''' experiment: novel english ''' # get a sentence and an image # compute a score which represents association of words with that image # classify that image and get words associated with it in decending order # get ranks of word mentioned by user # score is sum of 1/rank for each word # e.g. "this is a blue cube" def associate_words_example(self, listOfPositiveWords, listOfNegativeWords, example): # classify this image and get associated words [isConfidentGuess, bestGuessWord, bestGuessObj, bestGuessMaxScore, wordMaxProabilityScores, wordProbabilityScores] = self.classify_example(example) # form a dictionary of score: word wordScoreDictionary = {} for word in wordMaxProabilityScores: if(wordMaxProabilityScores[word][2][wordMaxProabilityScores[word][1]] not in wordScoreDictionary.keys()): wordScoreDictionary[wordMaxProabilityScores[word][2][wordMaxProabilityScores[word][1]]] = [word] else: wordScoreDictionary[wordMaxProabilityScores[word][2][wordMaxProabilityScores[word][1]]].append(word) # now rank these in descending order rank = 0 wordRanks = {} for wordScore in sorted(wordScoreDictionary.keys(),reverse=True): rank += 1 for word in wordScoreDictionary[wordScore]: wordRanks[word] = rank # compute total score based on ranks of words in list totalScore = 0 for word in listOfPositiveWords: if(word in wordRanks): rank = wordRanks[word] # use flat division for float result totalScore += 1.0/rank for word in listOfNegativeWords: if(word in wordRanks): rank = wordRanks[word] # use flat division for float result totalScore -= 1.0/rank return [totalScore, wordRanks, wordScoreDictionary]
class SGDLearner: def __init__(self, X_train, y_train, X_test, y_test, random_state, eta0, alpha): self.X_train = X_train self.y_train = y_train self.X_test = X_test self.y_test = y_test self.rng = random_state # work out how to use random state ( ie save) self.sgd = SGDClassifier(random_state=self.rng, fit_intercept=True) self.sgd.loss = "hinge" self.sgd.alpha = alpha self.sgd.learning_rate = "constant" self.sgd.eta0 = eta0 # how to make these controlled properties (from chaco) # E_part = np.NaN((self.T // self.update_period, )) self.ntrain, self.ncoef = self.X_train.shape self.wts = np.zeros((self.ntrain + 1, self.ncoef + 1)) self.grad = np.zeros((self.ntrain + 1, self.ncoef + 1)) self.reset() def reset(self): self.scores = [] # turn into dataframe? self.mn_grad = [] self.st_grad = [] self.sgd.warm_start = False # reset learning ?does this reset learning rate time counter? self._iT = 0 d = pd.Index([], name="timestep") self.data = pd.DataFrame(index=d, columns=["part", "train", "test"]) def learn(self, learn_for, probe_every): """Train for learn_for and and store results probe_every""" ind = self.rng.randint(0, self.ntrain, learn_for) for time in range(0, learn_for, probe_every): self.sgd.partial_fit( self.X_train[ind[time : time + probe_every], :], self.y_train[ind[time : time + probe_every]], [0, 1] ) self._iT += probe_every self.sgd.warm_start = False # (not necessary? unless we use fit rather than partial) self.calc_grad() mn = self.grad.mean(axis=0) * 1e-6 st = self.grad.std(axis=0) * 1e-6 self.mn_grad.append(mn) self.mn_grad.append(st) ind_part = self.rng.randint(0, self.ntrain, int(self.ntrain * 0.1)) self.data.loc[self._iT, "part"] = self.sgd.score(self.X_train[ind_part, :], self.y_train[ind_part]) self.data.loc[self._iT, "train"] = self.sgd.score(self.X_train, self.y_train) self.data.loc[self._iT, "test"] = self.sgd.score(self.X_test, self.y_test) def calc_grad(self): # now calculate current gradient variance # by using very low learning rate and calculate gradients eta0 = self.sgd.eta0 self.sgd.eta0 = 1e-6 * eta0 self.wts[0, :-1] = self.sgd.coef_ self.wts[0, -1] = self.sgd.intercept_ for im in range(self.ntrain): self.sgd.partial_fit(self.X_train[im, :].reshape((1, -1)), self.y_train[im].reshape((1,)), [0, 1]) self.wts[im, :-1] = self.sgd.coef_ self.wts[im, -1] = self.sgd.intercept_ self.grad = np.diff(self.wts, axis=0) self.sgd.eta0 = eta0 # restore orignal learning rate def plot(self, ax_graph): if not (self.data.empty): for name, line in self.lines.iteritems(): line.set_data(self.data.index.values, self.data[name]) ax_graph.set_xlim(0, self.data.index.values[-1]) plt.draw() # update plot? else: ax_graph.cla() self.lines = {} ax_graph.set_xlabel("iter") ax_graph.set_ylabel("class") ax_graph.set_ylim(0, 1) colours = {"train": "blue", "part": "green", "test": "red"} for name, col in colours.iteritems(): self.lines[name] = plt.Line2D([], [], color=col, label=name) ax_graph.add_line(self.lines[name]) ax_graph.legend(loc="lower left")
#ddir = 'E:/workspace/data/cdiscount/' #wdir = 'C:/Users/ngaude/Documents/GitHub/kaggle/cdiscount/' ddir = '/home/ngaude/workspace/data/cdiscount/' wdir = '/home/ngaude/workspace/github/kaggle/cdiscount/' f_itocat = ddir+'joblib/itocat' (itocat1,cat1toi,itocat2,cat2toi,itocat3,cat3toi) = joblib.load(f_itocat) (X,Y) = joblib.load(ddir+'joblib/XYneighbor') Y=Y[:,2] classes = np.unique(Y) classifier = SGDClassifier(loss = 'hinge',n_jobs = 3,penalty='l2') classifier.partial_fit(X,Y,classes = classes) classifier.sparsify() # #nrows = 1000 #trainrows = Xtrain.shape[0] #epochs = 5 * trainrows / nrows #for i in range(epochs): # a = np.random.randint(trainrows,size=nrows) # Xi = Xtrain[a,:] # Yi = Ytrain[a] # print 'partial_fit',i,'/',epochs # classifier.partial_fit(Xi,Yi,classes = cat3toi.keys()) # #print 'train',classifier.score(Xtrain[:30000],Ytrain[:30000])
time1 = time.time() lda,train_bow = gensim_lda(train_array,num_topics,6,40) time2 = time.time() print 'lda time {}'.format(time2 - time1) """ SGD with Fisher score """ classes = np.array([[c] for c in shuffled_train_classes]) clf = SGDClassifier(shuffle=True) unique_classes = np.unique(classes) time1 = time.time() for i,score in enumerate(fisher_score(train_array,lda,num_topics,V,D_train)): clf.partial_fit(score, classes[i], unique_classes) time2 = time.time() print 'sgd time {}'.format(time2 - time1) """ SGD with Fisher score """ classes = np.array([[c] for c in test_classes]) time1 = time.time() went_fine = 0 for i,score in enumerate(fisher_score(test_array,lda,num_topics,V,D_test)): c = clf.predict(score) if c[0] == test_classes[i]:
#0.00001 --> 0.813194 a/10 #0.0001/20 --> 0.811584 a/20 #0.0001/15 --> 0.813287 a/15 !! #0.0001/12-->0.807602 a/12 #0.0001/17 --> 0.811538 #0.814882 <-- L1_RATION 0.5 L1 #0.819020 <-- l1 RATION 0.5 L1 average = true alpha = a/15 !! #0.817806 <-- l2 ration 0.15 average = true alpha = a/15 #0.818444 <-- l2 ration 0.15 average =true alpha = a/100 #0.818413 <-- l2 ration 0.15 average = true alpha = a/50 0.819020 count = 0 for line in sys.stdin: #if count >= 5000: # break line = line.strip() label, x_string = line.split(" ", 1) label = int(label) x_original = np.fromstring(x_string, sep=' ') x = transform(x_original) #using the kernel function clf.partial_fit(x, [label], CLASSES) count += 1 for x in clf.coef_[0]: print x, # print #cat training_set.txt | python mapper.py | python reducer.py > r_weights.txt #python evaluate.py r_weights.txt test_data.txt test_label.txt /Users/Charles/Desktop/
from sklearn.feature_extraction.text import HashingVectorizer from sklearn.linear_model import SGDClassifier vect=HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer) clf=SGDClassifier(loss='log',random_state=1,n_iter=1) doc_stream=stream_docs(path='/home/caofa/movie_data.csv') import pyprind pbar=pyprind.ProgBar(45) classes=np.array([0,1]) for _ in range(45): X_train,y_train=get_minibatch(doc_stream, size=1000) if not X_train: break X_train=vect.transform(X_train) clf.partial_fit(X_train,y_train,classes=classes) pbar.update() X_test,y_test=get_minibatch(doc_stream, size=5000) X_test=vect.transform(X_test) clf=clf.partial_fit(X_test,y_test)
sw = 1 + 4*chunk.is_booking chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True) XN = csr_matrix(chunk[num_col].values) X = csr_matrix((chunk.shape[0], n_features)) rows = np.arange(chunk.shape[0]) for col in cat_col_all: dat = np.ones(chunk.shape[0]) cols = chunk[col] % n_features X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features)) X = hstack((XN, X)) book_indices = sw[sw > 1].index.tolist() X_test = csr_matrix(X)[book_indices] y_test = y[book_indices] clf.partial_fit(X, y, classes=np.arange(100), sample_weight=sw) #len([i for i in clf.coef_[1] if i != 0]) #len([i for i in clf.coef_[1] if i > 0]) #jb = [col for h in np.argsort(abs(clf.coef_[5])) for col in chunk.columns if (hash(col) % n_features) == h] #preds += np.vstack(tuple([clf.predict_proba(test.loc[i*chunksize:min((i+1)*chunksize,test.shape[0]),:]) for i in range(int(test.shape[0]/100000))])) #preds += clf.predict_proba(test) count = count + chunksize map5 = map5eval(clf.predict_proba(X_test), y_test) print('%d rows completed. MAP@5: %f' % (count, map5)) if(count/chunksize == 200): break except Exception as e: #e = sys.exc_info()[0] print('Error: %s' % str(e))
def main(): # opfc = 0 print "###############################At Process Train Data by File########################################" # Get Train Data Header from Mega File fhead = open('/home/robbie/Hacking/Kaggle/ClickThroughRatePrediction-Avazu/Data/Raw/train.csv') fhead.seek(0) headers = fhead.readline().rstrip().split(",") # if printflag == True: print headers # Go after all the Train Files alltrainfiles = os.listdir(trainpath) # testpred = np.zeros(40428967) testpred = np.zeros(4577464) # testpred = np.zeros(89999) loopcounter = 1 for atrainfile in alltrainfiles: trainY = [] clickdata = [] for line in open(trainpath + atrainfile, 'r'): random_line = line.rstrip().split(",") tdict = dict(zip(headers, random_line)) tdict.pop('C14') tdict.pop('C17') tdict.pop('C20') trainY.append(tdict.pop('click')) tdict.pop('id') clickdata.append(tdict) # if printflag == True: print tdict print "######################At Dict Vectorizer for file:::" , loopcounter, "#####################" # if printflag == True: print clickdata vec = DictVectorizer() vcd = vec.fit_transform(clickdata).toarray() clickdata = [] # print "###############################At Tree Classifer########################################" # clf = RandomForestClassifier(n_estimators=5,n_jobs=-1) # clf.fit(vcd, trainY) ## if printflag == True: print clf.predict(vcd) == trainY ## print sum(int(x) for x in trainY) ## print sum(clf.predict(vcd) == trainY) ## print clf.oob_score_ print "############################At PassiveAggressive Classifer###################################" clf = SGDClassifier(penalty = 'l1', n_jobs=-1) clf.partial_fit(vcd, trainY,[0,1]) # if printflag == True: print clf.predict(vcd) == trainY # print sum(int(x) for x in trainY) # print sum(clf.predict(vcd) == trainY) # print clf.oob_score_ loopcounter = loopcounter + 1 print "###############################Import Test Data########################################" testheaders = [headers[0]] + headers[2:] alltestfiles = os.listdir(testpath) temptestpred = np.array([]) idlist = [] innercounter = 1 for atestfile in alltestfiles: testclickdata = [] for line in open(testpath + atestfile, 'r'): random_line = line.rstrip().split(",") tdict = dict(zip(testheaders, random_line)) tdict.pop('C14') tdict.pop('C17') tdict.pop('C20') idlist.append(tdict.pop('id')) testclickdata.append(tdict) # if printflag == True: print tdict print "------------------Dict Vectorize Test Data--------------------------------" tstvcd = vec.transform(testclickdata).toarray() # testdata = importdata(projpath,testfilename) # print tstvcd print "-------------------Predict Test Data::", innercounter, "-----------------------------" temp = clf.predict_proba(tstvcd) # print "Type is::" , type(temp) , " Shape is:::" , temp.shape # print temp100 temptestpred = np.concatenate((temptestpred,temp[:,1])) # print clf.classes_ # print temptestpred # print temptestpred[:,1] # fop = open(outputpath + 'output_' + atestfile + str(opfc), 'w+') # fop.write(testpred) # opfc = opfc + 1 innercounter = innercounter + 1 testpred = testpred + temptestpred # if loopcounter == 300: # break # testpred = testpred / len(alltrainfiles) # testpred = testpred / loopcounter fop = open(outputpath + 'submission.csv', 'w+') fop.writelines('id,click\n') for i in range(0,len(idlist)): fop.writelines(str(idlist[i]) + ',' + str(round(testpred[i],4)) + '\n') fop.close()