class SGDClassifierImpl(): def __init__(self, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight='balanced', warm_start=False, average=False): self._hyperparams = { 'loss': loss, 'penalty': penalty, 'alpha': alpha, 'l1_ratio': l1_ratio, 'fit_intercept': fit_intercept, 'max_iter': max_iter, 'tol': tol, 'shuffle': shuffle, 'verbose': verbose, 'epsilon': epsilon, 'n_jobs': n_jobs, 'random_state': random_state, 'learning_rate': learning_rate, 'eta0': eta0, 'power_t': power_t, 'early_stopping': early_stopping, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'class_weight': class_weight, 'warm_start': warm_start, 'average': average} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X) def partial_fit(self, X, y=None, classes = None): if not hasattr(self, "_wrapped_model"): self._wrapped_model = SKLModel(**self._hyperparams) self._wrapped_model.partial_fit(X, y, classes = classes) return self
class PositiveClassClassifier(object): hvectorizer = HashingVectorizer(tokenizer = LemmaTokenizer(), n_features = 2 ** 15, stop_words = 'english', lowercase = True, non_negative = True) all_classes = np.array([0, 1]) def __init__(self, positive_class): # Create an online classifier i.e. supporting `partial_fit()` self.classifier = SGDClassifier(loss = 'log') # Here we propose to learn a binary classification of the positive class # and all other documents self.positive_class = positive_class # structure to track accuracy history self.stats = {'n_train': 0, 'n_train_pos': 0, 'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(), 'runtime_history': [(0, 0)]} def progress(self): """Report progress information, return a string.""" duration = time.time() - self.stats['t0'] s = "%(n_train)6d train docs (%(n_train_pos)6d positive) " % self.stats s += "accuracy: %(accuracy).6f " % self.stats s += "in %.2fs (%5d docs/s)" % (duration, self.stats['n_train'] / duration) return s def train(self): minibatch_iterator = iter_minibatchs(OVA_TRAIN_FILE, self.hvectorizer, self.positive_class) # Main loop : iterate on mini-batchs of examples for i, (x_train, y_train) in enumerate(minibatch_iterator): # update estimator with examples in the current mini-batch self.classifier.partial_fit(x_train, y_train, classes=self.all_classes) # accumulate test accuracy stats self.stats['n_train'] += x_train.shape[0] self.stats['n_train_pos'] += sum(y_train) self.stats['accuracy'] = self.score() self.stats['accuracy_history'].append((self.stats['accuracy'], self.stats['n_train'])) self.stats['runtime_history'].append((self.stats['accuracy'], time.time() - self.stats['t0'])) #if i % 10 == 0: # print self.progress() def score(self): TEST_BATCHES_NO = 20 minibatch_iterator = iter_minibatchs(TEST_FILE, self.hvectorizer, self.positive_class) score = 0 for i, (x_test, y_test) in enumerate(minibatch_iterator): y_test = np.asarray(y_test) score += self.classifier.score(x_test, y_test) if i >= TEST_BATCHES_NO - 1: break return score / TEST_BATCHES_NO
iter_csv = pd.read_csv(info['path'], nrows=online_train_set_size, chunksize=batchsize, skiprows=1, names=columns_names, sep='\t') for batch_no, batch in enumerate(iter_csv): X_batch, y_batch = preprocess(batch, label_encoder) X_batch_kernel_approx, y_batch_onehot = encode(X_batch, y_batch, one_hot_encoder, column_transformer, rbf_sampler) # make one pass of stochastic gradient descent over the batch. sgd_classifier.partial_fit(X_batch_kernel_approx, y_batch, classes=[0, 1]) # print train/test accuracy metrics every 5 batch if (batch_no % 5) == 0: message = "batch {:>4} ".format(batch_no) for origin, X, y_true_onehot in zip( ('train', 'val'), (X_batch_kernel_approx, X_test_kernel_approx), (y_batch_onehot, y_true_test_onehot)): y_pred = sgd_classifier.predict(X) # preprocess correctly the labels and prediction to match # average_precision_score expectations y_pred_onehot = one_hot_encoder.transform(y_pred.reshape(-1, 1)) score = average_precision_score(y_true_onehot, y_pred_onehot)
duration = time.time() - stats['t0'] s = "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats s += "%(n_test)6d test docs (%(n_test_pos)6d positive) " % stats s += "accuracy: %(accuracy).3f " % stats s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration) return s minibatch_size = 100 minibatch_iterators = iter_minibatchs(data_streamer, minibatch_size) def learn(classifier, stats, (X_train, y_train)): if 't0' not in stats: stats['t0'] = time.time() classifier.partial_fit(X_train, y_train, classes=all_classes) stats['n_train'] += X_train.shape[0] stats['n_train_pos'] += sum(y_train) stats['accuracy'] = classifier.score(X_test, y_test) stats['accuracy_history'].append((stats['accuracy'], stats['n_train'])) stats['runtime_history'].append((stats['accuracy'], time.time() - stats['t0'])) return classifier, stats from sklearn.base import copy def merge((cf1, stats1), (cf2, stats2)): new = copy.deepcopy(cf1) new.coef_ += cf2.coef_ new.intercept_ += cf2.intercept_ return new, stats1 # Map/Reduce on Spark
for pos in xrange(0, len(seq), size): yield seq[pos:pos + size] categories = [ 'alt.atheism', 'comp.graphics', 'comp.sys.ibm.pc.hardware', 'misc.forsale', 'rec.autos', 'sci.space', 'talk.religion.misc', ] dataset = fetch_20newsgroups(subset='train', categories=categories) classif_data = zip(dataset.data, dataset.target) classes = np.array(list(set(dataset.target))) hasher = FeatureHasher() classifier = SGDClassifier() for i, chunk in enumerate(chunker(classif_data, 100)): messages, topics = zip(*chunk) X = hasher.transform(token_freqs(msg) for msg in messages) y = np.array(topics) classifier.partial_fit(X, topics, classes=classes) if i % 100 == 0: # dump model to be able to monitor quality and later # analyse convergence externally joblib.dump(classifier, 'model_%04d.pkl' % i)
duration = time.time() - stats['t0'] s = "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats s += "accuracy: %(accuracy).3f " % stats s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration) return s # We will feed the classifier with mini-batches of 100 documents; this means # we have at most 100 docs in memory at any time. minibatch_size = 100 # Main loop : iterate on mini-batchs of examples minibatch_iterators = iter_minibatches(data_stream, minibatch_size) for i, (X_train, y_train) in enumerate(minibatch_iterators): # update estimator with examples in the current mini-batch classifier.partial_fit(X_train, y_train, classes=all_classes) # accumulate test accuracy stats stats['n_train'] += X_train.shape[0] stats['n_train_pos'] += sum(y_train) stats['accuracy'] = classifier.score(X_test, y_test) stats['accuracy_history'].append((stats['accuracy'], stats['n_train'])) stats['runtime_history'].append( (stats['accuracy'], time.time() - stats['t0'])) if i % 10 == 0: print(progress(stats)) ############################################################################### # Plot results ###############################################################################
average=False, n_iter=10) trainloss = [] testloss = [] for i, chunk in enumerate( pd.read_csv("cancer2.csv", chunksize=chunksize, header=None, iterator=True)): X = chunk.iloc[:, :-1] y = chunk.iloc[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) estimator.partial_fit(X, y, classes=np.unique(y)) trainR2 = mean_squared_error(y_train, estimator.predict(X_train)) testR2 = mean_squared_error(y_test, estimator.predict(X_test)) trainloss.append(trainR2) testloss.append(testR2) print("trainloss:{:.4f},testloss:{:.4f} ".format(trainloss[-1], testloss[-1])) if i > 3: break # In[134]: import matplotlib.pyplot as plt plt.plot(trainloss) plt.plot(testloss) plt.legend(('train', 'test'))
class SGD(AutoSklearnClassificationAlgorithm): def __init__(self, loss, penalty, alpha, fit_intercept, n_iter, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.n_iter = n_iter self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def fit(self, X, y, sample_weight=None): self.iterative_fit(X, y, n_iter=1, sample_weight=sample_weight, refit=True) while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1, sample_weight=sample_weight) return self def iterative_fit(self, X, y, n_iter=1, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier if refit: self.estimator = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.n_iter = int(self.n_iter) self.l1_ratio = float( self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float( self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float( self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=n_iter, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state) else: self.estimator.n_iter += n_iter self.estimator.partial_fit(X, y, classes=np.unique(y), sample_weight=sample_weight) if self.estimator.n_iter >= self.n_iter: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = CategoricalHyperparameter( "loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default="log") penalty = CategoricalHyperparameter("penalty", ["l1", "l2", "elasticnet"], default="l2") alpha = UniformFloatHyperparameter("alpha", 10e-7, 1e-1, log=True, default=0.0001) l1_ratio = UniformFloatHyperparameter("l1_ratio", 1e-9, 1, log=True, default=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, log=True, default=20) epsilon = UniformFloatHyperparameter("epsilon", 1e-5, 1e-1, default=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default="optimal") eta0 = UniformFloatHyperparameter("eta0", 10**-7, 0.1, default=0.01) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default=0.25) average = CategoricalHyperparameter("average", ["False", "True"], default="False") cs.add_hyperparameters([ loss, penalty, alpha, l1_ratio, fit_intercept, n_iter, epsilon, learning_rate, eta0, power_t, average ]) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") # eta0 seems to be always active according to the source code; when # learning_rate is set to optimial, eta0 is the starting value: # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling") #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant") #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant) power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") cs.add_conditions([elasticnet, epsilon_condition, power_t_condition]) return cs
def run(keyn, nPart): all_classes = np.array([0, 1]) allKeys = [l.split()[0] for l in open('keywordsAll.txt').readlines()] keyFreqs = [ float(l.split()[1]) / 4205907 for l in open('keywordsAll.txt').readlines() ] key = allKeys[keyn] freq = keyFreqs[keyn] opt = 'body+title+code' bv = 'True' nneg = 'True' nv = 'None' #testopt = 'c' #testopt = 'w' #testopt = 'l2' testopt = 'l1' if testopt == 'c': cls = SGDClassifier(loss='hinge', learning_rate="constant", alpha=1e-6, eta0=1e-2, penalty='l2') elif testopt == 'w': cls = SGDClassifier(class_weight={1: 1.0 / freq / 8.0, 0: 1}) elif testopt == 'l2': cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l2') elif testopt == 'l1': cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l1') outputName = 'key_' + str( keyn) + '_SGDtune_' + opt + '_partialfit_' + testopt + '.txt' pklName = 'SGD_key_' + str(keyn) + '_' + testopt + '.pkl' n0, ntrain = resumeJob(outputName, pklName) body_test, y_test = getTestSet(10, key, opt, testSize=0.2, seed=123) tot_pos = sum(y_test) vectorizer = HashingVectorizer(decode_error='ignore', n_features=2**20, token_pattern=r"\b\w[\w#+.-]*(?<!\.$)", binary=str2bool(bv), norm=normOpt(nv), non_negative=str2bool(nneg)) X_test = vectorizer.transform(body_test) #print 'test case:', len(y_test), 'positive', tot_pos, 'key:', key, 'X norm:', X_test.sum(), 'binary:', bv, 'norm:', nv, 'nneg:', nneg if n0 >= 2: cls = joblib.load(pklName) for n in xrange(n0, 10): outfile = open(outputName, 'a') data = json.load(gzip.open('Train.rdup.' + str(n) + '.json.gz')) minibatch_size = len(data) / nPart + 1 for i in xrange(nPart): n1 = i * minibatch_size n2 = (i + 1) * minibatch_size if i == nPart - 1: n2 = len(data) ntrain += (n2 - n1) body_train, y_train = getMiniBatch(data, n1, n2, key, opt) X_train = vectorizer.transform(body_train) shuffledRange = range(n2 - n1) for n_iter in xrange(5): X_train, y_train = shuffle(X_train, y_train) cls.partial_fit(X_train, y_train, classes=all_classes) y_pred = cls.predict(X_test) f1 = metrics.f1_score(y_test, y_pred) p = metrics.precision_score(y_test, y_pred) r = metrics.recall_score(y_test, y_pred) accu = cls.score(X_train, y_train) y_pred = cls.predict(X_train) f1t = metrics.f1_score(y_train, y_pred) outfile.write( "%3d %8d %.4f %.3f %.3f %.3f %.3f %5d %5d\n" % (n, ntrain, accu, f1t, f1, p, r, sum(y_pred), tot_pos)) _ = joblib.dump(cls, pklName, compress=9) outfile.close()