예제 #1
0
class SGDClassifierImpl():

    def __init__(self, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight='balanced', warm_start=False, average=False):
        self._hyperparams = {
            'loss': loss,
            'penalty': penalty,
            'alpha': alpha,
            'l1_ratio': l1_ratio,
            'fit_intercept': fit_intercept,
            'max_iter': max_iter,
            'tol': tol,
            'shuffle': shuffle,
            'verbose': verbose,
            'epsilon': epsilon,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'learning_rate': learning_rate,
            'eta0': eta0,
            'power_t': power_t,
            'early_stopping': early_stopping,
            'validation_fraction': validation_fraction,
            'n_iter_no_change': n_iter_no_change,
            'class_weight': class_weight,
            'warm_start': warm_start,
            'average': average}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)

    def partial_fit(self, X, y=None, classes = None):
      if not hasattr(self, "_wrapped_model"):
        self._wrapped_model = SKLModel(**self._hyperparams)
      self._wrapped_model.partial_fit(X, y, classes = classes)
      return self
class PositiveClassClassifier(object):
    hvectorizer = HashingVectorizer(tokenizer = LemmaTokenizer(),
                                    n_features = 2 ** 15,
                                    stop_words = 'english',
                                    lowercase = True,
                                    non_negative = True)
 
    all_classes = np.array([0, 1])
    
    def __init__(self, positive_class):
        # Create an online classifier i.e. supporting `partial_fit()`
        self.classifier = SGDClassifier(loss = 'log')

        # Here we propose to learn a binary classification of the positive class
        # and all other documents
        self.positive_class = positive_class

        # structure to track accuracy history
        self.stats = {'n_train': 0, 'n_train_pos': 0, 'accuracy': 0.0, 
            'accuracy_history': [(0, 0)], 't0': time.time(), 
            'runtime_history': [(0, 0)]}

    def progress(self):
        """Report progress information, return a string."""
        duration = time.time() - self.stats['t0']
        s = "%(n_train)6d train docs (%(n_train_pos)6d positive) " % self.stats
        s += "accuracy: %(accuracy).6f " % self.stats
        s += "in %.2fs (%5d docs/s)" % (duration, self.stats['n_train'] / duration)
        return s

    def train(self):
        minibatch_iterator = iter_minibatchs(OVA_TRAIN_FILE, self.hvectorizer, self.positive_class)
 
        # Main loop : iterate on mini-batchs of examples
        for i, (x_train, y_train) in enumerate(minibatch_iterator):
            # update estimator with examples in the current mini-batch
            self.classifier.partial_fit(x_train, y_train, classes=self.all_classes)

            # accumulate test accuracy stats
            self.stats['n_train'] += x_train.shape[0]
            self.stats['n_train_pos'] += sum(y_train)
            self.stats['accuracy'] = self.score()
            self.stats['accuracy_history'].append((self.stats['accuracy'], 
                                                   self.stats['n_train']))
            self.stats['runtime_history'].append((self.stats['accuracy'],
                                                  time.time() - self.stats['t0']))
            #if i % 10 == 0:
            #    print self.progress()

    def score(self): 
        TEST_BATCHES_NO = 20
        minibatch_iterator = iter_minibatchs(TEST_FILE, self.hvectorizer, self.positive_class)
        score = 0
        
        for i, (x_test, y_test) in enumerate(minibatch_iterator):
            y_test = np.asarray(y_test)
            score += self.classifier.score(x_test, y_test)

            if i >= TEST_BATCHES_NO - 1:
                break

        return score / TEST_BATCHES_NO
iter_csv = pd.read_csv(info['path'],
                       nrows=online_train_set_size,
                       chunksize=batchsize,
                       skiprows=1,
                       names=columns_names,
                       sep='\t')

for batch_no, batch in enumerate(iter_csv):
    X_batch, y_batch = preprocess(batch, label_encoder)
    X_batch_kernel_approx, y_batch_onehot = encode(X_batch, y_batch,
                                                   one_hot_encoder,
                                                   column_transformer,
                                                   rbf_sampler)

    # make one pass of stochastic gradient descent over the batch.
    sgd_classifier.partial_fit(X_batch_kernel_approx, y_batch, classes=[0, 1])

    # print train/test accuracy metrics every 5 batch
    if (batch_no % 5) == 0:
        message = "batch {:>4} ".format(batch_no)
        for origin, X, y_true_onehot in zip(
            ('train', 'val'), (X_batch_kernel_approx, X_test_kernel_approx),
            (y_batch_onehot, y_true_test_onehot)):

            y_pred = sgd_classifier.predict(X)

            # preprocess correctly the labels and prediction to match
            # average_precision_score expectations
            y_pred_onehot = one_hot_encoder.transform(y_pred.reshape(-1, 1))

            score = average_precision_score(y_true_onehot, y_pred_onehot)
    duration = time.time() - stats['t0']
    s = "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats
    s += "%(n_test)6d test docs (%(n_test_pos)6d positive) " % stats
    s += "accuracy: %(accuracy).3f " % stats
    s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration)
    return s


minibatch_size = 100

minibatch_iterators = iter_minibatchs(data_streamer, minibatch_size)
def learn(classifier, stats, (X_train, y_train)):
    if 't0' not in stats:
        stats['t0'] = time.time()

    classifier.partial_fit(X_train, y_train, classes=all_classes)
    stats['n_train'] += X_train.shape[0]
    stats['n_train_pos'] += sum(y_train)
    stats['accuracy'] = classifier.score(X_test, y_test)
    stats['accuracy_history'].append((stats['accuracy'], stats['n_train']))
    stats['runtime_history'].append((stats['accuracy'], time.time() - stats['t0']))
    return classifier, stats

from sklearn.base import copy
def merge((cf1, stats1), (cf2, stats2)):
    new = copy.deepcopy(cf1)
    new.coef_ += cf2.coef_
    new.intercept_ += cf2.intercept_
    return new, stats1

# Map/Reduce on Spark
예제 #5
0
파일: snippet.py 프로젝트: szabo92/gistable
    for pos in xrange(0, len(seq), size):
        yield seq[pos:pos + size]


categories = [
    'alt.atheism',
    'comp.graphics',
    'comp.sys.ibm.pc.hardware',
    'misc.forsale',
    'rec.autos',
    'sci.space',
    'talk.religion.misc',
]

dataset = fetch_20newsgroups(subset='train', categories=categories)
classif_data = zip(dataset.data, dataset.target)
classes = np.array(list(set(dataset.target)))

hasher = FeatureHasher()
classifier = SGDClassifier()

for i, chunk in enumerate(chunker(classif_data, 100)):
    messages, topics = zip(*chunk)
    X = hasher.transform(token_freqs(msg) for msg in messages)
    y = np.array(topics)
    classifier.partial_fit(X, topics, classes=classes)
    if i % 100 == 0:
        # dump model to be able to monitor quality and later
        # analyse convergence externally
        joblib.dump(classifier, 'model_%04d.pkl' % i)
예제 #6
0
    duration = time.time() - stats['t0']
    s = "%(n_train)6d train docs (%(n_train_pos)6d positive) " % stats
    s += "accuracy: %(accuracy).3f " % stats
    s += "in %.2fs (%5d docs/s)" % (duration, stats['n_train'] / duration)
    return s


# We will feed the classifier with mini-batches of 100 documents; this means
# we have at most 100 docs in memory at any time.
minibatch_size = 100

# Main loop : iterate on mini-batchs of examples
minibatch_iterators = iter_minibatches(data_stream, minibatch_size)
for i, (X_train, y_train) in enumerate(minibatch_iterators):
    # update estimator with examples in the current mini-batch
    classifier.partial_fit(X_train, y_train, classes=all_classes)
    # accumulate test accuracy stats
    stats['n_train'] += X_train.shape[0]
    stats['n_train_pos'] += sum(y_train)
    stats['accuracy'] = classifier.score(X_test, y_test)
    stats['accuracy_history'].append((stats['accuracy'], stats['n_train']))
    stats['runtime_history'].append(
        (stats['accuracy'], time.time() - stats['t0']))
    if i % 10 == 0:
        print(progress(stats))

###############################################################################
# Plot results
###############################################################################

예제 #7
0
                          average=False,
                          n_iter=10)
trainloss = []
testloss = []
for i, chunk in enumerate(
        pd.read_csv("cancer2.csv",
                    chunksize=chunksize,
                    header=None,
                    iterator=True)):
    X = chunk.iloc[:, :-1]
    y = chunk.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)
    estimator.partial_fit(X, y, classes=np.unique(y))
    trainR2 = mean_squared_error(y_train, estimator.predict(X_train))
    testR2 = mean_squared_error(y_test, estimator.predict(X_test))
    trainloss.append(trainR2)
    testloss.append(testR2)
    print("trainloss:{:.4f},testloss:{:.4f} ".format(trainloss[-1],
                                                     testloss[-1]))
    if i > 3:
        break

# In[134]:

import matplotlib.pyplot as plt
plt.plot(trainloss)
plt.plot(testloss)
plt.legend(('train', 'test'))
예제 #8
0
class SGD(AutoSklearnClassificationAlgorithm):
    def __init__(self,
                 loss,
                 penalty,
                 alpha,
                 fit_intercept,
                 n_iter,
                 learning_rate,
                 l1_ratio=0.15,
                 epsilon=0.1,
                 eta0=0.01,
                 power_t=0.5,
                 average=False,
                 random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None

    def fit(self, X, y, sample_weight=None):
        self.iterative_fit(X,
                           y,
                           n_iter=1,
                           sample_weight=sample_weight,
                           refit=True)
        while not self.configuration_fully_fitted():
            self.iterative_fit(X, y, n_iter=1, sample_weight=sample_weight)

        return self

    def iterative_fit(self, X, y, n_iter=1, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier

        if refit:
            self.estimator = None

        if self.estimator is None:

            self.alpha = float(self.alpha)
            self.fit_intercept = self.fit_intercept == 'True'
            self.n_iter = int(self.n_iter)
            self.l1_ratio = float(
                self.l1_ratio) if self.l1_ratio is not None else 0.15
            self.epsilon = float(
                self.epsilon) if self.epsilon is not None else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(
                self.power_t) if self.power_t is not None else 0.25
            self.average = self.average == 'True'

            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           n_iter=n_iter,
                                           learning_rate=self.learning_rate,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state)
        else:
            self.estimator.n_iter += n_iter

        self.estimator.partial_fit(X,
                                   y,
                                   classes=np.unique(y),
                                   sample_weight=sample_weight)

        if self.estimator.n_iter >= self.n_iter:
            self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'SGD Classifier',
            'name': 'Stochastic Gradient Descent Classifier',
            'handles_regression': False,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': False,
            'is_deterministic': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        loss = CategoricalHyperparameter(
            "loss",
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            default="log")
        penalty = CategoricalHyperparameter("penalty",
                                            ["l1", "l2", "elasticnet"],
                                            default="l2")
        alpha = UniformFloatHyperparameter("alpha",
                                           10e-7,
                                           1e-1,
                                           log=True,
                                           default=0.0001)
        l1_ratio = UniformFloatHyperparameter("l1_ratio",
                                              1e-9,
                                              1,
                                              log=True,
                                              default=0.15)
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        n_iter = UniformIntegerHyperparameter("n_iter",
                                              5,
                                              1000,
                                              log=True,
                                              default=20)
        epsilon = UniformFloatHyperparameter("epsilon",
                                             1e-5,
                                             1e-1,
                                             default=1e-4,
                                             log=True)
        learning_rate = CategoricalHyperparameter(
            "learning_rate", ["optimal", "invscaling", "constant"],
            default="optimal")
        eta0 = UniformFloatHyperparameter("eta0", 10**-7, 0.1, default=0.01)
        power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default=0.25)
        average = CategoricalHyperparameter("average", ["False", "True"],
                                            default="False")
        cs.add_hyperparameters([
            loss, penalty, alpha, l1_ratio, fit_intercept, n_iter, epsilon,
            learning_rate, eta0, power_t, average
        ])

        # TODO add passive/aggressive here, although not properly documented?
        elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
        epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")
        # eta0 seems to be always active according to the source code; when
        # learning_rate is set to optimial, eta0 is the starting value:
        # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx
        #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling")
        #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant")
        #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant)
        power_t_condition = EqualsCondition(power_t, learning_rate,
                                            "invscaling")

        cs.add_conditions([elasticnet, epsilon_condition, power_t_condition])

        return cs
예제 #9
0
def run(keyn, nPart):
    all_classes = np.array([0, 1])
    allKeys = [l.split()[0] for l in open('keywordsAll.txt').readlines()]
    keyFreqs = [
        float(l.split()[1]) / 4205907
        for l in open('keywordsAll.txt').readlines()
    ]
    key = allKeys[keyn]
    freq = keyFreqs[keyn]

    opt = 'body+title+code'
    bv = 'True'
    nneg = 'True'
    nv = 'None'
    #testopt = 'c'
    #testopt = 'w'
    #testopt = 'l2'
    testopt = 'l1'

    if testopt == 'c':
        cls = SGDClassifier(loss='hinge',
                            learning_rate="constant",
                            alpha=1e-6,
                            eta0=1e-2,
                            penalty='l2')
    elif testopt == 'w':
        cls = SGDClassifier(class_weight={1: 1.0 / freq / 8.0, 0: 1})
    elif testopt == 'l2':
        cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l2')
    elif testopt == 'l1':
        cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l1')

    outputName = 'key_' + str(
        keyn) + '_SGDtune_' + opt + '_partialfit_' + testopt + '.txt'
    pklName = 'SGD_key_' + str(keyn) + '_' + testopt + '.pkl'
    n0, ntrain = resumeJob(outputName, pklName)

    body_test, y_test = getTestSet(10, key, opt, testSize=0.2, seed=123)
    tot_pos = sum(y_test)
    vectorizer = HashingVectorizer(decode_error='ignore',
                                   n_features=2**20,
                                   token_pattern=r"\b\w[\w#+.-]*(?<!\.$)",
                                   binary=str2bool(bv),
                                   norm=normOpt(nv),
                                   non_negative=str2bool(nneg))

    X_test = vectorizer.transform(body_test)
    #print 'test case:', len(y_test), 'positive', tot_pos, 'key:', key, 'X norm:', X_test.sum(), 'binary:', bv, 'norm:', nv, 'nneg:', nneg
    if n0 >= 2:
        cls = joblib.load(pklName)
    for n in xrange(n0, 10):
        outfile = open(outputName, 'a')
        data = json.load(gzip.open('Train.rdup.' + str(n) + '.json.gz'))
        minibatch_size = len(data) / nPart + 1
        for i in xrange(nPart):
            n1 = i * minibatch_size
            n2 = (i + 1) * minibatch_size
            if i == nPart - 1:
                n2 = len(data)
            ntrain += (n2 - n1)
            body_train, y_train = getMiniBatch(data, n1, n2, key, opt)
            X_train = vectorizer.transform(body_train)
            shuffledRange = range(n2 - n1)
            for n_iter in xrange(5):
                X_train, y_train = shuffle(X_train, y_train)
            cls.partial_fit(X_train, y_train, classes=all_classes)
            y_pred = cls.predict(X_test)
            f1 = metrics.f1_score(y_test, y_pred)
            p = metrics.precision_score(y_test, y_pred)
            r = metrics.recall_score(y_test, y_pred)
            accu = cls.score(X_train, y_train)
            y_pred = cls.predict(X_train)
            f1t = metrics.f1_score(y_train, y_pred)
            outfile.write(
                "%3d %8d %.4f %.3f %.3f %.3f %.3f %5d  %5d\n" %
                (n, ntrain, accu, f1t, f1, p, r, sum(y_pred), tot_pos))
        _ = joblib.dump(cls, pklName, compress=9)
        outfile.close()