Exemplo n.º 1
1
def apply_minibatch_sgd(datasets, minibatch, epoch=5, cores=1, seed=1):
    ''' Applies the logistic regression sgd method

    :type datasets: list
    :param datasets: List containing training/testing data
    
    :type minibatch: int
    :param minibatch: minibatch size
        
    :type cores: int
    :param cores: Number of cores
    
    :type seed: int
    :param seed: Random seed
    '''
    print 'Applying mini-batch SGD with mini-batch size of ', minibatch
    training_X, training_y = datasets[0]
    testing_X, testing_y = datasets[1]
    print 'Shuffling training data'
    training_X, training_y = shuffle(training_X, training_y, random_state = seed)
    clf = SGDClassifier(loss="log", random_state=seed, n_iter=epoch, verbose=0, n_jobs=cores)
    classes = numpy.unique([-1, 1])
    minibatches = training_X.shape[0]/minibatch + 1
    samples = training_X.shape[0]
    for i in xrange(epoch):
        print "Epoch ", i+1
        for j in xrange(minibatches):
            clf.partial_fit(training_X[j*minibatch:min(samples,(j+1)*minibatch)], training_y[j*minibatch:min(samples,(j+1)*minibatch)], classes=classes)
        print "Accuracy on testing data:", clf.score(testing_X, testing_y)
def run_online_classifier():
    vect = HashingVectorizer(
        decode_error='ignore',
        n_features=2**21,
        preprocessor=None,
        tokenizer=tokenizer_streaming,
    )
    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)

    csv_filename = os.path.join('datasets', 'movie_data.csv')
    doc_stream = stream_docs(path=csv_filename)

    classes = np.array([0, 1])
    for _ in range(45):
        X_train, y_train = get_minibatch(doc_stream, size=1000)
        if X_train is None:
            break
        else:
            X_train = vect.transform(X_train)
            clf.partial_fit(X_train, y_train, classes=classes)

    X_test, y_test = get_minibatch(doc_stream, size=5000)
    X_test = vect.transform(X_test)
    print("Test accuracy: %.3f" % clf.score(X_test, y_test))

    clf = clf.partial_fit(X_test, y_test)
Exemplo n.º 3
0
    def train(self):
        fp_lr_model = "./data/lr_model"

        lr_model = SGDClassifier(
            loss='log'
        )  # using log-loss for lr  early_stopping=False with partial_fit
        self.scores = []

        for i, chunk in enumerate(self.df_train):
            print('starting {} chunk...'.format(i + 1))
            df_train = self.oh_enc.transform(chunk)  # 转换为one-hot编码
            # other_feat = ['device_model', 'device_ip', 'device_id', 'app_domain', 'hour']  # one-hot编码没有考虑的object特征也要删除
            # ['id', 'click'].extend(other_feat)
            feat_train = df_train.columns.drop([
                'id', 'click', 'device_model', 'device_ip', 'device_id',
                'app_domain', 'hour', 'C17', 'C18', 'C19', 'C20', 'C21'
            ])
            train_x = df_train[feat_train]
            train_y = df_train['click'].astype('int')
            lr_model.partial_fit(train_x, train_y, classes=[0, 1])

            y_pred = lr_model.predict_proba(train_x)[:, 1]
            score = log_loss(train_y, y_pred)
            self.scores.append(score)
        print('saving model...')
        pickle.dump(lr_model, open(fp_lr_model, 'wb'))
def train():
    vect = HashingVectorizer(decode_error='ignore',
                             n_features=2**21,
                             preprocessor=None,
                             ngram_range=(1, 3),
                             tokenizer=tokenizer)
    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
    stream_path = os.path.join(work_path, 'movie_data.csv')
    doc_stream = stream_docs(path=stream_path)

    pbar = pyprind.ProgBar(45)
    classes = np.array([0, 1])
    for _ in range(45):
        X_train, y_train = get_minibatch(doc_stream, size=1000)
        if not X_train:
            break
        X_train = vect.transform(X_train)
        clf.partial_fit(X_train, y_train, classes=classes)
        pbar.update()

    X_test, y_test = get_minibatch(doc_stream, size=5000)
    X_test = vect.transform(X_test)
    print('Accuracy: %.3f' % clf.score(X_test, y_test))

    clf = clf.partial_fit(X_test, y_test)

    return clf
Exemplo n.º 5
0
def dump_classifier(clf_path):
    vect = HashingVectorizer(decode_error='ignore',
                             n_features=2**21,
                             preprocessor=None,
                             tokenizer=tokenizer)

    if Version(sklearn_version) < '0.18':
        clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
    else:
        clf = SGDClassifier(loss='log', random_state=1, max_iter=1)

    cur_dir = os.path.dirname(__file__)
    doc_stream = stream_docs(path=os.path.join(cur_dir, 'movie_data.csv'))

    #   pbar = pyprind.ProgBar(45)

    classes = np.array([0, 1])
    for _ in range(45):
        X_train, y_train = get_minibatch(doc_stream, size=1000)
        if not X_train:
            break
        X_train = vect.transform(X_train)
        clf.partial_fit(X_train, y_train, classes=classes)
#        pbar.update()

    X_test, y_test = get_minibatch(doc_stream, size=5000)
    X_test = vect.transform(X_test)
    print('Accuracy: %.3f' % clf.score(X_test, y_test))

    clf = clf.partial_fit(X_test, y_test)

    pickle.dump(clf, open(clf_path, 'wb'), protocol=4)
Exemplo n.º 6
0
    def train2(self):
        # df = pd.DataFrame()
        # df = pd.read_csv(self._bow.csv_path)
        # train = df.loc[:25000, 'review'].values
        # label = df.loc[:25000, 'sentiment'].values
        # test_train = df.loc[25000:, 'review'].values
        # test_label = df.loc[25000:, 'sentiment'].values
        classes = np.array([0, 1])

        #tokenized = self._bow.tokenizer_without_stop_word('I hava a pen')

        # x_train, y_label = self._bow.get_minibatch(self._bow.stream_docs(), size=2)
        vect = HashingVectorizer(
            decode_error='ignore',
            n_features=2**21,
            preprocessor=None,
            tokenizer=self._bow.tokenizer_without_stop_word)

        clf = SGDClassifier(loss='log', random_state=1, n_iter=1)

        doc_stream = self._bow.stream_docs()

        for _ in range(45):
            x_train, train_label = self._bow.get_minibatch(doc_stream,
                                                           size=1000)
            if not x_train:
                break
            x_train = vect.transform(x_train)
            clf.partial_fit(x_train, train_label, classes=classes)

        x_test_train, test_label = self._bow.get_minibatch(doc_stream,
                                                           size=5000)
        x_test_train = vect.transform(x_test_train)
        print('accuracy %.3f' % clf.score(x_test_train, test_label))
        return clf
Exemplo n.º 7
0
def check_classifier(vect: HashingVectorizer) -> None:
    if not clf_path.is_file():
        print('Classifier was not found, creating...')

        clf = SGDClassifier(loss='log', random_state=1)
        ds = DocStream('./movie_data.csv')
        pbar = ProgBar(45)

        classes = np.array([0, 1])

        for _ in range(45):
            x_train, y_train = ds.get_minibatch(1000)

            if not x_train:
                break

            x_train = vect.transform(x_train)
            clf.partial_fit(x_train, y_train, classes)
            pbar.update()

        print('Training completed...')

        x_test, y_test = ds.get_minibatch(5000)
        x_test = vect.transform(x_test)

        print(f'Score: {clf.score(x_test, y_test)}')

        clf = clf.partial_fit(x_test, y_test)

        dump(clf, clf_path, protocol=4)
Exemplo n.º 8
0
def SG_classify(X, Y, class_0_weight, class_1_weight, sgc=None):
    #stochastic gradient descent classifier
    if sgc:
        if np.bincount(Y)[0] > 0 and len(np.bincount(Y)) > 1:
            sgc.partial_fit(X, Y)
    else:
        param_SGC = {
            'loss': 'hinge',
            'penalty': 'elasticnet',
            'n_iter': 1,
            'shuffle': True,
            'class_weight': {
                0: class_0_weight,
                1: class_1_weight
            },
            'warm_start': True,
            'alpha': 0.001
        }
        sgc = SGDClassifier(**param_SGC)
        if np.bincount(Y)[0] > 0 and len(np.bincount(Y)) > 1:
            sgc.partial_fit(X, Y, np.unique(Y))
            coef = sgc.coef_
            intercept = sgc.intercept_
        else:
            sgc = None
            coef = None
            intercept = None

    return sgc, coef, intercept
Exemplo n.º 9
0
def mine():
    print("Starting")
    clf = SGDClassifier(loss='log',random_state=1,n_iter=1)
    print('Create/Load Classifier')
    doc_stream = stream_docs(path='./movie_data.csv')
    print('Fitting data')
    classes = np.array([0,1])
    for _ in range(45):
        X_train, y_train = get_minibatch(doc_stream, size=1000)
        if not X_train:
            break
        X_train = vect.transform(X_train)
        clf.partial_fit(X_train, y_train, classes=classes)
    print('Finished Fitting')

    X_test, y_test = get_minibatch(doc_stream, size=5000)
    X_test = vect.transform(X_test)
    print('Accuracy: %.3f' % clf.score(X_test,y_test))

    print('create pickle objects')
    dest = os.path.join('','pkl_objects')
    if not os.path.exists(dest):
        os.makedirs(dest)

    pickle.dump(stop, open(os.path.join(dest,'stopwords.pkl'),'wb'), protocol=4)
    pickle.dump(clf, open(os.path.join(dest,'classifier.pkl'),'wb'), protocol=4)
Exemplo n.º 10
0
def test_transformer(transformer, data_set, configuration):

    clf = SGDClassifier(alpha=0.005)
    samples = []
    labels = range(10)
    for epoch in range(configuration.hyper_parameters.epochs):
        for index, sample in enumerate(transformer.compute_outputs(data_set.trainset[0], data_set.trainset[1], 1)):

            samples.append(sample.reshape((1, sample.shape[0])))
            if index % 10 == 9:
                clf.partial_fit(samples, labels, labels)
                samples = []
                gc.collect()

    error = 0
    count = 0
    test_predictions = []
    for index, sample in enumerate(transformer.compute_outputs(data_set.testset[0], data_set.testset[1], 1)):
        prediction = clf.predict(sample)
        if not prediction == index % 10:
            error += 1

        count += 1
        test_predictions.append(prediction)

    OutputLog().write('test predictions weight: {0}'.format(test_predictions))

    OutputLog().write('\nerror: %f%%\n' % error)
def train_and_pickle_classifier():
    import numpy as np
    from sklearn.linear_model import SGDClassifier

    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)

    csv_filename = os.path.join('datasets', 'movie_data.csv')
    doc_stream = stream_docs(path=csv_filename)

    classes = np.array([0, 1])
    for _ in range(45):
        X_train, y_train = get_minibatch(doc_stream, size=1000)
        if X_train is None:
            break
        else:
            X_train = vect.transform(X_train)
            clf.partial_fit(X_train, y_train, classes=classes)

    X_test, y_test = get_minibatch(doc_stream, size=5000)
    X_test = vect.transform(X_test)
    print("Test accuracy: %.3f" % clf.score(X_test, y_test))

    clf = clf.partial_fit(X_test, y_test)

    pickle.dump(clf, open(CLF_FILENAME, 'wb'), protocol=4)
Exemplo n.º 12
0
class NOGDClassifier(object):
    def __init__(self, n_components=100, n_iter=1):
        self.nys = Nystroem(n_components=n_components)
        self.clf = SGDClassifier(loss='hinge',
                                 penalty='l2',
                                 shuffle=True,
                                 n_iter=n_iter)
        self.count = 0

    def fit(self, X, y):
        if self.count == 0:
            X_tran = self.nys.fit_transform(X)
        else:
            X_tran = self.nys.transform(X)
        self.count += 1
        self.clf.fit(X_tran, y)

    def partial_fit(self, X, y):
        if self.count == 0:
            X_tran = self.nys.fit_transform(X)
        else:
            X_tran = self.nys.transform(X)
        self.count += 1
        self.clf.partial_fit(X_tran, y)

    def predict(self, X):
        X_tran = self.nys.transform(X)
        y_pred = self.clf.predict(X_tran)
        return y_pred
Exemplo n.º 13
0
def test_partial_fit_doesnt_mutate_inputs():
    n, d = 100, 20
    X, y = make_classification(n_samples=n,
                               n_features=d,
                               random_state=42,
                               chunks=(n, d))
    X = X.compute()
    y = y.compute()
    meta = {
        "iterations": 0,
        "mean_copy_time": 0,
        "mean_fit_time": 0,
        "partial_fit_calls": 0,
    }
    model = SGDClassifier(tol=1e-3)
    model.partial_fit(X[:n // 2], y[:n // 2], classes=np.unique(y))
    new_model, new_meta = _partial_fit((model, meta),
                                       X[n // 2:],
                                       y[n // 2:],
                                       fit_params={"classes": np.unique(y)})
    assert meta != new_meta
    assert new_meta["partial_fit_calls"] == 1
    assert not np.allclose(model.coef_, new_model.coef_)
    assert model.t_ < new_model.t_
    assert new_meta["partial_fit_time"] >= 0
    new_meta2 = _score((model, new_meta), X[n // 2:], y[n // 2:], None)
    assert new_meta2["score_time"] >= 0
    assert new_meta2 != new_meta
Exemplo n.º 14
0
def train_by_partial_SGB():
    x_train, x_test, y_train, y_test = Load_Traindata_Testdata_with_Tfidf()
    # X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.3)
    # model = RandomForestClassifier(n_jobs=-1, n_estimators=100)
    # model=XGBClassifier()
    model = SGDClassifier(n_jobs=-1, max_iter=20, alpha=0.01)
    now = datetime.datetime.now()
    print("Training begin:", now)
    batch_size = 50000
    for i in range(100):
        last = datetime.datetime.now()
        start = (i * batch_size) % len(y_train)
        end = min(start + batch_size, len(y_train))
        model.partial_fit(x_train[start:end],
                          y_train[start:end],
                          classes=y_train)
        y_pre = model.predict(x_test)
        acc = accuracy_score(y_test, y_pre)
        score = model.score(x_test, y_test)
        cost_time = datetime.datetime.now() - last
        print("%d times,  %f score,  %f acc" % (i, score, acc), cost_time,
              " time(s)")
    # model.fit(X_train, Y_train)
    # y_pre = model.predict(X_val)
    # print(model.score(X_val, Y_val))
    # print(accuracy_score(Y_val, y_pre))
    training_time = datetime.datetime.now() - now
    print("Training time(s):", training_time)
def train_by_partial_SGD(filename):

    x_train, x_test, y_train, y_test = Load_Traindata_Testdata_with_Tfidf(
        filename)
    model = SGDClassifier(n_jobs=4, loss='hinge', alpha=0.09, tol=0.001)
    now = datetime.datetime.now()
    print("Training begin by SGB:", now)
    batch_size = 50000
    for i in range(1000):
        last = datetime.datetime.now()
        start = (i * batch_size) % len(y_train)
        end = min(start + batch_size, len(y_train))
        model.partial_fit(x_train[start:end],
                          y_train[start:end],
                          classes=y_train)
        y_pre = model.predict(x_test)
        acc = accuracy_score(y_test, y_pre)
        score = model.score(x_test, y_test)
        cost_time = datetime.datetime.now() - last
        print("%d times,  %f score,  %f acc" % (i, score, acc), cost_time,
              " time(s)")
    # model.fit(X_train, Y_train)
    # y_pre = model.predict(X_val)
    # print(model.score(X_val, Y_val))
    # print(accuracy_score(Y_val, y_pre))
    training_time = datetime.datetime.now() - now
    print("Training time(s):", training_time)
Exemplo n.º 16
0
class SVMClassifier(Model):
    def __init__(self):
        Model.__init__(self)
        # self.model = MLPClassifier(hidden_layer_sizes=(50,), max_iter=50, alpha=1e-4,
        #                            solver='sgd', tol=1e-4, random_state=1,
        #                            learning_rate_init=.001)

        self.model = SGDClassifier()

    def update(self, x, y, learning_rate):
        if len(y) > 0:
            self.model.partial_fit(x, y)

    def batch_update(self, x, y, learning_rate):
        indices = np.arange(len(y))
        np.random.shuffle(indices)
        # print('new training iteration using {} items'.format(len(y)))
        self.model.partial_fit(x[indices, :], y[indices], np.unique(y))

    def loss(self, x, y):
        pass

    def calculate_loss(self, x, y):
        #p = self.model.predict(x)
        return 1-self.model.score(x,y)

    def update_params(self, params):
        pass

    def restart(self):
        pass
Exemplo n.º 17
0
def test_multi_output_classification_partial_fit():
    # test if multi_target initializes correctly with base estimator and fit
    # assert predictions work as expected for predict

    sgd_linear_clf = SGDClassifier(loss='log', random_state=1)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)

    # train the multi_target_linear and also get the predictions.
    half_index = X.shape[0] // 2
    multi_target_linear.partial_fit(
        X[:half_index], y[:half_index], classes=classes)

    first_predictions = multi_target_linear.predict(X)
    assert_equal((n_samples, n_outputs), first_predictions.shape)

    multi_target_linear.partial_fit(X[half_index:], y[half_index:])
    second_predictions = multi_target_linear.predict(X)
    assert_equal((n_samples, n_outputs), second_predictions.shape)

    # train the linear classification with each column and assert that
    # predictions are equal after first partial_fit and second partial_fit
    for i in range(3):
        # create a clone with the same state
        sgd_linear_clf = clone(sgd_linear_clf)
        sgd_linear_clf.partial_fit(
            X[:half_index], y[:half_index, i], classes=classes[i])
        assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
        sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
        assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
Exemplo n.º 18
0
def out_of_core():
    vect = HashingVectorizer(decode_error='ignore',
                             n_features=2**21,
                             preprocessor=None,
                             tokenizer=tokenizer_new)
    clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
    doc_stream = stream_docs(path='./movie_data.csv')
    pbar = pyprind.ProgBar(45)
    classes = np.array([0, 1])
    for _ in range(45):
        # import pdb; pdb.set_trace()
        X_train, y_train = get_minibatch(doc_stream, size=1000)
        if not X_train:
            break
        X_train = vect.transform(X_train)
        clf.partial_fit(X_train, y_train, classes=classes)
        pbar.update()

    X_test, y_test = get_minibatch(doc_stream, size=5000)
    X_test = vect.transform(X_test)
    print('\nAccuracy: %.3f' % clf.score(X_test, y_test))
    clf = clf.partial_fit(X_test, y_test)
    dest = os.path.join('movieclassifier', 'pkl_objects')
    if not os.path.exists(dest):
        os.makedirs(dest)
    pickle.dump(stop,
                open(os.path.join(dest, 'stopwords.pkl'), 'wb'),
                protocol=4)
    pickle.dump(clf,
                open(os.path.join(dest, 'classifier.pkl'), 'wb'),
                protocol=4)
Exemplo n.º 19
0
    def _initialise_objective_function(self, x):

        x = np.atleast_2d(x)
        fs = np.zeros((x.shape[0], 1))
        for i in range(x.shape[0]):
            fs[i] = 0
            gamma = np.exp(x[i, 0])  # learning rate, log scale
            alpha = np.exp(x[i, 1])  # l2 regulariser, log scale
            n_iter = int(x[i, 2])  # num epochs
            batch_size = int(x[i, 3])  # mini batch size
            clf = SGDClassifier(loss='log',
                                penalty='l2',
                                alpha=alpha,
                                learning_rate='constant',
                                eta0=gamma,
                                n_iter=1)

            for j in range(n_iter):
                for (X_batch,
                     y_batch) in self._next_batch(self.X_train, self.y_train,
                                                  batch_size):
                    clf.partial_fit(X_batch, y_batch, classes=self.classes)

            score = clf.score(self.X_test, self.y_test)
            fs[i] = 1 - score  # classification error
        return fs
Exemplo n.º 20
0
def evaluate_svm(alpha):
    # Note: n_iter gets switched to 1 by sklearn whenever you call partial_fit(). This initial
    # setting is for the pretesting of eta0.
    basic_svm = SGDClassifier(loss="hinge", penalty="l2", l1_ratio=0.0, random_state=31337, n_jobs=5,
                              n_iter=5, alpha=alpha)

    learning_rate_grid = [ 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7 ]
    pretest_svm = GridSearchCV(basic_svm,
                               {"learning_rate": ["constant"],
                                "eta0": learning_rate_grid}).fit(X_pretest, y_pretest)
    bottou_gamma0 = pretest_svm.best_params_["eta0"]
    basic_svm.eta0 = bottou_gamma0
    basic_svm.learning_rate = "constant"

    basic_svm = basic_svm.partial_fit(X_pretest, y_pretest, classes = np.unique(y_train))

    progressive_val = []
    train_score = []
    for dp in range(0, X_train.shape[0], batch_size):
        t = dp + n_pretest
        basic_svm.eta0 = bottou_gamma0/(1 + bottou_gamma0*alpha*t)
        X_batch = X_train[dp:dp+batch_size]
        y_batch = y_train[dp:dp+batch_size]
        progressive_val.append(basic_svm.score(X_batch, y_batch))
        basic_svm = basic_svm.partial_fit(X_batch, y_batch)
        train_score.append(basic_svm.score(X_batch, y_batch))

    scores = progressive_val[-batches_for_cv_performance:]
    return np.mean(scores), np.std(scores), basic_svm
Exemplo n.º 21
0
class PartialSGDEstimator(BaseEstimator):
    def fit(self, documents, labels=None, mini_batch_size=500):
        self.model = SGDClassifier()
        batchDocs = []
        batchLabels = []
        count = 0
        for doc in documents:
            batchDocs.append(doc)
            batchLabels.append(labels[count])
            if count % mini_batch_size == 0:
                print("batch")
                self.model.partial_fit(batchDocs,
                                       batchLabels,
                                       classes=np.unique(labels))
                batchDocs = []
                batchLabels = []
                gc.collect()
            else:
                pass
            count += 1
        return self

    def predict(self, X, mini_batch_size=500):
        yhat = []
        for doc in X:
            preds = self.model.predict(doc)
            yhat.extend(preds)
        return yhat
Exemplo n.º 22
0
def direcrtoryProcessing(train_path):
    training_names = os.listdir(train_path)
    # Get all the path to the images and save them in a list
    # image_paths and the corresponding label in image_paths
    image_paths = []
    image_classes = []
    class_id = 0
    for training_name in training_names:  #Got three directory having images
        dir = os.path.join(train_path, training_name)
        class_path = imlist(dir)
        image_paths += class_path
        image_classes += [class_id] * len(class_path)
        class_id += 1
    X, Y = trainTestSet(image_paths, image_classes)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=0)
    clf = SGDClassifier()
    batcherator = iter_minibatches(10, X_train, y_train)
    for X_chunk, y_chunk in batcherator:
        clf.partial_fit(X_chunk, y_chunk, classes=np.unique(Y))
        y_predicted = clf.predict(X_test)
        print(
            classification_report(y_test,
                                  y_predicted,
                                  target_names=training_names))
Exemplo n.º 23
0
def train(model: SGDClassifier,
          train_data,
          train_labels,
          test_data,
          test_labels,
          total_epochs=1000):
    # using partial fit instead of fit in order to gather information on accuracy after every pass

    labels = []
    scores = []

    for i in range(total_epochs):
        if i == 0:
            model.partial_fit(train_data,
                              train_labels,
                              classes=np.unique(train_labels))
        else:
            model.partial_fit(train_data, train_labels)
        if (i + 1) % (total_epochs // 20) == 0:
            pred_labels = model.predict(test_data)
            scores.append(accuracy_score(test_labels, pred_labels))
            labels.append(i + 1)
            print("Epoch {0} score: {1}".format(i + 1, scores[-1]))

    return labels, scores
Exemplo n.º 24
0
 def init(self, n_obs, n_act): 
     self.models = []
     for i in xrange(n_act):
         model = SGDClassifier(**self.model_kwargs)
         
         model.partial_fit(np.random.rand(1, n_obs), [0], classes=[0, 1])
         self.models.append(model)
def bigram(documents, prediction_documents):
    vectorizer = HashingVectorizer(decode_error='ignore',
                                   ngram_range=(1, 2),
                                   preprocessor=None,
                                   tokenizer=tokenizer,
                                   analyzer='word')
    classifier = SGDClassifier(loss='hinge', penalty='l1')

    chunk = stream(path=documents, scope='training')

    pbar = pyprind.ProgBar(10)
    classes = np.array([0, 1])
    for _ in range(10):
        reviews, labels = batch(chunk, size=2500, scope='training')
        reviews = vectorizer.transform(reviews)
        classifier.partial_fit(reviews, labels, classes=classes)
        pbar.update()

    prediction_size = prediction_file_size(prediction_documents) - 1
    test_chunk = stream(path=prediction_documents, scope='predicting')
    test_reviews = batch(test_chunk, size=prediction_size, scope='predicting')
    test_reviews = vectorizer.transform(test_reviews)

    predictions = classifier.predict(test_reviews)

    save('bigram.output.txt', predictions)
Exemplo n.º 26
0
def main(args):
    with open(args.train, 'r') as f:
        train_data, train_labels = matify(json.load(f))
    with open(args.test, 'r') as f:
        test_data, test_labels = matify(json.load(f))
    train_data = np.array(train_data, dtype=np.float32)
    train_data /= 256.
    test_data = np.array(test_data, dtype=np.float32)
    test_data /= 256.
    train_labels = np.array(train_labels, dtype=np.float32)
    test_labels = np.array(test_labels, dtype=np.float32)

    clf = SGDClassifier(loss='hinge', penalty='l2')
    for i in tqdm(xrange(args.training_steps)):
        data, labels = get_batch(train_data, train_labels, BATCH_SIZE)
        clf.partial_fit(data, labels, classes=[c for c in xrange(CLASSES)])
        if ((i + 1) % 200 == 0):
            tqdm.write('step %d, training accuracy %g' %
                       (i + 1, clf.score(data, labels)))

    print('Validating...')
    data, labels = get_batch(test_data, test_labels, BATCH_SIZE)
    print('VAL ACCURACY: %f' % clf.score(data, labels))

    if args.param_dir:
        vars_to_save = {
            'fc1_w.summary': clf.coef_,
            'fc1_b.summary': clf.intercept_,
        }
        for var in vars_to_save:
            path = os.path.join(args.param_dir, var)
            with open(path, 'w+') as f:
                pickle.dump(vars_to_save[var], f)
Exemplo n.º 27
0
def SGD_normal(x_train,y_train,x_test,y_test):
    #X = [[0., 0.], [1., 1.]]
    #y = [0, 1]

    import numpy as np
    print(x_train.shape, y_train.shape, type(x_train))

    clf=SGDClassifier(loss='hinge',penalty='l2')
    clf.partial_fit(x_train,y_train,classes=np.unique(y_train))

    '''
    SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
    '''
    #参数说明
    '''
    loss="hinge": (soft-margin) linear Support Vector Machine ((软-间隔)线性支持向量机),
    loss="modified_huber": smoothed hinge loss (平滑的 hinge 损失),
    loss="log": logistic regression (logistic 回归),
    and all regression losses below(以及所有的回归损失)。
    
    默认设置为 penalty="l2" 。 L1 penalty (惩罚)导致稀疏解,使得大多数系数为零。
    Elastic Net(弹性网)解决了在特征高相关时 L1 penalty(惩罚)的一些不足。
    参数 l1_ratio 控制了 L1 和 L2 penalty(惩罚)的 convex combination (凸组合)。
    '''
    note_prediction = list(clf.predict(x_test))
    from sklearn.metrics import classification_report, confusion_matrix
    print(confusion_matrix(y_test, note_prediction))
    print(classification_report(y_test, note_prediction))
    return clf
Exemplo n.º 28
0
def test_multi_output_classification_partial_fit():
    # test if multi_target initializes correctly with base estimator and fit
    # assert predictions work as expected for predict

    sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)

    # train the multi_target_linear and also get the predictions.
    half_index = X.shape[0] // 2
    multi_target_linear.partial_fit(
        X[:half_index], y[:half_index], classes=classes)

    first_predictions = multi_target_linear.predict(X)
    assert_equal((n_samples, n_outputs), first_predictions.shape)

    multi_target_linear.partial_fit(X[half_index:], y[half_index:])
    second_predictions = multi_target_linear.predict(X)
    assert_equal((n_samples, n_outputs), second_predictions.shape)

    # train the linear classification with each column and assert that
    # predictions are equal after first partial_fit and second partial_fit
    for i in range(3):
        # create a clone with the same state
        sgd_linear_clf = clone(sgd_linear_clf)
        sgd_linear_clf.partial_fit(
            X[:half_index], y[:half_index, i], classes=classes[i])
        assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
        sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
        assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
Exemplo n.º 29
0
def train_tagger(X, y, type='sgdo'):
    if type == 'sgd':
        clf = SGDClassifier()
        clf.fit(X, y)
    elif type == 'sgdo':
        clf = SGDClassifier()
        classes = np.unique(y)
        for i in range(len(X)):
            sys.stdout.write('%.3f%% Complete\r' %
                             ((float(i) / float(len(X))) * 100))
            A = X[i]
            b = y[i]
            clf.partial_fit([A], [b], classes)
    elif type == 'nn':
        clf = Perceptron()
        clf.fit(X, y)
    elif type == 'nno':
        clf = Perceptron()
        classes = np.unique(y)
        for i in range(len(X)):
            sys.stdout.write('%.3f%% Complete\r' %
                             ((float(i) / float(len(X))) * 100))
            A = X[i]
            b = y[i]
            clf.partial_fit([A], [b], classes)
    elif type == 'svm':
        clf = svm.LinearSVC()
        clf.fit(X, y)
    else:
        clf = svm.LinearSVC()
        clf.fit(X, y)

    return clf
Exemplo n.º 30
0
def train_test_bow(ngram_order, batch_size=128, n_epoch=3):
    label_sets = ['full', 'function', '3way', 'in_out', 'man_nat']
    for label_set in label_sets:
        # need to drop unk for full/function
        if label_set in ['full', 'function']:
            df = sentences_df(labels=label_set, drop_unk=True)
        else:
            df = sentences_df(SENTENCES_CSV, labels=label_set, drop_unk=False)
        X, y, word2idx, l_enc = load_dataset(df, ngram_order=ngram_order)
        print "X shape: %s" % (X.shape,)
        print "y shape: %s" % (y.shape,)
        skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=0)
        scores = []
        for (train, test) in skf:
            clf = None
            clf = SGDClassifier(loss='log',
                                alpha=0.001,
                                l1_ratio=0,
                                random_state=0)
            for epoch in range(n_epoch):
                X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test]
                n_batches = X_train.shape[0] // batch_size
                for minibatch_idx in range(n_batches):
                    clf.partial_fit(
                        X_train[minibatch_idx * batch_size : (minibatch_idx+1) * batch_size],
                        y_train[minibatch_idx * batch_size : (minibatch_idx+1) * batch_size],
                        classes=np.unique(y))
                print "Epoch: %d/%d Train acc: %.4f" \
                    % (epoch+1, n_epoch, clf.score(X_train, y_train))
            fold_score = clf.score(X_test, y_test)
            print "Fold acc: %.4f" % fold_score
            scores.append(fold_score)
        print '%s label mean cv accuracy: %.4f\n' % (label_set, np.mean(scores))
Exemplo n.º 31
0
class SGDRanker(BaseEstimator):

    """ Ranking predictor using stochastic gradient descent

    TODO:
    -allow configurable parameters for classifier
    -seed random state
    """

    def __init__(self, seconds=10):
        self.clf = SGDClassifier(loss='hinge')
        self.clf.fit_intercept = False
        self.clf.classes_ = np.array([-1, 1])
        self.seconds = seconds

    def fit(self, X, y):
        rows = X.shape[0]
        start_time = time.time()
        for i in itertools.count():
            if time.time() - start_time > self.seconds:
                return self
            idx1 = random.randint(0, rows - 1)
            idx2 = random.randint(0, rows - 1)
            y1, y2 = y[idx1], y[idx2]
            if y1 == y2:
                continue
            self.clf.partial_fit(X[idx1] - X[idx2], np.sign(y1 - y2))

    def predict(self, X):
        return np.dot(X, self.clf.coef_.T)
Exemplo n.º 32
0
class LightModel:
    def __init__(self,learningRate, numEpochs, ppenalty="l1", mustShuffle=True):
        #Init scikit models
        self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle)
    def train(self, gen,  v=False):
        i = 0
        for x, y in gen: #For each batch
            self.Classifier.partial_fit(x, y, [0,1])
            i += len(x)
            if v : print(str(datetime.now())[:-7] , "example:", i)
            
    def test(self, gen,  v=False):

        #init target and prediction arrays
        ytot = np.array([])
        ptot = np.array([])
        #Get prediction for each batch
        i = 0
        for x,y in gen:
            p = self.Classifier.predict_proba(x)
            p = p.T[1].T #Keep column corresponding to probability of class 1
            #Stack target and prediction for later analysis
            ytot = np.hstack((ytot, y)) 
            ptot = np.hstack((ptot, p))
            i += y.shape[0]
            if v : print(str(datetime.now())[:-7] , "example:", i)
        if v: print("Score:", self.score(ytot, ptot))
        
        return (ytot, ptot)
    def score(self, target, prediction):
        return llfun(target, prediction)
Exemplo n.º 33
0
class LanguageProcessing():

    def __init__(self, model=None):
        # Machine learning models.
        self.nlp = spacy_model.load()
        if model:
            self.model = model
        else:
            self.model = SGDClassifier()
            # Preprocessing the corpus.
            # self.corpus: Map<String, List<String>>
            self.corpus = load(open('pickles/speech_corpus', 'rb'))
            self.categories = self.corpus.keys()
            self.classifications_by_cat = {self.categories[i]: i
                                           for i in range(len(self.categories))}
            self.classifications_by_num = {i: self.categories[i]
                                           for i in range(len(self.categories))}
            # Training the model.
            training_x = []
            training_y = []
            for k in self.corpus:
                sentences = self.corpus[k]
                training_x += [self.return_nlp(s).vector for s in sentences]
                training_y += [self.classifications_by_cat[k]
                               for i in range(len(sentences))]
            self.model.fit(array(training_x), array(training_y))

    def return_nlp(self, text):
        """
        Wraps given text in unicode and
        returns its spaCy wrapper.
        """
        return self.nlp(unicode(text))

    def string_similarity(self, s1, s2):
        """
        Using spaCy, computes the similarity
        between two strings based on the
        GloVe vectors provided.
        """
        return self.return_nlp(s1).similarity(self.return_nlp(s2))

    def train_with_query(self, query):
        query_vectorized = [self.return_nlp(query).vector]
        pred = self.model.predict(
            array(query_vectorized))[0]
        # Inform the user of the prediction and
        # ask for confirmation of training. To be removed later.
        print '\n' * 100
        print 'QUERY: " ' + query + '"'
        print 'CLASSIFIED AS: "' + self.classifications_by_num[pred] + '"'
        decision = raw_input(
            'Is this what you expected? 0 for N, 1 for Y.\n> ')
        print '\n' * 2
        if eval(decision) == 1:
            # Update the corpus and train the model.
            self.corpus[self.classifications_by_num[pred]].append(query)
            self.model.partial_fit(array(query_vectorized), array([pred]))
        return query, self.classifications_by_num[pred]
Exemplo n.º 34
0
def incremental_SGD(X, Y, loss):
    sgd = SGDClassifier(loss=loss, penalty="l2")
    labels = np.unique(Y)
    for i in range(X.shape[0]):
        point_x = X[i]
        point_y = Y[i]
        sgd.partial_fit([point_x], [point_y], classes=labels)
    return sgd
Exemplo n.º 35
0
def train():
    model = SGDClassifier()
    for batch_no, batch in enumerate(db.mini_batches(100)):
        X, y = vectorize_batch(batch)
        model.partial_fit(X, y)
        if sampling and batch_no == 10:
            break
    return model
Exemplo n.º 36
0
def train(train_ids, class_id, polys, gs, patch_size):
    print('TRAINING...')

    # First calculate standard scaler parameters
    scaler = StandardScaler()
    for img_id in train_ids:
        print('Calcuating scaler for ' + str(img_id) + ' for class ' + str(class_id) + ' at ' + str(datetime.now(timezone('EST'))))
        im_rgb = tiff.imread('input/three_band/{}.tif'.format(img_id)).transpose([1, 2, 0])
        patches = extract_patches_2d(im_rgb, patch_size)
        patches = np.reshape(patches, (len(patches), -1))
        #xs = im_rgb.reshape(-1, 3).astype(np.float32)
        xs = patches.astype(np.float32)
        scaler.partial_fit(xs)

    # Next build the logistic model
    model = SGDClassifier(loss='log')
    for img_id in train_ids:
        print('Training on ' + str(img_id) + ' for class ' + str(class_id) + ' at ' + str(datetime.now(timezone('EST'))))

        # Load grid size for current image polygon coordinates
        x_max, y_min = gs[gs['ImageId'] == img_id].iloc[0,1:].astype(float)

        # Read current image with tiff
        im_rgb = tiff.imread('input/three_band/{}.tif'.format(img_id)).transpose([1, 2, 0])
        im_size = im_rgb.shape[:2]
        patches = extract_patches_2d(im_rgb, patch_size)
        print(len(patches))
        patches = np.reshape(patches, (len(patches), -1))


        # Read in polygons for current image
        cur_polygons = polys[(polys['ImageId'] == img_id) & (polys['ClassType'] == class_id)].iloc[0]['MultipolygonWKT']
        train_polygons = shapely.wkt.loads(cur_polygons)

        
        x_scaler, y_scaler = get_scalers(im_size, x_max, y_min)
        train_polygons_scaled = shapely.affinity.scale(train_polygons, xfact=x_scaler, yfact=y_scaler, origin=(0, 0, 0))
        train_mask = get_polygon_mask(train_polygons_scaled, im_size)

        # Load xs from image and ys from polygon mask
        #xs = im_rgb.reshape(-1, 3).astype(np.float32)
        xs = patches.astype(np.float32)
        edges_to_delete = (patch_size[1] - 1) / 2 # Delete 0 for patch_size 1, 1 for patch_size 3, 2 for patch_size 5, etc
        ys = train_mask[edges_to_delete:-edges_to_delete, edges_to_delete:-edges_to_delete].reshape(-1) # Drop beginning & end rows and columns to account for patch size
        #ys = train_mask.reshape(-1)

        # Scale x values with trained scaler
        #print(xs.mean(axis=0))
        xs = scaler.transform(xs)
        print(im_rgb.shape)
        print(xs.shape)
        print(ys.shape)
        #print(xs.mean(axis=0))

        print('training partial fit...')
        model.partial_fit(xs, ys, classes = (0, 1))

    return scaler, model
Exemplo n.º 37
0
class modle(object):
    def __init__(self, modle_n=0):
        #生成模型
        if modle_n == 0:  #支持向量机
            self.clf = svm.SVC()
        elif modle_n == 1:
            self.clf = linear_model.LogisticRegression(C=1.0,
                                                       penalty='l1',
                                                       tol=1e-6)
        elif modle_n == 2:  #随机梯度下降
            self.clf = SGDClassifier()  # SGDClassifier的参数设置可以参考sklearn官网
        '''
        clf = svm.SVC(C=1.0, cache_size=200, class_weight=None,
            coef0=0.0, decision_function_shape=None,
            degree=3, gamma='auto', kernel='rbf',
            max_iter=-1, probability=False,
            random_state=None, shrinking=True,
            tol=0.001, verbose=False)
        '''

    def train(self, X, y):
        self.clf.fit(X, y)
        self.out_clf()

    def train_batch(self, X, y, m=2, n=0):
        if m == 0:  #支持向量机
            pass
        elif m == 1:
            pass
        elif m == 2:  #随机梯度下降
            # 使用 partial_fit ,并在第一次调用 partial_fit 的时候指定 classes
            self.clf.partial_fit(X, y, classes=np.array([0, 1]))
        print("train...{0}".format(n))  # 当前次数
        self.out_clf()

    def in_clf(self):
        #导入模型
        self.clf = joblib.load('TalkingDataAdTracking/data/sgd.pkl')
        print('in_clf...ok')

    def out_clf(self):  #保存模型
        joblib.dump(self.clf, 'TalkingDataAdTracking/data/sgd.pkl')
        print('out_clf...ok')

    def evaluate(self, X, y):
        e = self.clf.score(X, y)
        print(e)
        return e

    def evaluate2(self, X, y):
        #简单评估
        e = cross_validation.cross_val_score(self.clf, X, y, cv=5)
        print(e)
        return e

    def predict(self, X):
        #预测
        return self.clf.predict(X)
Exemplo n.º 38
0
    def create_classifier(self):
        DB.db.connect()
        clf = SGDClassifier(loss="modified_huber")
        labs_map = NameToIndex()

        with DB.db.transaction():
            offset = 0
            words_count = self.get_words_count()
            classes = numpy.arange(0, words_count)
            x_all = []
            y_all = []
            while True:
                print ' %d partial_fit %d' % (time(), offset)
                query = DB.Vocabulary\
                    .select(DB.Vocabulary.lv1, DB.Vocabulary.lv2)\
                    .join(DB.PcaModel, on=(DB.Vocabulary.feature == DB.PcaModel.feature)).order_by( DB.Vocabulary.feature).offset(offset).limit(1000)\
                    .tuples().iterator()
                features = numpy.array(
                    map(lambda x: [x[0]] + list(x[1]), query))
                offset += len(features)
                if len(features) == 0:
                    break

                Y = features[:, 0]
                X = features[:, 1:]

                labs = []
                for lab in Y:
                    labs.append(labs_map.map(lab))

                if (len(x_all) < 10000):
                    x_all = x_all + X.tolist()
                    y_all = y_all + labs
                labs = numpy.array(labs)

                #clf = LinearSVC()
                #clf = OneVsRestClassifier(SVC(probability=True, kernel='linear'))
                #clf.fit(X,labs)
                clf.partial_fit(X, labs, classes)
                print clf.score(x_all, y_all)

            DB.TrainingResult.delete().where(
                DB.TrainingResult.name == self.__class__.__name__ +
                "_clf").execute()
            DB.TrainingResult.delete().where(
                DB.TrainingResult.name == self.__class__.__name__ +
                "_labs_map").execute()

            tr = DB.TrainingResult()
            tr.name = self.__class__.__name__ + "_clf"
            tr.data = clf
            tr.save()

            tr = DB.TrainingResult()
            tr.name = self.__class__.__name__ + "_labs_map"
            tr.data = labs_map
            tr.save()
Exemplo n.º 39
0
def loadModel():
    """
    """
    train = np.genfromtxt('dataset.csv', delimiter=',')
    x_train = train[:, 1:]
    y_train = np.uint8(train[:, 0])
    clf = SGDClassifier(loss="log")
    clf.partial_fit(x_train, y_train, classes=[0, 1])
    return x_train, y_train, clf
Exemplo n.º 40
0
def objective(trial):
    alpha = trial.suggest_uniform("alpha", 0.0, 1.0)
    clf = SGDClassifier(alpha=alpha)
    n_train_iter = 100

    for step in range(n_train_iter):
        clf.partial_fit(X_train, y_train, classes=classes)

        intermediate_value = clf.score(X_valid, y_valid)
Exemplo n.º 41
0
Arquivo: odr.py Projeto: caoym/odr
    def create_classifier(self):
        DB.db.connect()
        clf = SGDClassifier( loss="modified_huber")
        labs_map = NameToIndex()

        with DB.db.transaction():
            offset = 0
            words_count = self.get_words_count()
            classes = numpy.arange(0,words_count)
            x_all = []
            y_all = []
            while True:
                print ' %d partial_fit %d'%(time(),offset)
                query = DB.Vocabulary\
                    .select(DB.Vocabulary.lv1, DB.Vocabulary.lv2)\
                    .join(DB.PcaModel, on=(DB.Vocabulary.feature == DB.PcaModel.feature)).order_by( DB.Vocabulary.feature).offset(offset).limit(1000)\
                    .tuples().iterator()
                features = numpy.array(map(lambda x:[x[0]]+list(x[1]),query))
                offset += len(features)
                if len(features) == 0:
                    break

                Y = features[:,0]
                X = features[:,1:]

                labs = []
                for lab in Y:
                    labs.append(labs_map.map(lab))

                if(len(x_all)<10000):
                    x_all = x_all + X.tolist()
                    y_all = y_all + labs
                labs = numpy.array(labs)

                #clf = LinearSVC()
                #clf = OneVsRestClassifier(SVC(probability=True, kernel='linear'))
                #clf.fit(X,labs)
                clf.partial_fit(X,labs,classes)
                print clf.score(x_all,y_all)

            DB.TrainingResult.delete().where(DB.TrainingResult.name == self.__class__.__name__+"_clf").execute()
            DB.TrainingResult.delete().where(DB.TrainingResult.name == self.__class__.__name__+"_labs_map").execute()

            tr = DB.TrainingResult()
            tr.name = self.__class__.__name__+"_clf"
            tr.data = clf
            tr.save()

            tr = DB.TrainingResult()
            tr.name = self.__class__.__name__+"_labs_map"
            tr.data = labs_map
            tr.save()
Exemplo n.º 42
0
def chi_feature_select(train_file, test_file):

    lines = read_text_src(train_file)
    lines = [x for x in lines if len(x)>1]
    X_train = [line[1] for line in lines]
    y_train = [line[0] for line in lines]

    lines = read_text_src(test_file)
    lines = [x for x in lines if len(x) > 1]
    X_test = [line[1] for line in lines]
    y_test = [line[0] for line in lines]

    vectorizer = TfidfVectorizer(tokenizer=zh_tokenize)#ngram_range=(1,2)
    X_train = vectorizer.fit_transform(X_train)
    print X_train.shape

    X_test = vectorizer.transform(X_test)

    # word = vectorizer.get_feature_names()


    # N = X_train.shape[1]
    # ch2 = SelectKBest(chi2, k=int(N*0.2)) #.fit_transform(X, y)
    #
    #
    # X_train = ch2.fit_transform(X_train, y_train)
    # X_test = ch2.transform(X_test)

    # feature_names = [word[i] for i
    #                  in ch2.get_support(indices=True)]
    #

    # for i in feature_names:
    #     print i.encode('utf-8')
    # feature_names = np.asarray(feature_names)
    # print feature_names
    # clf = LinearSVC(penalty="l1", dual=False, tol=1e-3)

    # clf.fit(X_train, y_train)
    clf = SGDClassifier(loss="log", penalty='l1')
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    prob = clf.predict_proba(X_test[0])
    print prob
    X=["市场经济复苏,互联网公司蓬勃发展","世纪大战终于开启,勇士引得第73胜"]
    Y=['1','0']
    X=vectorizer.transform(X)
    clf.partial_fit(X,Y, classes=['0','1'])
    tmpx=['暴风科技股价大跌',"世纪大战终于开启,勇士引得第73胜"]
    tmpX=vectorizer.transform(tmpx)
    pred = clf.predict(tmpX)
    print pred
Exemplo n.º 43
0
def main():
    # Get training and model filenames
    with open('model_metadata.json') as f:
        config = json.load(f)

    CLASSES = [float(x) for x in config['classes']]
    model_filename = config['modelFilename']
    NUM_BITS_FOR_HASHING = config['numBitsForHashing']
    train_filename = config['trainFilename']
    sklearn_version_expected = config['sklearnVersion']

    # If sklearn version is wrong, exit without training
    if float(sklearn.__version__) != float(sklearn_version_expected):
        print "Wrong sklearn version"
        sys.exit(0)

    with open(train_filename) as f:
        lines = (tuple(line.rstrip('\n').split('\t')) for line in f)
        parsed_lines = ((line[1:], float(line[0])) for line in lines)

        # Parse header and get feature names for namespacing
        header = next(lines)
        FEATURE_NAMES = tuple(header[1:])

        # Build pipeline
        pre_processing_pipeline = make_pre_processing_pipeline(
            feature_names=FEATURE_NAMES,
            num_bits_for_hashing=NUM_BITS_FOR_HASHING
        )

        # Instantiate classifier
        # (a logistic regression model with Stochastic Gradient Descent)
        clf = SGDClassifier(loss='log')

        # Train model in mini-batches
        batch_size = 8000

        for rows, labels in batched_lines(batch_size, parsed_lines):
            processed_rows = pre_processing_pipeline.fit_transform(rows)
            clf.partial_fit(processed_rows, labels, classes=CLASSES)

        print clf

        # Save model
        joblib.dump(clf, model_filename)

        # Reload just to make sure it serializes and de- properly
        joblib.load(model_filename)
Exemplo n.º 44
0
class Model:
    def __init__(self,numFeatures, learningRate, numEpochs, ppenalty="l1", mustShuffle=True):
        #Init scikit models
        self.FH = FeatureHasher(n_features=numFeatures, input_type='string')
        self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle)
    def train(self, gen,  v=False):

        i = 0
        for x, y in gen: #For each batch
            xHash = self.FH.transform(x) #hash trick
            y = np.array(y)            
##            for epoch in range(numEpochs):
            self.Classifier.partial_fit(xHash, y, [0,1])
            i += len(x)
            if v : print(str(datetime.now())[:-7] , "example:", i)
            
    def test(self, gen,  v=False):

        #init target and prediction arrays
        ytot = np.array([])
        ptot = np.array([])
        #Get prediction for each batch
        i = 0
        for x,y in gen:
            xHash = self.FH.transform(x) #hash trick
            p = self.Classifier.predict_proba(xHash)
            p = p.T[1].T #Keep column corresponding to probability of class 1
            #Stack target and prediction for later analysis
            ytot = np.hstack((ytot, y)) 
            ptot = np.hstack((ptot, p))
            i += y.shape[0]
            if v : print(str(datetime.now())[:-7] , "example:", i)
        if v: print("Score:", self.score(ytot, ptot))
        
        return (ytot, ptot)
    def predictBatch(self, batch):
        hashedBatch = self.FH.transform(batch)
        prediction = self.Classifier.predict_proba(hashedBatch)
        return prediction
    def generatePrediction(self, generator):
        for xBatch, idBatch in generator:
            prediction = self.predictBatch(xBatch)
            yield prediction, idBatch
    def score(self, target, prediction):
        return llfun(target, prediction)
Exemplo n.º 45
0
def train(input_filename, num_train_examples, num_test_examples, block_size):
    # Load initial training data and test data
    X_train, y_train, X_test, y_test, scaler = loaddata(input_filename, num_test_examples, block_size)

    # Feature generation using random forests
    forest = RandomForestClassifier(n_estimators=150, n_jobs=-1)
    forest.fit(X_train, y_train)
    encoder = OneHotEncoder()
    encoder.fit(forest.apply(X_train))
    X_test = encoder.transform(forest.apply(X_test))
    # Make sure that classes are weighted inversely to their frequencies
    weights = float(y_train.shape[0]) / (2 * numpy.bincount(y_train))
    class_weights = {0: weights[0], 1: weights[1]}
    learner = SGDClassifier(
        loss="hinge",
        penalty="l2",
        learning_rate="invscaling",
        alpha=0.0001,
        average=10 ** 4,
        eta0=1.0,
        class_weight=class_weights,
    )

    num_passes = 3
    aucs = []

    for j in range(num_passes):
        for i in range(0, num_train_examples, block_size):
            df = pandas.read_csv(input_filename, header=None, skiprows=i, nrows=block_size)
            X_train = df.values[:, 1:]
            X_train = scaler.transform(X_train)
            X_train = encoder.transform(forest.apply(X_train))
            y_train = numpy.array(df.values[:, 0], numpy.int)
            del df

            learner.partial_fit(X_train, y_train, classes=numpy.array([0, 1]))
            y_pred_prob = learner.decision_function(X_test)
            auc = roc_auc_score(y_test, y_pred_prob)
            aucs.append([i + num_train_examples * j, auc])
            print(aucs[-1])

    df = pandas.DataFrame(aucs, columns=["Iterations", "AUC"])
    df = df.set_index("Iterations")
    return df
Exemplo n.º 46
0
def batchPredict(X, y):
    est = SGDClassifier(loss='log', penalty='l1', alpha=0.01)
    progressive_validation_score = []
    train_score = []
    l = len(X)
    step = 500
    for datapoint in range(0, l, step):
        X_batch = X[datapoint:datapoint+step]
        y_batch = y[datapoint:datapoint+step]
        if datapoint > 0:
            progressive_validation_score.append(est.score(X_batch, y_batch))
        est.partial_fit(X_batch, y_batch, classes=range(10))
        if datapoint > 0:
            train_score.append(est.score(X_batch, y_batch))
    
    plt.plot(train_score, label='train score')
    plt.plot(progressive_validation_score, label='progressive validation score')
    plt.xlabel('Mini-batch')
    plt.ylabel('Score')
    plt.legend(loc='best')  
    plt.show()       
Exemplo n.º 47
0
class Model:
    def __init__(self,numFeatures, learningRate, mustShuffle=True):
        #Init scikit models
        self.FH = FeatureHasher(n_features=numFeatures, input_type='pair')
        self.Classifier = SGDClassifier(loss='log', alpha=learningRate, shuffle=mustShuffle)
    def train(self, gen, numEpochs,  v=False):
        i = 0
        for x, y in gen: #For each batch
            xHash = self.FH.transform(x) #hash trick
            y = np.array(y)            
            for epoch in range(numEpochs):
                self.Classifier.partial_fit(xHash, y, [0,1])
                
            if v and (i % (numBatches/60)) == 0: print(datetime.now(), "example:", i*sizeBatch)
            i+=1
    def test(self, gen,  v=False):
        
        #init target and prediction arrays
        ytot = np.array([])
        ptot = np.array([])
        #Get prediction for each batch
        for batch in gen:
            data = list(batch) #store batch in memory for prediction
            x, y = data[0], np.array(data[1])
            x = self.FH.transform(x)
            p = self.Classifier.predict_proba(x)
            p = p.T[1].T #Keep column corresponding to probability of class 1
            #Stack target and prediction for later analysis
            ytot = np.hstack((ytot, y)) 
            ptot = np.hstack((ptot, p))

        if v: print("Score:", self.score(ytot, ptot))
        
        return (ytot, ptot)
    def score(self, target, prediction):
        return llfun(target, prediction)
Exemplo n.º 48
0
 def partial_fit(self, X, y, *args, **kw):
     X = sp.csr_matrix(X)
     return SGDClassifier.partial_fit(self, X, y, *args, **kw)
Exemplo n.º 49
0
class SGDLearner:
    def __init__(self, X_train, y_train, X_test, y_test, random_state, eta0, alpha):

        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.rng = random_state
        # work out how to use random state ( ie save)
        self.sgd = SGDClassifier(random_state=self.rng, fit_intercept=True)

        self.sgd.loss = "hinge"
        self.sgd.alpha = alpha
        self.sgd.learning_rate = "constant"
        self.sgd.eta0 = eta0

        # how to make these  controlled properties (from chaco)

        # E_part = np.NaN((self.T // self.update_period, ))

        self.ntrain, self.ncoef = self.X_train.shape

        self.wts = np.zeros((self.ntrain + 1, self.ncoef + 1))
        self.grad = np.zeros((self.ntrain + 1, self.ncoef + 1))
        self.reset()

    def reset(self):

        self.scores = []

        # turn into dataframe?
        self.mn_grad = []
        self.st_grad = []

        self.sgd.warm_start = False  # reset learning ?does this reset learning rate time counter?
        self._iT = 0

    def learn(self, learn_for, probe_every):
        """Train for learn_for and and store results probe_every"""

        ind = self.rng.randint(0, self.ntrain, learn_for)
        for time in range(0, learn_for, probe_every):
            self.sgd.partial_fit(
                self.X_train[ind[time : time + probe_every], :], self.y_train[ind[time : time + probe_every]], [0, 1]
            )

            self._iT += probe_every
            self.sgd.warm_start = False  # (not necessary? unless we use fit rather than partial)
            self.calc_grad()
            mn = self.grad.mean(axis=0) * 1e-6
            st = self.grad.std(axis=0) * 1e-6

            self.mn_grad.append(mn)
            self.mn_grad.append(st)

            ind_part = self.rng.randint(0, self.ntrain, int(self.ntrain * 0.1))
            scores = {"timestep": self._iT}
            scores["part"] = self.sgd.score(self.X_train[ind_part, :], self.y_train[ind_part])
            scores["train"] = self.sgd.score(self.X_train, self.y_train)
            scores["test"] = self.sgd.score(self.X_test, self.y_test)
            self.scores.append(scores)

    def calc_grad(self):
        # now calculate current gradient variance
        # by using very low learning rate and calculate gradients
        eta0 = self.sgd.eta0
        self.sgd.eta0 = 1e-6 * eta0
        self.wts[0, :-1] = self.sgd.coef_
        self.wts[0, -1] = self.sgd.intercept_
        for im in range(self.ntrain):

            self.sgd.partial_fit(self.X_train[im, :].reshape((1, -1)), self.y_train[im].reshape((1,)), [0, 1])
            self.wts[im, :-1] = self.sgd.coef_
            self.wts[im, -1] = self.sgd.intercept_

        self.grad = np.diff(self.wts, axis=0)
        self.sgd.eta0 = eta0
Exemplo n.º 50
0
lr_model = SGDClassifier(loss='log')  # using log-loss for LogisticRegression
scores = []

k = 1  # using k and i to control the training scale (training_samples used = (all_samples / k)
i = 1
for chunk in df_train_f:
    if i < k: 
        i += 1
        continue
    i = 1
    df_train = oh_enc.transform(chunk)
    #----- training LR -----#
    feature_train = df_train.columns.drop(['id', 'click'])
    train_X = df_train[feature_train]
    train_y = df_train['click'].astype('int')
    lr_model.partial_fit(train_X, train_y, classes = [0,1])  # fitting
    
    # the score of training
    y_pred = lr_model.predict_proba(train_X)[:, 1]
    score = log_loss(train_y, y_pred)
    scores.append(score)

## store the pre-trained lr_model
pickle.dump(lr_model, open(fp_lr_model, 'wb'))

## show the training curve
f1 = plt.figure(1)
plt.plot(scores)
plt.xlabel('iterations')
plt.ylabel('log_loss')
plt.title('log_loss of training')
Exemplo n.º 51
0
class JointModel:

	# creating an empty model
	def __init__(self):
	
		# known words and their classifiers
		self.knownWords = {}
		self.minimumGuessScore = JM_GUESS_SCORE_THRESHOLD
                self.clfColor = SGDClassifier(loss="log", penalty="l2")
                self.clfShape = SGDClassifier(loss="log", penalty="l2")
                self.classColors = []
                self.classShapes = []

        # add a word-example pair to the SGD classifer
        # word: string
        # example: image
        def add_sgd_class(self, word, example):
           self.clfColor = SGDClassifier(loss="log", penalty="l2")
           self.clfShape = SGDClassifier(loss="log", penalty="l2")
           X_Color = [example['Color']]
           y_Color = [word]
           X_Shape = [example['Shape']]
           y_Shape = [word]
           for word in self.knownWords.keys():
              for classifier in self.knownWords[word]:
                 if("Synonym" not in str(type(classifier))):
                    examples = classifier.positiveExamples
                    for ex in examples : 
                       if("Color" in classifier._type_):
                          X_Color.append(ex['Color'])
                          y_Color.append(word)
                       if("Shape" in classifier._type_):
                          X_Shape.append(ex['Shape'])
                          y_Shape.append(word)
           
           classes = np.unique(y_Color)
           self.clfColor.partial_fit(X_Color, y_Color,classes=classes)
           self.classColors = classes
           classes = np.unique(y_Shape)
           self.clfShape.partial_fit(X_Shape, y_Shape,classes=classes)
           self.classShapes = classes

        # add a word-example pair to existing SGD classifer
        # word: string
        # example: image
        def partial_fit_classifer(self,word,example):
           self.clfColor.partial_fit([example['Color']],[word])
           self.clfShape.partial_fit([example['Shape']],[word])



	# add a word-example pair to the model
	# word: string
	# example: image
	# example polairty: global definition (constant)
	def add_word_example_pair(self, word, example, examplePolarity):
	
		currentKnownWords = self.knownWords.keys()
                
                # check if it is a new word
		if(word not in self.knownWords.keys()):
			# new word. add possibly associated classifiers
			# limited to initialization
			self.knownWords[word] = []
			self.knownWords[word].append(ObjColor(word, example, examplePolarity))
			self.knownWords[word].append(ObjShape(word, example, examplePolarity))

                        self.add_sgd_class(word,example)
                                         
	                # add possibilities of being a synonym
			# this will not contain redundant information like (a b), (a c) and (b c)
			# this is because syonyms are added in order
			for knownWord in currentKnownWords:
				# word may be a synonym of knownWord
				# when classifying, synonyms are checked for all classifier types
				# e.g. color, shape
				self.knownWords[word].append(ObjSynonymColor(word, knownWord, example, examplePolarity))
				self.knownWords[word].append(ObjSynonymShape(word, knownWord, example, examplePolarity))
		else:
                        self.partial_fit_classifer(word,example)
			# known word. just add the example
			# add in all word objects (where adding an example is possible)			
			for classifier in self.knownWords[word]:
				# assume all types to qualify for example addition
				classifier.add_example(example, examplePolarity)

	'''
	experiment: trained attributes
	'''
	# classify a word with corresponding example and get positive or negative confirmation
	# if the classifier is confident, then we don't know
	# e.g. "is this green?"
	# word: string
	# example: image
	# classificationScores: dictionary of classification scores per classifier
	def classify_word_example(self, word, example, checkSynonyms=True):
                probColor = 0.0
                probShape = 0.0
                if(word in self.classColors) :
                   index = self.classColors(word)
                   colorPredict = self.clfColor.predict([example['Color']])
                   colorProbs   = self.clfColor.predict_proba([example['Color']])
                   probColor = colorProbs[index]

                if(word in self.classShapes) :
                   index = self.classShapes(word)
                   shapePredict = self.clfShape.predict([example['Shape']])
                   shapeProbs   = self.clfShape.predict_proba([example['Shape']])
                   probShape = shapeProbs[index]
               

		probabilityScores = {}
		pExampleGivenWordValues = {}

		# check all classifiers related to this word
		for classifier in self.knownWords[word]:
			#print(word, str(classifier))
			if("Synonym" not in str(type(classifier))):
				# use non-synonym classifiers directly
				[probabilityScore, pExampleGivenWord] = classifier.calculate_probability_score(example)
			elif(checkSynonyms == True):
				# use synonym classifiers indirectly
				# add positive and negative examples known for the word but not the synonym
				# we do not care about the return values for recursive calls
				# we only want to populate probabilityScores in each recursion
	
				if("Color" in str(type(classifier))):
					searchType = "ObjColor"					
				elif("Shape" in str(type(classifier))):
					searchType = "ObjShape"					
				else:
					# should never come here for given initialization
					pass

				for synonymClassifier in self.knownWords[classifier.synonym]:
					if(searchType in str(type(synonymClassifier))):
						# will only enter this once
						# break is efficient but not neccessary
						synonymClassifierObj = synonymClassifier
						break;

				[probabilityScore, pExampleGivenWord] = synonymClassifierObj.calculate_probability_score(example, classifier.positiveExamples, classifier.negativeExamples)
			
			else:
				# do nothing about the synonyms
				pass

			# add score to classification scores
			probabilityScores[classifier] = probabilityScore
			pExampleGivenWordValues[classifier] = pExampleGivenWord

		# now we have accumulated all the scores
		# check if any of the scores exceed the threshold
		# initially assume inconsistency
		isWordExampleConsistent = False

		# compare for positive
		# more convoluted but faster this way because no if condition
		for classifier in probabilityScores.keys():
			isWordExampleConsistent = isWordExampleConsistent or (probabilityScores[classifier] >= classifier.get_classification_threshold())

		# return the consistency decision and probability scores
		return [isWordExampleConsistent, probabilityScores, pExampleGivenWordValues]

	'''
	experiment: novel scene
	'''	
	# classify a new example and get corresponding word
	# if no classifier is confident, then it is a new category of example. we do not handle this right now
	# e.g. "what is this?"
	# e.g. of bayes' rule: p(cube|example) = p(example|cube) * p(cube) / p(example)
	# p(example) is constant across all word classifications and can be ignored when comparing them
	# p(example|cube): the fraction of examples in "cube" which matched the current example
	# p(cube): the fraction of examples under "cube" relative to examples over all known words
	# p(cube) = totalExamples of cube / total examples of all words
	# the denominator is constant for all word scores. ignore it
	# consider non-normalized version of p(cube) to calculate score
	# example: image
	def classify_example(self, example):


                colorPredict = self.clfColor.predict([example['Color']])
                colorProbs   = self.clfColor.predict_proba([example['Color']])

                shapePredict = self.clfShape.predict([example['Shape']])
                shapeProbs   = self.clfShape.predict_proba([example['Shape']])
		# check against each known word
		# maximum probability score data corresponding to a word		
		wordMaxProabilityScores = {}
		# all probability score data corresponding to a word
		wordProbabilityScores = {}

		# maintain best guess
		bestGuessWord = ""
		bestGuessObj = ""
		bestGuessMaxScore = 0
	
		# calculate word probability scores
		# check all associated classifiers
		for word in self.knownWords.keys():
			[isWordExampleConsistent, probabilityScores, pExampleGivenWordValues] = self.classify_word_example(word, example)

			# select maximum score corresponding to best interpretation			
			maxScore = max(probabilityScores.values())			

			maxScoreObj = "none"
			for classifier in probabilityScores:
				if(probabilityScores[classifier] == maxScore):
					maxScoreObj = classifier

			# add to probability scores
			#totalObjExamples = float(len(maxScoreObj.positiveExamples) + len(maxScoreObj.negativeExamples))
			#wordMaxProabilityScores[word] = [maxScore, maxScoreObj, maxScore/totalObjExamples]
			wordMaxProabilityScores[word] = [maxScore, maxScoreObj, pExampleGivenWordValues]
			wordProbabilityScores[word] = [isWordExampleConsistent, probabilityScores]

			# update best guess if possible
			if(maxScore > bestGuessMaxScore):
				bestGuessWord = word
				bestGuessObj = maxScoreObj
				bestGuessMaxScore = maxScore

		# guess confidence
		# initial assumption
		isConfidentGuess = False

		if(bestGuessMaxScore >= self.minimumGuessScore):
			isConfidentGuess = True
		
		# return everything known to man
		return [isConfidentGuess, bestGuessWord, bestGuessObj, bestGuessMaxScore, wordMaxProabilityScores, wordProbabilityScores]
		
	'''
	experiment: novel english
	'''	
	# get a sentence and an image
	# compute a score which represents association of words with that image
	# classify that image and get words associated with it in decending order
	# get ranks of word mentioned by user
	# score is sum of 1/rank for each word
	# e.g. "this is a blue cube"
	def associate_words_example(self, listOfPositiveWords, listOfNegativeWords, example):
		
		# classify this image and get associated words
		[isConfidentGuess, bestGuessWord, bestGuessObj, bestGuessMaxScore, wordMaxProabilityScores, wordProbabilityScores] = self.classify_example(example)

		# form a dictionary of score: word
		wordScoreDictionary = {}
		for word in wordMaxProabilityScores:
			if(wordMaxProabilityScores[word][2][wordMaxProabilityScores[word][1]] not in wordScoreDictionary.keys()):
			    wordScoreDictionary[wordMaxProabilityScores[word][2][wordMaxProabilityScores[word][1]]] = [word]
			else:
			    wordScoreDictionary[wordMaxProabilityScores[word][2][wordMaxProabilityScores[word][1]]].append(word)

		# now rank these in descending order
		rank = 0
		wordRanks = {}
		for wordScore in sorted(wordScoreDictionary.keys(),reverse=True):
			rank += 1
			for word in wordScoreDictionary[wordScore]:
			    wordRanks[word] = rank

		# compute total score based on ranks of words in list
		totalScore = 0
		for word in listOfPositiveWords:
			if(word in wordRanks):
				rank = wordRanks[word]
				# use flat division for float result
				totalScore += 1.0/rank

		for word in listOfNegativeWords:
			if(word in wordRanks):
				rank = wordRanks[word]
				# use flat division for float result
				totalScore -= 1.0/rank

		return [totalScore, wordRanks, wordScoreDictionary]
Exemplo n.º 52
0
class SGDLearner:
    def __init__(self, X_train, y_train, X_test, y_test, random_state, eta0, alpha):

        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.rng = random_state
        # work out how to use random state ( ie save)
        self.sgd = SGDClassifier(random_state=self.rng, fit_intercept=True)

        self.sgd.loss = "hinge"
        self.sgd.alpha = alpha
        self.sgd.learning_rate = "constant"
        self.sgd.eta0 = eta0

        # how to make these  controlled properties (from chaco)

        # E_part = np.NaN((self.T // self.update_period, ))

        self.ntrain, self.ncoef = self.X_train.shape

        self.wts = np.zeros((self.ntrain + 1, self.ncoef + 1))
        self.grad = np.zeros((self.ntrain + 1, self.ncoef + 1))
        self.reset()

    def reset(self):

        self.scores = []

        # turn into dataframe?
        self.mn_grad = []
        self.st_grad = []

        self.sgd.warm_start = False
        # reset learning ?does this reset learning rate time counter?
        self._iT = 0

        d = pd.Index([], name="timestep")
        self.data = pd.DataFrame(index=d, columns=["part", "train", "test"])

    def learn(self, learn_for, probe_every):
        """Train for learn_for and and store results probe_every"""

        ind = self.rng.randint(0, self.ntrain, learn_for)
        for time in range(0, learn_for, probe_every):
            self.sgd.partial_fit(
                self.X_train[ind[time : time + probe_every], :], self.y_train[ind[time : time + probe_every]], [0, 1]
            )

            self._iT += probe_every
            self.sgd.warm_start = False  # (not necessary? unless we use fit rather than partial)
            self.calc_grad()
            mn = self.grad.mean(axis=0) * 1e-6
            st = self.grad.std(axis=0) * 1e-6

            self.mn_grad.append(mn)
            self.mn_grad.append(st)

            ind_part = self.rng.randint(0, self.ntrain, int(self.ntrain * 0.1))

            self.data.loc[self._iT, "part"] = self.sgd.score(self.X_train[ind_part, :], self.y_train[ind_part])
            self.data.loc[self._iT, "train"] = self.sgd.score(self.X_train, self.y_train)
            self.data.loc[self._iT, "test"] = self.sgd.score(self.X_test, self.y_test)

    def calc_grad(self):
        # now calculate current gradient variance
        # by using very low learning rate and calculate gradients
        eta0 = self.sgd.eta0
        self.sgd.eta0 = 1e-6 * eta0
        self.wts[0, :-1] = self.sgd.coef_
        self.wts[0, -1] = self.sgd.intercept_
        for im in range(self.ntrain):

            self.sgd.partial_fit(self.X_train[im, :].reshape((1, -1)), self.y_train[im].reshape((1,)), [0, 1])
            self.wts[im, :-1] = self.sgd.coef_
            self.wts[im, -1] = self.sgd.intercept_

        self.grad = np.diff(self.wts, axis=0)
        self.sgd.eta0 = eta0
        # restore orignal learning rate

    def plot(self, ax_graph):
        if not (self.data.empty):
            for name, line in self.lines.iteritems():
                line.set_data(self.data.index.values, self.data[name])
            ax_graph.set_xlim(0, self.data.index.values[-1])
            plt.draw()  # update plot?
        else:
            ax_graph.cla()
            self.lines = {}
            ax_graph.set_xlabel("iter")
            ax_graph.set_ylabel("class")
            ax_graph.set_ylim(0, 1)
            colours = {"train": "blue", "part": "green", "test": "red"}
            for name, col in colours.iteritems():
                self.lines[name] = plt.Line2D([], [], color=col, label=name)
                ax_graph.add_line(self.lines[name])
            ax_graph.legend(loc="lower left")
Exemplo n.º 53
0
#ddir = 'E:/workspace/data/cdiscount/'
#wdir = 'C:/Users/ngaude/Documents/GitHub/kaggle/cdiscount/'
ddir = '/home/ngaude/workspace/data/cdiscount/'
wdir = '/home/ngaude/workspace/github/kaggle/cdiscount/'

f_itocat = ddir+'joblib/itocat'
(itocat1,cat1toi,itocat2,cat2toi,itocat3,cat3toi) = joblib.load(f_itocat)

(X,Y) = joblib.load(ddir+'joblib/XYneighbor')
Y=Y[:,2]
classes = np.unique(Y)

classifier = SGDClassifier(loss = 'hinge',n_jobs = 3,penalty='l2')

classifier.partial_fit(X,Y,classes = classes)

classifier.sparsify()

#
#nrows = 1000
#trainrows = Xtrain.shape[0]
#epochs = 5 * trainrows / nrows
#for i in range(epochs):
#    a = np.random.randint(trainrows,size=nrows)
#    Xi = Xtrain[a,:]
#    Yi = Ytrain[a]
#    print 'partial_fit',i,'/',epochs
#    classifier.partial_fit(Xi,Yi,classes = cat3toi.keys())
#
#print 'train',classifier.score(Xtrain[:30000],Ytrain[:30000])
		time1 = time.time()
		lda,train_bow = gensim_lda(train_array,num_topics,6,40)
		time2 = time.time()
		print 'lda time {}'.format(time2 - time1)

		"""
			SGD with Fisher score 
		"""
		classes = np.array([[c] for c in shuffled_train_classes])
		clf = SGDClassifier(shuffle=True)

		unique_classes = np.unique(classes)

		time1 = time.time()
		for i,score in enumerate(fisher_score(train_array,lda,num_topics,V,D_train)):
			clf.partial_fit(score, classes[i], unique_classes) 

		time2 = time.time()
		print 'sgd time {}'.format(time2 - time1)

		"""
			SGD with Fisher score 
		"""
		classes = np.array([[c] for c in test_classes])
		time1 = time.time()

		went_fine = 0

		for i,score in enumerate(fisher_score(test_array,lda,num_topics,V,D_test)):
			c = clf.predict(score) 
			if c[0] == test_classes[i]:
Exemplo n.º 55
0
    #0.00001 --> 0.813194   a/10
    #0.0001/20 --> 0.811584 a/20
    #0.0001/15 --> 0.813287 a/15 !!
    #0.0001/12-->0.807602 a/12
    #0.0001/17 --> 0.811538

    #0.814882 <-- L1_RATION 0.5 L1
    #0.819020 <-- l1 RATION 0.5 L1 average = true alpha = a/15 !!
    #0.817806 <-- l2 ration 0.15 average = true alpha = a/15
    #0.818444 <-- l2 ration 0.15 average  =true alpha = a/100
    #0.818413 <-- l2 ration 0.15 average = true alpha = a/50 0.819020
    count = 0
    for line in sys.stdin:
        #if count >= 5000:
        #    break
        line = line.strip()
        label, x_string = line.split(" ", 1)
        label = int(label)
        x_original = np.fromstring(x_string, sep=' ')
        x = transform(x_original) #using the kernel function 
        clf.partial_fit(x, [label], CLASSES)
        count += 1

    for x in clf.coef_[0]:
        print x,
#    print

#cat training_set.txt | python mapper.py | python reducer.py > r_weights.txt
#python evaluate.py r_weights.txt test_data.txt  test_label.txt /Users/Charles/Desktop/

Exemplo n.º 56
0
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
vect=HashingVectorizer(decode_error='ignore',
                       n_features=2**21,
                       preprocessor=None,
                       tokenizer=tokenizer)
clf=SGDClassifier(loss='log',random_state=1,n_iter=1)
doc_stream=stream_docs(path='/home/caofa/movie_data.csv')
import pyprind
pbar=pyprind.ProgBar(45)
classes=np.array([0,1])
for _ in range(45):
    X_train,y_train=get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train=vect.transform(X_train)
    clf.partial_fit(X_train,y_train,classes=classes)
    pbar.update()

X_test,y_test=get_minibatch(doc_stream, size=5000)
X_test=vect.transform(X_test)
clf=clf.partial_fit(X_test,y_test)








     sw = 1 + 4*chunk.is_booking
     chunk.drop(['cnt', 'hotel_cluster', 'is_booking'], axis=1, inplace=True)
     
     XN = csr_matrix(chunk[num_col].values)
     X = csr_matrix((chunk.shape[0], n_features))
     rows = np.arange(chunk.shape[0])
     for col in cat_col_all:
         dat = np.ones(chunk.shape[0])
         cols = chunk[col] % n_features
         X += csr_matrix((dat, (rows, cols)), shape=(chunk.shape[0], n_features))
     X = hstack((XN, X))
     book_indices = sw[sw > 1].index.tolist()
     X_test = csr_matrix(X)[book_indices]
     y_test = y[book_indices]
     
     clf.partial_fit(X, y, classes=np.arange(100), sample_weight=sw)
     #len([i for i in clf.coef_[1] if i != 0])
     #len([i for i in clf.coef_[1] if i > 0])
     #jb = [col for h in np.argsort(abs(clf.coef_[5])) for col in chunk.columns if (hash(col) % n_features) == h]
     
     #preds += np.vstack(tuple([clf.predict_proba(test.loc[i*chunksize:min((i+1)*chunksize,test.shape[0]),:]) for i in range(int(test.shape[0]/100000))]))
     #preds += clf.predict_proba(test)
     
     count = count + chunksize
     map5 = map5eval(clf.predict_proba(X_test), y_test)
     print('%d rows completed. MAP@5: %f' % (count, map5))
     if(count/chunksize == 200):
         break
 except Exception as e:
     #e = sys.exc_info()[0]
     print('Error: %s' % str(e))
Exemplo n.º 58
0
def main(): 


#    opfc = 0
    
    print "###############################At Process Train Data by File########################################"
    # Get Train Data Header from Mega File
    fhead = open('/home/robbie/Hacking/Kaggle/ClickThroughRatePrediction-Avazu/Data/Raw/train.csv')
    fhead.seek(0) 
    headers = fhead.readline().rstrip().split(",")
#    if printflag == True: print headers
        
    # Go after all the Train Files
    alltrainfiles = os.listdir(trainpath)
#    testpred = np.zeros(40428967)                
    testpred = np.zeros(4577464)   
#    testpred = np.zeros(89999)
    loopcounter = 1                
    
    for atrainfile in alltrainfiles:

        trainY = []
        clickdata = []
        
        for line in open(trainpath + atrainfile, 'r'):
            
            random_line = line.rstrip().split(",")
            tdict = dict(zip(headers, random_line))
            tdict.pop('C14')
            tdict.pop('C17')
            tdict.pop('C20')
            trainY.append(tdict.pop('click'))
            tdict.pop('id')
            clickdata.append(tdict)
     
#                    if printflag == True: print tdict
            
        print "######################At Dict Vectorizer for file:::" , loopcounter, "#####################"
        
#                if printflag == True:  print clickdata
        vec = DictVectorizer()
        vcd = vec.fit_transform(clickdata).toarray()
        clickdata = []
        
#                print "###############################At Tree Classifer########################################"      
#                clf = RandomForestClassifier(n_estimators=5,n_jobs=-1)
#                clf.fit(vcd, trainY)
##                if printflag == True: print clf.predict(vcd) == trainY
##                print sum(int(x) for x in trainY)
##                print sum(clf.predict(vcd) == trainY)
##                print clf.oob_score_
        
        print "############################At PassiveAggressive Classifer###################################"      
        clf = SGDClassifier(penalty = 'l1', n_jobs=-1)
        clf.partial_fit(vcd, trainY,[0,1])
#                if printflag == True: print clf.predict(vcd) == trainY
#                print sum(int(x) for x in trainY)
#                print sum(clf.predict(vcd) == trainY)
#                print clf.oob_score_
        loopcounter = loopcounter + 1
                
    print "###############################Import Test Data########################################"      
    
    testheaders = [headers[0]] + headers[2:]
    
    alltestfiles = os.listdir(testpath)
    
    temptestpred = np.array([])
    idlist = []
    innercounter = 1 
    for atestfile in alltestfiles:
                    
        testclickdata = []
        
        for line in open(testpath + atestfile, 'r'):
            
            random_line = line.rstrip().split(",")
            tdict = dict(zip(testheaders, random_line))
            tdict.pop('C14')
            tdict.pop('C17')
            tdict.pop('C20')
            idlist.append(tdict.pop('id'))
            testclickdata.append(tdict)
            
#                                    if printflag == True: print tdict
        
        print "------------------Dict Vectorize Test Data--------------------------------"      
        tstvcd = vec.transform(testclickdata).toarray()
    #    testdata = importdata(projpath,testfilename)
#                                print tstvcd
        
        print "-------------------Predict Test Data::", innercounter, "-----------------------------" 
        temp = clf.predict_proba(tstvcd)
#                                print "Type is::" , type(temp) , "     Shape is:::" , temp.shape
#                                print temp100
        temptestpred = np.concatenate((temptestpred,temp[:,1]))
#                                print clf.classes_
#                                print temptestpred
#                                print temptestpred[:,1]
        
#                                fop = open(outputpath + 'output_' + atestfile + str(opfc), 'w+')
#                                fop.write(testpred)
#                                opfc = opfc + 1
        
        innercounter = innercounter + 1
    testpred = testpred + temptestpred
    
#    if loopcounter == 300:
#            break
    

#    testpred = testpred / len(alltrainfiles)
#    testpred = testpred / loopcounter
    
    fop = open(outputpath + 'submission.csv', 'w+')
    fop.writelines('id,click\n')
    for i in range(0,len(idlist)):
        fop.writelines(str(idlist[i]) + ',' + str(round(testpred[i],4)) + '\n')
    
    fop.close()