示例#1
0
def SGD_c_fit(X, y):
    clf = SGDClassifier(loss='log',
                        penalty='l2',
                        alpha=1e-3,
                        n_iter=5,
                        shuffle=True)
    return clf.fit(X, y)
示例#2
0
def classifyTestSamples(trainingFeatures, trainingCategories, testFeatures):
    clf = SGDClassifier()

    clf.fit(trainingFeatures, trainingCategories)
    predictedCategories = clf.predict(testFeatures)

    return predictedCategories
示例#3
0
    def fit(self, X, Y):
        # TODO: maybe scale training data that its norm becomes 1?
        # http://scikit-learn.org/stable/modules/sgd.html#id1
        self.alpha = float(self.alpha)
        self.fit_intercept = bool(self.fit_intercept)
        self.n_iter = int(self.n_iter)
        if self.class_weight == "None":
            self.class_weight = None
        self.l1_ratio = float(self.l1_ratio)
        self.epsilon = float(self.epsilon)
        self.eta0 = float(self.eta0)
        self.power_t = float(self.power_t)

        self.estimator = SGDClassifier(loss=self.loss,
                                       penalty=self.penalty,
                                       alpha=self.alpha,
                                       fit_intercept=self.fit_intercept,
                                       n_iter=self.n_iter,
                                       learning_rate=self.learning_rate,
                                       class_weight=self.class_weight,
                                       l1_ratio=self.l1_ratio,
                                       epsilon=self.epsilon,
                                       eta0=self.eta0,
                                       power_t=self.power_t,
                                       shuffle=True,
                                       random_state=self.random_state)
        self.estimator.fit(X, Y)
        return self
示例#4
0
    def iterative_fit(self, X, y, n_iter=1, refit=False):
        if refit:
            self.estimator = None

        if self.estimator is None:
            self.alpha = float(self.alpha)
            self.fit_intercept = self.fit_intercept == 'True'
            self.n_iter = int(self.n_iter)
            if self.class_weight == "None":
                self.class_weight = None
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None else 0.15
            self.epsilon = float(self.epsilon) if self.epsilon is not None else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(self.power_t) if self.power_t is not None else 0.25
            self.average = self.average == 'True'
            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           n_iter=self.n_iter,
                                           learning_rate=self.learning_rate,
                                           class_weight=self.class_weight,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state)

        self.estimator.n_iter += n_iter
        self.estimator.fit(X, y)
        return self
示例#5
0
def training(processed_train_csv_file):
    processed_train_samples = pd.read_csv(processed_train_csv_file)
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    processed_train_samples = processed_train_samples.fillna(value=0)
    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    random.shuffle(processed_train_samples_index_lst)
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples["booking_bool"].values

    print "Training Random Forest Classifier"
    rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    rf_classifier.fit(features, labels)
    print "Saving the Random Forest Classifier"
    data_io.save_model(rf_classifier, model_name="rf_classifier.pkl")

    print "Training Gradient Boosting Classifier"
    gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    gb_classifier.fit(features, labels)
    print "Saving the Gradient Boosting Classifier"
    data_io.save_model(gb_classifier, model_name="gb_classifier.pkl")

    print "Training SGD Classifier"
    sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1)
    sgd_classifier.fit(features, labels)
    print "Saving the SGD Classifier"
    data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
示例#6
0
    def SGD(self, train_features, test_features):
        print("in SGD")
        self.train_features = train_features
        self.test_features = test_features
        scores = []
        submission = pd.DataFrame.from_dict({'id': test['Id']})
        SGD_file = 'SGD.pckl'
        SGD_model_pkl = open(SGD_file, 'wb')
        for class_name in class_names:
            train_target = train[class_name]
            classifier = SGDClassifier(loss='modified_huber',
                                       penalty='l2',
                                       alpha=0.001,
                                       random_state=42,
                                       max_iter=200,
                                       tol=0.20,
                                       learning_rate='optimal')

            cv_score = np.mean(
                cross_val_score(classifier,
                                train_features,
                                train_target,
                                cv=3,
                                scoring='roc_auc'))
            scores.append(cv_score)
            print('CV score for class {} is {}'.format(class_name, cv_score))

            classifier.fit(train_features, train_target)
            pickle.dump(classifier, SGD_model_pkl)
            submission[class_name] = classifier.predict_proba(test_features)[:,
                                                                             1]

        print('Total CV score is {}'.format(np.mean(scores)))
        SGD_model_pkl.close()
        submission.to_csv('SGD.csv', index=False)
示例#7
0
    def fit(self, data, args):
        self.model = SGDClassifier(loss="log")

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval
示例#8
0
 def ada_sd(self, n_iter, eta, random_state):
     self.ada = SGDClassifier(n_iter=n_iter,
                              eta=eta,
                              random_state=random_state)
     self.ada.fit(
         self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1),
         self.Y_train_category)
     self.y_pred = self.ada.predict(
         self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1))
     return (self.Y_test_category != self.y_pred).sum(), accuracy_score(
         self.Y_test_category, self.y_pred), self.y_pred
示例#9
0
def test_label_binarizer_iris():
    lb = LabelBinarizer()
    Y = lb.fit_transform(iris.target)
    clfs = [SGDClassifier().fit(iris.data, Y[:, k])
            for k in range(len(lb.classes_))]
    Y_pred = np.array([clf.decision_function(iris.data) for clf in clfs]).T
    y_pred = lb.inverse_transform(Y_pred)
    accuracy = np.mean(iris.target == y_pred)
    y_pred2 = SGDClassifier().fit(iris.data, iris.target).predict(iris.data)
    accuracy2 = np.mean(iris.target == y_pred2)
    assert_almost_equal(accuracy, accuracy2)
示例#10
0
class MachineLearning:
    def __init__(self, Master_DF):
        self.Data_Frame = Master_DF

    def Encoder(self, df):
        encoder = LabelEncoder()
        print("Fitting")
        encoder.fit(df)
        return encoder.transform(df)

    def Perceptorn_PreProcessing(self, x, y):
        X = self.Encoder(self.Data_Frame[x].factorize()[0])
        Y = self.Encoder(self.Data_Frame[y].factorize()[0])
        X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.3,
                                                            random_state=0)
        sc = StandardScaler()
        sc.fit(X_train)
        X_train_std = sc.transform(X_train)
        X_test_std = sc.transform(X_test)
        return X_train_std, Y_train, X_test_std, Y_test

    def ppn_model(self, n_iter, eta0, random_state):
        self.X_train_std_uri, self.Y_train_category, self.X_test_std_uri, self.Y_test_category = self.Perceptorn_PreProcessing(
            'uri', 'category')
        ppn = Perceptron(n_iter=n_iter, eta0=eta0, random_state=random_state)
        ppn.fit(self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1),
                self.Y_train_category)
        y_pred = ppn.predict(
            self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1))
        return (self.Y_test_category != y_pred).sum(), accuracy_score(
            self.Y_test_category, y_pred), y_pred

    def lr_model(self, c, random_state):
        lr = LogisticRegression(C=c, random_state=random_state)
        lr.fit(self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1),
               self.Y_train_category)
        y_pred = lr.predict(
            self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1))
        return (self.Y_test_category != y_pred).sum(), accuracy_score(
            self.Y_test_category, y_pred), y_pred

    def ada_sd(self, n_iter, eta, random_state):
        self.ada = SGDClassifier(n_iter=n_iter,
                                 eta=eta,
                                 random_state=random_state)
        self.ada.fit(
            self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1),
            self.Y_train_category)
        self.y_pred = self.ada.predict(
            self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1))
        return (self.Y_test_category != self.y_pred).sum(), accuracy_score(
            self.Y_test_category, self.y_pred), self.y_pred
示例#11
0
def do_training(processed_train_csv_file):
    ## Processed train samples reading
    # read saved processed train samples from the given csv file
    processed_train_samples = pd.read_csv(processed_train_csv_file)

    # inf to nan
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    # nan to 0
    processed_train_samples = processed_train_samples.fillna(value=0)

    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    # 之前排过序,这里shuffle一下,效果更好
    random.shuffle(processed_train_samples_index_lst)

    # organize new train samples and targets
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples['booking_bool'].values

    # Model training
    # 1 Random Forest Classifier

    print("Training Random Forest Classifier")
    rf_classifier = RandomForestClassifier(n_estimators=150,
                                           verbose=2,
                                           n_jobs=-1,
                                           min_samples_split=10)
    rf_classifier.fit(features, labels)

    print("Saving the Random Forest Classifier")
    data_io.save_model(rf_classifier, model_name='rf_classifier.pkl')

    # 2 Gradient Boosting Classifier
    print("Gradient Boosting  Classifier")
    gb_classifier = GradientBoostingClassifier(n_estimators=150,
                                               verbose=2,
                                               learning_rate=0.1,
                                               min_samples_split=10)
    gb_classifier.fit(features, labels)
    print("Saving the Gradient Boosting  Classifier")
    data_io.save_model(gb_classifier, model_name='gb_classifier.pkl')

    # 3 SGD Classifier
    print("SGD Classifier")
    sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2,
                                   n_jobs=-1)
    sgd_classifier.fit(features, labels)

    print("saved the SGD Classifier")
    data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')
示例#12
0
def train_model(texts, points, num_classses, model_dir, text_encoding='utf-8'):
	""" Given an iterable of (text, lat, lon) items, cluster the points into #num_classes and use
	them as labels, then extract unigram features, train a classifier and save it in models/model_name
	for future use. 

	Args:
	texts -- an iterable (e.g. a list) of texts e.g. ['this is the first text', 'this is the second text'].
	points -- an iterable (e.g. a list) of tuples in the form of (lat, lon) where coordinates are of type float e.g. [(1.2343, -10.239834r),(5.634534, -12.47563)]
	num_classes -- the number of desired clusters/labels/classes of the model.
	model_name -- the name of the directory within models/ that the model will be saved.
	"""
	
	if os.path.exists(model_dir):
		logging.error("Model directory " + model_dir + " already exists, please try another address.")
		sys.exit(-1)
	else:
		os.mkdir(model_dir)
	
	from sklearn.cluster import KMeans
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model.stochastic_gradient import SGDClassifier
	
	kmeans = KMeans(n_clusters=num_classses, random_state=0)
	points_arr = numpy.array(points)
	kmeans.fit_transform(points_arr)
	cluster_centers = kmeans.cluster_centers_
	sample_clusters = kmeans.labels_
	label_coordinate = {}
	for i in range(cluster_centers.shape[0]):
		lat, lon = cluster_centers[i, 0], cluster_centers[i, 1]
		label_coordinate[i] = (lat, lon)
	
	logging.info('extracting features from text...')
	vectorizer = TfidfVectorizer(encoding=text_encoding, stop_words='english', ngram_range=(1,1), max_df=0.5, min_df=0, binary=True, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)
	X_train = vectorizer.fit_transform(texts)
	Y_train = sample_clusters
	vectorizer.stop_words_ = None
	logging.info('the number of samples is %d and the number of features is %d' % (X_train.shape[0], X_train.shape[1]))
	
	logging.info('training the classifier...')
	logging.warn('Note that alpha (regularisation strength) should be tuned based on the performance on validation data.')
	clf = SGDClassifier(loss='log', penalty='elasticnet', alpha=5e-5, l1_ratio=0.9, fit_intercept=True, n_iter=5, n_jobs=2, random_state=0, learning_rate="optimal")
	clf.fit(X_train, Y_train)
	clf.coef_ = csr_matrix(clf.coef_)
	
	logging.info('retrieving address of the given points using geopy (requires internet access).')
	coordinate_address = retrieve_location_from_coordinates(label_coordinate.values())

	logging.info('dumping the the vectorizer, clf (trained model), label_coordinates and coordinate_locations into pickle files in ' + model_dir)
	dump_model(clf, vectorizer, coordinate_address, label_coordinate, model_dir)
示例#13
0
def main():
    raw_data = np.loadtxt(sys.stdin)
    samples = raw_data.shape[0]
    X = np.empty((samples, 401))
    Y = raw_data[:,0]
    for i in xrange(samples):
        X[i] = transform(raw_data[i, 1:])
    clf = SGDClassifier(loss = _LOSS, penalty = _PENALTY,
                        fit_intercept = False, shuffle = True,
                        alpha = _REGULARIZATION)
    clf.fit(X, Y)
    sys.stdout.write('%s\t' % _KEY)
    for coeff in clf.coef_.flatten():
        sys.stdout.write("%f " % coeff)
    sys.stdout.write("\n")
示例#14
0
def train():
    X = df_train.drop(['cust_id', 'y', 'cust_group'], axis=1, inplace=False)
    y = df_train['y']
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)
    print(X_train.shape, X_test.shape)

    names = [
        "KNeighborsClassifier", "SGDClassifier", "LogisticRegression",
        "RandomForestClassifier", "GradientBoostingClassifier",
        "AdaBoostClassifier", "DecisionTreeClassifier"
    ]
    classifiers = [
        KNeighborsClassifier(),
        SGDClassifier(loss='log'),
        LogisticRegression(C=4.0),
        RandomForestClassifier(oob_score=True),
        GradientBoostingClassifier(),
        AdaBoostClassifier(),
        DecisionTreeClassifier(max_depth=3)
    ]
    for name, clf in zip(names, classifiers):
        print("====" * 20)
        print("traing..." + name)
        clf.fit(X_train, y_train)

        prob = clf.predict_proba(X_test).astype(float)
        # pred = np.argmax(prob, axis=1)
        print("mean_squared_error:", mean_squared_error(y_test, prob[:, 1]))
        print("log_loss:", log_loss(y_test.astype(int), prob[:, 1]))
        print("roc_auc_score:", roc_auc_score(y_test, prob[:, 1]))
def stochastic_descent(xtrain, ytrain, xtest):
    clf = SGDClassifier(loss="hinge",
                        penalty="l2",
                        max_iter=10,
                        random_state=42,
                        alpha=1e-3,
                        tol=None)
    print("SGD Fitting")
    clf.fit(xtrain, ytrain)
    # Saving the model with pickle
    with open(base_dir + "Model", 'wb') as f:
        pickle.dump(clf, f)
    print("SGD Predicting")
    ytest = clf.predict(xtest)

    return ytest
示例#16
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
示例#17
0
def begin_test(train_x, test_x, train_y, test_y):
    x = train_x + test_x
    y = train_y + test_y

    clf1 = LinearRegression()
    clf2 = LogisticRegression()
    clf3 = SGDClassifier()
    clf4 = SVC()
    clf5 = KNeighborsClassifier()
    clf6 = MLPClassifier()
    clf7 = DecisionTreeClassifier()
    clf8 = MultinomialNB()
    # clf1.fit(train_x, train_y)
    # y_pred = clf1.predict(test_x)
    # print("LinearRegressionAccuracy   ", accuracy_score(test_y, y_pred.round()))
    eclf = VotingClassifier(
        estimators=[('logr', clf2), ('sgd', clf3), ('svm', clf4), ('kn', clf5), ('nn', clf6), ('dt', clf7)],
        voting='hard')

    for label, clf in zip(
            ['LogisticRegressionClassifier', 'SGDClassifierClassifier', 'SVCClassifier',
             'NearestNeighbourClassifier', 'NeuralNetworkClassifier', 'DecisionTreeClassifier',
             'MultinomialNB', 'EnsembleClassifier'],
            [clf2, clf3, clf4, clf5, clf6, clf7, clf8, eclf]):
        scores = cross_val_score(clf, x, y, cv=10, scoring='accuracy')
        f_measure = cross_val_score(clf, x, y, cv=10, scoring='f1')
        # print(scores)
        print(label, "Accuracy:  ", scores.mean(), "+/- ", scores.std())
        print(label, "F-measure:  ", f_measure.mean())
示例#18
0
    def initialize_algorithm(self, hyperparameters):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier
        hyperparameters = self.initialize_hyperparameters(hyperparameters)
        sgd = SGDClassifier(
                            loss=hyperparameters["loss"],
                            penalty=hyperparameters["penalty"],
                            alpha=float(hyperparameters["alpha"]),
                            fit_intercept=bool(hyperparameters["fit_intercept"]),
                            n_iter=int(hyperparameters["n_iter"]),
                            learning_rate=hyperparameters["learning_rate"],
                            l1_ratio=float(hyperparameters["l1_ratio"]),
                            epsilon=float(hyperparameters["epsilon"]),
                            eta0=float(hyperparameters["eta0"]),
                            power_t=float(hyperparameters["power_t"]),
                            shuffle=True,
                            average=bool(hyperparameters["average"]),
                            random_state=None)
        # sgd = self.algorithm(
        #                     loss=hyperparameters["loss"],
        #                     penalty=hyperparameters["penalty"],
        #                     alpha=hyperparameters["alpha"],
        #                     fit_intercept=hyperparameters["fit_intercept"],
        #                     n_iter=hyperparameters["n_iter"],
        #                     learning_rate=hyperparameters["learning_rate"],
        #                     l1_ratio=hyperparameters["l1_ratio"],
        #                     epsilon=hyperparameters["epsilon"],
        #                     eta0=hyperparameters["eta0"],
        #                     power_t=hyperparameters["power_t"],
        #                     average=hyperparameters["average"],
        #                     random_state=None)

        return (self.get_full_name(), sgd)
示例#19
0
    def iterative_fit(self, X, y, n_iter=1, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier

        if refit:
            self.estimator = None

        if self.estimator is None:
            self._iterations = 0

            self.alpha = float(self.alpha)
            self.fit_intercept = self.fit_intercept == 'True'
            self.n_iter = int(self.n_iter)
            self.l1_ratio = float(
                self.l1_ratio) if self.l1_ratio is not None else 0.15
            self.epsilon = float(
                self.epsilon) if self.epsilon is not None else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(
                self.power_t) if self.power_t is not None else 0.25
            self.average = self.average == 'True'

            self.estimator = SGDClassifier(
                loss=self.loss,
                penalty=self.penalty,
                alpha=self.alpha,
                fit_intercept=self.fit_intercept,
                n_iter=1,
                learning_rate=self.learning_rate,
                l1_ratio=self.l1_ratio,
                epsilon=self.epsilon,
                eta0=self.eta0,
                power_t=self.power_t,
                shuffle=True,
                average=self.average,
                random_state=self.random_state,
            )

        self.estimator.n_iter = n_iter
        self.estimator.partial_fit(X,
                                   y,
                                   classes=np.unique(y),
                                   sample_weight=sample_weight)

        if self._iterations >= self.n_iter:
            self.fully_fit_ = True
        self._iterations += n_iter
        return self
 def CPLELearningWrapper(X_train, y_train, X_test):
     from frameworks.CPLELearning import CPLELearningModel
     #clf = RandomForestClassifier()
     from sklearn.linear_model.stochastic_gradient import SGDClassifier
     clf = SGDClassifier(loss='log', penalty='l1')
     ssmodel = CPLELearningModel(clf)
     newlabels = np.concatenate((np.array(y_train), -np.ones(len(X_test))))
     ssmodel.fit(np.concatenate((X_train, X_test)), newlabels)
     return ssmodel.predict(X_test)
def demo(instances=2000):
    """ _test_comparison_prequential
    
    This demo will test a prequential evaluation when more than one learner is 
    passed, which makes it a comparison task.
    
    Parameters
    ----------
    instances: int
        The evaluation's maximum number of instances.
     
    """
    # Stream setup
    stream = FileStream("../datasets/covtype.csv", -1, 1)
    # stream = SEAGenerator(classification_function=2, sample_seed=53432, balance_classes=False)
    stream.prepare_for_use()
    # Setup the classifier
    clf = SGDClassifier()
    # classifier = KNNAdwin(k=8, max_window_size=2000,leaf_size=40, categorical_list=None)
    # classifier = OzaBaggingAdwin(h=KNN(k=8, max_window_size=2000, leaf_size=30, categorical_list=None))
    clf_one = KNNAdwin(k=8, max_window_size=1000, leaf_size=30)
    # clf_two = KNN(k=8, max_window_size=1000, leaf_size=30)
    # clf_two = LeverageBagging(h=KNN(), ensemble_length=2)

    t_one = OneHotToCategorical([[10, 11, 12, 13],
                                 [
                                     14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
                                     24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
                                     34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
                                     44, 45, 46, 47, 48, 49, 50, 51, 52, 53
                                 ]])
    # t_two = OneHotToCategorical([[10, 11, 12, 13],
    #                        [14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
    #                        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53]])

    pipe_one = Pipeline([('one_hot_to_categorical', t_one), ('KNN', clf_one)])
    # pipe_two = Pipeline([('one_hot_to_categorical', t_two), ('KNN', clf_two)])

    classifier = [clf, pipe_one]
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    # pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(pretrain_size=2000,
                                    output_file='teste.csv',
                                    max_samples=instances,
                                    batch_size=1,
                                    n_wait=200,
                                    max_time=1000,
                                    show_plot=True,
                                    metrics=['performance', 'kappa_t'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=classifier)
示例#22
0
class SGDClassifierImpl():

    def __init__(self, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight='balanced', warm_start=False, average=False):
        self._hyperparams = {
            'loss': loss,
            'penalty': penalty,
            'alpha': alpha,
            'l1_ratio': l1_ratio,
            'fit_intercept': fit_intercept,
            'max_iter': max_iter,
            'tol': tol,
            'shuffle': shuffle,
            'verbose': verbose,
            'epsilon': epsilon,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'learning_rate': learning_rate,
            'eta0': eta0,
            'power_t': power_t,
            'early_stopping': early_stopping,
            'validation_fraction': validation_fraction,
            'n_iter_no_change': n_iter_no_change,
            'class_weight': class_weight,
            'warm_start': warm_start,
            'average': average}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)

    def partial_fit(self, X, y=None, classes = None):
      if not hasattr(self, "_wrapped_model"):
        self._wrapped_model = SKLModel(**self._hyperparams)
      self._wrapped_model.partial_fit(X, y, classes = classes)
      return self
示例#23
0
def tune_params(X, y):
    # param_test2 = {'alpha': [0.0001, 0.00001, 0.00002]}  # 10
    param_test3 = {'n_iter': range(10, 100, 10)}  # 10
    gsearch1 = GridSearchCV(estimator=SGDClassifier(random_state=10,
                                                    n_iter=50),
                            param_grid=param_test3,
                            scoring='accuracy',
                            cv=5)
    gsearch1.fit(X, y)
    print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)
def get_models():
    models = dict()
    models['NN'] = MLPClassifier(solver='lbfgs',
                                 alpha=1e-5,
                                 hidden_layer_sizes=(40, 10),
                                 random_state=1)
    models['sgdc'] = SGDClassifier(loss='hinge',
                                   penalty='l2',
                                   alpha=1e-3,
                                   n_iter=5,
                                   random_state=42)
    return models
    def __init__(self, positive_class):
        # Create an online classifier i.e. supporting `partial_fit()`
        self.classifier = SGDClassifier(loss = 'log')

        # Here we propose to learn a binary classification of the positive class
        # and all other documents
        self.positive_class = positive_class

        # structure to track accuracy history
        self.stats = {'n_train': 0, 'n_train_pos': 0, 'accuracy': 0.0, 
            'accuracy_history': [(0, 0)], 't0': time.time(), 
            'runtime_history': [(0, 0)]}
def online_model():
    vectorizer = HashingVectorizer(preprocessor=preprocessor,
                                   tokenizer=lemmatizer,
                                   alternate_sign=False,
                                   ngram_range=(1, 2),
                                   stop_words=STOP_WORD_lemma)

    clf = SGDClassifier(loss='log', max_iter=5)

    pipe = OnlinePipeline([('vectorizer', vectorizer), ('classifier', clf)])

    return pipe
示例#27
0
    def train(self, dataset):
        """Train the model with a dataset

        Args:
            dataset (list): List of training files
        """
        # Get the original training set
        training_set = self.model["algo"].training_set

        # Append the new data to it
        for text in dataset:
            self.logger.debug("Processing " + text.filename + "...")
            unigrams = Unigrams(
                join(
                    self.config["root"],
                    self.config["dirs"]["models_root"],
                    self.config["dirs"]["models"]["inline"],
                    self.config["models"]["inline"]["unigrams"],
                ))

            for p in text.text:
                for line in p:
                    if line.grade % 5 != 0:  # Unclassified lines are useless for the training
                        continue

                    f = MachineLearningFeatures()
                    features = f.extract_features(line, unigrams.ngrams,
                                                  text.stats)
                    result = int(line.grade / 5)

                    training_set["features"].append(features)
                    training_set["results"].append(result)

        self.logger.debug("Saving training set...")
        save(
            training_set,
            join(self.config["dirs"]["models_root"],
                 self.config["dirs"]["models"]["learning"],
                 self.config["models"]["learning"]["training_set"]))

        self.logger.debug("Training model...")
        ml_classifier = SGDClassifier(loss="log", class_weight="auto")
        self.model["algo"].set_classifier(ml_classifier)
        self.model["algo"].set_training_set(training_set["features"],
                                            training_set["results"])
        self.model["algo"].train()

        save(
            self.model["algo"].classifier,
            join(self.config["dirs"]["models_root"],
                 self.config["dirs"]["models"]["learning"],
                 self.config["models"]["learning"]["classifier"]))
def demo():

    # The classifier we will use (other options: SAMKNN, LeverageBagging, SGD)
    h = [HoeffdingTree(), SAMKNN(), LeverageBagging(), SGDClassifier()]

    # Demo 1 -- plot should not fail
    demo_parameterized(h)

    # Demo 2 -- csv output should look nice
    demo_parameterized(h, "sea_stream.csv", False)

    # Demo 3 -- should not give "'NoneType' object is not iterable" error
    demo_parameterized(h, "covtype.csv", False)
示例#29
0
def withoutPipeline():
    scaler = StandardScaler().fit(x_train) # for each x, (x - mean(all x))/std. dev. of x
                                           # this step computes the mean and std. dev.
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    clfer = SGDClassifier()
    clfer.fit(x_train, y_train) # this will try to separate the three classes based
                                # on the two features we gave it. Hence, we will get
                                # back three lines. I.e., three sets of coefficients
                                # and three intercepts
    if(DEBUG):
        #print clfer.coef_
        #print clfer.intercept_
        #print clfer.predict(scaler.transform([[4.7, 3.1]]))
        #print clfer.decision_function(scaler.transform([[4.7, 3.1]])) # the algorithm evaluates distance from all three
                                                                      # lines and picks the largest one (in this case [0])
        pass

    # validate results
    y_predict_train = clfer.predict(x_train)
    print "% Correct results on training set:"
    print metrics.accuracy_score(y_train, y_predict_train)
    y_predict_test = clfer.predict(x_test)
    print "\n% Correct results on testing set:"
    print metrics.accuracy_score(y_test, y_predict_test)
    # Understanding the classification report:
    # Precision: TP/(TP + FP) - ideal 1 - all instances reported as x were x. In other words,
    #                                     there were no instances reported as x that were NOT x
    # Recall:    TP/(TP + FN) - ideal 1 - all instances OF x were reported as x
    # Although, accuracy does not appear in the report, it is important to know what it means:
    # Accuracy: (TP + TN) / (TP + TN + FP + FN)
    print "\nClassification Report:"
    print metrics.classification_report(y_test, y_predict_test)
    # Understanding the confusion matrix
    # how many of class i were predicted as j
    # ideal. an Identity matrix
    print "Confusion Matrix:"
    print metrics.confusion_matrix(y_test, y_predict_test)
示例#30
0
 def __init__(self,
              loss='hinge',
              penalty='l2',
              alpha=0.0001,
              l1_ratio=0.15,
              fit_intercept=True,
              max_iter=None,
              tol=None,
              shuffle=True,
              verbose=0,
              epsilon=0.1,
              n_jobs=None,
              random_state=None,
              learning_rate='optimal',
              eta0=0.0,
              power_t=0.5,
              early_stopping=False,
              validation_fraction=0.1,
              n_iter_no_change=5,
              class_weight='balanced',
              warm_start=False,
              average=False,
              n_iter=None):
     self._hyperparams = {
         'loss': loss,
         'penalty': penalty,
         'alpha': alpha,
         'l1_ratio': l1_ratio,
         'fit_intercept': fit_intercept,
         'max_iter': max_iter,
         'tol': tol,
         'shuffle': shuffle,
         'verbose': verbose,
         'epsilon': epsilon,
         'n_jobs': n_jobs,
         'random_state': random_state,
         'learning_rate': learning_rate,
         'eta0': eta0,
         'power_t': power_t,
         'early_stopping': early_stopping,
         'validation_fraction': validation_fraction,
         'n_iter_no_change': n_iter_no_change,
         'class_weight': class_weight,
         'warm_start': warm_start,
         'average': average,
         'n_iter': n_iter
     }
     self._wrapped_model = SKLModel(**self._hyperparams)
示例#31
0
def demo(output_file=None, instances=40000):
    """ _test_holdout
    
    This demo runs a holdout evaluation task with one learner. The default 
    stream is a WaveformGenerator. The default learner is a SGDClassifier, 
    which is inserted into a Pipeline structure. All the default values can 
    be changing by uncommenting/commenting the code below.
    
    Parameters
    ----------
    output_file: string
        The name of the csv output file
    
    instances: int
        The evaluation's max number of instances
         
    """
    # Setup the File Stream
    #opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False)
    #stream = FileStream(opt, -1, 1)
    stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    classifier = SGDClassifier()
    #classifier = PassiveAggressiveClassifier()
    #classifier = SGDRegressor()
    #classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    eval = EvaluateHoldout(pretrain_size=10000,
                           test_size=2000,
                           dynamic_test_set=True,
                           max_instances=instances,
                           batch_size=1,
                           n_wait=15000,
                           max_time=1000,
                           output_file=output_file,
                           task_type='classification',
                           show_plot=True,
                           plot_options=['kappa', 'kappa_t', 'performance'])

    # Evaluate
    eval.eval(stream=stream, classifier=pipe)
示例#32
0
def demo(output_file=None, instances=40000):
    """ _test_comparison_holdout
    
    This demo will test a holdout evaluation task when more than one learner is 
    evaluated, which makes it a comparison task. 
    
    Parameters
    ----------
    output_file: string, optional
        If passed this parameter indicates the output file name. If left blank, 
        no output file will be generated.
    
    instances: int (Default: 40000)
        The evaluation's maximum number of instances.
    
    """
    # Setup the File Stream
    # opt = FileOption("FILE", "OPT_NAME", "../datasets/covtype.csv", "CSV", False)
    # stream = FileStream(opt, -1, 1)
    stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    clf_one = SGDClassifier()
    clf_two = KNNAdwin(k=8, max_window_size=2000)
    # classifier = PassiveAggressiveClassifier()
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    classifier = [clf_one, clf_two]

    # Setup the evaluator
    evaluator = EvaluateHoldout(pretrain_size=2000,
                                test_size=2000,
                                dynamic_test_set=True,
                                max_instances=instances,
                                batch_size=1,
                                n_wait=5000,
                                max_time=1000,
                                output_file=output_file,
                                task_type='classification',
                                show_plot=True,
                                plot_options=['kappa'])

    # Evaluate
    evaluator.eval(stream=stream, classifier=classifier)
def demo(output_file=None, instances=40000):
    """ _test_prequential_mol

    This demo shows the evaluation process of a MOL classifier, initialized 
    with sklearn's SGDClassifier.

    Parameters
    ----------
    output_file: string
        The name of the csv output file

    instances: int
        The evaluation's max number of instances

    """
    # Setup the File Stream
    #opt = FileOption("FILE", "OPT_NAME", "../datasets/music.csv", "CSV", False)
    #stream = FileStream(opt, 0, 6)
    stream = MultilabelGenerator(n_samples=instances)
    #stream = WaveformGenerator()
    stream.prepare_for_use()

    # Setup the classifier
    classifier = MultiOutputLearner(SGDClassifier(n_iter=100))
    #classifier = SGDClassifier()
    #classifier = PassiveAggressiveClassifier()
    #classifier = SGDRegressor()
    #classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    eval = EvaluatePrequential(
        pretrain_size=5000,
        max_instances=instances - 10000,
        batch_size=1,
        n_wait=200,
        max_time=1000,
        output_file=output_file,
        task_type='multi_output',
        show_plot=True,
        plot_options=['hamming_score', 'j_index', 'exact_match'])

    # Evaluate
    eval.eval(stream=stream, classifier=pipe)
示例#34
0
def begin_test(train_x, test_x, train_y, test_y):
    x = train_x + test_x
    y = train_y + test_y

    clf1 = LinearRegression()
    clf2 = LogisticRegression()
    clf3 = SGDClassifier()
    clf4 = SVC()
    clf5 = KNeighborsClassifier()
    clf6 = MLPClassifier()
    clf7 = DecisionTreeClassifier()
    clf8 = MultinomialNB()
    # clf1.fit(train_x, train_y)
    # y_pred = clf1.predict(test_x)
    # print("LinearRegressionAccuracy   ", accuracy_score(test_y, y_pred.round()))
    eclf = VotingClassifier(estimators=[('logr', clf2), ('sgd', clf3),
                                        ('svm', clf4), ('kn', clf5),
                                        ('nn', clf6), ('dt', clf7)],
                            voting='hard')

    # for label, clf in zip(
    #         ['LogisticRegressionClassifier', 'SGDClassifierClassifier', 'SVCClassifier',
    #          'NearestNeighbourClassifier', 'NeuralNetworkClassifier', 'DecisionTreeClassifier',
    #          'MultinomialNB', 'EnsembleClassifier'],
    #         [clf2, clf3, clf4, clf5, clf6, clf7, clf8, eclf]):

    arr = [[16, 54], [18, 45], [23, 54], [33, 54], [37, 45]]

    for k, j in arr:
        clf = DecisionTreeClassifier(splitter='best',
                                     max_depth=j,
                                     min_samples_split=k)
        clf.fit(train_x, train_y)
        y_pred = clf.predict(test_x)
        correct_count = 0
        for i in range(len(y_pred)):
            if (y_pred[i] == test_y[i]):
                correct_count += 1
        # if(correct_count/len(y_pred)>=0.74):
        print(
            str(q) + "    " + str(k) + "    " + str(j) + "   " +
            str(correct_count / len(y_pred)))
示例#35
0
def demo(output_file=None, instances=40000):
    """ _test_prequential_mol

    This demo shows the evaluation process of a MOL classifier, initialized 
    with sklearn's SGDClassifier.

    Parameters
    ----------
    output_file: string
        The name of the csv output file

    instances: int
        The evaluation's max number of instances

    """
    # Setup the File Stream
    stream = MultilabelGenerator(n_samples=instances)
    # stream = WaveformGenerator()

    # Setup the classifier
    classifier = MultiOutputLearner(SGDClassifier(n_iter=100))
    # classifier = SGDClassifier()
    # classifier = PassiveAggressiveClassifier()
    # classifier = SGDRegressor()
    # classifier = PerceptronMask()

    # Setup the pipeline
    pipe = Pipeline([('Classifier', classifier)])

    # Setup the evaluator
    evaluator = EvaluatePrequential(
        pretrain_size=5000,
        max_samples=instances - 10000,
        batch_size=1,
        n_wait=200,
        max_time=1000,
        output_file=output_file,
        show_plot=True,
        metrics=['hamming_score', 'j_index', 'exact_match'])

    # Evaluate
    evaluator.evaluate(stream=stream, model=pipe)
        tfile.extractall(self.data_path)
        print("done !")

    def iterdocs(self):
        """Iterate doc by doc, yield a dict."""
        for root, _dirnames, filenames in os.walk(self.data_path):
            for filename in fnmatch.filter(filenames, '*.sgm'):
                path = os.path.join(root, filename)
                parser = ReutersParser()
                for doc in parser.parse(open(path)):
                    yield doc


hasher = HashingVectorizer(decode_error='ignore', n_features=2 ** 18)

classifier = SGDClassifier()

data_streamer = ReutersStreamReader('reuters').iterdocs()

all_classes = np.array([0, 1])
positive_class = 'acq'


def get_minibatch(doc_iter, size, transformer=hasher,
                  pos_class=positive_class):
    """Extract a minibatch of examples, return a tuple X, y.

    Note: size is before excluding invalid docs with no topics assigned.

    """
    data = [('{title}\n\n{body}'.format(**doc), pos_class in doc['topics'])
示例#37
0
cancer = fetch_mldata("Lung cancer (Ontario)")
X = cancer.target.T
ytrue = np.copy(cancer.data).flatten()
ytrue[ytrue > 0] = 1

# label a few points
labeled_N = 4
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample(
    np.where(ytrue == 1)[0], labeled_N / 2
)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
# basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss="log", penalty="l1")  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print "supervised log.reg. score", basemodel.score(X, ytrue)

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print "self-learning log.reg. score", ssmodel.score(X, ytrue)

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print "CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue)

# semi-supervised score, WQDA model
ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True)  # weighted Quadratic Discriminant Analysis
示例#38
0
class SGD(ParamSklearnClassificationAlgorithm):
    def __init__(self, loss, penalty, alpha, fit_intercept, n_iter,
                 learning_rate, class_weight=None, l1_ratio=0.15, epsilon=0.1,
                 eta0=0.01, power_t=0.5, average=False, random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.class_weight = class_weight
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None

    def fit(self, X, y):
        while not self.configuration_fully_fitted():
            self.iterative_fit(X, y, n_iter=1)

        return self

    def iterative_fit(self, X, y, n_iter=1, refit=False):
        if refit:
            self.estimator = None

        if self.estimator is None:
            self.alpha = float(self.alpha)
            self.fit_intercept = self.fit_intercept == 'True'
            self.n_iter = int(self.n_iter)
            if self.class_weight == "None":
                self.class_weight = None
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None else 0.15
            self.epsilon = float(self.epsilon) if self.epsilon is not None else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(self.power_t) if self.power_t is not None else 0.25
            self.average = self.average == 'True'
            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           n_iter=self.n_iter,
                                           learning_rate=self.learning_rate,
                                           class_weight=self.class_weight,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state)

        self.estimator.n_iter += n_iter
        self.estimator.fit(X, y)
        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not self.estimator.n_iter < self.n_iter

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'SGD Classifier',
                'name': 'Stochastic Gradient Descent Classifier',
                'handles_missing_values': False,
                'handles_nominal_values': False,
                'handles_numerical_features': True,
                'prefers_data_scaled': True,
                'prefers_data_normalized': True,
                'handles_regression': False,
                'handles_classification': True,
                'handles_multiclass': True,
                'handles_multilabel': False,
                'is_deterministic': True,
                'handles_sparse': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,),
                # TODO find out what is best used here!
                'preferred_dtype' : None}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        loss = cs.add_hyperparameter(CategoricalHyperparameter("loss",
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            default="hinge"))
        penalty = cs.add_hyperparameter(CategoricalHyperparameter(
            "penalty", ["l1", "l2", "elasticnet"], default="l2"))
        alpha = cs.add_hyperparameter(UniformFloatHyperparameter(
            "alpha", 10e-7, 1e-1, log=True, default=0.0001))
        l1_ratio = cs.add_hyperparameter(UniformFloatHyperparameter(
            "l1_ratio", 0, 1, default=0.15))
        fit_intercept = cs.add_hyperparameter(UnParametrizedHyperparameter(
            "fit_intercept", "True"))
        n_iter = cs.add_hyperparameter(UniformIntegerHyperparameter(
            "n_iter", 5, 1000, default=20))
        epsilon = cs.add_hyperparameter(UniformFloatHyperparameter(
            "epsilon", 1e-5, 1e-1, default=1e-4, log=True))
        learning_rate = cs.add_hyperparameter(CategoricalHyperparameter(
            "learning_rate", ["optimal", "invscaling", "constant"],
            default="optimal"))
        eta0 = cs.add_hyperparameter(UniformFloatHyperparameter(
            "eta0", 10**-7, 0.1, default=0.01))
        power_t = cs.add_hyperparameter(UniformFloatHyperparameter(
            "power_t", 1e-5, 1, default=0.25))
        average = cs.add_hyperparameter(CategoricalHyperparameter(
            "average", ["False", "True"], default="False"))

        # TODO add passive/aggressive here, although not properly documented?
        elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
        epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")
        # eta0 seems to be always active according to the source code; when
        # learning_rate is set to optimial, eta0 is the starting value:
        # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx
        #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling")
        #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant")
        #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant)
        power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling")

        cs.add_condition(elasticnet)
        cs.add_condition(epsilon_condition)
        cs.add_condition(power_t_condition)

        return cs

    def __str__(self):
        return "ParamSklearn StochasticGradientClassifier"
class PositiveClassClassifier(object):
    hvectorizer = HashingVectorizer(tokenizer = LemmaTokenizer(),
                                    n_features = 2 ** 15,
                                    stop_words = 'english',
                                    lowercase = True,
                                    non_negative = True)
 
    all_classes = np.array([0, 1])
    
    def __init__(self, positive_class):
        # Create an online classifier i.e. supporting `partial_fit()`
        self.classifier = SGDClassifier(loss = 'log')

        # Here we propose to learn a binary classification of the positive class
        # and all other documents
        self.positive_class = positive_class

        # structure to track accuracy history
        self.stats = {'n_train': 0, 'n_train_pos': 0, 'accuracy': 0.0, 
            'accuracy_history': [(0, 0)], 't0': time.time(), 
            'runtime_history': [(0, 0)]}

    def progress(self):
        """Report progress information, return a string."""
        duration = time.time() - self.stats['t0']
        s = "%(n_train)6d train docs (%(n_train_pos)6d positive) " % self.stats
        s += "accuracy: %(accuracy).6f " % self.stats
        s += "in %.2fs (%5d docs/s)" % (duration, self.stats['n_train'] / duration)
        return s

    def train(self):
        minibatch_iterator = iter_minibatchs(OVA_TRAIN_FILE, self.hvectorizer, self.positive_class)
 
        # Main loop : iterate on mini-batchs of examples
        for i, (x_train, y_train) in enumerate(minibatch_iterator):
            # update estimator with examples in the current mini-batch
            self.classifier.partial_fit(x_train, y_train, classes=self.all_classes)

            # accumulate test accuracy stats
            self.stats['n_train'] += x_train.shape[0]
            self.stats['n_train_pos'] += sum(y_train)
            self.stats['accuracy'] = self.score()
            self.stats['accuracy_history'].append((self.stats['accuracy'], 
                                                   self.stats['n_train']))
            self.stats['runtime_history'].append((self.stats['accuracy'],
                                                  time.time() - self.stats['t0']))
            #if i % 10 == 0:
            #    print self.progress()

    def score(self): 
        TEST_BATCHES_NO = 20
        minibatch_iterator = iter_minibatchs(TEST_FILE, self.hvectorizer, self.positive_class)
        score = 0
        
        for i, (x_test, y_test) in enumerate(minibatch_iterator):
            y_test = np.asarray(y_test)
            score += self.classifier.score(x_test, y_test)

            if i >= TEST_BATCHES_NO - 1:
                break

        return score / TEST_BATCHES_NO
示例#40
0
    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            self.fully_fit_ = False

            self.alpha = float(self.alpha)
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \
                else 0.15
            self.epsilon = float(self.epsilon) if self.epsilon is not None \
                else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(self.power_t) if self.power_t is not None \
                else 0.5
            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)

            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           max_iter=n_iter,
                                           tol=self.tol,
                                           learning_rate=self.learning_rate,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state,
                                           warm_start=True)
            self.estimator.fit(X, y, sample_weight=sample_weight)
        else:
            self.estimator.max_iter += n_iter
            self.estimator.max_iter = min(self.estimator.max_iter, 512)
            self.estimator._validate_params()
            self.estimator._partial_fit(
                X, y,
                alpha=self.estimator.alpha,
                C=1.0,
                loss=self.estimator.loss,
                learning_rate=self.estimator.learning_rate,
                max_iter=n_iter,
                sample_weight=sample_weight,
                classes=None,
                coef_init=None,
                intercept_init=None
            )

        if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_:
            self.fully_fit_ = True

        return self
def SGD_c_fit(X,y):
    clf = SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, shuffle=True)
    return clf.fit(X, y)
示例#42
0
class SGD(
    IterativeComponentWithSampleWeight,
    AutoSklearnClassificationAlgorithm,
):
    def __init__(self, loss, penalty, alpha, fit_intercept, tol,
                 learning_rate, l1_ratio=0.15, epsilon=0.1,
                 eta0=0.01, power_t=0.5, average=False, random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.tol = tol
        self.learning_rate = learning_rate
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None

    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            self.fully_fit_ = False

            self.alpha = float(self.alpha)
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \
                else 0.15
            self.epsilon = float(self.epsilon) if self.epsilon is not None \
                else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(self.power_t) if self.power_t is not None \
                else 0.5
            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)

            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           max_iter=n_iter,
                                           tol=self.tol,
                                           learning_rate=self.learning_rate,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state,
                                           warm_start=True)
            self.estimator.fit(X, y, sample_weight=sample_weight)
        else:
            self.estimator.max_iter += n_iter
            self.estimator.max_iter = min(self.estimator.max_iter, 512)
            self.estimator._validate_params()
            self.estimator._partial_fit(
                X, y,
                alpha=self.estimator.alpha,
                C=1.0,
                loss=self.estimator.loss,
                learning_rate=self.estimator.learning_rate,
                max_iter=n_iter,
                sample_weight=sample_weight,
                classes=None,
                coef_init=None,
                intercept_init=None
            )

        if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_:
            self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'SGD Classifier',
                'name': 'Stochastic Gradient Descent Classifier',
                'handles_regression': False,
                'handles_classification': True,
                'handles_multiclass': True,
                'handles_multilabel': False,
                'is_deterministic': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,)}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        loss = CategoricalHyperparameter("loss",
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            default_value="log")
        penalty = CategoricalHyperparameter(
            "penalty", ["l1", "l2", "elasticnet"], default_value="l2")
        alpha = UniformFloatHyperparameter(
            "alpha", 1e-7, 1e-1, log=True, default_value=0.0001)
        l1_ratio = UniformFloatHyperparameter(
            "l1_ratio", 1e-9, 1,  log=True, default_value=0.15)
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True,
                                         default_value=1e-4)
        epsilon = UniformFloatHyperparameter(
            "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True)
        learning_rate = CategoricalHyperparameter(
            "learning_rate", ["optimal", "invscaling", "constant"],
            default_value="invscaling")
        eta0 = UniformFloatHyperparameter(
            "eta0", 1e-7, 1e-1, default_value=0.01, log=True)
        power_t = UniformFloatHyperparameter("power_t", 1e-5, 1,
                                             default_value=0.5)
        average = CategoricalHyperparameter(
            "average", ["False", "True"], default_value="False")
        cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept,
                                tol, epsilon, learning_rate, eta0, power_t,
                                average])

        # TODO add passive/aggressive here, although not properly documented?
        elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
        epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")

        power_t_condition = EqualsCondition(power_t, learning_rate,
                                            "invscaling")

        # eta0 is only relevant if learning_rate!='optimal' according to code
        # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/
        # linear_model/sgd_fast.pyx#L603
        eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling",
                                                            "constant"])
        cs.add_conditions([elasticnet, epsilon_condition, power_t_condition,
                           eta0_in_inv_con])

        return cs
示例#43
0
plt.figure(2)
plt.scatter(features[:,1], features[:,2], c = labels)
plt.plot(x1, x_2)



# sklearn implementation
from sklearn.linear_model import Perceptron
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.metrics import accuracy_score


# Fitting an sklearn Perceptron and SGDClassifier with perceptron loss function (these should be identical)
clf = Perceptron(random_state=None, eta0= 0.1, shuffle=False, penalty=None, class_weight=None, fit_intercept=False)
clf2 = SGDClassifier(loss="perceptron",eta0=0.1,learning_rate="constant",penalty=None,random_state=None,shuffle=False,fit_intercept=False,warm_start=False,average=False,n_iter=1000)
clf.fit(x_train, y_train)
clf2.fit(x_train, y_train)

y_predict = clf.predict(x_test)
y_preSGD = clf2.predict(x_test)

print "sklearn Perceptron accuracy:"
print accuracy_score(y_test, y_predict)

print "sklearn SGDClassifier accuracy:"
print accuracy_score(y_test, y_preSGD)

print "my perceptron accuracy:"
print accuracy_score(y_test, y_pred)
print "\n"