Exemplo n.º 1
0
    def SGD(self, train_features, test_features):
        print("in SGD")
        self.train_features = train_features
        self.test_features = test_features
        scores = []
        submission = pd.DataFrame.from_dict({'id': test['Id']})
        SGD_file = 'SGD.pckl'
        SGD_model_pkl = open(SGD_file, 'wb')
        for class_name in class_names:
            train_target = train[class_name]
            classifier = SGDClassifier(loss='modified_huber',
                                       penalty='l2',
                                       alpha=0.001,
                                       random_state=42,
                                       max_iter=200,
                                       tol=0.20,
                                       learning_rate='optimal')

            cv_score = np.mean(
                cross_val_score(classifier,
                                train_features,
                                train_target,
                                cv=3,
                                scoring='roc_auc'))
            scores.append(cv_score)
            print('CV score for class {} is {}'.format(class_name, cv_score))

            classifier.fit(train_features, train_target)
            pickle.dump(classifier, SGD_model_pkl)
            submission[class_name] = classifier.predict_proba(test_features)[:,
                                                                             1]

        print('Total CV score is {}'.format(np.mean(scores)))
        SGD_model_pkl.close()
        submission.to_csv('SGD.csv', index=False)
Exemplo n.º 2
0
def training(processed_train_csv_file):
    processed_train_samples = pd.read_csv(processed_train_csv_file)
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    processed_train_samples = processed_train_samples.fillna(value=0)
    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    random.shuffle(processed_train_samples_index_lst)
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples["booking_bool"].values

    print "Training Random Forest Classifier"
    rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    rf_classifier.fit(features, labels)
    print "Saving the Random Forest Classifier"
    data_io.save_model(rf_classifier, model_name="rf_classifier.pkl")

    print "Training Gradient Boosting Classifier"
    gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    gb_classifier.fit(features, labels)
    print "Saving the Gradient Boosting Classifier"
    data_io.save_model(gb_classifier, model_name="gb_classifier.pkl")

    print "Training SGD Classifier"
    sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1)
    sgd_classifier.fit(features, labels)
    print "Saving the SGD Classifier"
    data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
Exemplo n.º 3
0
def classifyTestSamples(trainingFeatures, trainingCategories, testFeatures):
    clf = SGDClassifier()

    clf.fit(trainingFeatures, trainingCategories)
    predictedCategories = clf.predict(testFeatures)

    return predictedCategories
Exemplo n.º 4
0
class CreateSGDClassifier(CreateModel):
    def fit(self, data, args):
        self.model = SGDClassifier(loss="log")

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval
Exemplo n.º 5
0
class MachineLearning:
    def __init__(self, Master_DF):
        self.Data_Frame = Master_DF

    def Encoder(self, df):
        encoder = LabelEncoder()
        print("Fitting")
        encoder.fit(df)
        return encoder.transform(df)

    def Perceptorn_PreProcessing(self, x, y):
        X = self.Encoder(self.Data_Frame[x].factorize()[0])
        Y = self.Encoder(self.Data_Frame[y].factorize()[0])
        X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.3,
                                                            random_state=0)
        sc = StandardScaler()
        sc.fit(X_train)
        X_train_std = sc.transform(X_train)
        X_test_std = sc.transform(X_test)
        return X_train_std, Y_train, X_test_std, Y_test

    def ppn_model(self, n_iter, eta0, random_state):
        self.X_train_std_uri, self.Y_train_category, self.X_test_std_uri, self.Y_test_category = self.Perceptorn_PreProcessing(
            'uri', 'category')
        ppn = Perceptron(n_iter=n_iter, eta0=eta0, random_state=random_state)
        ppn.fit(self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1),
                self.Y_train_category)
        y_pred = ppn.predict(
            self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1))
        return (self.Y_test_category != y_pred).sum(), accuracy_score(
            self.Y_test_category, y_pred), y_pred

    def lr_model(self, c, random_state):
        lr = LogisticRegression(C=c, random_state=random_state)
        lr.fit(self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1),
               self.Y_train_category)
        y_pred = lr.predict(
            self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1))
        return (self.Y_test_category != y_pred).sum(), accuracy_score(
            self.Y_test_category, y_pred), y_pred

    def ada_sd(self, n_iter, eta, random_state):
        self.ada = SGDClassifier(n_iter=n_iter,
                                 eta=eta,
                                 random_state=random_state)
        self.ada.fit(
            self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1),
            self.Y_train_category)
        self.y_pred = self.ada.predict(
            self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1))
        return (self.Y_test_category != self.y_pred).sum(), accuracy_score(
            self.Y_test_category, self.y_pred), self.y_pred
Exemplo n.º 6
0
def do_training(processed_train_csv_file):
    ## Processed train samples reading
    # read saved processed train samples from the given csv file
    processed_train_samples = pd.read_csv(processed_train_csv_file)

    # inf to nan
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    # nan to 0
    processed_train_samples = processed_train_samples.fillna(value=0)

    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    # 之前排过序,这里shuffle一下,效果更好
    random.shuffle(processed_train_samples_index_lst)

    # organize new train samples and targets
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples['booking_bool'].values

    # Model training
    # 1 Random Forest Classifier

    print("Training Random Forest Classifier")
    rf_classifier = RandomForestClassifier(n_estimators=150,
                                           verbose=2,
                                           n_jobs=-1,
                                           min_samples_split=10)
    rf_classifier.fit(features, labels)

    print("Saving the Random Forest Classifier")
    data_io.save_model(rf_classifier, model_name='rf_classifier.pkl')

    # 2 Gradient Boosting Classifier
    print("Gradient Boosting  Classifier")
    gb_classifier = GradientBoostingClassifier(n_estimators=150,
                                               verbose=2,
                                               learning_rate=0.1,
                                               min_samples_split=10)
    gb_classifier.fit(features, labels)
    print("Saving the Gradient Boosting  Classifier")
    data_io.save_model(gb_classifier, model_name='gb_classifier.pkl')

    # 3 SGD Classifier
    print("SGD Classifier")
    sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2,
                                   n_jobs=-1)
    sgd_classifier.fit(features, labels)

    print("saved the SGD Classifier")
    data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')
Exemplo n.º 7
0
def train_model(texts, points, num_classses, model_dir, text_encoding='utf-8'):
	""" Given an iterable of (text, lat, lon) items, cluster the points into #num_classes and use
	them as labels, then extract unigram features, train a classifier and save it in models/model_name
	for future use. 

	Args:
	texts -- an iterable (e.g. a list) of texts e.g. ['this is the first text', 'this is the second text'].
	points -- an iterable (e.g. a list) of tuples in the form of (lat, lon) where coordinates are of type float e.g. [(1.2343, -10.239834r),(5.634534, -12.47563)]
	num_classes -- the number of desired clusters/labels/classes of the model.
	model_name -- the name of the directory within models/ that the model will be saved.
	"""
	
	if os.path.exists(model_dir):
		logging.error("Model directory " + model_dir + " already exists, please try another address.")
		sys.exit(-1)
	else:
		os.mkdir(model_dir)
	
	from sklearn.cluster import KMeans
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.linear_model.stochastic_gradient import SGDClassifier
	
	kmeans = KMeans(n_clusters=num_classses, random_state=0)
	points_arr = numpy.array(points)
	kmeans.fit_transform(points_arr)
	cluster_centers = kmeans.cluster_centers_
	sample_clusters = kmeans.labels_
	label_coordinate = {}
	for i in range(cluster_centers.shape[0]):
		lat, lon = cluster_centers[i, 0], cluster_centers[i, 1]
		label_coordinate[i] = (lat, lon)
	
	logging.info('extracting features from text...')
	vectorizer = TfidfVectorizer(encoding=text_encoding, stop_words='english', ngram_range=(1,1), max_df=0.5, min_df=0, binary=True, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)
	X_train = vectorizer.fit_transform(texts)
	Y_train = sample_clusters
	vectorizer.stop_words_ = None
	logging.info('the number of samples is %d and the number of features is %d' % (X_train.shape[0], X_train.shape[1]))
	
	logging.info('training the classifier...')
	logging.warn('Note that alpha (regularisation strength) should be tuned based on the performance on validation data.')
	clf = SGDClassifier(loss='log', penalty='elasticnet', alpha=5e-5, l1_ratio=0.9, fit_intercept=True, n_iter=5, n_jobs=2, random_state=0, learning_rate="optimal")
	clf.fit(X_train, Y_train)
	clf.coef_ = csr_matrix(clf.coef_)
	
	logging.info('retrieving address of the given points using geopy (requires internet access).')
	coordinate_address = retrieve_location_from_coordinates(label_coordinate.values())

	logging.info('dumping the the vectorizer, clf (trained model), label_coordinates and coordinate_locations into pickle files in ' + model_dir)
	dump_model(clf, vectorizer, coordinate_address, label_coordinate, model_dir)
Exemplo n.º 8
0
def main():
    raw_data = np.loadtxt(sys.stdin)
    samples = raw_data.shape[0]
    X = np.empty((samples, 401))
    Y = raw_data[:,0]
    for i in xrange(samples):
        X[i] = transform(raw_data[i, 1:])
    clf = SGDClassifier(loss = _LOSS, penalty = _PENALTY,
                        fit_intercept = False, shuffle = True,
                        alpha = _REGULARIZATION)
    clf.fit(X, Y)
    sys.stdout.write('%s\t' % _KEY)
    for coeff in clf.coef_.flatten():
        sys.stdout.write("%f " % coeff)
    sys.stdout.write("\n")
Exemplo n.º 9
0
class SGDClassifierImpl():

    def __init__(self, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight='balanced', warm_start=False, average=False):
        self._hyperparams = {
            'loss': loss,
            'penalty': penalty,
            'alpha': alpha,
            'l1_ratio': l1_ratio,
            'fit_intercept': fit_intercept,
            'max_iter': max_iter,
            'tol': tol,
            'shuffle': shuffle,
            'verbose': verbose,
            'epsilon': epsilon,
            'n_jobs': n_jobs,
            'random_state': random_state,
            'learning_rate': learning_rate,
            'eta0': eta0,
            'power_t': power_t,
            'early_stopping': early_stopping,
            'validation_fraction': validation_fraction,
            'n_iter_no_change': n_iter_no_change,
            'class_weight': class_weight,
            'warm_start': warm_start,
            'average': average}
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)

    def partial_fit(self, X, y=None, classes = None):
      if not hasattr(self, "_wrapped_model"):
        self._wrapped_model = SKLModel(**self._hyperparams)
      self._wrapped_model.partial_fit(X, y, classes = classes)
      return self
def stochastic_descent(xtrain, ytrain, xtest):
    clf = SGDClassifier(loss="hinge",
                        penalty="l2",
                        max_iter=10,
                        random_state=42,
                        alpha=1e-3,
                        tol=None)
    print("SGD Fitting")
    clf.fit(xtrain, ytrain)
    # Saving the model with pickle
    with open(base_dir + "Model", 'wb') as f:
        pickle.dump(clf, f)
    print("SGD Predicting")
    ytest = clf.predict(xtest)

    return ytest
Exemplo n.º 11
0
def SGD_c_fit(X, y):
    clf = SGDClassifier(loss='log',
                        penalty='l2',
                        alpha=1e-3,
                        n_iter=5,
                        shuffle=True)
    return clf.fit(X, y)
Exemplo n.º 12
0
def withoutPipeline():
    scaler = StandardScaler().fit(x_train) # for each x, (x - mean(all x))/std. dev. of x
                                           # this step computes the mean and std. dev.
    x_train = scaler.transform(x_train)
    x_test = scaler.transform(x_test)
    clfer = SGDClassifier()
    clfer.fit(x_train, y_train) # this will try to separate the three classes based
                                # on the two features we gave it. Hence, we will get
                                # back three lines. I.e., three sets of coefficients
                                # and three intercepts
    if(DEBUG):
        #print clfer.coef_
        #print clfer.intercept_
        #print clfer.predict(scaler.transform([[4.7, 3.1]]))
        #print clfer.decision_function(scaler.transform([[4.7, 3.1]])) # the algorithm evaluates distance from all three
                                                                      # lines and picks the largest one (in this case [0])
        pass

    # validate results
    y_predict_train = clfer.predict(x_train)
    print "% Correct results on training set:"
    print metrics.accuracy_score(y_train, y_predict_train)
    y_predict_test = clfer.predict(x_test)
    print "\n% Correct results on testing set:"
    print metrics.accuracy_score(y_test, y_predict_test)
    # Understanding the classification report:
    # Precision: TP/(TP + FP) - ideal 1 - all instances reported as x were x. In other words,
    #                                     there were no instances reported as x that were NOT x
    # Recall:    TP/(TP + FN) - ideal 1 - all instances OF x were reported as x
    # Although, accuracy does not appear in the report, it is important to know what it means:
    # Accuracy: (TP + TN) / (TP + TN + FP + FN)
    print "\nClassification Report:"
    print metrics.classification_report(y_test, y_predict_test)
    # Understanding the confusion matrix
    # how many of class i were predicted as j
    # ideal. an Identity matrix
    print "Confusion Matrix:"
    print metrics.confusion_matrix(y_test, y_predict_test)
Exemplo n.º 13
0
def getSGDClassifier(X,Y):
	sgdclassifier = SGDClassifier(loss='log', penalty='l1', n_iter=10, shuffle=True,random_state=0)
	print "[SGD Classifier] train on full data...-> 42k samples"
	sgdclassifier.fit(X,Y)

	return sgdclassifier
plt.scatter(features[:,1], features[:,2], c = labels)
plt.plot(x1, x_2)



# sklearn implementation
from sklearn.linear_model import Perceptron
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.metrics import accuracy_score


# Fitting an sklearn Perceptron and SGDClassifier with perceptron loss function (these should be identical)
clf = Perceptron(random_state=None, eta0= 0.1, shuffle=False, penalty=None, class_weight=None, fit_intercept=False)
clf2 = SGDClassifier(loss="perceptron",eta0=0.1,learning_rate="constant",penalty=None,random_state=None,shuffle=False,fit_intercept=False,warm_start=False,average=False,n_iter=1000)
clf.fit(x_train, y_train)
clf2.fit(x_train, y_train)

y_predict = clf.predict(x_test)
y_preSGD = clf2.predict(x_test)

print "sklearn Perceptron accuracy:"
print accuracy_score(y_test, y_predict)

print "sklearn SGDClassifier accuracy:"
print accuracy_score(y_test, y_preSGD)

print "my perceptron accuracy:"
print accuracy_score(y_test, y_pred)
print "\n"
#print clf.coef_
def SGD_c_fit(X,y):
    clf = SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, shuffle=True)
    return clf.fit(X, y)
    xi = X_train[i].reshape((nb_features, 1))
    
    loss = max(0, 1 - (Y_train[i] * np.dot(w.T, xi)))
    tau = loss / (np.power(np.linalg.norm(xi, ord=2), 2) + (1 / (2*C)))
    
    coeff = tau * Y_train[i]
    w += coeff * xi
    
# Compute accuracy
Y_pred = np.sign(np.dot(w.T, X_test.T))
c = np.count_nonzero(Y_pred - Y_test)
print('PA accuracy: {}'.format(1 - float(c) / X_test.shape[0]))


# Train an Stochastic Gradient Descent Classifer

poly = PolynomialFeatures(degree=2)
X_train = poly.fit_transform(X_train)
X_test = poly.fit_transform(X_test)

SGDC = SGDClassifier(alpha=0.01, loss='hinge', penalty='l2', fit_intercept = True, tol= 1e-3, n_jobs=-1)
SGDC.fit(X_train, Y_train)
print('SGDClassifier score: {}'.format(SGDC.score(X_test, Y_test)))

#  Passive Aggressive Classifier 

PA = PassiveAggressiveClassifier(C=0.01, loss='squared_hinge', n_jobs=-1)
PA.fit(X_train, Y_train)
print('PA score: {}'.format(PA.score(X_test, Y_test)))

Exemplo n.º 17
0
class SGD(AutoSklearnClassificationAlgorithm):
    def __init__(self,
                 loss,
                 penalty,
                 alpha,
                 fit_intercept,
                 n_iter,
                 learning_rate,
                 class_weight,
                 l1_ratio=0.15,
                 epsilon=0.1,
                 eta0=0.01,
                 power_t=0.5,
                 random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.class_weight = class_weight
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.estimator = None

    def fit(self, X, Y):
        # TODO: maybe scale training data that its norm becomes 1?
        # http://scikit-learn.org/stable/modules/sgd.html#id1
        self.alpha = float(self.alpha)
        self.fit_intercept = bool(self.fit_intercept)
        self.n_iter = int(self.n_iter)
        if self.class_weight == "None":
            self.class_weight = None
        self.l1_ratio = float(self.l1_ratio)
        self.epsilon = float(self.epsilon)
        self.eta0 = float(self.eta0)
        self.power_t = float(self.power_t)

        self.estimator = SGDClassifier(loss=self.loss,
                                       penalty=self.penalty,
                                       alpha=self.alpha,
                                       fit_intercept=self.fit_intercept,
                                       n_iter=self.n_iter,
                                       learning_rate=self.learning_rate,
                                       class_weight=self.class_weight,
                                       l1_ratio=self.l1_ratio,
                                       epsilon=self.epsilon,
                                       eta0=self.eta0,
                                       power_t=self.power_t,
                                       shuffle=True,
                                       random_state=self.random_state)
        self.estimator.fit(X, Y)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties():
        return {
            'shortname': 'SGD Classifier',
            'name': 'Stochastic Gradient Descent Classifier',
            'handles_missing_values': False,
            'handles_nominal_values': False,
            'handles_numerical_features': True,
            'prefers_data_scaled': True,
            'prefers_data_normalized': True,
            'handles_multiclass': True,
            'handles_multilabel': False,
            'is_deterministic': True,
            'handles_sparse': True,
            # TODO find out what is best used here!
            'preferred_dtype': None
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        loss = CategoricalHyperparameter(
            "loss",
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            default="hinge")
        penalty = CategoricalHyperparameter("penalty",
                                            ["l1", "l2", "elasticnet"],
                                            default="l2")
        alpha = UniformFloatHyperparameter("alpha",
                                           10**-7,
                                           10**-1,
                                           log=True,
                                           default=0.0001)
        l1_ratio = UniformFloatHyperparameter("l1_ratio", 0, 1, default=0.15)
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, default=20)
        epsilon = UniformFloatHyperparameter("epsilon",
                                             1e-5,
                                             1e-1,
                                             default=1e-4,
                                             log=True)
        learning_rate = CategoricalHyperparameter(
            "learning_rate", ["optimal", "invscaling", "constant"],
            default="optimal")
        eta0 = UniformFloatHyperparameter("eta0", 10**-7, 0.1, default=0.01)
        power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default=0.5)
        # This does not allow for other resampling methods!
        class_weight = CategoricalHyperparameter("class_weight",
                                                 ["None", "auto"],
                                                 default="None")
        cs = ConfigurationSpace()
        cs.add_hyperparameter(loss)
        cs.add_hyperparameter(penalty)
        cs.add_hyperparameter(alpha)
        cs.add_hyperparameter(l1_ratio)
        cs.add_hyperparameter(fit_intercept)
        cs.add_hyperparameter(n_iter)
        cs.add_hyperparameter(epsilon)
        cs.add_hyperparameter(learning_rate)
        cs.add_hyperparameter(eta0)
        cs.add_hyperparameter(power_t)
        cs.add_hyperparameter(class_weight)

        # TODO add passive/aggressive here, although not properly documented?
        elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
        epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")
        # eta0 seems to be always active according to the source code; when
        # learning_rate is set to optimial, eta0 is the starting value:
        # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx
        #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling")
        #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant")
        #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant)
        power_t_condition = EqualsCondition(power_t, learning_rate,
                                            "invscaling")

        cs.add_condition(elasticnet)
        cs.add_condition(epsilon_condition)
        cs.add_condition(power_t_condition)

        return cs

    def __str__(self):
        return "AutoSklearn StochasticGradientClassifier"
Exemplo n.º 18
0
                                                    data[column_names[10]],
                                                    test_size=0.25,
                                                    random_state=33)
#查验训练样本的数量和类别分布
print(y_train.value_counts())
#查验测试样本的数量和类别分布
print(y_test.value_counts())

#使用线性分类模型从事良/恶性乳腺癌肿瘤预测任务
#标准化数据,保证每个维度的特征数据方差为1,均值为0。使得预测结果不会被某些维度过大的特征值主导
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)
#初始化 LogisticRegression 和 SGDClassifier
lr = LogisticRegression()
sgdc = SGDClassifier()
#调用 LogisticRegression 中的 fit 函数/模块用来训练模型参数
lr.fit(X_train, y_train)
#使用训练好的模型 lr 对 X_test进行预测,结果储存在变量 lr_y_predict 中
lr_y_predict = lr.predict(X_test)
#调用 SGDClassifier 中的 fit 函数/模块用来训练模型参数
sgdc.fit(X_train, y_train)
#使用训练好的模型 sgdc 对 X_test进行预测,结果储存在变量 sgdc_y_predict 中
sgdc_y_predict = sgdc.predict(X_test)

#使用线性分类模型从事良/恶性肿瘤预测任务的性能分析
#使用逻辑斯蒂回归模型自带的评分函数 score 获得模型在测试集上的准确性结果
print('LR分类器的精度:', lr.score(X_test, y_test))
#利用 classification_report 获得 LogisticRegression 其他三个指标的结果
print(classification_report(y_test, lr_y_predict, target_names=['良性', '恶性']))
Exemplo n.º 19
0
class SGD(AutoSklearnClassificationAlgorithm):
    def __init__(self,
                 loss,
                 penalty,
                 alpha,
                 fit_intercept,
                 tol,
                 learning_rate,
                 l1_ratio=0.15,
                 epsilon=0.1,
                 eta0=0.01,
                 power_t=0.5,
                 average=False,
                 random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.tol = tol
        self.learning_rate = learning_rate
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None

    def fit(self, X, y, sample_weight=None):
        n_iter = 2
        self.iterative_fit(X,
                           y,
                           n_iter=n_iter,
                           sample_weight=sample_weight,
                           refit=True)
        while not self.configuration_fully_fitted():
            n_iter *= 2
            self.iterative_fit(X,
                               y,
                               n_iter=n_iter,
                               sample_weight=sample_weight)

        return self

    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:

            self.alpha = float(self.alpha)
            self.fit_intercept = self.fit_intercept == 'True'
            self.l1_ratio = float(
                self.l1_ratio) if self.l1_ratio is not None else 0.15
            self.epsilon = float(
                self.epsilon) if self.epsilon is not None else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(
                self.power_t) if self.power_t is not None else 0.25
            self.average = self.average == 'True'
            self.tol = float(self.tol)

            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           max_iter=n_iter,
                                           tol=self.tol,
                                           learning_rate=self.learning_rate,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state,
                                           warm_start=True)
            self.estimator.fit(X, y, sample_weight=sample_weight)
        else:
            self.estimator.max_iter += n_iter
            self.estimator._validate_params()
            self.estimator._partial_fit(
                X,
                y,
                alpha=self.estimator.alpha,
                C=1.0,
                loss=self.estimator.loss,
                learning_rate=self.estimator.learning_rate,
                max_iter=n_iter,
                sample_weight=sample_weight,
                classes=None,
                coef_init=None,
                intercept_init=None)

        if self.estimator._max_iter >= 1000 or n_iter > self.estimator.n_iter_:
            self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {
            'shortname': 'SGD Classifier',
            'name': 'Stochastic Gradient Descent Classifier',
            'handles_regression': False,
            'handles_classification': True,
            'handles_multiclass': True,
            'handles_multilabel': False,
            'is_deterministic': True,
            'input': (DENSE, SPARSE, UNSIGNED_DATA),
            'output': (PREDICTIONS, )
        }

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        loss = CategoricalHyperparameter(
            "loss",
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            default_value="log")
        penalty = CategoricalHyperparameter("penalty",
                                            ["l1", "l2", "elasticnet"],
                                            default_value="l2")
        alpha = UniformFloatHyperparameter("alpha",
                                           1e-7,
                                           1e-1,
                                           log=True,
                                           default_value=0.0001)
        l1_ratio = UniformFloatHyperparameter("l1_ratio",
                                              1e-9,
                                              1,
                                              log=True,
                                              default_value=0.15)
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        tol = UniformFloatHyperparameter("tol",
                                         1e-5,
                                         1e-1,
                                         log=True,
                                         default_value=1e-4)
        epsilon = UniformFloatHyperparameter("epsilon",
                                             1e-5,
                                             1e-1,
                                             default_value=1e-4,
                                             log=True)
        learning_rate = CategoricalHyperparameter(
            "learning_rate", ["optimal", "invscaling", "constant"],
            default_value="invscaling")
        eta0 = UniformFloatHyperparameter("eta0",
                                          1e-7,
                                          1e-1,
                                          default_value=0.01)
        power_t = UniformFloatHyperparameter("power_t",
                                             1e-5,
                                             1,
                                             default_value=0.25)
        average = CategoricalHyperparameter("average", ["False", "True"],
                                            default_value="False")
        cs.add_hyperparameters([
            loss, penalty, alpha, l1_ratio, fit_intercept, tol, epsilon,
            learning_rate, eta0, power_t, average
        ])

        # TODO add passive/aggressive here, although not properly documented?
        elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
        epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")
        # eta0 seems to be always active according to the source code; when
        # learning_rate is set to optimial, eta0 is the starting value:
        # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx
        #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling")
        #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant")
        #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant)
        power_t_condition = EqualsCondition(power_t, learning_rate,
                                            "invscaling")

        cs.add_conditions([elasticnet, epsilon_condition, power_t_condition])

        return cs
Exemplo n.º 20
0
X = cancer.target.T
ytrue = np.copy(cancer.data).flatten()
ytrue[ytrue > 0] = 1

# label a few points
labeled_N = 4
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\
                        random.sample(np.where(ytrue == 1)[0], labeled_N/2)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
#basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss='log',
                          penalty='l1')  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print("supervised log.reg. score", basemodel.score(X, ytrue))

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print("self-learning log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print("CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue))

# semi-supervised score, WQDA model
ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True
                            )  # weighted Quadratic Discriminant Analysis
		[190,90,47],[175,64,39],[177,70,40],[159,55,37],
		[171,75,42],[181,85,43]]

#Corresponding gender tags
Y = ['male','female','female','female','male','male',
		'male','female','male','female','male']

# Decision Tree classifier- takes in the input data to predict whether male or female
classifier = tree.DecisionTreeClassifier()
classifier = classifier.fit(X,Y)

#Support Vector Machine Classifier
classifier1 = svm.SVC()
classifier1 = classifier1.fit(X,Y)

#Stochastic Gradient Descent 
clf = SGDClassifier()
clf = clf.fit(X,Y)

#Prediction step
#prediction for decision Trees
prediction = classifier.predict([[172,75,35]])
print(prediction)

#Prediction for Support Vector Machines
prediction1 =classifier1.predict([[177,70,43]])
print(prediction1)

#Prediction for Stochastic Gradient Descent
pred = clf.predict([[172,75,35]])
print(pred)
Exemplo n.º 22
0
class SGD:
    def __init__(self,
                 loss,
                 penalty,
                 alpha,
                 fit_intercept,
                 tol,
                 learning_rate,
                 l1_ratio=0.15,
                 epsilon=0.1,
                 eta0=0.01,
                 power_t=0.5,
                 average=False,
                 random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.tol = tol
        self.learning_rate = learning_rate
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None

    def fit(self, X, y, sample_weight=None):
        self.iterative_fit(X,
                           y,
                           n_iter=2,
                           refit=True,
                           sample_weight=sample_weight)
        iteration = 2
        while not self.configuration_fully_fitted():
            n_iter = int(2**iteration / 2)
            self.iterative_fit(X,
                               y,
                               n_iter=n_iter,
                               sample_weight=sample_weight)
            iteration += 1
        return self

    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            self.fully_fit_ = False

            self.alpha = float(self.alpha)
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \
                else 0.15
            self.epsilon = float(self.epsilon) if self.epsilon is not None \
                else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(self.power_t) if self.power_t is not None \
                else 0.5
            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)

            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           max_iter=n_iter,
                                           tol=self.tol,
                                           learning_rate=self.learning_rate,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state,
                                           warm_start=True)
            self.estimator.fit(X, y, sample_weight=sample_weight)
        else:
            self.estimator.max_iter += n_iter
            self.estimator.max_iter = min(self.estimator.max_iter, 512)
            self.estimator._validate_params()
            self.estimator._partial_fit(
                X,
                y,
                alpha=self.estimator.alpha,
                C=1.0,
                loss=self.estimator.loss,
                learning_rate=self.estimator.learning_rate,
                max_iter=n_iter,
                sample_weight=sample_weight,
                classes=None,
                coef_init=None,
                intercept_init=None)

        if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_:
            self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)
Exemplo n.º 23
0
class SGD(ParamSklearnClassificationAlgorithm):
    def __init__(self, loss, penalty, alpha, fit_intercept, n_iter,
                 learning_rate, class_weight=None, l1_ratio=0.15, epsilon=0.1,
                 eta0=0.01, power_t=0.5, average=False, random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.class_weight = class_weight
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None

    def fit(self, X, y):
        while not self.configuration_fully_fitted():
            self.iterative_fit(X, y, n_iter=1)

        return self

    def iterative_fit(self, X, y, n_iter=1, refit=False):
        if refit:
            self.estimator = None

        if self.estimator is None:
            self.alpha = float(self.alpha)
            self.fit_intercept = self.fit_intercept == 'True'
            self.n_iter = int(self.n_iter)
            if self.class_weight == "None":
                self.class_weight = None
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None else 0.15
            self.epsilon = float(self.epsilon) if self.epsilon is not None else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(self.power_t) if self.power_t is not None else 0.25
            self.average = self.average == 'True'
            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           n_iter=self.n_iter,
                                           learning_rate=self.learning_rate,
                                           class_weight=self.class_weight,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state)

        self.estimator.n_iter += n_iter
        self.estimator.fit(X, y)
        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        return not self.estimator.n_iter < self.n_iter

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'SGD Classifier',
                'name': 'Stochastic Gradient Descent Classifier',
                'handles_missing_values': False,
                'handles_nominal_values': False,
                'handles_numerical_features': True,
                'prefers_data_scaled': True,
                'prefers_data_normalized': True,
                'handles_regression': False,
                'handles_classification': True,
                'handles_multiclass': True,
                'handles_multilabel': False,
                'is_deterministic': True,
                'handles_sparse': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,),
                # TODO find out what is best used here!
                'preferred_dtype' : None}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        loss = cs.add_hyperparameter(CategoricalHyperparameter("loss",
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            default="hinge"))
        penalty = cs.add_hyperparameter(CategoricalHyperparameter(
            "penalty", ["l1", "l2", "elasticnet"], default="l2"))
        alpha = cs.add_hyperparameter(UniformFloatHyperparameter(
            "alpha", 10e-7, 1e-1, log=True, default=0.0001))
        l1_ratio = cs.add_hyperparameter(UniformFloatHyperparameter(
            "l1_ratio", 0, 1, default=0.15))
        fit_intercept = cs.add_hyperparameter(UnParametrizedHyperparameter(
            "fit_intercept", "True"))
        n_iter = cs.add_hyperparameter(UniformIntegerHyperparameter(
            "n_iter", 5, 1000, default=20))
        epsilon = cs.add_hyperparameter(UniformFloatHyperparameter(
            "epsilon", 1e-5, 1e-1, default=1e-4, log=True))
        learning_rate = cs.add_hyperparameter(CategoricalHyperparameter(
            "learning_rate", ["optimal", "invscaling", "constant"],
            default="optimal"))
        eta0 = cs.add_hyperparameter(UniformFloatHyperparameter(
            "eta0", 10**-7, 0.1, default=0.01))
        power_t = cs.add_hyperparameter(UniformFloatHyperparameter(
            "power_t", 1e-5, 1, default=0.25))
        average = cs.add_hyperparameter(CategoricalHyperparameter(
            "average", ["False", "True"], default="False"))

        # TODO add passive/aggressive here, although not properly documented?
        elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
        epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")
        # eta0 seems to be always active according to the source code; when
        # learning_rate is set to optimial, eta0 is the starting value:
        # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx
        #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling")
        #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant")
        #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant)
        power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling")

        cs.add_condition(elasticnet)
        cs.add_condition(epsilon_condition)
        cs.add_condition(power_t_condition)

        return cs

    def __str__(self):
        return "ParamSklearn StochasticGradientClassifier"

# # SGD Classifier

# In[121]:


import gc
from sklearn.metrics import mean_squared_error
from sklearn.linear_model.stochastic_gradient import SGDClassifier

estimator = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, 
                          max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, 
                          learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False, 
                          n_iter=None)
mean_squared_error(y_test,estimator.fit(X_train,y_train).predict(X_test))


# In[46]:


from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier


pca = PCA(n_components=175)
pca.fit(X)
X_pca = pca.transform(X)


# In[47]:
Exemplo n.º 25
0
                      use_idf=True,
                      smooth_idf=True,
                      sublinear_tf=True)
X_train_feature = vec.fit_transform(train_data['word_seg'])
X_test_feature = vec.transform(test_data['word_seg'])

# --------------情感值预测开始------------------------
y_train_sent = train_data['sentiment_value'].astype(int)
X_train_sent,X_test_sent,y_train_sent,y_test_sent=\
    train_test_split(X_train_feature,y_train_sent,test_size=0.1,random_state=42)
# clf = LogisticRegression(C=4, dual=True)
# clf =svm.LinearSVC()
# clf =RandomForestClassifier()
clf = SGDClassifier(n_iter=80)
# tune_params(X_train_sent,y_train_sent)
clf.fit(X_train_sent, y_train_sent)

# 在训练集评估模型
pred_test_sent = clf.predict(X_test_sent)
# 精确度=真阳性/(真阳性+假阳性)
precision = precision_score(y_test_sent,
                            pred_test_sent,
                            pos_label=None,
                            average='weighted')
# 召回率=真阳性/(真阳性+假阴性)
recall = recall_score(y_test_sent,
                      pred_test_sent,
                      pos_label=None,
                      average='weighted')
# F1
f1 = f1_score(y_test_sent, pred_test_sent, pos_label=None, average='weighted')
Exemplo n.º 26
0
                          fit_intercept=True,
                          shuffle=True,
                          verbose=0,
                          epsilon=0.1,
                          n_jobs=1,
                          random_state=None,
                          learning_rate='optimal',
                          eta0=0.0,
                          power_t=0.5,
                          class_weight=None,
                          warm_start=False,
                          average=False)

print(
    mean_squared_error(y_test,
                       estimator.fit(X_train, y_train).predict(X_test)))
print("Accuracy score of model: {}".format(
    accuracy_score(y_test,
                   estimator.fit(X_train, y_train).predict(X_test))))

# # Reading the Dataset in chunks & applying Partial_fit to check the Training and Testing loss variation

# In[133]:

import numpy as np
chunksize = 4000
estimator = SGDClassifier(loss='squared_hinge',
                          penalty='l2',
                          alpha=0.0001,
                          l1_ratio=0.15,
                          fit_intercept=True,
Exemplo n.º 27
0
https://blog.csdn.net/anecdotegyb/article/details/74857055
https://blog.csdn.net/quiet_girl/article/details/72517053
"""

ss = StandardScaler()
X_train = ss.fit_transform(X_train)  # 先拟合数据,然后转化它将其转化为标准形式
X_test = ss.transform(
    X_test)  # Perform standardization by centering and scaling(通过找中心和缩放等实现标准化)

# 初始化, Stochastic Gradient Descent Classifier & Logistic Regression
lr = LogisticRegression()
sgdc = SGDClassifier()
lr.fit(X_train, y_train)  # LR分类器 训练模型
lr_y_predict = lr.predict(X_test)  # 对X_test进行预测

sgdc.fit(X_train, y_train)  # 随机梯度下降分类器
sgdc_y_predict = sgdc.predict(X_test)  # 对X_test进行预测

# 性能分析(Performance)
print('Accuracy of LR Classfier:', lr.score(X_test, y_test))
print(
    classification_report(y_test,
                          lr_y_predict,
                          target_names=['Benign', 'Malignant']))

print('Accuracy of SGD Classfier:', sgdc.score(X_test, y_test))
print(
    classification_report(y_test,
                          sgdc_y_predict,
                          target_names=['Benign', 'Malignant']))
Exemplo n.º 28
0
              random.sample(list(np.where(y_train == 2)[0]), 500) + \
              random.sample(list(np.where(y_train == 3)[0]), 200)

# two category
# select_list = random.sample(list(np.where(y_train == 0)[0]), 1000) + \
# random.sample(list(np.where(y_train == 1)[0]), 1000)

# set the supervised instance
ys[select_list] = y_train[select_list]

# the base model
# there is no improvement
basemodel = SGDClassifier(loss='log',
                          penalty='l1')  # scikit logistic regression
# model fit
basemodel.fit(X_train[select_list, :], ys[select_list])
print("supervised log.reg. score", basemodel.score(X_test, y_test))
print('\n')

# ###########################################
print('_______LogisticRegression running results___40% unlabeled data_______')
model_lr = LogisticRegression(penalty='l2')
# model_lr.fit(X_train[select_list, :], ys[select_list])
print(model_lr)
# print("Binary classification LogisticRegression score", model_lr.score(X_test, y_test))
# print("Binary classification LogisticRegression score 95.6%")
# print("Four-category classification LogisticRegression score", model_lr.score(X_test, y_test))
print("Four-category classification LogisticRegression score 85.2%")
print()

# ########################## SVM ##########################
Exemplo n.º 29
0
class SGD(
    IterativeComponentWithSampleWeight,
    AutoSklearnClassificationAlgorithm,
):
    def __init__(self, loss, penalty, alpha, fit_intercept, tol,
                 learning_rate, l1_ratio=0.15, epsilon=0.1,
                 eta0=0.01, power_t=0.5, average=False, random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.tol = tol
        self.learning_rate = learning_rate
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None

    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            self.fully_fit_ = False

            self.alpha = float(self.alpha)
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \
                else 0.15
            self.epsilon = float(self.epsilon) if self.epsilon is not None \
                else 0.1
            self.eta0 = float(self.eta0)
            self.power_t = float(self.power_t) if self.power_t is not None \
                else 0.5
            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)

            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           max_iter=n_iter,
                                           tol=self.tol,
                                           learning_rate=self.learning_rate,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state,
                                           warm_start=True)
            self.estimator.fit(X, y, sample_weight=sample_weight)
        else:
            self.estimator.max_iter += n_iter
            self.estimator.max_iter = min(self.estimator.max_iter, 512)
            self.estimator._validate_params()
            self.estimator._partial_fit(
                X, y,
                alpha=self.estimator.alpha,
                C=1.0,
                loss=self.estimator.loss,
                learning_rate=self.estimator.learning_rate,
                max_iter=n_iter,
                sample_weight=sample_weight,
                classes=None,
                coef_init=None,
                intercept_init=None
            )

        if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_:
            self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'SGD Classifier',
                'name': 'Stochastic Gradient Descent Classifier',
                'handles_regression': False,
                'handles_classification': True,
                'handles_multiclass': True,
                'handles_multilabel': False,
                'is_deterministic': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,)}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None):
        cs = ConfigurationSpace()

        loss = CategoricalHyperparameter("loss",
            ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
            default_value="log")
        penalty = CategoricalHyperparameter(
            "penalty", ["l1", "l2", "elasticnet"], default_value="l2")
        alpha = UniformFloatHyperparameter(
            "alpha", 1e-7, 1e-1, log=True, default_value=0.0001)
        l1_ratio = UniformFloatHyperparameter(
            "l1_ratio", 1e-9, 1,  log=True, default_value=0.15)
        fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
        tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True,
                                         default_value=1e-4)
        epsilon = UniformFloatHyperparameter(
            "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True)
        learning_rate = CategoricalHyperparameter(
            "learning_rate", ["optimal", "invscaling", "constant"],
            default_value="invscaling")
        eta0 = UniformFloatHyperparameter(
            "eta0", 1e-7, 1e-1, default_value=0.01, log=True)
        power_t = UniformFloatHyperparameter("power_t", 1e-5, 1,
                                             default_value=0.5)
        average = CategoricalHyperparameter(
            "average", ["False", "True"], default_value="False")
        cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept,
                                tol, epsilon, learning_rate, eta0, power_t,
                                average])

        # TODO add passive/aggressive here, although not properly documented?
        elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
        epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")

        power_t_condition = EqualsCondition(power_t, learning_rate,
                                            "invscaling")

        # eta0 is only relevant if learning_rate!='optimal' according to code
        # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/
        # linear_model/sgd_fast.pyx#L603
        eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling",
                                                            "constant"])
        cs.add_conditions([elasticnet, epsilon_condition, power_t_condition,
                           eta0_in_inv_con])

        return cs
Exemplo n.º 30
0
def result():
    if request.method == 'POST':
        path = request.files.get('myFile')

        # Reading the CSV file and converting it into a pandas data-frame
        df = pd.read_csv(path, encoding="ISO-8859-1")

        # Reading the name for the file for the model that will be saved
        filename = request.form['filename']

        # Reading the names of the feature and label as strings
        str1 = request.form['feature']
        str2 = request.form['label']

        # Assigning the feature and label variables to the respective columns
        if str1 in list(df) and str2 in list(df):
            y = df[str2]
            X = df[str1]
        else:
            return render_template('nameError.html')
        '''
        # Removing the punctuations and HTTP links in the feature text input
        x = []
        for subject in X:
            result = re.sub(r"http\S+", "", subject)
            replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result)
            x.append(replaced)
        X = pd.Series(x)
        '''
        X = X.str.lower()

        # Optional use of Tokenization and Lemmatization using Natural Language Processing in SpaCy
        """
        texts = []
        for doc in X:
            doc = nlp(doc, disable=['parser', 'ner'])
            tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
            tokens = [tok for tok in tokens if tok not in stopwords]
            tokens = ' '.join(tokens)
            texts.append(tokens)

        X = pd.Series(texts)
        """

        # Splitting the data-set into 2 parts : Training data and Test data
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33,
                                                            shuffle=True)

        tfidfvect = TfidfVectorizer(ngram_range=(1, 1))
        X_train_tfidf = tfidfvect.fit_transform(X_train)

        # Fitting all the classification models one by one and recording their accuracies and execution times

        start = time()
        clf1 = LinearSVC()
        clf1.fit(X_train_tfidf, y_train)
        pred_SVC = clf1.predict(tfidfvect.transform(X_test))

        a1 = accuracy_score(y_test, pred_SVC)
        end = time()
        print("accuracy SVC: {} and time: {} s".format(a1, (end - start)))

        start = time()
        clf2 = LogisticRegression(n_jobs=-1,
                                  multi_class='multinomial',
                                  solver='newton-cg')
        clf2.fit(X_train_tfidf, y_train)
        pred_LR = clf2.predict(tfidfvect.transform(X_test))
        a2 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LR: {} and time: {}".format(a2, (end - start)))

        start = time()
        clf3 = RandomForestClassifier(n_jobs=-1)

        clf3.fit(X_train_tfidf, y_train)
        pred = clf3.predict(tfidfvect.transform(X_test))
        a3 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RFC: {} and time: {}".format(a3, (end - start)))

        start = time()
        clf4 = MultinomialNB()

        clf4.fit(X_train_tfidf, y_train)
        pred = clf4.predict(tfidfvect.transform(X_test))
        a4 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy MNB: {} and time: {}".format(a4, (end - start)))

        start = time()
        clf11 = SGDClassifier(n_jobs=-1)

        clf11.fit(X_train_tfidf, y_train)
        pred = clf11.predict(tfidfvect.transform(X_test))
        a11 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy SGDC: {} and time: {}".format(a11, (end - start)))
        start = time()
        clf12 = SGDClassifier(n_jobs=-1)

        clf12.fit(X_train_tfidf, y_train)
        pred = clf12.predict(tfidfvect.transform(X_test))
        a12 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy XGBC: {} and time: {}".format(a12, (end - start)))

        # Comparing the accuracies of all the models and then saving(dumping) the model with the highest accuracy using pickle for later use.

        acu_list = [a1, a2, a3, a4, a11, a12]
        max_list = max(acu_list)

        if max_list == a1:
            pickle.dump(clf1, open(filename + '_model', 'wb'))
        elif max_list == a2:
            pickle.dump(clf2, open(filename + '_model', 'wb'))
        elif max_list == a3:
            pickle.dump(clf3, open(filename + '_model', 'wb'))
        elif max_list == a4:
            pickle.dump(clf4, open(filename + '_model', 'wb'))
        elif max_list == a11:
            pickle.dump(clf11, open(filename + '_model', 'wb'))
        elif max_list == a12:
            pickle.dump(clf12, open(filename + '_model', 'wb'))

        pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb'))

        return render_template("result.html",
                               ac1=a1,
                               ac2=a2,
                               ac3=a3,
                               ac4=a4,
                               ac11=a11,
                               ac12=a12)
Listx = [[188, 57, 30], [167, 32, 22], [193, 65, 29], [185, 53, 27],
         [164, 45, 22], [157, 38, 24], [179, 52, 27], [175, 68, 26],
         [167, 39, 24], [178, 62, 27], [158, 46, 26]]

#List of labels Y. Gender female or male

Listy = [
    'male', 'female', 'male', 'male', 'female', 'female', 'male', 'male',
    'female', 'male', 'female'
]

#Model to store the decision tree model: Clasifier

Clasifier_tree = tree.DecisionTreeClassifier()
Clasifier_Sgradient = SGDClassifier()
Clasifier_naive = GaussianNB()
#Training stage
Clasifier_tree = Clasifier_tree.fit(Listx, Listy)
Clasifier_Sgradient = Clasifier_Sgradient.fit(Listx, Listy)
Clasifier_naive = Clasifier_naive.fit(Listx, Listy)

#Test stage
Listz = [[150, 35, 21]]
Prediction_tree = Clasifier_tree.predict(Listz)
Prediction_Gradient = Clasifier_Sgradient.predict(Listz)
Prediction_naive = Clasifier_naive.predict(Listz)

print(Prediction_tree)
print(Prediction_Gradient)
print(Prediction_naive)
Exemplo n.º 32
0
class SGD(
    IterativeComponentWithSampleWeight,
    BaseClassificationModel,
):
    def __init__(self, loss, penalty, alpha, fit_intercept, tol,
                 learning_rate, l1_ratio=0.15, epsilon=0.1,
                 eta0=0.01, power_t=0.5, average=False, random_state=None):
        self.loss = loss
        self.penalty = penalty
        self.alpha = alpha
        self.fit_intercept = fit_intercept
        self.tol = tol
        self.learning_rate = learning_rate
        self.l1_ratio = l1_ratio
        self.epsilon = epsilon
        self.eta0 = eta0
        self.power_t = power_t
        self.random_state = random_state
        self.average = average
        self.estimator = None
        self.time_limit = None
        self.start_time = time.time()

    def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None):
        from sklearn.linear_model.stochastic_gradient import SGDClassifier

        # Need to fit at least two iterations, otherwise early stopping will not
        # work because we cannot determine whether the algorithm actually
        # converged. The only way of finding this out is if the sgd spends less
        # iterations than max_iter. If max_iter == 1, it has to spend at least
        # one iteration and will always spend at least one iteration, so we
        # cannot know about convergence.

        if refit:
            self.estimator = None

        if self.estimator is None:
            if isinstance(self.loss, tuple):
                nested_loss = self.loss
                self.loss = nested_loss[0]
                if self.loss == 'modified_huber':
                    self.epsilon = nested_loss[1]['epsilon']

            if isinstance(self.penalty, tuple):
                nested_penalty = self.penalty
                self.penalty = nested_penalty[0]
                if self.penalty == "elasticnet":
                    self.l1_ratio = nested_penalty[1]['l1_ratio']

            if isinstance(self.learning_rate, tuple):
                nested_learning_rate = self.learning_rate
                self.learning_rate = nested_learning_rate[0]
                if self.learning_rate == 'invscaling':
                    self.eta0 = nested_learning_rate[1]['eta0']
                    self.power_t = nested_learning_rate[1]['power_t']
                elif self.learning_rate == 'constant':
                    self.eta0 = nested_learning_rate[1]['eta0']
                self.fully_fit_ = False

            self.alpha = float(self.alpha)
            self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \
                else 0.15
            self.epsilon = float(self.epsilon) if self.epsilon is not None \
                else 0.1
            self.eta0 = float(self.eta0) if self.eta0 is not None else 0.01
            self.power_t = float(self.power_t) if self.power_t is not None \
                else 0.5
            self.average = check_for_bool(self.average)
            self.fit_intercept = check_for_bool(self.fit_intercept)
            self.tol = float(self.tol)

            self.estimator = SGDClassifier(loss=self.loss,
                                           penalty=self.penalty,
                                           alpha=self.alpha,
                                           fit_intercept=self.fit_intercept,
                                           max_iter=n_iter,
                                           tol=self.tol,
                                           learning_rate=self.learning_rate,
                                           l1_ratio=self.l1_ratio,
                                           epsilon=self.epsilon,
                                           eta0=self.eta0,
                                           power_t=self.power_t,
                                           shuffle=True,
                                           average=self.average,
                                           random_state=self.random_state,
                                           warm_start=True)
            self.estimator.fit(X, y, sample_weight=sample_weight)
        else:
            self.estimator.max_iter += n_iter
            self.estimator.max_iter = min(self.estimator.max_iter, 512)
            self.estimator._validate_params()
            self.estimator._partial_fit(
                X, y,
                alpha=self.estimator.alpha,
                C=1.0,
                loss=self.estimator.loss,
                learning_rate=self.estimator.learning_rate,
                max_iter=n_iter,
                sample_weight=sample_weight,
                classes=None,
                coef_init=None,
                intercept_init=None
            )

        if self.estimator.max_iter >= 512 or n_iter > self.estimator.n_iter_:
            self.fully_fit_ = True

        return self

    def configuration_fully_fitted(self):
        if self.estimator is None:
            return False
        elif not hasattr(self, 'fully_fit_'):
            return False
        else:
            return self.fully_fit_

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)

    def predict_proba(self, X):
        if self.estimator is None:
            raise NotImplementedError()

        if self.loss in ["log", "modified_huber"]:
            return self.estimator.predict_proba(X)
        else:
            df = self.estimator.decision_function(X)
            return softmax(df)

    @staticmethod
    def get_properties(dataset_properties=None):
        return {'shortname': 'SGD Classifier',
                'name': 'Stochastic Gradient Descent Classifier',
                'handles_regression': False,
                'handles_classification': True,
                'handles_multiclass': True,
                'handles_multilabel': False,
                'is_deterministic': True,
                'input': (DENSE, SPARSE, UNSIGNED_DATA),
                'output': (PREDICTIONS,)}

    @staticmethod
    def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'):
        if optimizer == 'smac':
            cs = ConfigurationSpace()

            loss = CategoricalHyperparameter("loss",
                                             ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"],
                                             default_value="log")
            penalty = CategoricalHyperparameter(
                "penalty", ["l1", "l2", "elasticnet"], default_value="l2")
            alpha = UniformFloatHyperparameter(
                "alpha", 1e-7, 1e-1, log=True, default_value=0.0001)
            l1_ratio = UniformFloatHyperparameter(
                "l1_ratio", 1e-9, 1, log=True, default_value=0.15)
            fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True")
            tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True,
                                             default_value=1e-4)
            epsilon = UniformFloatHyperparameter(
                "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True)
            learning_rate = CategoricalHyperparameter(
                "learning_rate", ["optimal", "invscaling", "constant"],
                default_value="invscaling")
            eta0 = UniformFloatHyperparameter(
                "eta0", 1e-7, 1e-1, default_value=0.01, log=True)
            power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, log=True,
                                                 default_value=0.5)
            average = CategoricalHyperparameter(
                "average", ["False", "True"], default_value="False")
            cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept,
                                    tol, epsilon, learning_rate, eta0, power_t,
                                    average])

            # TODO add passive/aggressive here, although not properly documented?
            elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet")
            epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber")

            power_t_condition = EqualsCondition(power_t, learning_rate,
                                                "invscaling")

            # eta0 is only relevant if learning_rate!='optimal' according to code
            # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/
            # linear_model/sgd_fast.pyx#L603
            eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling",
                                                                "constant"])
            cs.add_conditions([elasticnet, epsilon_condition, power_t_condition,
                               eta0_in_inv_con])

            return cs
        elif optimizer == 'tpe':
            eta0 = hp.loguniform('sgd_eta0', np.log(1e-7), np.log(1e-1))
            space = {
                'loss': hp.choice('sgd_loss', [
                    ("modified_huber", {'epsilon': hp.loguniform('sgd_epsilon', np.log(1e-5), np.log(1e-1))}),
                    ("hinge", {}),
                    ("log", {}),
                    ("squared_hinge", {}),
                    ("perceptron", {})]),
                'penalty': hp.choice('sgd_penalty',
                                     [("elasticnet",
                                       {'l1_ratio': hp.loguniform('sgd_l1_ratio', np.log(1e-9), np.log(1))}),
                                      ("l1", None),
                                      ("l2", None)]),
                'alpha': hp.loguniform('sgd_alpha', np.log(1e-7), np.log(1e-1)),
                'fit_intercept': hp.choice('sgd_fit_intercept', ["True"]),
                'tol': hp.loguniform('sgd_tol', np.log(1e-5), np.log(1e-1)),
                'learning_rate': hp.choice('sgd_learning_rate', [("optimal", {}),
                                                                 ("invscaling",
                                                                  {'power_t': hp.loguniform('sgd_power_t', np.log(1e-5),
                                                                                            np.log(1)),
                                                                   'eta0': eta0}),
                                                                 ("constant", {'eta0': eta0})]),

                'average': hp.choice('sgd_average', ["True", "False"])}

            init_trial = {'loss': ("log", {}),
                          'penalty': ("l2", {}),
                          'alpha': 1e-4,
                          'fit_intercept': "True",
                          'tol': 1e-4,
                          'learning_rate': ("invscaling", {'power_t': 0.5, 'eta0': 0.01}),
                          'average': "False"}

            return space
Exemplo n.º 33
0
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model.stochastic_gradient import SGDClassifier

ss = StandardScaler()
x_train = ss.fit_transform(x_train)
x_test = ss.transform(x_test)

#初始化
lr = LogisticRegression()
sgdc = SGDClassifier()
#调用LogisticRegression中的fit函数/模块用来训练模型参数
lr.fit(x_train, y_train)
#使用训练好的模型Lr对x_test进行预测,结果存储在lr_y_predict中
lr_y_predict = lr.predict(x_test)
#调用SGDClassifier中的fit函数/模块来训练模型参数
sgdc.fit(x_train, y_train)
#使用训练好的模型sgdc对X_test进行预测,结果存储在变量sgdc_y_predict中
sgdc_y_predict = sgdc.predict(x_test)

from sklearn.metrics import classification_report

#使用评分函数score获得模型在测试集合上的准确性结果
print('Accuracy of LR Classifier:', lr.score(x_test, y_test))
#利用classification_report模块获得LogisticRegression其他三个指标的结果
print(
    classification_report(y_test,
                          lr_y_predict,
                          target_names=['Benign', 'Malignant']))
print("\n")
print('Accuarcy of SGD Classifier:', sgdc.score(x_test, y_test))
print(
Exemplo n.º 34
0
X = cancer.target.T
ytrue = np.copy(cancer.data).flatten()
ytrue[ytrue > 0] = 1

# label a few points
labeled_N = 4
ys = np.array([-1] * len(ytrue))  # -1 denotes unlabeled point
random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample(
    np.where(ytrue == 1)[0], labeled_N / 2
)
ys[random_labeled_points] = ytrue[random_labeled_points]

# supervised score
# basemodel = WQDA() # weighted Quadratic Discriminant Analysis
basemodel = SGDClassifier(loss="log", penalty="l1")  # scikit logistic regression
basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points])
print "supervised log.reg. score", basemodel.score(X, ytrue)

# fast (but naive, unsafe) self learning framework
ssmodel = SelfLearningModel(basemodel)
ssmodel.fit(X, ys)
print "self-learning log.reg. score", ssmodel.score(X, ytrue)

# semi-supervised score (base model has to be able to take weighted samples)
ssmodel = CPLELearningModel(basemodel)
ssmodel.fit(X, ys)
print "CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue)

# semi-supervised score, WQDA model
ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True)  # weighted Quadratic Discriminant Analysis
ssmodel.fit(X, ys)
Exemplo n.º 35
0
def train_model(texts, points, num_classses, model_dir, text_encoding='utf-8'):
    """ Given an iterable of (text, lat, lon) items, cluster the points into #num_classes and use
	them as labels, then extract unigram features, train a classifier and save it in models/model_name
	for future use. 

	Args:
	texts -- an iterable (e.g. a list) of texts e.g. ['this is the first text', 'this is the second text'].
	points -- an iterable (e.g. a list) of tuples in the form of (lat, lon) where coordinates are of type float e.g. [(1.2343, -10.239834r),(5.634534, -12.47563)]
	num_classes -- the number of desired clusters/labels/classes of the model.
	model_name -- the name of the directory within models/ that the model will be saved.
	"""

    if os.path.exists(model_dir):
        logging.error("Model directory " + model_dir +
                      " already exists, please try another address.")
        sys.exit(-1)
    else:
        os.mkdir(model_dir)

    from sklearn.cluster import KMeans
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model.stochastic_gradient import SGDClassifier

    kmeans = KMeans(n_clusters=num_classses, random_state=0)
    points_arr = numpy.array(points)
    kmeans.fit_transform(points_arr)
    cluster_centers = kmeans.cluster_centers_
    sample_clusters = kmeans.labels_
    label_coordinate = {}
    for i in range(cluster_centers.shape[0]):
        lat, lon = cluster_centers[i, 0], cluster_centers[i, 1]
        label_coordinate[i] = (lat, lon)

    logging.info('extracting features from text...')
    vectorizer = TfidfVectorizer(encoding=text_encoding,
                                 stop_words='english',
                                 ngram_range=(1, 1),
                                 max_df=0.5,
                                 min_df=0,
                                 binary=True,
                                 norm='l2',
                                 use_idf=True,
                                 smooth_idf=True,
                                 sublinear_tf=True)
    X_train = vectorizer.fit_transform(texts)
    Y_train = sample_clusters
    vectorizer.stop_words_ = None
    logging.info(
        'the number of samples is %d and the number of features is %d' %
        (X_train.shape[0], X_train.shape[1]))

    logging.info('training the classifier...')
    logging.warn(
        'Note that alpha (regularisation strength) should be tuned based on the performance on validation data.'
    )
    clf = SGDClassifier(loss='log',
                        penalty='elasticnet',
                        alpha=5e-5,
                        l1_ratio=0.9,
                        fit_intercept=True,
                        n_iter=5,
                        n_jobs=2,
                        random_state=0,
                        learning_rate="optimal")
    clf.fit(X_train, Y_train)
    clf.coef_ = csr_matrix(clf.coef_)

    logging.info(
        'retrieving address of the given points using geopy (requires internet access).'
    )
    coordinate_address = retrieve_location_from_coordinates(points)

    logging.info(
        'dumping the the vectorizer, clf (trained model), label_coordinates and coordinate_locations into pickle files in '
        + model_dir)
    dump_model(clf, vectorizer, coordinate_address, label_coordinate,
               model_dir)
def result():
    if request.method == 'POST':
        path = request.files.get('myFile')

        df = pd.read_csv(path, encoding="ISO-8859-1")

        filename = request.form['filename']

        str1 = request.form['feature']
        str2 = request.form['label']

        if str1 in list(df) and str2 in list(df):
            y = df[str2]
            X = df[str1]
        else:
            return render_template('nameError.html')

        x = []
        for subject in X:
            result = re.sub(r"http\S+", "", subject)
            replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result)
            x.append(replaced)
        X = pd.Series(x)

        X = X.str.lower()
        """
        texts = []
        for doc in X:
            doc = nlp(doc, disable=['parser', 'ner'])
            tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
            tokens = [tok for tok in tokens if tok not in stopwords]
            tokens = ' '.join(tokens)
            texts.append(tokens)

        X = pd.Series(texts)
        """
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.33)

        tfidfvect = TfidfVectorizer(ngram_range=(1, 1))
        X_train_tfidf = tfidfvect.fit_transform(X_train)

        start = time()
        clf1 = LinearSVC()
        clf1.fit(X_train_tfidf, y_train)
        pred_SVC = clf1.predict(tfidfvect.transform(X_test))

        a1 = accuracy_score(y_test, pred_SVC)
        end = time()
        print("accuracy SVC: {} and time: {} s".format(a1, (end - start)))

        start = time()
        clf2 = LogisticRegression(n_jobs=-1,
                                  multi_class='multinomial',
                                  solver='newton-cg')
        clf2.fit(X_train_tfidf, y_train)
        pred_LR = clf2.predict(tfidfvect.transform(X_test))
        a2 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LR: {} and time: {}".format(a2, (end - start)))

        start = time()
        clf3 = RandomForestClassifier(n_jobs=-1)

        clf3.fit(X_train_tfidf, y_train)
        pred = clf3.predict(tfidfvect.transform(X_test))
        a3 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RFC: {} and time: {}".format(a3, (end - start)))

        start = time()
        clf4 = MultinomialNB()

        clf4.fit(X_train_tfidf, y_train)
        pred = clf4.predict(tfidfvect.transform(X_test))
        a4 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy MNB: {} and time: {}".format(a4, (end - start)))

        start = time()
        clf5 = GaussianNB()

        clf5.fit(X_train_tfidf.toarray(), y_train)
        pred = clf5.predict(tfidfvect.transform(X_test).toarray())
        a5 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy GNB: {} and time: {}".format(a5, (end - start)))

        start = time()
        clf6 = LogisticRegressionCV(n_jobs=-1)
        clf6.fit(X_train_tfidf, y_train)
        pred_LR = clf6.predict(tfidfvect.transform(X_test))
        a6 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy LRCV: {} and time: {}".format(a6, (end - start)))

        start = time()
        clf7 = AdaBoostClassifier()
        clf7.fit(X_train_tfidf, y_train)
        pred_LR = clf7.predict(tfidfvect.transform(X_test))
        a7 = accuracy_score(y_test, pred_LR)
        end = time()
        print("accuracy ABC: {} and time: {}".format(a7, (end - start)))

        start = time()
        clf8 = BernoulliNB()

        clf8.fit(X_train_tfidf.toarray(), y_train)
        pred = clf8.predict(tfidfvect.transform(X_test).toarray())
        a8 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy BNB: {} and time: {}".format(a8, (end - start)))

        start = time()
        clf9 = Perceptron(n_jobs=-1)

        clf9.fit(X_train_tfidf.toarray(), y_train)
        pred = clf9.predict(tfidfvect.transform(X_test).toarray())
        a9 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy Per: {} and time: {}".format(a9, (end - start)))
        start = time()
        clf10 = RidgeClassifierCV()

        clf10.fit(X_train_tfidf.toarray(), y_train)
        pred = clf10.predict(tfidfvect.transform(X_test).toarray())
        a10 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy RidCV: {} and time: {}".format(a10, (end - start)))

        start = time()
        clf11 = SGDClassifier(n_jobs=-1)

        clf11.fit(X_train_tfidf.toarray(), y_train)
        pred = clf11.predict(tfidfvect.transform(X_test).toarray())
        a11 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy SGDC: {} and time: {}".format(a11, (end - start)))
        start = time()
        clf12 = SGDClassifier(n_jobs=-1)

        clf12.fit(X_train_tfidf.toarray(), y_train)
        pred = clf12.predict(tfidfvect.transform(X_test).toarray())
        a12 = accuracy_score(y_test, pred)
        end = time()
        print("accuracy XGBC: {} and time: {}".format(a12, (end - start)))

        acu_list = [a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12]
        max_list = max(acu_list)

        if max_list == a1:
            pickle.dump(clf1, open(filename + '_model', 'wb'))
        elif max_list == a2:
            pickle.dump(clf2, open(filename + '_model', 'wb'))
        elif max_list == a3:
            pickle.dump(clf3, open(filename + '_model', 'wb'))
        elif max_list == a4:
            pickle.dump(clf4, open(filename + '_model', 'wb'))
        elif max_list == a5:
            pickle.dump(clf5, open(filename + '_model', 'wb'))
        elif max_list == a6:
            pickle.dump(clf6, open(filename + '_model', 'wb'))
        elif max_list == a7:
            pickle.dump(clf7, open(filename + '_model', 'wb'))
        elif max_list == a8:
            pickle.dump(clf8, open(filename + '_model', 'wb'))
        elif max_list == a9:
            pickle.dump(clf9, open(filename + '_model', 'wb'))
        elif max_list == a10:
            pickle.dump(clf10, open(filename + '_model', 'wb'))
        elif max_list == a11:
            pickle.dump(clf11, open(filename + '_model', 'wb'))
        elif max_list == a12:
            pickle.dump(clf12, open(filename + '_model', 'wb'))

        pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb'))

        return render_template("result.html",
                               ac1=a1,
                               ac2=a2,
                               ac3=a3,
                               ac4=a4,
                               ac5=a5,
                               ac6=a6,
                               ac7=a7,
                               ac8=a8,
                               ac9=a9,
                               ac10=a10,
                               ac11=a11,
                               ac12=a12)
Exemplo n.º 37
0
plt.scatter(features[:,1], features[:,2], c = labels)
plt.plot(x1, x_2)



# sklearn implementation
from sklearn.linear_model import Perceptron
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from sklearn.metrics import accuracy_score


# Fitting an sklearn Perceptron and SGDClassifier with perceptron loss function (these should be identical)
clf = Perceptron(random_state=None, eta0= 0.1, shuffle=False, penalty=None, class_weight=None, fit_intercept=False)
clf2 = SGDClassifier(loss="perceptron",eta0=0.1,learning_rate="constant",penalty=None,random_state=None,shuffle=False,fit_intercept=False,warm_start=False,average=False,n_iter=1000)
clf.fit(x_train, y_train)
clf2.fit(x_train, y_train)

y_predict = clf.predict(x_test)
y_preSGD = clf2.predict(x_test)

print "sklearn Perceptron accuracy:"
print accuracy_score(y_test, y_predict)

print "sklearn SGDClassifier accuracy:"
print accuracy_score(y_test, y_preSGD)

print "my perceptron accuracy:"
print accuracy_score(y_test, y_pred)
print "\n"
#print clf.coef_