def classifyTestSamples(trainingFeatures, trainingCategories, testFeatures): clf = SGDClassifier() clf.fit(trainingFeatures, trainingCategories) predictedCategories = clf.predict(testFeatures) return predictedCategories
class MachineLearning: def __init__(self, Master_DF): self.Data_Frame = Master_DF def Encoder(self, df): encoder = LabelEncoder() print("Fitting") encoder.fit(df) return encoder.transform(df) def Perceptorn_PreProcessing(self, x, y): X = self.Encoder(self.Data_Frame[x].factorize()[0]) Y = self.Encoder(self.Data_Frame[y].factorize()[0]) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0) sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) return X_train_std, Y_train, X_test_std, Y_test def ppn_model(self, n_iter, eta0, random_state): self.X_train_std_uri, self.Y_train_category, self.X_test_std_uri, self.Y_test_category = self.Perceptorn_PreProcessing( 'uri', 'category') ppn = Perceptron(n_iter=n_iter, eta0=eta0, random_state=random_state) ppn.fit(self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1), self.Y_train_category) y_pred = ppn.predict( self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1)) return (self.Y_test_category != y_pred).sum(), accuracy_score( self.Y_test_category, y_pred), y_pred def lr_model(self, c, random_state): lr = LogisticRegression(C=c, random_state=random_state) lr.fit(self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1), self.Y_train_category) y_pred = lr.predict( self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1)) return (self.Y_test_category != y_pred).sum(), accuracy_score( self.Y_test_category, y_pred), y_pred def ada_sd(self, n_iter, eta, random_state): self.ada = SGDClassifier(n_iter=n_iter, eta=eta, random_state=random_state) self.ada.fit( self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1), self.Y_train_category) self.y_pred = self.ada.predict( self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1)) return (self.Y_test_category != self.y_pred).sum(), accuracy_score( self.Y_test_category, self.y_pred), self.y_pred
def withoutPipeline(): scaler = StandardScaler().fit(x_train) # for each x, (x - mean(all x))/std. dev. of x # this step computes the mean and std. dev. x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) clfer = SGDClassifier() clfer.fit(x_train, y_train) # this will try to separate the three classes based # on the two features we gave it. Hence, we will get # back three lines. I.e., three sets of coefficients # and three intercepts if(DEBUG): #print clfer.coef_ #print clfer.intercept_ #print clfer.predict(scaler.transform([[4.7, 3.1]])) #print clfer.decision_function(scaler.transform([[4.7, 3.1]])) # the algorithm evaluates distance from all three # lines and picks the largest one (in this case [0]) pass # validate results y_predict_train = clfer.predict(x_train) print "% Correct results on training set:" print metrics.accuracy_score(y_train, y_predict_train) y_predict_test = clfer.predict(x_test) print "\n% Correct results on testing set:" print metrics.accuracy_score(y_test, y_predict_test) # Understanding the classification report: # Precision: TP/(TP + FP) - ideal 1 - all instances reported as x were x. In other words, # there were no instances reported as x that were NOT x # Recall: TP/(TP + FN) - ideal 1 - all instances OF x were reported as x # Although, accuracy does not appear in the report, it is important to know what it means: # Accuracy: (TP + TN) / (TP + TN + FP + FN) print "\nClassification Report:" print metrics.classification_report(y_test, y_predict_test) # Understanding the confusion matrix # how many of class i were predicted as j # ideal. an Identity matrix print "Confusion Matrix:" print metrics.confusion_matrix(y_test, y_predict_test)
class SGDClassifierImpl(): def __init__(self, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight='balanced', warm_start=False, average=False): self._hyperparams = { 'loss': loss, 'penalty': penalty, 'alpha': alpha, 'l1_ratio': l1_ratio, 'fit_intercept': fit_intercept, 'max_iter': max_iter, 'tol': tol, 'shuffle': shuffle, 'verbose': verbose, 'epsilon': epsilon, 'n_jobs': n_jobs, 'random_state': random_state, 'learning_rate': learning_rate, 'eta0': eta0, 'power_t': power_t, 'early_stopping': early_stopping, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'class_weight': class_weight, 'warm_start': warm_start, 'average': average} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X) def partial_fit(self, X, y=None, classes = None): if not hasattr(self, "_wrapped_model"): self._wrapped_model = SKLModel(**self._hyperparams) self._wrapped_model.partial_fit(X, y, classes = classes) return self
def stochastic_descent(xtrain, ytrain, xtest): clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=10, random_state=42, alpha=1e-3, tol=None) print("SGD Fitting") clf.fit(xtrain, ytrain) # Saving the model with pickle with open(base_dir + "Model", 'wb') as f: pickle.dump(clf, f) print("SGD Predicting") ytest = clf.predict(xtest) return ytest
n_iter=10) trainloss = [] testloss = [] for i, chunk in enumerate( pd.read_csv("cancer2.csv", chunksize=chunksize, header=None, iterator=True)): X = chunk.iloc[:, :-1] y = chunk.iloc[:, -1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) estimator.partial_fit(X, y, classes=np.unique(y)) trainR2 = mean_squared_error(y_train, estimator.predict(X_train)) testR2 = mean_squared_error(y_test, estimator.predict(X_test)) trainloss.append(trainR2) testloss.append(testR2) print("trainloss:{:.4f},testloss:{:.4f} ".format(trainloss[-1], testloss[-1])) if i > 3: break # In[134]: import matplotlib.pyplot as plt plt.plot(trainloss) plt.plot(testloss) plt.legend(('train', 'test')) plt.show()
data[column_names[10]], test_size=0.25, random_state=33) #查验训练样本的数量和类别分布 print(y_train.value_counts()) #查验测试样本的数量和类别分布 print(y_test.value_counts()) #使用线性分类模型从事良/恶性乳腺癌肿瘤预测任务 #标准化数据,保证每个维度的特征数据方差为1,均值为0。使得预测结果不会被某些维度过大的特征值主导 ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) #初始化 LogisticRegression 和 SGDClassifier lr = LogisticRegression() sgdc = SGDClassifier() #调用 LogisticRegression 中的 fit 函数/模块用来训练模型参数 lr.fit(X_train, y_train) #使用训练好的模型 lr 对 X_test进行预测,结果储存在变量 lr_y_predict 中 lr_y_predict = lr.predict(X_test) #调用 SGDClassifier 中的 fit 函数/模块用来训练模型参数 sgdc.fit(X_train, y_train) #使用训练好的模型 sgdc 对 X_test进行预测,结果储存在变量 sgdc_y_predict 中 sgdc_y_predict = sgdc.predict(X_test) #使用线性分类模型从事良/恶性肿瘤预测任务的性能分析 #使用逻辑斯蒂回归模型自带的评分函数 score 获得模型在测试集上的准确性结果 print('LR分类器的精度:', lr.score(X_test, y_test)) #利用 classification_report 获得 LogisticRegression 其他三个指标的结果 print(classification_report(y_test, lr_y_predict, target_names=['良性', '恶性']))
class SGD( IterativeComponentWithSampleWeight, AutoSklearnClassificationAlgorithm, ): def __init__(self, loss, penalty, alpha, fit_intercept, tol, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.tol = tol self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.alpha = float(self.alpha) self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None \ else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None \ else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 512) self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None ) if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = CategoricalHyperparameter("loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default_value="log") penalty = CategoricalHyperparameter( "penalty", ["l1", "l2", "elasticnet"], default_value="l2") alpha = UniformFloatHyperparameter( "alpha", 1e-7, 1e-1, log=True, default_value=0.0001) l1_ratio = UniformFloatHyperparameter( "l1_ratio", 1e-9, 1, log=True, default_value=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True, default_value=1e-4) epsilon = UniformFloatHyperparameter( "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default_value="invscaling") eta0 = UniformFloatHyperparameter( "eta0", 1e-7, 1e-1, default_value=0.01, log=True) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default_value=0.5) average = CategoricalHyperparameter( "average", ["False", "True"], default_value="False") cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept, tol, epsilon, learning_rate, eta0, power_t, average]) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") # eta0 is only relevant if learning_rate!='optimal' according to code # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/ # linear_model/sgd_fast.pyx#L603 eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling", "constant"]) cs.add_conditions([elasticnet, epsilon_condition, power_t_condition, eta0_in_inv_con]) return cs
https://blog.csdn.net/quiet_girl/article/details/72517053 """ ss = StandardScaler() X_train = ss.fit_transform(X_train) # 先拟合数据,然后转化它将其转化为标准形式 X_test = ss.transform( X_test) # Perform standardization by centering and scaling(通过找中心和缩放等实现标准化) # 初始化, Stochastic Gradient Descent Classifier & Logistic Regression lr = LogisticRegression() sgdc = SGDClassifier() lr.fit(X_train, y_train) # LR分类器 训练模型 lr_y_predict = lr.predict(X_test) # 对X_test进行预测 sgdc.fit(X_train, y_train) # 随机梯度下降分类器 sgdc_y_predict = sgdc.predict(X_test) # 对X_test进行预测 # 性能分析(Performance) print('Accuracy of LR Classfier:', lr.score(X_test, y_test)) print( classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant'])) print('Accuracy of SGD Classfier:', sgdc.score(X_test, y_test)) print( classification_report(y_test, sgdc_y_predict, target_names=['Benign', 'Malignant'])) # sklearn分类:https://blog.csdn.net/u012526003/article/details/79054012
ss = StandardScaler() x_train = ss.fit_transform(x_train) x_test = ss.transform(x_test) #初始化 lr = LogisticRegression() sgdc = SGDClassifier() #调用LogisticRegression中的fit函数/模块用来训练模型参数 lr.fit(x_train, y_train) #使用训练好的模型Lr对x_test进行预测,结果存储在lr_y_predict中 lr_y_predict = lr.predict(x_test) #调用SGDClassifier中的fit函数/模块来训练模型参数 sgdc.fit(x_train, y_train) #使用训练好的模型sgdc对X_test进行预测,结果存储在变量sgdc_y_predict中 sgdc_y_predict = sgdc.predict(x_test) from sklearn.metrics import classification_report #使用评分函数score获得模型在测试集合上的准确性结果 print('Accuracy of LR Classifier:', lr.score(x_test, y_test)) #利用classification_report模块获得LogisticRegression其他三个指标的结果 print( classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant'])) print("\n") print('Accuarcy of SGD Classifier:', sgdc.score(x_test, y_test)) print( classification_report(y_test, sgdc_y_predict,
class SGD(AutoSklearnClassificationAlgorithm): def __init__(self, loss, penalty, alpha, fit_intercept, n_iter, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.n_iter = n_iter self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def fit(self, X, y, sample_weight=None): self.iterative_fit(X, y, n_iter=1, sample_weight=sample_weight, refit=True) while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1, sample_weight=sample_weight) return self def iterative_fit(self, X, y, n_iter=1, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier if refit: self.estimator = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.n_iter = int(self.n_iter) self.l1_ratio = float( self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float( self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float( self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=n_iter, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state) else: self.estimator.n_iter += n_iter self.estimator.partial_fit(X, y, classes=np.unique(y), sample_weight=sample_weight) if self.estimator.n_iter >= self.n_iter: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = CategoricalHyperparameter( "loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default="log") penalty = CategoricalHyperparameter("penalty", ["l1", "l2", "elasticnet"], default="l2") alpha = UniformFloatHyperparameter("alpha", 10e-7, 1e-1, log=True, default=0.0001) l1_ratio = UniformFloatHyperparameter("l1_ratio", 1e-9, 1, log=True, default=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, log=True, default=20) epsilon = UniformFloatHyperparameter("epsilon", 1e-5, 1e-1, default=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default="optimal") eta0 = UniformFloatHyperparameter("eta0", 10**-7, 0.1, default=0.01) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default=0.25) average = CategoricalHyperparameter("average", ["False", "True"], default="False") cs.add_hyperparameters([ loss, penalty, alpha, l1_ratio, fit_intercept, n_iter, epsilon, learning_rate, eta0, power_t, average ]) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") # eta0 seems to be always active according to the source code; when # learning_rate is set to optimial, eta0 is the starting value: # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling") #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant") #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant) power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") cs.add_conditions([elasticnet, epsilon_condition, power_t_condition]) return cs
[190,90,47],[175,64,39],[177,70,40],[159,55,37], [171,75,42],[181,85,43]] #Corresponding gender tags Y = ['male','female','female','female','male','male', 'male','female','male','female','male'] # Decision Tree classifier- takes in the input data to predict whether male or female classifier = tree.DecisionTreeClassifier() classifier = classifier.fit(X,Y) #Support Vector Machine Classifier classifier1 = svm.SVC() classifier1 = classifier1.fit(X,Y) #Stochastic Gradient Descent clf = SGDClassifier() clf = clf.fit(X,Y) #Prediction step #prediction for decision Trees prediction = classifier.predict([[172,75,35]]) print(prediction) #Prediction for Support Vector Machines prediction1 =classifier1.predict([[177,70,43]]) print(prediction1) #Prediction for Stochastic Gradient Descent pred = clf.predict([[172,75,35]]) print(pred)
X_batch_kernel_approx, y_batch_onehot = encode(X_batch, y_batch, one_hot_encoder, column_transformer, rbf_sampler) # make one pass of stochastic gradient descent over the batch. sgd_classifier.partial_fit(X_batch_kernel_approx, y_batch, classes=[0, 1]) # print train/test accuracy metrics every 5 batch if (batch_no % 5) == 0: message = "batch {:>4} ".format(batch_no) for origin, X, y_true_onehot in zip( ('train', 'val'), (X_batch_kernel_approx, X_test_kernel_approx), (y_batch_onehot, y_true_test_onehot)): y_pred = sgd_classifier.predict(X) # preprocess correctly the labels and prediction to match # average_precision_score expectations y_pred_onehot = one_hot_encoder.transform(y_pred.reshape(-1, 1)) score = average_precision_score(y_true_onehot, y_pred_onehot) message += "{} precision: {:.4f} ".format(origin, score) if origin == 'val': test_scores_rbf.append(score) train_times_rbf.append(time.perf_counter() - t0) online_train_set_sizes.append((batch_no + 1) * batchsize) print(message) ###############################################################################
def run(keyn, nPart): all_classes = np.array([0, 1]) allKeys = [l.split()[0] for l in open('keywordsAll.txt').readlines()] keyFreqs = [ float(l.split()[1]) / 4205907 for l in open('keywordsAll.txt').readlines() ] key = allKeys[keyn] freq = keyFreqs[keyn] opt = 'body+title+code' bv = 'True' nneg = 'True' nv = 'None' #testopt = 'c' #testopt = 'w' #testopt = 'l2' testopt = 'l1' if testopt == 'c': cls = SGDClassifier(loss='hinge', learning_rate="constant", alpha=1e-6, eta0=1e-2, penalty='l2') elif testopt == 'w': cls = SGDClassifier(class_weight={1: 1.0 / freq / 8.0, 0: 1}) elif testopt == 'l2': cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l2') elif testopt == 'l1': cls = SGDClassifier(loss='log', alpha=1e-5, penalty='l1') outputName = 'key_' + str( keyn) + '_SGDtune_' + opt + '_partialfit_' + testopt + '.txt' pklName = 'SGD_key_' + str(keyn) + '_' + testopt + '.pkl' n0, ntrain = resumeJob(outputName, pklName) body_test, y_test = getTestSet(10, key, opt, testSize=0.2, seed=123) tot_pos = sum(y_test) vectorizer = HashingVectorizer(decode_error='ignore', n_features=2**20, token_pattern=r"\b\w[\w#+.-]*(?<!\.$)", binary=str2bool(bv), norm=normOpt(nv), non_negative=str2bool(nneg)) X_test = vectorizer.transform(body_test) #print 'test case:', len(y_test), 'positive', tot_pos, 'key:', key, 'X norm:', X_test.sum(), 'binary:', bv, 'norm:', nv, 'nneg:', nneg if n0 >= 2: cls = joblib.load(pklName) for n in xrange(n0, 10): outfile = open(outputName, 'a') data = json.load(gzip.open('Train.rdup.' + str(n) + '.json.gz')) minibatch_size = len(data) / nPart + 1 for i in xrange(nPart): n1 = i * minibatch_size n2 = (i + 1) * minibatch_size if i == nPart - 1: n2 = len(data) ntrain += (n2 - n1) body_train, y_train = getMiniBatch(data, n1, n2, key, opt) X_train = vectorizer.transform(body_train) shuffledRange = range(n2 - n1) for n_iter in xrange(5): X_train, y_train = shuffle(X_train, y_train) cls.partial_fit(X_train, y_train, classes=all_classes) y_pred = cls.predict(X_test) f1 = metrics.f1_score(y_test, y_pred) p = metrics.precision_score(y_test, y_pred) r = metrics.recall_score(y_test, y_pred) accu = cls.score(X_train, y_train) y_pred = cls.predict(X_train) f1t = metrics.f1_score(y_train, y_pred) outfile.write( "%3d %8d %.4f %.3f %.3f %.3f %.3f %5d %5d\n" % (n, ntrain, accu, f1t, f1, p, r, sum(y_pred), tot_pos)) _ = joblib.dump(cls, pklName, compress=9) outfile.close()
class SGD(ParamSklearnClassificationAlgorithm): def __init__(self, loss, penalty, alpha, fit_intercept, n_iter, learning_rate, class_weight=None, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.n_iter = n_iter self.learning_rate = learning_rate self.class_weight = class_weight self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def fit(self, X, y): while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1) return self def iterative_fit(self, X, y, n_iter=1, refit=False): if refit: self.estimator = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.n_iter = int(self.n_iter) if self.class_weight == "None": self.class_weight = None self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=self.n_iter, learning_rate=self.learning_rate, class_weight=self.class_weight, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state) self.estimator.n_iter += n_iter self.estimator.fit(X, y) return self def configuration_fully_fitted(self): if self.estimator is None: return False return not self.estimator.n_iter < self.n_iter def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_missing_values': False, 'handles_nominal_values': False, 'handles_numerical_features': True, 'prefers_data_scaled': True, 'prefers_data_normalized': True, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,), # TODO find out what is best used here! 'preferred_dtype' : None} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = cs.add_hyperparameter(CategoricalHyperparameter("loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default="hinge")) penalty = cs.add_hyperparameter(CategoricalHyperparameter( "penalty", ["l1", "l2", "elasticnet"], default="l2")) alpha = cs.add_hyperparameter(UniformFloatHyperparameter( "alpha", 10e-7, 1e-1, log=True, default=0.0001)) l1_ratio = cs.add_hyperparameter(UniformFloatHyperparameter( "l1_ratio", 0, 1, default=0.15)) fit_intercept = cs.add_hyperparameter(UnParametrizedHyperparameter( "fit_intercept", "True")) n_iter = cs.add_hyperparameter(UniformIntegerHyperparameter( "n_iter", 5, 1000, default=20)) epsilon = cs.add_hyperparameter(UniformFloatHyperparameter( "epsilon", 1e-5, 1e-1, default=1e-4, log=True)) learning_rate = cs.add_hyperparameter(CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default="optimal")) eta0 = cs.add_hyperparameter(UniformFloatHyperparameter( "eta0", 10**-7, 0.1, default=0.01)) power_t = cs.add_hyperparameter(UniformFloatHyperparameter( "power_t", 1e-5, 1, default=0.25)) average = cs.add_hyperparameter(CategoricalHyperparameter( "average", ["False", "True"], default="False")) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") # eta0 seems to be always active according to the source code; when # learning_rate is set to optimial, eta0 is the starting value: # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling") #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant") #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant) power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") cs.add_condition(elasticnet) cs.add_condition(epsilon_condition) cs.add_condition(power_t_condition) return cs def __str__(self): return "ParamSklearn StochasticGradientClassifier"
class SGD(AutoSklearnClassificationAlgorithm): def __init__(self, loss, penalty, alpha, fit_intercept, n_iter, learning_rate, class_weight, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.n_iter = n_iter self.learning_rate = learning_rate self.class_weight = class_weight self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.estimator = None def fit(self, X, Y): # TODO: maybe scale training data that its norm becomes 1? # http://scikit-learn.org/stable/modules/sgd.html#id1 self.alpha = float(self.alpha) self.fit_intercept = bool(self.fit_intercept) self.n_iter = int(self.n_iter) if self.class_weight == "None": self.class_weight = None self.l1_ratio = float(self.l1_ratio) self.epsilon = float(self.epsilon) self.eta0 = float(self.eta0) self.power_t = float(self.power_t) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=self.n_iter, learning_rate=self.learning_rate, class_weight=self.class_weight, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, random_state=self.random_state) self.estimator.fit(X, Y) return self def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(): return { 'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_missing_values': False, 'handles_nominal_values': False, 'handles_numerical_features': True, 'prefers_data_scaled': True, 'prefers_data_normalized': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'handles_sparse': True, # TODO find out what is best used here! 'preferred_dtype': None } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): loss = CategoricalHyperparameter( "loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default="hinge") penalty = CategoricalHyperparameter("penalty", ["l1", "l2", "elasticnet"], default="l2") alpha = UniformFloatHyperparameter("alpha", 10**-7, 10**-1, log=True, default=0.0001) l1_ratio = UniformFloatHyperparameter("l1_ratio", 0, 1, default=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, default=20) epsilon = UniformFloatHyperparameter("epsilon", 1e-5, 1e-1, default=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default="optimal") eta0 = UniformFloatHyperparameter("eta0", 10**-7, 0.1, default=0.01) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default=0.5) # This does not allow for other resampling methods! class_weight = CategoricalHyperparameter("class_weight", ["None", "auto"], default="None") cs = ConfigurationSpace() cs.add_hyperparameter(loss) cs.add_hyperparameter(penalty) cs.add_hyperparameter(alpha) cs.add_hyperparameter(l1_ratio) cs.add_hyperparameter(fit_intercept) cs.add_hyperparameter(n_iter) cs.add_hyperparameter(epsilon) cs.add_hyperparameter(learning_rate) cs.add_hyperparameter(eta0) cs.add_hyperparameter(power_t) cs.add_hyperparameter(class_weight) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") # eta0 seems to be always active according to the source code; when # learning_rate is set to optimial, eta0 is the starting value: # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling") #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant") #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant) power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") cs.add_condition(elasticnet) cs.add_condition(epsilon_condition) cs.add_condition(power_t_condition) return cs def __str__(self): return "AutoSklearn StochasticGradientClassifier"
Listx = [[188, 57, 30], [167, 32, 22], [193, 65, 29], [185, 53, 27], [164, 45, 22], [157, 38, 24], [179, 52, 27], [175, 68, 26], [167, 39, 24], [178, 62, 27], [158, 46, 26]] #List of labels Y. Gender female or male Listy = [ 'male', 'female', 'male', 'male', 'female', 'female', 'male', 'male', 'female', 'male', 'female' ] #Model to store the decision tree model: Clasifier Clasifier_tree = tree.DecisionTreeClassifier() Clasifier_Sgradient = SGDClassifier() Clasifier_naive = GaussianNB() #Training stage Clasifier_tree = Clasifier_tree.fit(Listx, Listy) Clasifier_Sgradient = Clasifier_Sgradient.fit(Listx, Listy) Clasifier_naive = Clasifier_naive.fit(Listx, Listy) #Test stage Listz = [[150, 35, 21]] Prediction_tree = Clasifier_tree.predict(Listz) Prediction_Gradient = Clasifier_Sgradient.predict(Listz) Prediction_naive = Clasifier_naive.predict(Listz) print(Prediction_tree) print(Prediction_Gradient) print(Prediction_naive)
class SGD: def __init__(self, loss, penalty, alpha, fit_intercept, tol, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.tol = tol self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def fit(self, X, y, sample_weight=None): self.iterative_fit(X, y, n_iter=2, refit=True, sample_weight=sample_weight) iteration = 2 while not self.configuration_fully_fitted(): n_iter = int(2**iteration / 2) self.iterative_fit(X, y, n_iter=n_iter, sample_weight=sample_weight) iteration += 1 return self def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.alpha = float(self.alpha) self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None \ else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None \ else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 512) self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None) if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df)
class SGD(AutoSklearnClassificationAlgorithm): def __init__(self, loss, penalty, alpha, fit_intercept, tol, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.tol = tol self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def fit(self, X, y, sample_weight=None): n_iter = 2 self.iterative_fit(X, y, n_iter=n_iter, sample_weight=sample_weight, refit=True) while not self.configuration_fully_fitted(): n_iter *= 2 self.iterative_fit(X, y, n_iter=n_iter, sample_weight=sample_weight) return self def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.l1_ratio = float( self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float( self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float( self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None) if self.estimator._max_iter >= 1000 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = CategoricalHyperparameter( "loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default_value="log") penalty = CategoricalHyperparameter("penalty", ["l1", "l2", "elasticnet"], default_value="l2") alpha = UniformFloatHyperparameter("alpha", 1e-7, 1e-1, log=True, default_value=0.0001) l1_ratio = UniformFloatHyperparameter("l1_ratio", 1e-9, 1, log=True, default_value=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True, default_value=1e-4) epsilon = UniformFloatHyperparameter("epsilon", 1e-5, 1e-1, default_value=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default_value="invscaling") eta0 = UniformFloatHyperparameter("eta0", 1e-7, 1e-1, default_value=0.01) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default_value=0.25) average = CategoricalHyperparameter("average", ["False", "True"], default_value="False") cs.add_hyperparameters([ loss, penalty, alpha, l1_ratio, fit_intercept, tol, epsilon, learning_rate, eta0, power_t, average ]) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") # eta0 seems to be always active according to the source code; when # learning_rate is set to optimial, eta0 is the starting value: # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling") #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant") #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant) power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") cs.add_conditions([elasticnet, epsilon_condition, power_t_condition]) return cs
X_train_feature = vec.fit_transform(train_data['word_seg']) X_test_feature = vec.transform(test_data['word_seg']) # --------------情感值预测开始------------------------ y_train_sent = train_data['sentiment_value'].astype(int) X_train_sent,X_test_sent,y_train_sent,y_test_sent=\ train_test_split(X_train_feature,y_train_sent,test_size=0.1,random_state=42) # clf = LogisticRegression(C=4, dual=True) # clf =svm.LinearSVC() # clf =RandomForestClassifier() clf = SGDClassifier(n_iter=80) # tune_params(X_train_sent,y_train_sent) clf.fit(X_train_sent, y_train_sent) # 在训练集评估模型 pred_test_sent = clf.predict(X_test_sent) # 精确度=真阳性/(真阳性+假阳性) precision = precision_score(y_test_sent, pred_test_sent, pos_label=None, average='weighted') # 召回率=真阳性/(真阳性+假阴性) recall = recall_score(y_test_sent, pred_test_sent, pos_label=None, average='weighted') # F1 f1 = f1_score(y_test_sent, pred_test_sent, pos_label=None, average='weighted') # 精确率 accuracy = accuracy_score(y_test_sent, pred_test_sent) print("precision:{:.4f}-recall:{:.4f}-f1:{:.4f}-accuracy:{:.4f}".format(
# sklearn implementation from sklearn.linear_model import Perceptron from sklearn.linear_model.stochastic_gradient import SGDClassifier from sklearn.metrics import accuracy_score # Fitting an sklearn Perceptron and SGDClassifier with perceptron loss function (these should be identical) clf = Perceptron(random_state=None, eta0= 0.1, shuffle=False, penalty=None, class_weight=None, fit_intercept=False) clf2 = SGDClassifier(loss="perceptron",eta0=0.1,learning_rate="constant",penalty=None,random_state=None,shuffle=False,fit_intercept=False,warm_start=False,average=False,n_iter=1000) clf.fit(x_train, y_train) clf2.fit(x_train, y_train) y_predict = clf.predict(x_test) y_preSGD = clf2.predict(x_test) print "sklearn Perceptron accuracy:" print accuracy_score(y_test, y_predict) print "sklearn SGDClassifier accuracy:" print accuracy_score(y_test, y_preSGD) print "my perceptron accuracy:" print accuracy_score(y_test, y_pred) print "\n" #print clf.coef_ def x22(x1, w): w0 = clf.coef_[0][0] w1 = clf.coef_[0][1]
class SGD( IterativeComponentWithSampleWeight, BaseClassificationModel, ): def __init__(self, loss, penalty, alpha, fit_intercept, tol, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.tol = tol self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None self.time_limit = None self.start_time = time.time() def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: if isinstance(self.loss, tuple): nested_loss = self.loss self.loss = nested_loss[0] if self.loss == 'modified_huber': self.epsilon = nested_loss[1]['epsilon'] if isinstance(self.penalty, tuple): nested_penalty = self.penalty self.penalty = nested_penalty[0] if self.penalty == "elasticnet": self.l1_ratio = nested_penalty[1]['l1_ratio'] if isinstance(self.learning_rate, tuple): nested_learning_rate = self.learning_rate self.learning_rate = nested_learning_rate[0] if self.learning_rate == 'invscaling': self.eta0 = nested_learning_rate[1]['eta0'] self.power_t = nested_learning_rate[1]['power_t'] elif self.learning_rate == 'constant': self.eta0 = nested_learning_rate[1]['eta0'] self.fully_fit_ = False self.alpha = float(self.alpha) self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None \ else 0.1 self.eta0 = float(self.eta0) if self.eta0 is not None else 0.01 self.power_t = float(self.power_t) if self.power_t is not None \ else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 512) self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None ) if self.estimator.max_iter >= 512 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): if optimizer == 'smac': cs = ConfigurationSpace() loss = CategoricalHyperparameter("loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default_value="log") penalty = CategoricalHyperparameter( "penalty", ["l1", "l2", "elasticnet"], default_value="l2") alpha = UniformFloatHyperparameter( "alpha", 1e-7, 1e-1, log=True, default_value=0.0001) l1_ratio = UniformFloatHyperparameter( "l1_ratio", 1e-9, 1, log=True, default_value=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True, default_value=1e-4) epsilon = UniformFloatHyperparameter( "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default_value="invscaling") eta0 = UniformFloatHyperparameter( "eta0", 1e-7, 1e-1, default_value=0.01, log=True) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, log=True, default_value=0.5) average = CategoricalHyperparameter( "average", ["False", "True"], default_value="False") cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept, tol, epsilon, learning_rate, eta0, power_t, average]) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") # eta0 is only relevant if learning_rate!='optimal' according to code # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/ # linear_model/sgd_fast.pyx#L603 eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling", "constant"]) cs.add_conditions([elasticnet, epsilon_condition, power_t_condition, eta0_in_inv_con]) return cs elif optimizer == 'tpe': eta0 = hp.loguniform('sgd_eta0', np.log(1e-7), np.log(1e-1)) space = { 'loss': hp.choice('sgd_loss', [ ("modified_huber", {'epsilon': hp.loguniform('sgd_epsilon', np.log(1e-5), np.log(1e-1))}), ("hinge", {}), ("log", {}), ("squared_hinge", {}), ("perceptron", {})]), 'penalty': hp.choice('sgd_penalty', [("elasticnet", {'l1_ratio': hp.loguniform('sgd_l1_ratio', np.log(1e-9), np.log(1))}), ("l1", None), ("l2", None)]), 'alpha': hp.loguniform('sgd_alpha', np.log(1e-7), np.log(1e-1)), 'fit_intercept': hp.choice('sgd_fit_intercept', ["True"]), 'tol': hp.loguniform('sgd_tol', np.log(1e-5), np.log(1e-1)), 'learning_rate': hp.choice('sgd_learning_rate', [("optimal", {}), ("invscaling", {'power_t': hp.loguniform('sgd_power_t', np.log(1e-5), np.log(1)), 'eta0': eta0}), ("constant", {'eta0': eta0})]), 'average': hp.choice('sgd_average', ["True", "False"])} init_trial = {'loss': ("log", {}), 'penalty': ("l2", {}), 'alpha': 1e-4, 'fit_intercept': "True", 'tol': 1e-4, 'learning_rate': ("invscaling", {'power_t': 0.5, 'eta0': 0.01}), 'average': "False"} return space
def result(): if request.method == 'POST': path = request.files.get('myFile') # Reading the CSV file and converting it into a pandas data-frame df = pd.read_csv(path, encoding="ISO-8859-1") # Reading the name for the file for the model that will be saved filename = request.form['filename'] # Reading the names of the feature and label as strings str1 = request.form['feature'] str2 = request.form['label'] # Assigning the feature and label variables to the respective columns if str1 in list(df) and str2 in list(df): y = df[str2] X = df[str1] else: return render_template('nameError.html') ''' # Removing the punctuations and HTTP links in the feature text input x = [] for subject in X: result = re.sub(r"http\S+", "", subject) replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result) x.append(replaced) X = pd.Series(x) ''' X = X.str.lower() # Optional use of Tokenization and Lemmatization using Natural Language Processing in SpaCy """ texts = [] for doc in X: doc = nlp(doc, disable=['parser', 'ner']) tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-'] tokens = [tok for tok in tokens if tok not in stopwords] tokens = ' '.join(tokens) texts.append(tokens) X = pd.Series(texts) """ # Splitting the data-set into 2 parts : Training data and Test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True) tfidfvect = TfidfVectorizer(ngram_range=(1, 1)) X_train_tfidf = tfidfvect.fit_transform(X_train) # Fitting all the classification models one by one and recording their accuracies and execution times start = time() clf1 = LinearSVC() clf1.fit(X_train_tfidf, y_train) pred_SVC = clf1.predict(tfidfvect.transform(X_test)) a1 = accuracy_score(y_test, pred_SVC) end = time() print("accuracy SVC: {} and time: {} s".format(a1, (end - start))) start = time() clf2 = LogisticRegression(n_jobs=-1, multi_class='multinomial', solver='newton-cg') clf2.fit(X_train_tfidf, y_train) pred_LR = clf2.predict(tfidfvect.transform(X_test)) a2 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LR: {} and time: {}".format(a2, (end - start))) start = time() clf3 = RandomForestClassifier(n_jobs=-1) clf3.fit(X_train_tfidf, y_train) pred = clf3.predict(tfidfvect.transform(X_test)) a3 = accuracy_score(y_test, pred) end = time() print("accuracy RFC: {} and time: {}".format(a3, (end - start))) start = time() clf4 = MultinomialNB() clf4.fit(X_train_tfidf, y_train) pred = clf4.predict(tfidfvect.transform(X_test)) a4 = accuracy_score(y_test, pred) end = time() print("accuracy MNB: {} and time: {}".format(a4, (end - start))) start = time() clf11 = SGDClassifier(n_jobs=-1) clf11.fit(X_train_tfidf, y_train) pred = clf11.predict(tfidfvect.transform(X_test)) a11 = accuracy_score(y_test, pred) end = time() print("accuracy SGDC: {} and time: {}".format(a11, (end - start))) start = time() clf12 = SGDClassifier(n_jobs=-1) clf12.fit(X_train_tfidf, y_train) pred = clf12.predict(tfidfvect.transform(X_test)) a12 = accuracy_score(y_test, pred) end = time() print("accuracy XGBC: {} and time: {}".format(a12, (end - start))) # Comparing the accuracies of all the models and then saving(dumping) the model with the highest accuracy using pickle for later use. acu_list = [a1, a2, a3, a4, a11, a12] max_list = max(acu_list) if max_list == a1: pickle.dump(clf1, open(filename + '_model', 'wb')) elif max_list == a2: pickle.dump(clf2, open(filename + '_model', 'wb')) elif max_list == a3: pickle.dump(clf3, open(filename + '_model', 'wb')) elif max_list == a4: pickle.dump(clf4, open(filename + '_model', 'wb')) elif max_list == a11: pickle.dump(clf11, open(filename + '_model', 'wb')) elif max_list == a12: pickle.dump(clf12, open(filename + '_model', 'wb')) pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb')) return render_template("result.html", ac1=a1, ac2=a2, ac3=a3, ac4=a4, ac11=a11, ac12=a12)
def result(): if request.method == 'POST': path = request.files.get('myFile') df = pd.read_csv(path, encoding="ISO-8859-1") filename = request.form['filename'] str1 = request.form['feature'] str2 = request.form['label'] if str1 in list(df) and str2 in list(df): y = df[str2] X = df[str1] else: return render_template('nameError.html') x = [] for subject in X: result = re.sub(r"http\S+", "", subject) replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result) x.append(replaced) X = pd.Series(x) X = X.str.lower() """ texts = [] for doc in X: doc = nlp(doc, disable=['parser', 'ner']) tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-'] tokens = [tok for tok in tokens if tok not in stopwords] tokens = ' '.join(tokens) texts.append(tokens) X = pd.Series(texts) """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) tfidfvect = TfidfVectorizer(ngram_range=(1, 1)) X_train_tfidf = tfidfvect.fit_transform(X_train) start = time() clf1 = LinearSVC() clf1.fit(X_train_tfidf, y_train) pred_SVC = clf1.predict(tfidfvect.transform(X_test)) a1 = accuracy_score(y_test, pred_SVC) end = time() print("accuracy SVC: {} and time: {} s".format(a1, (end - start))) start = time() clf2 = LogisticRegression(n_jobs=-1, multi_class='multinomial', solver='newton-cg') clf2.fit(X_train_tfidf, y_train) pred_LR = clf2.predict(tfidfvect.transform(X_test)) a2 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LR: {} and time: {}".format(a2, (end - start))) start = time() clf3 = RandomForestClassifier(n_jobs=-1) clf3.fit(X_train_tfidf, y_train) pred = clf3.predict(tfidfvect.transform(X_test)) a3 = accuracy_score(y_test, pred) end = time() print("accuracy RFC: {} and time: {}".format(a3, (end - start))) start = time() clf4 = MultinomialNB() clf4.fit(X_train_tfidf, y_train) pred = clf4.predict(tfidfvect.transform(X_test)) a4 = accuracy_score(y_test, pred) end = time() print("accuracy MNB: {} and time: {}".format(a4, (end - start))) start = time() clf5 = GaussianNB() clf5.fit(X_train_tfidf.toarray(), y_train) pred = clf5.predict(tfidfvect.transform(X_test).toarray()) a5 = accuracy_score(y_test, pred) end = time() print("accuracy GNB: {} and time: {}".format(a5, (end - start))) start = time() clf6 = LogisticRegressionCV(n_jobs=-1) clf6.fit(X_train_tfidf, y_train) pred_LR = clf6.predict(tfidfvect.transform(X_test)) a6 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LRCV: {} and time: {}".format(a6, (end - start))) start = time() clf7 = AdaBoostClassifier() clf7.fit(X_train_tfidf, y_train) pred_LR = clf7.predict(tfidfvect.transform(X_test)) a7 = accuracy_score(y_test, pred_LR) end = time() print("accuracy ABC: {} and time: {}".format(a7, (end - start))) start = time() clf8 = BernoulliNB() clf8.fit(X_train_tfidf.toarray(), y_train) pred = clf8.predict(tfidfvect.transform(X_test).toarray()) a8 = accuracy_score(y_test, pred) end = time() print("accuracy BNB: {} and time: {}".format(a8, (end - start))) start = time() clf9 = Perceptron(n_jobs=-1) clf9.fit(X_train_tfidf.toarray(), y_train) pred = clf9.predict(tfidfvect.transform(X_test).toarray()) a9 = accuracy_score(y_test, pred) end = time() print("accuracy Per: {} and time: {}".format(a9, (end - start))) start = time() clf10 = RidgeClassifierCV() clf10.fit(X_train_tfidf.toarray(), y_train) pred = clf10.predict(tfidfvect.transform(X_test).toarray()) a10 = accuracy_score(y_test, pred) end = time() print("accuracy RidCV: {} and time: {}".format(a10, (end - start))) start = time() clf11 = SGDClassifier(n_jobs=-1) clf11.fit(X_train_tfidf.toarray(), y_train) pred = clf11.predict(tfidfvect.transform(X_test).toarray()) a11 = accuracy_score(y_test, pred) end = time() print("accuracy SGDC: {} and time: {}".format(a11, (end - start))) start = time() clf12 = SGDClassifier(n_jobs=-1) clf12.fit(X_train_tfidf.toarray(), y_train) pred = clf12.predict(tfidfvect.transform(X_test).toarray()) a12 = accuracy_score(y_test, pred) end = time() print("accuracy XGBC: {} and time: {}".format(a12, (end - start))) acu_list = [a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12] max_list = max(acu_list) if max_list == a1: pickle.dump(clf1, open(filename + '_model', 'wb')) elif max_list == a2: pickle.dump(clf2, open(filename + '_model', 'wb')) elif max_list == a3: pickle.dump(clf3, open(filename + '_model', 'wb')) elif max_list == a4: pickle.dump(clf4, open(filename + '_model', 'wb')) elif max_list == a5: pickle.dump(clf5, open(filename + '_model', 'wb')) elif max_list == a6: pickle.dump(clf6, open(filename + '_model', 'wb')) elif max_list == a7: pickle.dump(clf7, open(filename + '_model', 'wb')) elif max_list == a8: pickle.dump(clf8, open(filename + '_model', 'wb')) elif max_list == a9: pickle.dump(clf9, open(filename + '_model', 'wb')) elif max_list == a10: pickle.dump(clf10, open(filename + '_model', 'wb')) elif max_list == a11: pickle.dump(clf11, open(filename + '_model', 'wb')) elif max_list == a12: pickle.dump(clf12, open(filename + '_model', 'wb')) pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb')) return render_template("result.html", ac1=a1, ac2=a2, ac3=a3, ac4=a4, ac5=a5, ac6=a6, ac7=a7, ac8=a8, ac9=a9, ac10=a10, ac11=a11, ac12=a12)
# sklearn implementation from sklearn.linear_model import Perceptron from sklearn.linear_model.stochastic_gradient import SGDClassifier from sklearn.metrics import accuracy_score # Fitting an sklearn Perceptron and SGDClassifier with perceptron loss function (these should be identical) clf = Perceptron(random_state=None, eta0= 0.1, shuffle=False, penalty=None, class_weight=None, fit_intercept=False) clf2 = SGDClassifier(loss="perceptron",eta0=0.1,learning_rate="constant",penalty=None,random_state=None,shuffle=False,fit_intercept=False,warm_start=False,average=False,n_iter=1000) clf.fit(x_train, y_train) clf2.fit(x_train, y_train) y_predict = clf.predict(x_test) y_preSGD = clf2.predict(x_test) print "sklearn Perceptron accuracy:" print accuracy_score(y_test, y_predict) print "sklearn SGDClassifier accuracy:" print accuracy_score(y_test, y_preSGD) print "my perceptron accuracy:" print accuracy_score(y_test, y_pred) print "\n" #print clf.coef_ def x22(x1, w): w0 = clf.coef_[0][0] w1 = clf.coef_[0][1]