def SGD(self, train_features, test_features): print("in SGD") self.train_features = train_features self.test_features = test_features scores = [] submission = pd.DataFrame.from_dict({'id': test['Id']}) SGD_file = 'SGD.pckl' SGD_model_pkl = open(SGD_file, 'wb') for class_name in class_names: train_target = train[class_name] classifier = SGDClassifier(loss='modified_huber', penalty='l2', alpha=0.001, random_state=42, max_iter=200, tol=0.20, learning_rate='optimal') cv_score = np.mean( cross_val_score(classifier, train_features, train_target, cv=3, scoring='roc_auc')) scores.append(cv_score) print('CV score for class {} is {}'.format(class_name, cv_score)) classifier.fit(train_features, train_target) pickle.dump(classifier, SGD_model_pkl) submission[class_name] = classifier.predict_proba(test_features)[:, 1] print('Total CV score is {}'.format(np.mean(scores))) SGD_model_pkl.close() submission.to_csv('SGD.csv', index=False)
def training(processed_train_csv_file): processed_train_samples = pd.read_csv(processed_train_csv_file) processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan) processed_train_samples = processed_train_samples.fillna(value=0) processed_train_samples_index_lst = processed_train_samples.index.tolist() random.shuffle(processed_train_samples_index_lst) shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst] col_names = shuffled_train_samples.columns.tolist() col_names.remove("booking_bool") features = shuffled_train_samples[col_names].values labels = shuffled_train_samples["booking_bool"].values print "Training Random Forest Classifier" rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) rf_classifier.fit(features, labels) print "Saving the Random Forest Classifier" data_io.save_model(rf_classifier, model_name="rf_classifier.pkl") print "Training Gradient Boosting Classifier" gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) gb_classifier.fit(features, labels) print "Saving the Gradient Boosting Classifier" data_io.save_model(gb_classifier, model_name="gb_classifier.pkl") print "Training SGD Classifier" sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1) sgd_classifier.fit(features, labels) print "Saving the SGD Classifier" data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
def classifyTestSamples(trainingFeatures, trainingCategories, testFeatures): clf = SGDClassifier() clf.fit(trainingFeatures, trainingCategories) predictedCategories = clf.predict(testFeatures) return predictedCategories
class CreateSGDClassifier(CreateModel): def fit(self, data, args): self.model = SGDClassifier(loss="log") with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval
class MachineLearning: def __init__(self, Master_DF): self.Data_Frame = Master_DF def Encoder(self, df): encoder = LabelEncoder() print("Fitting") encoder.fit(df) return encoder.transform(df) def Perceptorn_PreProcessing(self, x, y): X = self.Encoder(self.Data_Frame[x].factorize()[0]) Y = self.Encoder(self.Data_Frame[y].factorize()[0]) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0) sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) return X_train_std, Y_train, X_test_std, Y_test def ppn_model(self, n_iter, eta0, random_state): self.X_train_std_uri, self.Y_train_category, self.X_test_std_uri, self.Y_test_category = self.Perceptorn_PreProcessing( 'uri', 'category') ppn = Perceptron(n_iter=n_iter, eta0=eta0, random_state=random_state) ppn.fit(self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1), self.Y_train_category) y_pred = ppn.predict( self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1)) return (self.Y_test_category != y_pred).sum(), accuracy_score( self.Y_test_category, y_pred), y_pred def lr_model(self, c, random_state): lr = LogisticRegression(C=c, random_state=random_state) lr.fit(self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1), self.Y_train_category) y_pred = lr.predict( self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1)) return (self.Y_test_category != y_pred).sum(), accuracy_score( self.Y_test_category, y_pred), y_pred def ada_sd(self, n_iter, eta, random_state): self.ada = SGDClassifier(n_iter=n_iter, eta=eta, random_state=random_state) self.ada.fit( self.X_train_std_uri.reshape(len(self.X_train_std_uri), 1), self.Y_train_category) self.y_pred = self.ada.predict( self.X_test_std_uri.reshape(len(self.X_test_std_uri), 1)) return (self.Y_test_category != self.y_pred).sum(), accuracy_score( self.Y_test_category, self.y_pred), self.y_pred
def do_training(processed_train_csv_file): ## Processed train samples reading # read saved processed train samples from the given csv file processed_train_samples = pd.read_csv(processed_train_csv_file) # inf to nan processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan) # nan to 0 processed_train_samples = processed_train_samples.fillna(value=0) processed_train_samples_index_lst = processed_train_samples.index.tolist() # 之前排过序,这里shuffle一下,效果更好 random.shuffle(processed_train_samples_index_lst) # organize new train samples and targets shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst] col_names = shuffled_train_samples.columns.tolist() col_names.remove("booking_bool") features = shuffled_train_samples[col_names].values labels = shuffled_train_samples['booking_bool'].values # Model training # 1 Random Forest Classifier print("Training Random Forest Classifier") rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, n_jobs=-1, min_samples_split=10) rf_classifier.fit(features, labels) print("Saving the Random Forest Classifier") data_io.save_model(rf_classifier, model_name='rf_classifier.pkl') # 2 Gradient Boosting Classifier print("Gradient Boosting Classifier") gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) gb_classifier.fit(features, labels) print("Saving the Gradient Boosting Classifier") data_io.save_model(gb_classifier, model_name='gb_classifier.pkl') # 3 SGD Classifier print("SGD Classifier") sgd_classifier = SGDClassifier(loss="modified_huber", verbose=2, n_jobs=-1) sgd_classifier.fit(features, labels) print("saved the SGD Classifier") data_io.save_model(sgd_classifier, model_name='sgd_classifier.pkl')
def train_model(texts, points, num_classses, model_dir, text_encoding='utf-8'): """ Given an iterable of (text, lat, lon) items, cluster the points into #num_classes and use them as labels, then extract unigram features, train a classifier and save it in models/model_name for future use. Args: texts -- an iterable (e.g. a list) of texts e.g. ['this is the first text', 'this is the second text']. points -- an iterable (e.g. a list) of tuples in the form of (lat, lon) where coordinates are of type float e.g. [(1.2343, -10.239834r),(5.634534, -12.47563)] num_classes -- the number of desired clusters/labels/classes of the model. model_name -- the name of the directory within models/ that the model will be saved. """ if os.path.exists(model_dir): logging.error("Model directory " + model_dir + " already exists, please try another address.") sys.exit(-1) else: os.mkdir(model_dir) from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.stochastic_gradient import SGDClassifier kmeans = KMeans(n_clusters=num_classses, random_state=0) points_arr = numpy.array(points) kmeans.fit_transform(points_arr) cluster_centers = kmeans.cluster_centers_ sample_clusters = kmeans.labels_ label_coordinate = {} for i in range(cluster_centers.shape[0]): lat, lon = cluster_centers[i, 0], cluster_centers[i, 1] label_coordinate[i] = (lat, lon) logging.info('extracting features from text...') vectorizer = TfidfVectorizer(encoding=text_encoding, stop_words='english', ngram_range=(1,1), max_df=0.5, min_df=0, binary=True, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True) X_train = vectorizer.fit_transform(texts) Y_train = sample_clusters vectorizer.stop_words_ = None logging.info('the number of samples is %d and the number of features is %d' % (X_train.shape[0], X_train.shape[1])) logging.info('training the classifier...') logging.warn('Note that alpha (regularisation strength) should be tuned based on the performance on validation data.') clf = SGDClassifier(loss='log', penalty='elasticnet', alpha=5e-5, l1_ratio=0.9, fit_intercept=True, n_iter=5, n_jobs=2, random_state=0, learning_rate="optimal") clf.fit(X_train, Y_train) clf.coef_ = csr_matrix(clf.coef_) logging.info('retrieving address of the given points using geopy (requires internet access).') coordinate_address = retrieve_location_from_coordinates(label_coordinate.values()) logging.info('dumping the the vectorizer, clf (trained model), label_coordinates and coordinate_locations into pickle files in ' + model_dir) dump_model(clf, vectorizer, coordinate_address, label_coordinate, model_dir)
def main(): raw_data = np.loadtxt(sys.stdin) samples = raw_data.shape[0] X = np.empty((samples, 401)) Y = raw_data[:,0] for i in xrange(samples): X[i] = transform(raw_data[i, 1:]) clf = SGDClassifier(loss = _LOSS, penalty = _PENALTY, fit_intercept = False, shuffle = True, alpha = _REGULARIZATION) clf.fit(X, Y) sys.stdout.write('%s\t' % _KEY) for coeff in clf.coef_.flatten(): sys.stdout.write("%f " % coeff) sys.stdout.write("\n")
class SGDClassifierImpl(): def __init__(self, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=None, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, early_stopping=False, validation_fraction=0.1, n_iter_no_change=5, class_weight='balanced', warm_start=False, average=False): self._hyperparams = { 'loss': loss, 'penalty': penalty, 'alpha': alpha, 'l1_ratio': l1_ratio, 'fit_intercept': fit_intercept, 'max_iter': max_iter, 'tol': tol, 'shuffle': shuffle, 'verbose': verbose, 'epsilon': epsilon, 'n_jobs': n_jobs, 'random_state': random_state, 'learning_rate': learning_rate, 'eta0': eta0, 'power_t': power_t, 'early_stopping': early_stopping, 'validation_fraction': validation_fraction, 'n_iter_no_change': n_iter_no_change, 'class_weight': class_weight, 'warm_start': warm_start, 'average': average} self._wrapped_model = SKLModel(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X) def partial_fit(self, X, y=None, classes = None): if not hasattr(self, "_wrapped_model"): self._wrapped_model = SKLModel(**self._hyperparams) self._wrapped_model.partial_fit(X, y, classes = classes) return self
def stochastic_descent(xtrain, ytrain, xtest): clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=10, random_state=42, alpha=1e-3, tol=None) print("SGD Fitting") clf.fit(xtrain, ytrain) # Saving the model with pickle with open(base_dir + "Model", 'wb') as f: pickle.dump(clf, f) print("SGD Predicting") ytest = clf.predict(xtest) return ytest
def SGD_c_fit(X, y): clf = SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, shuffle=True) return clf.fit(X, y)
def withoutPipeline(): scaler = StandardScaler().fit(x_train) # for each x, (x - mean(all x))/std. dev. of x # this step computes the mean and std. dev. x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) clfer = SGDClassifier() clfer.fit(x_train, y_train) # this will try to separate the three classes based # on the two features we gave it. Hence, we will get # back three lines. I.e., three sets of coefficients # and three intercepts if(DEBUG): #print clfer.coef_ #print clfer.intercept_ #print clfer.predict(scaler.transform([[4.7, 3.1]])) #print clfer.decision_function(scaler.transform([[4.7, 3.1]])) # the algorithm evaluates distance from all three # lines and picks the largest one (in this case [0]) pass # validate results y_predict_train = clfer.predict(x_train) print "% Correct results on training set:" print metrics.accuracy_score(y_train, y_predict_train) y_predict_test = clfer.predict(x_test) print "\n% Correct results on testing set:" print metrics.accuracy_score(y_test, y_predict_test) # Understanding the classification report: # Precision: TP/(TP + FP) - ideal 1 - all instances reported as x were x. In other words, # there were no instances reported as x that were NOT x # Recall: TP/(TP + FN) - ideal 1 - all instances OF x were reported as x # Although, accuracy does not appear in the report, it is important to know what it means: # Accuracy: (TP + TN) / (TP + TN + FP + FN) print "\nClassification Report:" print metrics.classification_report(y_test, y_predict_test) # Understanding the confusion matrix # how many of class i were predicted as j # ideal. an Identity matrix print "Confusion Matrix:" print metrics.confusion_matrix(y_test, y_predict_test)
def getSGDClassifier(X,Y): sgdclassifier = SGDClassifier(loss='log', penalty='l1', n_iter=10, shuffle=True,random_state=0) print "[SGD Classifier] train on full data...-> 42k samples" sgdclassifier.fit(X,Y) return sgdclassifier
plt.scatter(features[:,1], features[:,2], c = labels) plt.plot(x1, x_2) # sklearn implementation from sklearn.linear_model import Perceptron from sklearn.linear_model.stochastic_gradient import SGDClassifier from sklearn.metrics import accuracy_score # Fitting an sklearn Perceptron and SGDClassifier with perceptron loss function (these should be identical) clf = Perceptron(random_state=None, eta0= 0.1, shuffle=False, penalty=None, class_weight=None, fit_intercept=False) clf2 = SGDClassifier(loss="perceptron",eta0=0.1,learning_rate="constant",penalty=None,random_state=None,shuffle=False,fit_intercept=False,warm_start=False,average=False,n_iter=1000) clf.fit(x_train, y_train) clf2.fit(x_train, y_train) y_predict = clf.predict(x_test) y_preSGD = clf2.predict(x_test) print "sklearn Perceptron accuracy:" print accuracy_score(y_test, y_predict) print "sklearn SGDClassifier accuracy:" print accuracy_score(y_test, y_preSGD) print "my perceptron accuracy:" print accuracy_score(y_test, y_pred) print "\n" #print clf.coef_
def SGD_c_fit(X,y): clf = SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, shuffle=True) return clf.fit(X, y)
xi = X_train[i].reshape((nb_features, 1)) loss = max(0, 1 - (Y_train[i] * np.dot(w.T, xi))) tau = loss / (np.power(np.linalg.norm(xi, ord=2), 2) + (1 / (2*C))) coeff = tau * Y_train[i] w += coeff * xi # Compute accuracy Y_pred = np.sign(np.dot(w.T, X_test.T)) c = np.count_nonzero(Y_pred - Y_test) print('PA accuracy: {}'.format(1 - float(c) / X_test.shape[0])) # Train an Stochastic Gradient Descent Classifer poly = PolynomialFeatures(degree=2) X_train = poly.fit_transform(X_train) X_test = poly.fit_transform(X_test) SGDC = SGDClassifier(alpha=0.01, loss='hinge', penalty='l2', fit_intercept = True, tol= 1e-3, n_jobs=-1) SGDC.fit(X_train, Y_train) print('SGDClassifier score: {}'.format(SGDC.score(X_test, Y_test))) # Passive Aggressive Classifier PA = PassiveAggressiveClassifier(C=0.01, loss='squared_hinge', n_jobs=-1) PA.fit(X_train, Y_train) print('PA score: {}'.format(PA.score(X_test, Y_test)))
class SGD(AutoSklearnClassificationAlgorithm): def __init__(self, loss, penalty, alpha, fit_intercept, n_iter, learning_rate, class_weight, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.n_iter = n_iter self.learning_rate = learning_rate self.class_weight = class_weight self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.estimator = None def fit(self, X, Y): # TODO: maybe scale training data that its norm becomes 1? # http://scikit-learn.org/stable/modules/sgd.html#id1 self.alpha = float(self.alpha) self.fit_intercept = bool(self.fit_intercept) self.n_iter = int(self.n_iter) if self.class_weight == "None": self.class_weight = None self.l1_ratio = float(self.l1_ratio) self.epsilon = float(self.epsilon) self.eta0 = float(self.eta0) self.power_t = float(self.power_t) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=self.n_iter, learning_rate=self.learning_rate, class_weight=self.class_weight, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, random_state=self.random_state) self.estimator.fit(X, Y) return self def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(): return { 'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_missing_values': False, 'handles_nominal_values': False, 'handles_numerical_features': True, 'prefers_data_scaled': True, 'prefers_data_normalized': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'handles_sparse': True, # TODO find out what is best used here! 'preferred_dtype': None } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): loss = CategoricalHyperparameter( "loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default="hinge") penalty = CategoricalHyperparameter("penalty", ["l1", "l2", "elasticnet"], default="l2") alpha = UniformFloatHyperparameter("alpha", 10**-7, 10**-1, log=True, default=0.0001) l1_ratio = UniformFloatHyperparameter("l1_ratio", 0, 1, default=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") n_iter = UniformIntegerHyperparameter("n_iter", 5, 1000, default=20) epsilon = UniformFloatHyperparameter("epsilon", 1e-5, 1e-1, default=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default="optimal") eta0 = UniformFloatHyperparameter("eta0", 10**-7, 0.1, default=0.01) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default=0.5) # This does not allow for other resampling methods! class_weight = CategoricalHyperparameter("class_weight", ["None", "auto"], default="None") cs = ConfigurationSpace() cs.add_hyperparameter(loss) cs.add_hyperparameter(penalty) cs.add_hyperparameter(alpha) cs.add_hyperparameter(l1_ratio) cs.add_hyperparameter(fit_intercept) cs.add_hyperparameter(n_iter) cs.add_hyperparameter(epsilon) cs.add_hyperparameter(learning_rate) cs.add_hyperparameter(eta0) cs.add_hyperparameter(power_t) cs.add_hyperparameter(class_weight) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") # eta0 seems to be always active according to the source code; when # learning_rate is set to optimial, eta0 is the starting value: # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling") #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant") #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant) power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") cs.add_condition(elasticnet) cs.add_condition(epsilon_condition) cs.add_condition(power_t_condition) return cs def __str__(self): return "AutoSklearn StochasticGradientClassifier"
data[column_names[10]], test_size=0.25, random_state=33) #查验训练样本的数量和类别分布 print(y_train.value_counts()) #查验测试样本的数量和类别分布 print(y_test.value_counts()) #使用线性分类模型从事良/恶性乳腺癌肿瘤预测任务 #标准化数据,保证每个维度的特征数据方差为1,均值为0。使得预测结果不会被某些维度过大的特征值主导 ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) #初始化 LogisticRegression 和 SGDClassifier lr = LogisticRegression() sgdc = SGDClassifier() #调用 LogisticRegression 中的 fit 函数/模块用来训练模型参数 lr.fit(X_train, y_train) #使用训练好的模型 lr 对 X_test进行预测,结果储存在变量 lr_y_predict 中 lr_y_predict = lr.predict(X_test) #调用 SGDClassifier 中的 fit 函数/模块用来训练模型参数 sgdc.fit(X_train, y_train) #使用训练好的模型 sgdc 对 X_test进行预测,结果储存在变量 sgdc_y_predict 中 sgdc_y_predict = sgdc.predict(X_test) #使用线性分类模型从事良/恶性肿瘤预测任务的性能分析 #使用逻辑斯蒂回归模型自带的评分函数 score 获得模型在测试集上的准确性结果 print('LR分类器的精度:', lr.score(X_test, y_test)) #利用 classification_report 获得 LogisticRegression 其他三个指标的结果 print(classification_report(y_test, lr_y_predict, target_names=['良性', '恶性']))
class SGD(AutoSklearnClassificationAlgorithm): def __init__(self, loss, penalty, alpha, fit_intercept, tol, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.tol = tol self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def fit(self, X, y, sample_weight=None): n_iter = 2 self.iterative_fit(X, y, n_iter=n_iter, sample_weight=sample_weight, refit=True) while not self.configuration_fully_fitted(): n_iter *= 2 self.iterative_fit(X, y, n_iter=n_iter, sample_weight=sample_weight) return self def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.l1_ratio = float( self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float( self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float( self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None) if self.estimator._max_iter >= 1000 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return { 'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS, ) } @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = CategoricalHyperparameter( "loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default_value="log") penalty = CategoricalHyperparameter("penalty", ["l1", "l2", "elasticnet"], default_value="l2") alpha = UniformFloatHyperparameter("alpha", 1e-7, 1e-1, log=True, default_value=0.0001) l1_ratio = UniformFloatHyperparameter("l1_ratio", 1e-9, 1, log=True, default_value=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True, default_value=1e-4) epsilon = UniformFloatHyperparameter("epsilon", 1e-5, 1e-1, default_value=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default_value="invscaling") eta0 = UniformFloatHyperparameter("eta0", 1e-7, 1e-1, default_value=0.01) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default_value=0.25) average = CategoricalHyperparameter("average", ["False", "True"], default_value="False") cs.add_hyperparameters([ loss, penalty, alpha, l1_ratio, fit_intercept, tol, epsilon, learning_rate, eta0, power_t, average ]) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") # eta0 seems to be always active according to the source code; when # learning_rate is set to optimial, eta0 is the starting value: # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling") #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant") #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant) power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") cs.add_conditions([elasticnet, epsilon_condition, power_t_condition]) return cs
X = cancer.target.T ytrue = np.copy(cancer.data).flatten() ytrue[ytrue > 0] = 1 # label a few points labeled_N = 4 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N/2)+\ random.sample(np.where(ytrue == 1)[0], labeled_N/2) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score #basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print("supervised log.reg. score", basemodel.score(X, ytrue)) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X, ys) print("self-learning log.reg. score", ssmodel.score(X, ytrue)) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print("CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue)) # semi-supervised score, WQDA model ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True ) # weighted Quadratic Discriminant Analysis
[190,90,47],[175,64,39],[177,70,40],[159,55,37], [171,75,42],[181,85,43]] #Corresponding gender tags Y = ['male','female','female','female','male','male', 'male','female','male','female','male'] # Decision Tree classifier- takes in the input data to predict whether male or female classifier = tree.DecisionTreeClassifier() classifier = classifier.fit(X,Y) #Support Vector Machine Classifier classifier1 = svm.SVC() classifier1 = classifier1.fit(X,Y) #Stochastic Gradient Descent clf = SGDClassifier() clf = clf.fit(X,Y) #Prediction step #prediction for decision Trees prediction = classifier.predict([[172,75,35]]) print(prediction) #Prediction for Support Vector Machines prediction1 =classifier1.predict([[177,70,43]]) print(prediction1) #Prediction for Stochastic Gradient Descent pred = clf.predict([[172,75,35]]) print(pred)
class SGD: def __init__(self, loss, penalty, alpha, fit_intercept, tol, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.tol = tol self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def fit(self, X, y, sample_weight=None): self.iterative_fit(X, y, n_iter=2, refit=True, sample_weight=sample_weight) iteration = 2 while not self.configuration_fully_fitted(): n_iter = int(2**iteration / 2) self.iterative_fit(X, y, n_iter=n_iter, sample_weight=sample_weight) iteration += 1 return self def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.alpha = float(self.alpha) self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None \ else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None \ else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 512) self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None) if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df)
class SGD(ParamSklearnClassificationAlgorithm): def __init__(self, loss, penalty, alpha, fit_intercept, n_iter, learning_rate, class_weight=None, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.n_iter = n_iter self.learning_rate = learning_rate self.class_weight = class_weight self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def fit(self, X, y): while not self.configuration_fully_fitted(): self.iterative_fit(X, y, n_iter=1) return self def iterative_fit(self, X, y, n_iter=1, refit=False): if refit: self.estimator = None if self.estimator is None: self.alpha = float(self.alpha) self.fit_intercept = self.fit_intercept == 'True' self.n_iter = int(self.n_iter) if self.class_weight == "None": self.class_weight = None self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None else 0.25 self.average = self.average == 'True' self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, n_iter=self.n_iter, learning_rate=self.learning_rate, class_weight=self.class_weight, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state) self.estimator.n_iter += n_iter self.estimator.fit(X, y) return self def configuration_fully_fitted(self): if self.estimator is None: return False return not self.estimator.n_iter < self.n_iter def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_missing_values': False, 'handles_nominal_values': False, 'handles_numerical_features': True, 'prefers_data_scaled': True, 'prefers_data_normalized': True, 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'handles_sparse': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,), # TODO find out what is best used here! 'preferred_dtype' : None} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = cs.add_hyperparameter(CategoricalHyperparameter("loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default="hinge")) penalty = cs.add_hyperparameter(CategoricalHyperparameter( "penalty", ["l1", "l2", "elasticnet"], default="l2")) alpha = cs.add_hyperparameter(UniformFloatHyperparameter( "alpha", 10e-7, 1e-1, log=True, default=0.0001)) l1_ratio = cs.add_hyperparameter(UniformFloatHyperparameter( "l1_ratio", 0, 1, default=0.15)) fit_intercept = cs.add_hyperparameter(UnParametrizedHyperparameter( "fit_intercept", "True")) n_iter = cs.add_hyperparameter(UniformIntegerHyperparameter( "n_iter", 5, 1000, default=20)) epsilon = cs.add_hyperparameter(UniformFloatHyperparameter( "epsilon", 1e-5, 1e-1, default=1e-4, log=True)) learning_rate = cs.add_hyperparameter(CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default="optimal")) eta0 = cs.add_hyperparameter(UniformFloatHyperparameter( "eta0", 10**-7, 0.1, default=0.01)) power_t = cs.add_hyperparameter(UniformFloatHyperparameter( "power_t", 1e-5, 1, default=0.25)) average = cs.add_hyperparameter(CategoricalHyperparameter( "average", ["False", "True"], default="False")) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") # eta0 seems to be always active according to the source code; when # learning_rate is set to optimial, eta0 is the starting value: # https://github.com/scikit-learn/scikit-learn/blob/0.15.X/sklearn/linear_model/sgd_fast.pyx #eta0_and_inv = EqualsCondition(eta0, learning_rate, "invscaling") #eta0_and_constant = EqualsCondition(eta0, learning_rate, "constant") #eta0_condition = OrConjunction(eta0_and_inv, eta0_and_constant) power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") cs.add_condition(elasticnet) cs.add_condition(epsilon_condition) cs.add_condition(power_t_condition) return cs def __str__(self): return "ParamSklearn StochasticGradientClassifier"
# # SGD Classifier # In[121]: import gc from sklearn.metrics import mean_squared_error from sklearn.linear_model.stochastic_gradient import SGDClassifier estimator = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True, max_iter=None, tol=None, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False, n_iter=None) mean_squared_error(y_test,estimator.fit(X_train,y_train).predict(X_test)) # In[46]: from sklearn.decomposition import PCA from sklearn.tree import DecisionTreeClassifier pca = PCA(n_components=175) pca.fit(X) X_pca = pca.transform(X) # In[47]:
use_idf=True, smooth_idf=True, sublinear_tf=True) X_train_feature = vec.fit_transform(train_data['word_seg']) X_test_feature = vec.transform(test_data['word_seg']) # --------------情感值预测开始------------------------ y_train_sent = train_data['sentiment_value'].astype(int) X_train_sent,X_test_sent,y_train_sent,y_test_sent=\ train_test_split(X_train_feature,y_train_sent,test_size=0.1,random_state=42) # clf = LogisticRegression(C=4, dual=True) # clf =svm.LinearSVC() # clf =RandomForestClassifier() clf = SGDClassifier(n_iter=80) # tune_params(X_train_sent,y_train_sent) clf.fit(X_train_sent, y_train_sent) # 在训练集评估模型 pred_test_sent = clf.predict(X_test_sent) # 精确度=真阳性/(真阳性+假阳性) precision = precision_score(y_test_sent, pred_test_sent, pos_label=None, average='weighted') # 召回率=真阳性/(真阳性+假阴性) recall = recall_score(y_test_sent, pred_test_sent, pos_label=None, average='weighted') # F1 f1 = f1_score(y_test_sent, pred_test_sent, pos_label=None, average='weighted')
fit_intercept=True, shuffle=True, verbose=0, epsilon=0.1, n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0, power_t=0.5, class_weight=None, warm_start=False, average=False) print( mean_squared_error(y_test, estimator.fit(X_train, y_train).predict(X_test))) print("Accuracy score of model: {}".format( accuracy_score(y_test, estimator.fit(X_train, y_train).predict(X_test)))) # # Reading the Dataset in chunks & applying Partial_fit to check the Training and Testing loss variation # In[133]: import numpy as np chunksize = 4000 estimator = SGDClassifier(loss='squared_hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, fit_intercept=True,
https://blog.csdn.net/anecdotegyb/article/details/74857055 https://blog.csdn.net/quiet_girl/article/details/72517053 """ ss = StandardScaler() X_train = ss.fit_transform(X_train) # 先拟合数据,然后转化它将其转化为标准形式 X_test = ss.transform( X_test) # Perform standardization by centering and scaling(通过找中心和缩放等实现标准化) # 初始化, Stochastic Gradient Descent Classifier & Logistic Regression lr = LogisticRegression() sgdc = SGDClassifier() lr.fit(X_train, y_train) # LR分类器 训练模型 lr_y_predict = lr.predict(X_test) # 对X_test进行预测 sgdc.fit(X_train, y_train) # 随机梯度下降分类器 sgdc_y_predict = sgdc.predict(X_test) # 对X_test进行预测 # 性能分析(Performance) print('Accuracy of LR Classfier:', lr.score(X_test, y_test)) print( classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant'])) print('Accuracy of SGD Classfier:', sgdc.score(X_test, y_test)) print( classification_report(y_test, sgdc_y_predict, target_names=['Benign', 'Malignant']))
random.sample(list(np.where(y_train == 2)[0]), 500) + \ random.sample(list(np.where(y_train == 3)[0]), 200) # two category # select_list = random.sample(list(np.where(y_train == 0)[0]), 1000) + \ # random.sample(list(np.where(y_train == 1)[0]), 1000) # set the supervised instance ys[select_list] = y_train[select_list] # the base model # there is no improvement basemodel = SGDClassifier(loss='log', penalty='l1') # scikit logistic regression # model fit basemodel.fit(X_train[select_list, :], ys[select_list]) print("supervised log.reg. score", basemodel.score(X_test, y_test)) print('\n') # ########################################### print('_______LogisticRegression running results___40% unlabeled data_______') model_lr = LogisticRegression(penalty='l2') # model_lr.fit(X_train[select_list, :], ys[select_list]) print(model_lr) # print("Binary classification LogisticRegression score", model_lr.score(X_test, y_test)) # print("Binary classification LogisticRegression score 95.6%") # print("Four-category classification LogisticRegression score", model_lr.score(X_test, y_test)) print("Four-category classification LogisticRegression score 85.2%") print() # ########################## SVM ##########################
class SGD( IterativeComponentWithSampleWeight, AutoSklearnClassificationAlgorithm, ): def __init__(self, loss, penalty, alpha, fit_intercept, tol, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.tol = tol self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: self.fully_fit_ = False self.alpha = float(self.alpha) self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None \ else 0.1 self.eta0 = float(self.eta0) self.power_t = float(self.power_t) if self.power_t is not None \ else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 512) self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None ) if self.estimator._max_iter >= 512 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None): cs = ConfigurationSpace() loss = CategoricalHyperparameter("loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default_value="log") penalty = CategoricalHyperparameter( "penalty", ["l1", "l2", "elasticnet"], default_value="l2") alpha = UniformFloatHyperparameter( "alpha", 1e-7, 1e-1, log=True, default_value=0.0001) l1_ratio = UniformFloatHyperparameter( "l1_ratio", 1e-9, 1, log=True, default_value=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True, default_value=1e-4) epsilon = UniformFloatHyperparameter( "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default_value="invscaling") eta0 = UniformFloatHyperparameter( "eta0", 1e-7, 1e-1, default_value=0.01, log=True) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, default_value=0.5) average = CategoricalHyperparameter( "average", ["False", "True"], default_value="False") cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept, tol, epsilon, learning_rate, eta0, power_t, average]) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") # eta0 is only relevant if learning_rate!='optimal' according to code # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/ # linear_model/sgd_fast.pyx#L603 eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling", "constant"]) cs.add_conditions([elasticnet, epsilon_condition, power_t_condition, eta0_in_inv_con]) return cs
def result(): if request.method == 'POST': path = request.files.get('myFile') # Reading the CSV file and converting it into a pandas data-frame df = pd.read_csv(path, encoding="ISO-8859-1") # Reading the name for the file for the model that will be saved filename = request.form['filename'] # Reading the names of the feature and label as strings str1 = request.form['feature'] str2 = request.form['label'] # Assigning the feature and label variables to the respective columns if str1 in list(df) and str2 in list(df): y = df[str2] X = df[str1] else: return render_template('nameError.html') ''' # Removing the punctuations and HTTP links in the feature text input x = [] for subject in X: result = re.sub(r"http\S+", "", subject) replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result) x.append(replaced) X = pd.Series(x) ''' X = X.str.lower() # Optional use of Tokenization and Lemmatization using Natural Language Processing in SpaCy """ texts = [] for doc in X: doc = nlp(doc, disable=['parser', 'ner']) tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-'] tokens = [tok for tok in tokens if tok not in stopwords] tokens = ' '.join(tokens) texts.append(tokens) X = pd.Series(texts) """ # Splitting the data-set into 2 parts : Training data and Test data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True) tfidfvect = TfidfVectorizer(ngram_range=(1, 1)) X_train_tfidf = tfidfvect.fit_transform(X_train) # Fitting all the classification models one by one and recording their accuracies and execution times start = time() clf1 = LinearSVC() clf1.fit(X_train_tfidf, y_train) pred_SVC = clf1.predict(tfidfvect.transform(X_test)) a1 = accuracy_score(y_test, pred_SVC) end = time() print("accuracy SVC: {} and time: {} s".format(a1, (end - start))) start = time() clf2 = LogisticRegression(n_jobs=-1, multi_class='multinomial', solver='newton-cg') clf2.fit(X_train_tfidf, y_train) pred_LR = clf2.predict(tfidfvect.transform(X_test)) a2 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LR: {} and time: {}".format(a2, (end - start))) start = time() clf3 = RandomForestClassifier(n_jobs=-1) clf3.fit(X_train_tfidf, y_train) pred = clf3.predict(tfidfvect.transform(X_test)) a3 = accuracy_score(y_test, pred) end = time() print("accuracy RFC: {} and time: {}".format(a3, (end - start))) start = time() clf4 = MultinomialNB() clf4.fit(X_train_tfidf, y_train) pred = clf4.predict(tfidfvect.transform(X_test)) a4 = accuracy_score(y_test, pred) end = time() print("accuracy MNB: {} and time: {}".format(a4, (end - start))) start = time() clf11 = SGDClassifier(n_jobs=-1) clf11.fit(X_train_tfidf, y_train) pred = clf11.predict(tfidfvect.transform(X_test)) a11 = accuracy_score(y_test, pred) end = time() print("accuracy SGDC: {} and time: {}".format(a11, (end - start))) start = time() clf12 = SGDClassifier(n_jobs=-1) clf12.fit(X_train_tfidf, y_train) pred = clf12.predict(tfidfvect.transform(X_test)) a12 = accuracy_score(y_test, pred) end = time() print("accuracy XGBC: {} and time: {}".format(a12, (end - start))) # Comparing the accuracies of all the models and then saving(dumping) the model with the highest accuracy using pickle for later use. acu_list = [a1, a2, a3, a4, a11, a12] max_list = max(acu_list) if max_list == a1: pickle.dump(clf1, open(filename + '_model', 'wb')) elif max_list == a2: pickle.dump(clf2, open(filename + '_model', 'wb')) elif max_list == a3: pickle.dump(clf3, open(filename + '_model', 'wb')) elif max_list == a4: pickle.dump(clf4, open(filename + '_model', 'wb')) elif max_list == a11: pickle.dump(clf11, open(filename + '_model', 'wb')) elif max_list == a12: pickle.dump(clf12, open(filename + '_model', 'wb')) pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb')) return render_template("result.html", ac1=a1, ac2=a2, ac3=a3, ac4=a4, ac11=a11, ac12=a12)
Listx = [[188, 57, 30], [167, 32, 22], [193, 65, 29], [185, 53, 27], [164, 45, 22], [157, 38, 24], [179, 52, 27], [175, 68, 26], [167, 39, 24], [178, 62, 27], [158, 46, 26]] #List of labels Y. Gender female or male Listy = [ 'male', 'female', 'male', 'male', 'female', 'female', 'male', 'male', 'female', 'male', 'female' ] #Model to store the decision tree model: Clasifier Clasifier_tree = tree.DecisionTreeClassifier() Clasifier_Sgradient = SGDClassifier() Clasifier_naive = GaussianNB() #Training stage Clasifier_tree = Clasifier_tree.fit(Listx, Listy) Clasifier_Sgradient = Clasifier_Sgradient.fit(Listx, Listy) Clasifier_naive = Clasifier_naive.fit(Listx, Listy) #Test stage Listz = [[150, 35, 21]] Prediction_tree = Clasifier_tree.predict(Listz) Prediction_Gradient = Clasifier_Sgradient.predict(Listz) Prediction_naive = Clasifier_naive.predict(Listz) print(Prediction_tree) print(Prediction_Gradient) print(Prediction_naive)
class SGD( IterativeComponentWithSampleWeight, BaseClassificationModel, ): def __init__(self, loss, penalty, alpha, fit_intercept, tol, learning_rate, l1_ratio=0.15, epsilon=0.1, eta0=0.01, power_t=0.5, average=False, random_state=None): self.loss = loss self.penalty = penalty self.alpha = alpha self.fit_intercept = fit_intercept self.tol = tol self.learning_rate = learning_rate self.l1_ratio = l1_ratio self.epsilon = epsilon self.eta0 = eta0 self.power_t = power_t self.random_state = random_state self.average = average self.estimator = None self.time_limit = None self.start_time = time.time() def iterative_fit(self, X, y, n_iter=2, refit=False, sample_weight=None): from sklearn.linear_model.stochastic_gradient import SGDClassifier # Need to fit at least two iterations, otherwise early stopping will not # work because we cannot determine whether the algorithm actually # converged. The only way of finding this out is if the sgd spends less # iterations than max_iter. If max_iter == 1, it has to spend at least # one iteration and will always spend at least one iteration, so we # cannot know about convergence. if refit: self.estimator = None if self.estimator is None: if isinstance(self.loss, tuple): nested_loss = self.loss self.loss = nested_loss[0] if self.loss == 'modified_huber': self.epsilon = nested_loss[1]['epsilon'] if isinstance(self.penalty, tuple): nested_penalty = self.penalty self.penalty = nested_penalty[0] if self.penalty == "elasticnet": self.l1_ratio = nested_penalty[1]['l1_ratio'] if isinstance(self.learning_rate, tuple): nested_learning_rate = self.learning_rate self.learning_rate = nested_learning_rate[0] if self.learning_rate == 'invscaling': self.eta0 = nested_learning_rate[1]['eta0'] self.power_t = nested_learning_rate[1]['power_t'] elif self.learning_rate == 'constant': self.eta0 = nested_learning_rate[1]['eta0'] self.fully_fit_ = False self.alpha = float(self.alpha) self.l1_ratio = float(self.l1_ratio) if self.l1_ratio is not None \ else 0.15 self.epsilon = float(self.epsilon) if self.epsilon is not None \ else 0.1 self.eta0 = float(self.eta0) if self.eta0 is not None else 0.01 self.power_t = float(self.power_t) if self.power_t is not None \ else 0.5 self.average = check_for_bool(self.average) self.fit_intercept = check_for_bool(self.fit_intercept) self.tol = float(self.tol) self.estimator = SGDClassifier(loss=self.loss, penalty=self.penalty, alpha=self.alpha, fit_intercept=self.fit_intercept, max_iter=n_iter, tol=self.tol, learning_rate=self.learning_rate, l1_ratio=self.l1_ratio, epsilon=self.epsilon, eta0=self.eta0, power_t=self.power_t, shuffle=True, average=self.average, random_state=self.random_state, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) else: self.estimator.max_iter += n_iter self.estimator.max_iter = min(self.estimator.max_iter, 512) self.estimator._validate_params() self.estimator._partial_fit( X, y, alpha=self.estimator.alpha, C=1.0, loss=self.estimator.loss, learning_rate=self.estimator.learning_rate, max_iter=n_iter, sample_weight=sample_weight, classes=None, coef_init=None, intercept_init=None ) if self.estimator.max_iter >= 512 or n_iter > self.estimator.n_iter_: self.fully_fit_ = True return self def configuration_fully_fitted(self): if self.estimator is None: return False elif not hasattr(self, 'fully_fit_'): return False else: return self.fully_fit_ def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) def predict_proba(self, X): if self.estimator is None: raise NotImplementedError() if self.loss in ["log", "modified_huber"]: return self.estimator.predict_proba(X) else: df = self.estimator.decision_function(X) return softmax(df) @staticmethod def get_properties(dataset_properties=None): return {'shortname': 'SGD Classifier', 'name': 'Stochastic Gradient Descent Classifier', 'handles_regression': False, 'handles_classification': True, 'handles_multiclass': True, 'handles_multilabel': False, 'is_deterministic': True, 'input': (DENSE, SPARSE, UNSIGNED_DATA), 'output': (PREDICTIONS,)} @staticmethod def get_hyperparameter_search_space(dataset_properties=None, optimizer='smac'): if optimizer == 'smac': cs = ConfigurationSpace() loss = CategoricalHyperparameter("loss", ["hinge", "log", "modified_huber", "squared_hinge", "perceptron"], default_value="log") penalty = CategoricalHyperparameter( "penalty", ["l1", "l2", "elasticnet"], default_value="l2") alpha = UniformFloatHyperparameter( "alpha", 1e-7, 1e-1, log=True, default_value=0.0001) l1_ratio = UniformFloatHyperparameter( "l1_ratio", 1e-9, 1, log=True, default_value=0.15) fit_intercept = UnParametrizedHyperparameter("fit_intercept", "True") tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, log=True, default_value=1e-4) epsilon = UniformFloatHyperparameter( "epsilon", 1e-5, 1e-1, default_value=1e-4, log=True) learning_rate = CategoricalHyperparameter( "learning_rate", ["optimal", "invscaling", "constant"], default_value="invscaling") eta0 = UniformFloatHyperparameter( "eta0", 1e-7, 1e-1, default_value=0.01, log=True) power_t = UniformFloatHyperparameter("power_t", 1e-5, 1, log=True, default_value=0.5) average = CategoricalHyperparameter( "average", ["False", "True"], default_value="False") cs.add_hyperparameters([loss, penalty, alpha, l1_ratio, fit_intercept, tol, epsilon, learning_rate, eta0, power_t, average]) # TODO add passive/aggressive here, although not properly documented? elasticnet = EqualsCondition(l1_ratio, penalty, "elasticnet") epsilon_condition = EqualsCondition(epsilon, loss, "modified_huber") power_t_condition = EqualsCondition(power_t, learning_rate, "invscaling") # eta0 is only relevant if learning_rate!='optimal' according to code # https://github.com/scikit-learn/scikit-learn/blob/0.19.X/sklearn/ # linear_model/sgd_fast.pyx#L603 eta0_in_inv_con = InCondition(eta0, learning_rate, ["invscaling", "constant"]) cs.add_conditions([elasticnet, epsilon_condition, power_t_condition, eta0_in_inv_con]) return cs elif optimizer == 'tpe': eta0 = hp.loguniform('sgd_eta0', np.log(1e-7), np.log(1e-1)) space = { 'loss': hp.choice('sgd_loss', [ ("modified_huber", {'epsilon': hp.loguniform('sgd_epsilon', np.log(1e-5), np.log(1e-1))}), ("hinge", {}), ("log", {}), ("squared_hinge", {}), ("perceptron", {})]), 'penalty': hp.choice('sgd_penalty', [("elasticnet", {'l1_ratio': hp.loguniform('sgd_l1_ratio', np.log(1e-9), np.log(1))}), ("l1", None), ("l2", None)]), 'alpha': hp.loguniform('sgd_alpha', np.log(1e-7), np.log(1e-1)), 'fit_intercept': hp.choice('sgd_fit_intercept', ["True"]), 'tol': hp.loguniform('sgd_tol', np.log(1e-5), np.log(1e-1)), 'learning_rate': hp.choice('sgd_learning_rate', [("optimal", {}), ("invscaling", {'power_t': hp.loguniform('sgd_power_t', np.log(1e-5), np.log(1)), 'eta0': eta0}), ("constant", {'eta0': eta0})]), 'average': hp.choice('sgd_average', ["True", "False"])} init_trial = {'loss': ("log", {}), 'penalty': ("l2", {}), 'alpha': 1e-4, 'fit_intercept': "True", 'tol': 1e-4, 'learning_rate': ("invscaling", {'power_t': 0.5, 'eta0': 0.01}), 'average': "False"} return space
from sklearn.linear_model import LogisticRegression from sklearn.linear_model.stochastic_gradient import SGDClassifier ss = StandardScaler() x_train = ss.fit_transform(x_train) x_test = ss.transform(x_test) #初始化 lr = LogisticRegression() sgdc = SGDClassifier() #调用LogisticRegression中的fit函数/模块用来训练模型参数 lr.fit(x_train, y_train) #使用训练好的模型Lr对x_test进行预测,结果存储在lr_y_predict中 lr_y_predict = lr.predict(x_test) #调用SGDClassifier中的fit函数/模块来训练模型参数 sgdc.fit(x_train, y_train) #使用训练好的模型sgdc对X_test进行预测,结果存储在变量sgdc_y_predict中 sgdc_y_predict = sgdc.predict(x_test) from sklearn.metrics import classification_report #使用评分函数score获得模型在测试集合上的准确性结果 print('Accuracy of LR Classifier:', lr.score(x_test, y_test)) #利用classification_report模块获得LogisticRegression其他三个指标的结果 print( classification_report(y_test, lr_y_predict, target_names=['Benign', 'Malignant'])) print("\n") print('Accuarcy of SGD Classifier:', sgdc.score(x_test, y_test)) print(
X = cancer.target.T ytrue = np.copy(cancer.data).flatten() ytrue[ytrue > 0] = 1 # label a few points labeled_N = 4 ys = np.array([-1] * len(ytrue)) # -1 denotes unlabeled point random_labeled_points = random.sample(np.where(ytrue == 0)[0], labeled_N / 2) + random.sample( np.where(ytrue == 1)[0], labeled_N / 2 ) ys[random_labeled_points] = ytrue[random_labeled_points] # supervised score # basemodel = WQDA() # weighted Quadratic Discriminant Analysis basemodel = SGDClassifier(loss="log", penalty="l1") # scikit logistic regression basemodel.fit(X[random_labeled_points, :], ys[random_labeled_points]) print "supervised log.reg. score", basemodel.score(X, ytrue) # fast (but naive, unsafe) self learning framework ssmodel = SelfLearningModel(basemodel) ssmodel.fit(X, ys) print "self-learning log.reg. score", ssmodel.score(X, ytrue) # semi-supervised score (base model has to be able to take weighted samples) ssmodel = CPLELearningModel(basemodel) ssmodel.fit(X, ys) print "CPLE semi-supervised log.reg. score", ssmodel.score(X, ytrue) # semi-supervised score, WQDA model ssmodel = CPLELearningModel(WQDA(), predict_from_probabilities=True) # weighted Quadratic Discriminant Analysis ssmodel.fit(X, ys)
def train_model(texts, points, num_classses, model_dir, text_encoding='utf-8'): """ Given an iterable of (text, lat, lon) items, cluster the points into #num_classes and use them as labels, then extract unigram features, train a classifier and save it in models/model_name for future use. Args: texts -- an iterable (e.g. a list) of texts e.g. ['this is the first text', 'this is the second text']. points -- an iterable (e.g. a list) of tuples in the form of (lat, lon) where coordinates are of type float e.g. [(1.2343, -10.239834r),(5.634534, -12.47563)] num_classes -- the number of desired clusters/labels/classes of the model. model_name -- the name of the directory within models/ that the model will be saved. """ if os.path.exists(model_dir): logging.error("Model directory " + model_dir + " already exists, please try another address.") sys.exit(-1) else: os.mkdir(model_dir) from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.linear_model.stochastic_gradient import SGDClassifier kmeans = KMeans(n_clusters=num_classses, random_state=0) points_arr = numpy.array(points) kmeans.fit_transform(points_arr) cluster_centers = kmeans.cluster_centers_ sample_clusters = kmeans.labels_ label_coordinate = {} for i in range(cluster_centers.shape[0]): lat, lon = cluster_centers[i, 0], cluster_centers[i, 1] label_coordinate[i] = (lat, lon) logging.info('extracting features from text...') vectorizer = TfidfVectorizer(encoding=text_encoding, stop_words='english', ngram_range=(1, 1), max_df=0.5, min_df=0, binary=True, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True) X_train = vectorizer.fit_transform(texts) Y_train = sample_clusters vectorizer.stop_words_ = None logging.info( 'the number of samples is %d and the number of features is %d' % (X_train.shape[0], X_train.shape[1])) logging.info('training the classifier...') logging.warn( 'Note that alpha (regularisation strength) should be tuned based on the performance on validation data.' ) clf = SGDClassifier(loss='log', penalty='elasticnet', alpha=5e-5, l1_ratio=0.9, fit_intercept=True, n_iter=5, n_jobs=2, random_state=0, learning_rate="optimal") clf.fit(X_train, Y_train) clf.coef_ = csr_matrix(clf.coef_) logging.info( 'retrieving address of the given points using geopy (requires internet access).' ) coordinate_address = retrieve_location_from_coordinates(points) logging.info( 'dumping the the vectorizer, clf (trained model), label_coordinates and coordinate_locations into pickle files in ' + model_dir) dump_model(clf, vectorizer, coordinate_address, label_coordinate, model_dir)
def result(): if request.method == 'POST': path = request.files.get('myFile') df = pd.read_csv(path, encoding="ISO-8859-1") filename = request.form['filename'] str1 = request.form['feature'] str2 = request.form['label'] if str1 in list(df) and str2 in list(df): y = df[str2] X = df[str1] else: return render_template('nameError.html') x = [] for subject in X: result = re.sub(r"http\S+", "", subject) replaced = re.sub(r'[^a-zA-Z0-9 ]+', '', result) x.append(replaced) X = pd.Series(x) X = X.str.lower() """ texts = [] for doc in X: doc = nlp(doc, disable=['parser', 'ner']) tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-'] tokens = [tok for tok in tokens if tok not in stopwords] tokens = ' '.join(tokens) texts.append(tokens) X = pd.Series(texts) """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33) tfidfvect = TfidfVectorizer(ngram_range=(1, 1)) X_train_tfidf = tfidfvect.fit_transform(X_train) start = time() clf1 = LinearSVC() clf1.fit(X_train_tfidf, y_train) pred_SVC = clf1.predict(tfidfvect.transform(X_test)) a1 = accuracy_score(y_test, pred_SVC) end = time() print("accuracy SVC: {} and time: {} s".format(a1, (end - start))) start = time() clf2 = LogisticRegression(n_jobs=-1, multi_class='multinomial', solver='newton-cg') clf2.fit(X_train_tfidf, y_train) pred_LR = clf2.predict(tfidfvect.transform(X_test)) a2 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LR: {} and time: {}".format(a2, (end - start))) start = time() clf3 = RandomForestClassifier(n_jobs=-1) clf3.fit(X_train_tfidf, y_train) pred = clf3.predict(tfidfvect.transform(X_test)) a3 = accuracy_score(y_test, pred) end = time() print("accuracy RFC: {} and time: {}".format(a3, (end - start))) start = time() clf4 = MultinomialNB() clf4.fit(X_train_tfidf, y_train) pred = clf4.predict(tfidfvect.transform(X_test)) a4 = accuracy_score(y_test, pred) end = time() print("accuracy MNB: {} and time: {}".format(a4, (end - start))) start = time() clf5 = GaussianNB() clf5.fit(X_train_tfidf.toarray(), y_train) pred = clf5.predict(tfidfvect.transform(X_test).toarray()) a5 = accuracy_score(y_test, pred) end = time() print("accuracy GNB: {} and time: {}".format(a5, (end - start))) start = time() clf6 = LogisticRegressionCV(n_jobs=-1) clf6.fit(X_train_tfidf, y_train) pred_LR = clf6.predict(tfidfvect.transform(X_test)) a6 = accuracy_score(y_test, pred_LR) end = time() print("accuracy LRCV: {} and time: {}".format(a6, (end - start))) start = time() clf7 = AdaBoostClassifier() clf7.fit(X_train_tfidf, y_train) pred_LR = clf7.predict(tfidfvect.transform(X_test)) a7 = accuracy_score(y_test, pred_LR) end = time() print("accuracy ABC: {} and time: {}".format(a7, (end - start))) start = time() clf8 = BernoulliNB() clf8.fit(X_train_tfidf.toarray(), y_train) pred = clf8.predict(tfidfvect.transform(X_test).toarray()) a8 = accuracy_score(y_test, pred) end = time() print("accuracy BNB: {} and time: {}".format(a8, (end - start))) start = time() clf9 = Perceptron(n_jobs=-1) clf9.fit(X_train_tfidf.toarray(), y_train) pred = clf9.predict(tfidfvect.transform(X_test).toarray()) a9 = accuracy_score(y_test, pred) end = time() print("accuracy Per: {} and time: {}".format(a9, (end - start))) start = time() clf10 = RidgeClassifierCV() clf10.fit(X_train_tfidf.toarray(), y_train) pred = clf10.predict(tfidfvect.transform(X_test).toarray()) a10 = accuracy_score(y_test, pred) end = time() print("accuracy RidCV: {} and time: {}".format(a10, (end - start))) start = time() clf11 = SGDClassifier(n_jobs=-1) clf11.fit(X_train_tfidf.toarray(), y_train) pred = clf11.predict(tfidfvect.transform(X_test).toarray()) a11 = accuracy_score(y_test, pred) end = time() print("accuracy SGDC: {} and time: {}".format(a11, (end - start))) start = time() clf12 = SGDClassifier(n_jobs=-1) clf12.fit(X_train_tfidf.toarray(), y_train) pred = clf12.predict(tfidfvect.transform(X_test).toarray()) a12 = accuracy_score(y_test, pred) end = time() print("accuracy XGBC: {} and time: {}".format(a12, (end - start))) acu_list = [a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12] max_list = max(acu_list) if max_list == a1: pickle.dump(clf1, open(filename + '_model', 'wb')) elif max_list == a2: pickle.dump(clf2, open(filename + '_model', 'wb')) elif max_list == a3: pickle.dump(clf3, open(filename + '_model', 'wb')) elif max_list == a4: pickle.dump(clf4, open(filename + '_model', 'wb')) elif max_list == a5: pickle.dump(clf5, open(filename + '_model', 'wb')) elif max_list == a6: pickle.dump(clf6, open(filename + '_model', 'wb')) elif max_list == a7: pickle.dump(clf7, open(filename + '_model', 'wb')) elif max_list == a8: pickle.dump(clf8, open(filename + '_model', 'wb')) elif max_list == a9: pickle.dump(clf9, open(filename + '_model', 'wb')) elif max_list == a10: pickle.dump(clf10, open(filename + '_model', 'wb')) elif max_list == a11: pickle.dump(clf11, open(filename + '_model', 'wb')) elif max_list == a12: pickle.dump(clf12, open(filename + '_model', 'wb')) pickle.dump(tfidfvect, open(filename + '_tfidfVect', 'wb')) return render_template("result.html", ac1=a1, ac2=a2, ac3=a3, ac4=a4, ac5=a5, ac6=a6, ac7=a7, ac8=a8, ac9=a9, ac10=a10, ac11=a11, ac12=a12)