class LogisticRegressionDensityRatioEstimator(DensityRatioBase): def __init__(self, Cs=10, solver='lbfgs', epochs=100, seed=None, *args, **kwargs): self.model = LogisticRegressionCV(Cs=Cs, solver=solver, max_iter=epochs, random_state=seed) def logit(self, X, y=None): return self.model.decision_function(X) def fit(self, X_top, X_bot, *args, **kwargs): X, y = make_classification_dataset(X_top, X_bot) return self.model.fit(X, y, *args, **kwargs) def evaluate(self, X_top, X_bot, *args, **kwargs): X, y = make_classification_dataset(X_top, X_bot) return self.model.score(X, y, *args, **kwargs)
class LogisticRegressionCV_(ProbabilisticModel): """LogisticRegressionCV Classifier """ def __init__(self, *args, **kwargs): self.model = LogisticRegressionCV(*args, **kwargs) self.name = "lrcv" def train(self, dataset, *args, **kwargs): return self.model.fit(*(dataset.format_sklearn() + args), **kwargs) def predict(self, feature, *args, **kwargs): return self.model.predict(feature, *args, **kwargs) def score(self, testing_dataset, *args, **kwargs): return self.model.score(*(testing_dataset.format_sklearn() + args), **kwargs) def predict_real(self, feature, *args, **kwargs): dvalue = self.model.decision_function(feature, *args, **kwargs) if len(np.shape(dvalue)) == 1: # n_classes == 2 return np.vstack((-dvalue, dvalue)).T else: return dvalue def predict_proba(self, feature, *args, **kwargs): return self.model.predict_proba(feature, *args, **kwargs) def feature_importances_(self): return self.model.coef_.ravel() def get_params(self): return self.model.get_params
def Log_Classifier(X_train, X_test, y_train, y_test): from sklearn.linear_model import LogisticRegressionCV from sklearn.metrics import roc_curve, auc, recall_score, precision_score, f1_score, accuracy_score clf_log = LogisticRegressionCV(n_jobs=4) clf_log.fit(X_train, y_train) y_pred = clf_log.predict(X_test) accuracy = accuracy_score(y_test, y_pred) recall = recall_score(y_test, y_pred) precision = precision_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) y_score = clf_log.decision_function(X_test) fpr, tpr, thresholds = roc_curve(y_test, y_score) auc = auc(fpr, tpr) return accuracy, recall, precision, f1, auc, y_score
class LogisticRegressionCVImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def predict_proba(self, X): return self._wrapped_model.predict_proba(X) def decision_function(self, X): return self._wrapped_model.decision_function(X)
X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) ### 3. 特征选择(这里不进行特征选择操作) ### 4. 降维处理(这里不做降维处理) ### 5. 模型构建 lr = LogisticRegressionCV(Cs=np.logspace(-4,1,50), fit_intercept=True, penalty='l2', solver='lbfgs', tol=0.01, multi_class='multinomial') lr.fit(X_train, Y_train) ### 6. 模型效果输出 ## 将正确的数据转换为矩阵形式 y_test_hot = label_binarize(Y_test,classes=(1,2,3)) ## 得到预测的损失值 lr_y_score = lr.decision_function(X_test) ## 计算roc的值 lr_fpr, lr_tpr, lr_threasholds = metrics.roc_curve(y_test_hot.ravel(),lr_y_score.ravel()) ## 计算auc的值 lr_auc = metrics.auc(lr_fpr, lr_tpr) print("Logistic算法R值:", lr.score(X_train, Y_train)) print("Logistic算法AUC值:", lr_auc) ### 7. 模型预测 lr_y_predict = lr.predict(X_test) ##### KNN算法实现 # a. 模型构建 knn = KNeighborsClassifier(n_neighbors=3) knn.fit(X_train, Y_train)
svcg = svc_gs.best_estimator_ # In[43]: accuracy_score(y_test, svcg.predict(X_test_std)) # ## Validating the model using roc curve and auc # In[44]: from sklearn.metrics import roc_curve, auc # In[45]: log_fpr, log_tpr, log_thrs = roc_curve(y_test, lrc.decision_function(X_test)) auc_logistic = auc(log_fpr, log_tpr) # In[46]: svm_fpr, svm_tpr, thresh = roc_curve(y_test, svc_gs.decision_function(X_test)) auc_svc = auc(svm_fpr, svm_tpr) # In[47]: plt.figure(figsize=(10, 10), dpi=100) plt.plot(svm_fpr, svm_tpr, linestyle='-', label='SVM (auc = %0.3f)' % auc_svc) plt.plot(log_fpr, log_tpr, marker='.', label='Logistic (auc = %0.3f)' % auc_logistic)
class DetectorLID: _name = 'lid' def __init__(self, neighborhood_constant=NEIGHBORHOOD_CONST, n_neighbors=None, metric='euclidean', metric_kwargs=None, n_cv_folds=CROSS_VAL_SIZE, c_search_values=None, approx_nearest_neighbors=True, skip_dim_reduction=True, model_dim_reduction=None, n_jobs=1, max_iter=200, balanced_classification=True, low_memory=False, save_knn_indices_to_file=True, seed_rng=SEED_DEFAULT): """ :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a function of the number of samples (data size). If `N` is the number of samples, then the number of neighbors is set to `N^neighborhood_constant`. It is recommended to set this value in the range 0.4 to 0.5. :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified, the `neighborhood_constant` is ignored. It is sufficient to specify either `neighborhood_constant` or `n_neighbors`. :param metric: string or a callable that specifies the distance metric to use. :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a dictionary. :param n_cv_folds: number of cross-validation folds. :param c_search_values: list or array of search values for the logistic regression hyper-parameter `C`. The default value is `None`. :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to find the nearest neighbors. The NN-descent method is used for approximate nearest neighbor searches. :param skip_dim_reduction: Set to True in order to skip dimension reduction of the layer embeddings. :param model_dim_reduction: 1. None if dimension reduction is not required; (OR) 2. Path to a file containing the saved dimension reduction model. This will be a pickle file that loads into a list of model dictionaries; (OR) 3. The dimension reduction model loaded into memory from the pickle file. :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores. :param max_iter: Maximum number of iterations for the optimization of the logistic classifier. The default value set by the scikit-learn library is 100, but sometimes this does not allow for convergence. Hence, increasing it to 200 here. :param balanced_classification: Set to True to assign sample weights to balance the binary classification problem separating adversarial from non-adversarial samples. :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this is likely to increase the running time. :param save_knn_indices_to_file: Set to True in order to save the KNN indices from each layer to a pickle file to reduce memory usage. This may not be needed when the data size and/or the number of layers is small. It avoids potential out-of-memory errors at the expense of time taken to write and read the files. :param seed_rng: int value specifying the seed for the random number generator. This is passed around to all the classes/functions that require random number generation. Set this to a fixed value for reproducible results. """ self.neighborhood_constant = neighborhood_constant self.n_neighbors = n_neighbors self.metric = metric self.metric_kwargs = metric_kwargs self.n_cv_folds = n_cv_folds self.c_search_values = c_search_values self.approx_nearest_neighbors = approx_nearest_neighbors self.skip_dim_reduction = skip_dim_reduction self.n_jobs = get_num_jobs(n_jobs) self.max_iter = max_iter self.balanced_classification = balanced_classification self.low_memory = low_memory self.save_knn_indices_to_file = save_knn_indices_to_file self.seed_rng = seed_rng np.random.seed(self.seed_rng) # Load the dimension reduction models per-layer if required self.transform_models = None if not self.skip_dim_reduction: if model_dim_reduction is None: raise ValueError("Model file for dimension reduction is required but not specified as input.") elif isinstance(model_dim_reduction, str): # Pickle file is specified self.transform_models = load_dimension_reduction_models(model_dim_reduction) elif isinstance(model_dim_reduction, list): # Model already loaded from pickle file self.transform_models = model_dim_reduction else: raise ValueError("Invalid format for the dimension reduction model input.") if self.c_search_values is None: # Default search values for the `C` parameter of logistic regression self.c_search_values = np.logspace(-4, 4, num=10) self.n_layers = None self.n_samples = [] self.index_knn = None self.model_logistic = None self.scaler = None # Temporary directory to save the KNN index files self.temp_direc = None self.temp_knn_files = None def fit(self, layer_embeddings_normal, layer_embeddings_adversarial, layer_embeddings_noisy=None): """ Extract the LID feature vector for normal, noisy, and adversarial samples and train a logistic classifier to separate adversarial samples from (normal + noisy). Cross-validation is used to select the hyper-parameter `C` using area under the ROC curve as the validation metric. :param layer_embeddings_normal: list of numpy arrays with the layer embeddings for normal samples. Length of the list is equal to the number of layers. The numpy array at index `i` has shape `(n, d_i)`, where `n` is the number of samples and `d_i` is the dimension of the embeddings at layer `i`. :param layer_embeddings_adversarial: Same format as `layer_embeddings_normal`, but corresponding to adversarial data. :param layer_embeddings_noisy: Same format as `layer_embeddings_normal`, but corresponding to noisy data. Can be set to `None` to exclude noisy data from training. :return: (self, scores_normal, scores_adversarial) if layer_embeddings_noise is None (self, scores_normal, scores_adversarial, scores_noisy) otherwise. ------------------------------------------------------- - self: trained instance of the class. - scores_normal: numpy array with the scores (decision function of the logistic classifier) for normal samples. 1d array with the same number of samples as `layer_embeddings_normal`. - scores_noisy: scores corresponding to `layer_embeddings_noisy` if noisy training data is provided. - scores_adversarial: scores corresponding to `layer_embeddings_adversarial`. """ self.n_layers = len(layer_embeddings_normal) logger.info("Number of layer embeddings: {:d}.".format(self.n_layers)) if layer_embeddings_noisy is None: logger.info("Noisy training data not provided.") cond1 = False noisy_data = False else: cond1 = (len(layer_embeddings_noisy) != self.n_layers) noisy_data = True if cond1 or (len(layer_embeddings_adversarial) != self.n_layers): raise ValueError("The layer embeddings for noisy and attack samples must have the same length as that " "of normal samples") # Number of samples in each of the categories self.n_samples = [ layer_embeddings_normal[0].shape[0], layer_embeddings_noisy[0].shape[0] if noisy_data else 0, layer_embeddings_adversarial[0].shape[0] ] # Number of nearest neighbors if self.n_neighbors is None: # Set number of nearest neighbors based on the data size (of normal samples) and the # neighborhood constant self.n_neighbors = int(np.ceil(self.n_samples[0] ** self.neighborhood_constant)) # The data arrays at all layers should have the same number of samples if not all([layer_embeddings_normal[i].shape[0] == self.n_samples[0] for i in range(self.n_layers)]): raise ValueError("Input 'layer_embeddings_normal' does not have the expected format") if noisy_data: if not all([layer_embeddings_noisy[i].shape[0] == self.n_samples[1] for i in range(self.n_layers)]): raise ValueError("Input 'layer_embeddings_noisy' does not have the expected format") if not all([layer_embeddings_adversarial[i].shape[0] == self.n_samples[2] for i in range(self.n_layers)]): raise ValueError("Input 'layer_embeddings_adversarial' does not have the expected format") if self.save_knn_indices_to_file: # Create a temporary directory for saving the KNN indices self.temp_direc = tempfile.mkdtemp(dir=os.getcwd()) self.temp_knn_files = [''] * self.n_layers self.index_knn = [None for _ in range(self.n_layers)] features_lid_normal = np.zeros((self.n_samples[0], self.n_layers)) features_lid_noisy = np.zeros((self.n_samples[1], self.n_layers)) features_lid_adversarial = np.zeros((self.n_samples[2], self.n_layers)) for i in range(self.n_layers): logger.info("Processing layer {:d}:".format(i + 1)) if self.transform_models: data_normal = transform_data_from_model(layer_embeddings_normal[i], self.transform_models[i]) data_adver = transform_data_from_model(layer_embeddings_adversarial[i], self.transform_models[i]) if noisy_data: data_noisy = transform_data_from_model(layer_embeddings_noisy[i], self.transform_models[i]) else: data_noisy = None d1 = layer_embeddings_normal[i].shape[1] d2 = data_normal.shape[1] if d2 < d1: logger.info("Input dimension = {:d}, projected dimension = {:d}".format(d1, d2)) else: data_normal = layer_embeddings_normal[i] data_adver = layer_embeddings_adversarial[i] if noisy_data: data_noisy = layer_embeddings_noisy[i] else: data_noisy = None logger.info("Building a KNN index on the feature embeddings of normal samples.") # Build a KNN index on the set of feature embeddings from normal samples from layer `i` self.index_knn[i] = KNNIndex( data_normal, n_neighbors=self.n_neighbors, metric=self.metric, metric_kwargs=self.metric_kwargs, approx_nearest_neighbors=self.approx_nearest_neighbors, n_jobs=self.n_jobs, low_memory=self.low_memory, seed_rng=self.seed_rng ) logger.info("Calculating LID estimates for the feature embeddings of normal samples.") # Nearest neighbors of the normal feature embeddings from this layer nn_indices, nn_distances = self.index_knn[i].query_self(k=self.n_neighbors) # LID estimates of the normal feature embeddings from this layer features_lid_normal[:, i] = lid_mle_amsaleg(nn_distances) if noisy_data: logger.info("Calculating LID estimates for the feature embeddings of noisy samples.") # Nearest neighbors of the noisy feature embeddings from this layer nn_indices, nn_distances = self.index_knn[i].query(data_noisy, k=self.n_neighbors) # LID estimates of the noisy feature embeddings from this layer features_lid_noisy[:, i] = lid_mle_amsaleg(nn_distances) logger.info("Calculating LID estimates for the feature embeddings of adversarial samples.") # Nearest neighbors of the adversarial feature embeddings from this layer nn_indices, nn_distances = self.index_knn[i].query(data_adver, k=self.n_neighbors) # LID estimates of the adversarial feature embeddings from this layer features_lid_adversarial[:, i] = lid_mle_amsaleg(nn_distances) if self.save_knn_indices_to_file: logger.info("Saving the KNN index from layer {:d} to a pickle file".format(i + 1)) self.temp_knn_files[i] = os.path.join(self.temp_direc, 'knn_index_layer_{:d}.pkl'.format(i + 1)) with open(self.temp_knn_files[i], 'wb') as fp: pickle.dump(self.index_knn[i], fp) # Free up the allocated memory self.index_knn[i] = None # Feature vector and labels for the binary logistic classifier. # Normal and noisy samples are given labels 0 and adversarial samples are given label 1 n_pos = features_lid_adversarial.shape[0] if noisy_data: features_lid = np.concatenate([features_lid_normal, features_lid_noisy, features_lid_adversarial], axis=0) labels = np.concatenate([np.zeros(features_lid_normal.shape[0], dtype=np.int), np.zeros(features_lid_noisy.shape[0], dtype=np.int), np.ones(n_pos, dtype=np.int)]) else: features_lid = np.concatenate([features_lid_normal, features_lid_adversarial], axis=0) labels = np.concatenate([np.zeros(features_lid_normal.shape[0], dtype=np.int), np.ones(n_pos, dtype=np.int)]) pos_prop = n_pos / float(labels.shape[0]) # Randomly shuffle the samples to avoid determinism ind_perm = np.random.permutation(labels.shape[0]) features_lid = features_lid[ind_perm, :] labels = labels[ind_perm] # Min-max scaling for the LID features self.scaler = MinMaxScaler().fit(features_lid) features_lid = self.scaler.transform(features_lid) logger.info("Training a binary logistic classifier with {:d} samples and {:d} LID features.". format(*features_lid.shape)) logger.info("Using {:d}-fold cross-validation with area under ROC curve as the metric to select " "the best regularization hyperparameter.".format(self.n_cv_folds)) logger.info("Proportion of positive (adversarial or OOD) samples in the training data: {:.4f}". format(pos_prop)) class_weight = None if self.balanced_classification: if (pos_prop < 0.45) or (pos_prop > 0.55): class_weight = {0: 1.0 / (1 - pos_prop), 1: 1.0 / pos_prop} logger.info("Balancing the classes by assigning sample weight {:.4f} to class 0 and sample weight " "{:.4f} to class 1".format(class_weight[0], class_weight[1])) self.model_logistic = LogisticRegressionCV( Cs=self.c_search_values, cv=self.n_cv_folds, penalty='l2', scoring='roc_auc', multi_class='auto', class_weight=class_weight, max_iter=self.max_iter, refit=True, n_jobs=self.n_jobs, random_state=self.seed_rng ).fit(features_lid, labels) # Larger values of this score correspond to a higher probability of predicting class 1 (adversarial) scores_normal = self.model_logistic.decision_function(self.scaler.transform(features_lid_normal)) scores_adversarial = self.model_logistic.decision_function(self.scaler.transform(features_lid_adversarial)) if noisy_data: scores_noisy = self.model_logistic.decision_function(self.scaler.transform(features_lid_noisy)) return self, scores_normal, scores_adversarial, scores_noisy else: return self, scores_normal, scores_adversarial def score(self, layer_embeddings, cleanup=True): """ Given a list of layer embeddings for test samples, extract the layer-wise LID feature vector and return the decision function of the logistic classifier. :param layer_embeddings: list of numpy arrays with the layer embeddings for normal samples. Length of the list is equal to the number of layers. The numpy array at index `i` has shape `(n, d_i)`, where `n` is the number of samples and `d_i` is the dimension of the embeddings at layer `i`. :param cleanup: If set to True, the temporary directory where the KNN index files are saved will be deleted after scoring. If this method is to be called multiple times, set `cleanup = False` for all calls except the last one. :return: - numpy array of detection scores for the test samples. Has shape `(n, )` where `n` is the number of samples. Larger values correspond to a higher confidence that the sample is adversarial. """ n_test = layer_embeddings[0].shape[0] l = len(layer_embeddings) if l != self.n_layers: raise ValueError("Expecting {:d} layers in the input 'layer_embeddings', but received {:d} layers.". format(self.n_layers, l)) features_lid = np.zeros((n_test, self.n_layers)) for i in range(self.n_layers): logger.info("Calculating LID features for layer {:d}".format(i + 1)) if self.transform_models: # Dimension reduction data_proj = transform_data_from_model(layer_embeddings[i], self.transform_models[i]) else: data_proj = layer_embeddings[i] if self.save_knn_indices_to_file: with open(self.temp_knn_files[i], 'rb') as fp: self.index_knn[i] = pickle.load(fp) _, nn_distances = self.index_knn[i].query(data_proj, k=self.n_neighbors) features_lid[:, i] = lid_mle_amsaleg(nn_distances) if self.save_knn_indices_to_file: self.index_knn[i] = None if cleanup and self.save_knn_indices_to_file: _ = subprocess.check_call(['rm', '-rf', self.temp_direc]) features_lid = self.scaler.transform(features_lid) return self.model_logistic.decision_function(features_lid)
penalty='l1', solver='liblinear', tol=0.01) re = lr.fit(X_train, Y_train) # 4. 模型效果获取 r = re.score(X_train, Y_train) print("准确率: ", r) print("稀疏化特征比率: %.2f%%" % (np.mean(lr.coef_.ravel() == 0)**100)) print("参数: ", re.coef_) print("截距: ", re.intercept_) print(re.predict(X_train)) y_hat = re.predict(X_train) print(y_hat) print(lr.decision_function(X_train)) # 5. 模型相关信息保存 # 引入包 # from sklearn.externals import joblib # 要求文件夹必须存在 # 将标准化模型保存 # joblib.dump(ss, "datas/logistic/ss.model") # 将模型保存 # joblib.dump(lr, "datas/models/logistic/lr.model") # 模型加载 # 引入包 # from sklearn.externals import joblib # oss = joblib.load(("models/logistic/ss.model")) # olr = joblib.load("models/logistic/lr.model")
def main(file_in, file_out): # file_in = '../Re__Research_on_detecting_air_pollution_related_terms_searches_/keywords_data_rescaled_joined.csv' # air_data_raw = readData(file_in) # create an excel book book = xlwt.Workbook() sheet0 = book.add_sheet('first_page') book.save(file_out) parameters = [] # for lag_days in [3, 5, 7]: # for kernel_size in range(2, lag_days): # for pollution_value in [60]: # for search_lag in [0, 1, 2, 3]: # parameters.append((lag_days, kernel_size, pollution_value, search_lag)) '''============Summary: 2009 90============== no polluted days in training data ''' for lag_days in [7]: for kernel_size in [2]: for pollution_value in [70]: for search_lag in [2]: parameters.append((lag_days, kernel_size, pollution_value, search_lag)) for parameter_index in range(len(parameters)): data = xlrd.open_workbook(file_out) ws = xlutils.copy.copy(data) data.release_resources() del data lag_days, kernel_size, pollution_value, search_lag= parameters[parameter_index] seq_length = lag_days sheet1 = ws.add_sheet('model' + str(parameter_index)) row_index = 0 col_index = 0 sheet1.write(row_index,col_index,'Input_Features') col_index = col_index + 1 sheet1.write(row_index,col_index,'Accuracy') col_index = col_index + 1 sheet1.write(row_index,col_index, 'F1_score') col_index = col_index + 1 sheet1.write(row_index,col_index, 'AUC_val') col_index = col_index + 1 sheet1.write(row_index,col_index+2, 'CNN: ' + '(seq_length, kernel_size, pollution_value, search_lag):' + str(parameters[parameter_index])) col_index=0 row_index = row_index + 1 # with open(file_out, 'w') as fo: # fo.write('Input_Features'+',' + 'Accuracy'+ ',' + 'F1_score' + ',' + 'AUC_val' + '\n') for season in ['summer']: # for season in ['summer', 'winter']: sheet1.write(row_index, col_index, "============" + season + "=============") row_index = row_index + 1 # fo.write("============" + season + "============="+ '\n') # for final_year in [2009,2010,2011,2012,2013]: for final_year in [2012]: # for final_year in [2012]: # for final_year in [2009]: sheet1.write(row_index, col_index, 'Final year: ' + str(final_year)) row_index = row_index + 1 # fo.write('Final year: ' + str(final_year) + '\n') # air_data = selectData(air_data_raw.copy(), season = season, final_year=final_year) for shift_days in [0]: # fo.write('Shift days: ' + str(shift_days)+ '\n') print("============Summary: " + str(final_year) + ' ' + str(pollution_value) + '==============' ) single_feature = False data_split = DataSplit(file_path = file_in, season = season, final_year = final_year) X_train, X_valid, X_test, y_train, y_valid, y_test = data_split.generateTrainTest() train_len = len(y_train) valid_len = len(y_valid) test_len = len(y_test) # lag_days = 3 # seq_length = 3 # kernel_size = 2 # pollution_value = 50 raw_values = np.concatenate((y_train, y_valid, y_test), axis=0) # transform data to be supervised learning # supervised_values = timeseries_to_supervised(raw_values, 5) supervised_values = timeseries_to_supervised(raw_values, lag = lag_days) # normalize to 0 to 1 # supervised_values = supervised_values/supervised_values.max() # normalize supervised_values supervised_values -= np.mean(supervised_values, axis = 0) # zero-center supervised_values /= np.std(supervised_values, axis = 0) # normalize # for input_features in ['pollution_val', 'one-hot-encoding+', 'glove-embedding+']: for with_pollution_val in ['pollution_val', 'with_pol_val', 'without_pol_val']: for input_features in ['one-hot+', 'one-hot+glove+']: if with_pollution_val == 'pollution_val': x_train_concat = supervised_values.copy() input_features = '' else: X_concat_frames = pd.concat([X_train, X_valid, X_test]) feature_embeddings = generate_search_embedding(X_concat_frames, representation = 'one-hot') feature_embeddings = lag_search_features(feature_embeddings, lag = search_lag) if input_features == 'one-hot+': if with_pollution_val == 'with_pol_val': x_train_concat = np.concatenate((supervised_values, feature_embeddings), axis=1) else: x_train_concat = feature_embeddings.copy() else: glove_feature_embeddings = generate_search_embedding(X_concat_frames, representation = 'glove') glove_feature_embeddings = lag_search_features(glove_feature_embeddings, lag = search_lag) if with_pollution_val == 'with_pol_val': x_train_concat = np.concatenate((supervised_values, feature_embeddings, glove_feature_embeddings), axis=1) else: x_train_concat = np.concatenate((feature_embeddings, glove_feature_embeddings), axis=1) input_embedding = generate_input_sequence(x_train_concat, seq_length = seq_length) input_embedding = input_embedding.reshape(len(input_embedding), -1) y_class = [1 if i>pollution_value else 0 for i in raw_values] # generate train_validation set x_train_valid = input_embedding[:train_len+valid_len] y_train_valid = y_class[:train_len+valid_len] x_test = input_embedding[train_len+valid_len:] y_test = y_class[train_len+valid_len:] valid_index = [i for i in range(train_len, train_len+valid_len)] test_fold = [ -1 if i not in valid_index else 0 for i in range(0, len(x_train_valid)) ] ps = PredefinedSplit(test_fold=test_fold) clf = LogisticRegressionCV( Cs=list(np.power(10.0, np.arange(-20, 6))) ,penalty='l2' ,scoring='f1' ,cv=ps ,random_state=0 ,max_iter=10000 ,class_weight="balanced" # ,fit_intercept=True ,fit_intercept=False ,solver='newton-cg' ,tol=10 ,refit=True ) clf.fit(x_train_valid, y_train_valid) pred = clf.predict(x_test) accuracy = accuracy_score(y_test,pred) f1_value = f1_score(y_test,pred) y_score = clf.decision_function(x_test) fpr,tpr,threshold = roc_curve(y_test, y_score) auc_value = auc(fpr,tpr) sheet1.write(row_index, col_index, with_pollution_val + '+' + input_features) col_index = col_index + 1 sheet1.write(row_index, col_index, str(accuracy)) col_index = col_index + 1 sheet1.write(row_index, col_index, str(f1_value)) col_index = col_index + 1 sheet1.write(row_index, col_index, str(auc_value)) col_index = 0 row_index = row_index + 1 if with_pollution_val == 'pollution_val': break # fo.write(input_features + ',' + str(accuracy) +',' + str(f1_value) + ',' + str(auc_value)+ '\n') ws.save(file_out) del ws
plt.plot(x1_len, Y1_test, 'ro',markersize = 8, zorder=3, label=u'真实值') plt.plot(x1_len, Y1_predict, 'go', markersize = 12, zorder=2, label=u'预测值,$R^2$=%.3f' % lr2.score(X1_train, Y1_train)) plt.legend(loc = 'upper left') plt.xlabel(u'数据编号', fontsize=18) plt.ylabel(u'葡萄酒质量', fontsize=18) plt.title(u'葡萄酒质量预测统计(降维处理)', fontsize=20) plt.show() #从AUC角度看模型效果 from sklearn.preprocessing import label_binarize from sklearn import metrics y_test_hot = label_binarize(Y_test,classes=(3,4,5,6,7,8,9)).ravel() #计算原始数据模型 #得到预测的损失值 lr_y_score = lr.decision_function(X_test).ravel() #计算ROC lr_fpr, lr_tpr, lr_threasholds = metrics.roc_curve(y_test_hot,lr_y_score) #计算AUC lr_auc = metrics.auc(lr_fpr, lr_tpr) #计算降维后的数据模型 #decision_function 的值等于X1_test乘以特征系数 lr2_y_score = lr2.decision_function(X1_test).ravel() #计算ROC lr2_fpr, lr2_tpr, lr2_threasholds = metrics.roc_curve(y_test_hot,lr2_y_score) #计算AUC lr2_auc = metrics.auc(lr2_fpr, lr2_tpr) print "原始数据AUC值:", lr_auc print "降维数据AUC值:", lr2_auc
x_min = np.min(X_train).astype(np.float32) - 0.5 x_max = np.max(X_train).astype(np.float32) + 0.5 Y_train = np.array(Y_train).reshape((-1, 1)) test = np.concatenate([X_train, Y_train], axis=1) test.sort(axis=0) y_predict = lr.predict(test[:, 0].reshape(-1, 1)) plt.figure(figsize=(12, 9), facecolor='w') plt.plot(test[:, 0], test[:, 1], 'ro', markersize=6, zorder=3, label=u'真实值') plt.plot(test[:, 0], y_predict, 'go', markersize=10, zorder=2, label=u'Logis算法预测值,准确率=%.3f' % lr.score(X_test, Y_test)) # 画第一条线 plt.plot([x_min, x_max], [theta11 * x_min + theta10, theta11 * x_max + theta10], 'r-', label=u'第一条线') plt.plot([x_min, x_max], [theta21 * x_min + theta20, theta21 * x_max + theta20], 'b-', label=u'第二条线') plt.plot([x_min, x_max], [theta31 * x_min + theta30, theta31 * x_max + theta30], 'g-', label=u'第三条线') plt.legend(loc='lower right') plt.xlabel(u'{}'.format(names[0:1]), fontsize=18) plt.ylabel(u'种类', fontsize=18) plt.title(u'鸢尾花数据分类', fontsize=20) plt.grid() plt.show() x_t = test[-10:, 0].reshape(-1,1) y_t = lr.decision_function(x_t) print(y_t) print(x_t.reshape(-1)) print(lr.predict(x_t)) print(test[-10:, 1]) print([theta31 * x_min + theta30, theta31 * 7.1 + theta30])