예제 #1
0
class LogisticRegressionDensityRatioEstimator(DensityRatioBase):
    def __init__(self,
                 Cs=10,
                 solver='lbfgs',
                 epochs=100,
                 seed=None,
                 *args,
                 **kwargs):

        self.model = LogisticRegressionCV(Cs=Cs,
                                          solver=solver,
                                          max_iter=epochs,
                                          random_state=seed)

    def logit(self, X, y=None):

        return self.model.decision_function(X)

    def fit(self, X_top, X_bot, *args, **kwargs):

        X, y = make_classification_dataset(X_top, X_bot)
        return self.model.fit(X, y, *args, **kwargs)

    def evaluate(self, X_top, X_bot, *args, **kwargs):

        X, y = make_classification_dataset(X_top, X_bot)
        return self.model.score(X, y, *args, **kwargs)
예제 #2
0
class LogisticRegressionCV_(ProbabilisticModel):

    """LogisticRegressionCV Classifier
    """

    def __init__(self, *args, **kwargs):
        self.model = LogisticRegressionCV(*args, **kwargs)
        self.name = "lrcv"        

    def train(self, dataset, *args, **kwargs):
        return self.model.fit(*(dataset.format_sklearn() + args), **kwargs)

    def predict(self, feature, *args, **kwargs):
        return self.model.predict(feature, *args, **kwargs)

    def score(self, testing_dataset, *args, **kwargs):
        return self.model.score(*(testing_dataset.format_sklearn() + args),
                                **kwargs)
    
    def predict_real(self, feature, *args, **kwargs):
        dvalue = self.model.decision_function(feature, *args, **kwargs)
        if len(np.shape(dvalue)) == 1:  # n_classes == 2
            return np.vstack((-dvalue, dvalue)).T
        else:
            return dvalue
    
    def predict_proba(self, feature, *args, **kwargs):
        return self.model.predict_proba(feature, *args, **kwargs)
    
    def feature_importances_(self):
        return self.model.coef_.ravel()
    
    def get_params(self):
        return self.model.get_params
예제 #3
0
def Log_Classifier(X_train, X_test, y_train, y_test):
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.metrics import roc_curve, auc, recall_score, precision_score, f1_score, accuracy_score
    clf_log = LogisticRegressionCV(n_jobs=4)
    clf_log.fit(X_train, y_train)
    y_pred = clf_log.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    y_score = clf_log.decision_function(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_score)
    auc = auc(fpr, tpr)

    return accuracy, recall, precision, f1, auc, y_score
예제 #4
0
class LogisticRegressionCVImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)

    def predict_proba(self, X):
        return self._wrapped_model.predict_proba(X)

    def decision_function(self, X):
        return self._wrapped_model.decision_function(X)
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

### 3. 特征选择(这里不进行特征选择操作)

### 4. 降维处理(这里不做降维处理)

### 5. 模型构建
lr = LogisticRegressionCV(Cs=np.logspace(-4,1,50), fit_intercept=True, penalty='l2', solver='lbfgs', tol=0.01, multi_class='multinomial')
lr.fit(X_train, Y_train)

### 6. 模型效果输出
## 将正确的数据转换为矩阵形式
y_test_hot = label_binarize(Y_test,classes=(1,2,3))
## 得到预测的损失值
lr_y_score = lr.decision_function(X_test)
## 计算roc的值
lr_fpr, lr_tpr, lr_threasholds = metrics.roc_curve(y_test_hot.ravel(),lr_y_score.ravel())
## 计算auc的值
lr_auc = metrics.auc(lr_fpr, lr_tpr)
print("Logistic算法R值:", lr.score(X_train, Y_train))
print("Logistic算法AUC值:", lr_auc)

### 7. 模型预测
lr_y_predict = lr.predict(X_test)

##### KNN算法实现
# a. 模型构建
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)
예제 #6
0
svcg = svc_gs.best_estimator_

# In[43]:

accuracy_score(y_test, svcg.predict(X_test_std))

# ## Validating the model using roc curve and auc

# In[44]:

from sklearn.metrics import roc_curve, auc

# In[45]:

log_fpr, log_tpr, log_thrs = roc_curve(y_test, lrc.decision_function(X_test))
auc_logistic = auc(log_fpr, log_tpr)

# In[46]:

svm_fpr, svm_tpr, thresh = roc_curve(y_test, svc_gs.decision_function(X_test))
auc_svc = auc(svm_fpr, svm_tpr)

# In[47]:

plt.figure(figsize=(10, 10), dpi=100)
plt.plot(svm_fpr, svm_tpr, linestyle='-', label='SVM (auc = %0.3f)' % auc_svc)
plt.plot(log_fpr,
         log_tpr,
         marker='.',
         label='Logistic (auc = %0.3f)' % auc_logistic)
예제 #7
0
class DetectorLID:
    _name = 'lid'
    def __init__(self,
                 neighborhood_constant=NEIGHBORHOOD_CONST, n_neighbors=None,
                 metric='euclidean', metric_kwargs=None,
                 n_cv_folds=CROSS_VAL_SIZE,
                 c_search_values=None,
                 approx_nearest_neighbors=True,
                 skip_dim_reduction=True,
                 model_dim_reduction=None,
                 n_jobs=1,
                 max_iter=200,
                 balanced_classification=True,
                 low_memory=False,
                 save_knn_indices_to_file=True,
                 seed_rng=SEED_DEFAULT):
        """

        :param neighborhood_constant: float value in (0, 1), that specifies the number of nearest neighbors as a
                                      function of the number of samples (data size). If `N` is the number of samples,
                                      then the number of neighbors is set to `N^neighborhood_constant`. It is
                                      recommended to set this value in the range 0.4 to 0.5.
        :param n_neighbors: None or int value specifying the number of nearest neighbors. If this value is specified,
                            the `neighborhood_constant` is ignored. It is sufficient to specify either
                            `neighborhood_constant` or `n_neighbors`.
        :param metric: string or a callable that specifies the distance metric to use.
        :param metric_kwargs: optional keyword arguments required by the distance metric specified in the form of a
                              dictionary.
        :param n_cv_folds: number of cross-validation folds.
        :param c_search_values: list or array of search values for the logistic regression hyper-parameter `C`. The
                                default value is `None`.
        :param approx_nearest_neighbors: Set to True in order to use an approximate nearest neighbor algorithm to
                                         find the nearest neighbors. The NN-descent method is used for approximate
                                         nearest neighbor searches.
        :param skip_dim_reduction: Set to True in order to skip dimension reduction of the layer embeddings.
        :param model_dim_reduction: 1. None if dimension reduction is not required; (OR)
                                    2. Path to a file containing the saved dimension reduction model. This will be
                                       a pickle file that loads into a list of model dictionaries; (OR)
                                    3. The dimension reduction model loaded into memory from the pickle file.
        :param n_jobs: Number of parallel jobs or processes. Set to -1 to use all the available cpu cores.
        :param max_iter: Maximum number of iterations for the optimization of the logistic classifier. The default
                         value set by the scikit-learn library is 100, but sometimes this does not allow for
                         convergence. Hence, increasing it to 200 here.
        :param balanced_classification: Set to True to assign sample weights to balance the binary classification
                                        problem separating adversarial from non-adversarial samples.
        :param low_memory: Set to True to enable the low memory option of the `NN-descent` method. Note that this
                           is likely to increase the running time.
        :param save_knn_indices_to_file: Set to True in order to save the KNN indices from each layer to a pickle
                                         file to reduce memory usage. This may not be needed when the data size
                                         and/or the number of layers is small. It avoids potential out-of-memory
                                         errors at the expense of time taken to write and read the files.
        :param seed_rng: int value specifying the seed for the random number generator. This is passed around to
                         all the classes/functions that require random number generation. Set this to a fixed value
                         for reproducible results.
        """
        self.neighborhood_constant = neighborhood_constant
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.metric_kwargs = metric_kwargs
        self.n_cv_folds = n_cv_folds
        self.c_search_values = c_search_values
        self.approx_nearest_neighbors = approx_nearest_neighbors
        self.skip_dim_reduction = skip_dim_reduction
        self.n_jobs = get_num_jobs(n_jobs)
        self.max_iter = max_iter
        self.balanced_classification = balanced_classification
        self.low_memory = low_memory
        self.save_knn_indices_to_file = save_knn_indices_to_file
        self.seed_rng = seed_rng

        np.random.seed(self.seed_rng)
        # Load the dimension reduction models per-layer if required
        self.transform_models = None
        if not self.skip_dim_reduction:
            if model_dim_reduction is None:
                raise ValueError("Model file for dimension reduction is required but not specified as input.")
            elif isinstance(model_dim_reduction, str):
                # Pickle file is specified
                self.transform_models = load_dimension_reduction_models(model_dim_reduction)
            elif isinstance(model_dim_reduction, list):
                # Model already loaded from pickle file
                self.transform_models = model_dim_reduction
            else:
                raise ValueError("Invalid format for the dimension reduction model input.")

        if self.c_search_values is None:
            # Default search values for the `C` parameter of logistic regression
            self.c_search_values = np.logspace(-4, 4, num=10)

        self.n_layers = None
        self.n_samples = []
        self.index_knn = None
        self.model_logistic = None
        self.scaler = None
        # Temporary directory to save the KNN index files
        self.temp_direc = None
        self.temp_knn_files = None

    def fit(self, layer_embeddings_normal, layer_embeddings_adversarial, layer_embeddings_noisy=None):
        """
        Extract the LID feature vector for normal, noisy, and adversarial samples and train a logistic classifier
        to separate adversarial samples from (normal + noisy). Cross-validation is used to select the hyper-parameter
        `C` using area under the ROC curve as the validation metric.

        :param layer_embeddings_normal: list of numpy arrays with the layer embeddings for normal samples.
                                        Length of the list is equal to the number of layers. The numpy array at
                                        index `i` has shape `(n, d_i)`, where `n` is the number of samples and `d_i`
                                        is the dimension of the embeddings at layer `i`.
        :param layer_embeddings_adversarial: Same format as `layer_embeddings_normal`, but corresponding to
                                             adversarial data.
        :param layer_embeddings_noisy: Same format as `layer_embeddings_normal`, but corresponding to noisy data.
                                       Can be set to `None` to exclude noisy data from training.
        :return:
            (self, scores_normal, scores_adversarial) if layer_embeddings_noise is None
            (self, scores_normal, scores_adversarial, scores_noisy) otherwise.
            -------------------------------------------------------
            - self: trained instance of the class.
            - scores_normal: numpy array with the scores (decision function of the logistic classifier) for normal
                             samples. 1d array with the same number of samples as `layer_embeddings_normal`.
            - scores_noisy: scores corresponding to `layer_embeddings_noisy` if noisy training data is provided.
            - scores_adversarial: scores corresponding to `layer_embeddings_adversarial`.
        """
        self.n_layers = len(layer_embeddings_normal)
        logger.info("Number of layer embeddings: {:d}.".format(self.n_layers))
        if layer_embeddings_noisy is None:
            logger.info("Noisy training data not provided.")
            cond1 = False
            noisy_data = False
        else:
            cond1 = (len(layer_embeddings_noisy) != self.n_layers)
            noisy_data = True

        if cond1 or (len(layer_embeddings_adversarial) != self.n_layers):
            raise ValueError("The layer embeddings for noisy and attack samples must have the same length as that "
                             "of normal samples")

        # Number of samples in each of the categories
        self.n_samples = [
            layer_embeddings_normal[0].shape[0],
            layer_embeddings_noisy[0].shape[0] if noisy_data else 0,
            layer_embeddings_adversarial[0].shape[0]
        ]
        # Number of nearest neighbors
        if self.n_neighbors is None:
            # Set number of nearest neighbors based on the data size (of normal samples) and the
            # neighborhood constant
            self.n_neighbors = int(np.ceil(self.n_samples[0] ** self.neighborhood_constant))

        # The data arrays at all layers should have the same number of samples
        if not all([layer_embeddings_normal[i].shape[0] == self.n_samples[0] for i in range(self.n_layers)]):
            raise ValueError("Input 'layer_embeddings_normal' does not have the expected format")

        if noisy_data:
            if not all([layer_embeddings_noisy[i].shape[0] == self.n_samples[1] for i in range(self.n_layers)]):
                raise ValueError("Input 'layer_embeddings_noisy' does not have the expected format")

        if not all([layer_embeddings_adversarial[i].shape[0] == self.n_samples[2] for i in range(self.n_layers)]):
            raise ValueError("Input 'layer_embeddings_adversarial' does not have the expected format")

        if self.save_knn_indices_to_file:
            # Create a temporary directory for saving the KNN indices
            self.temp_direc = tempfile.mkdtemp(dir=os.getcwd())
            self.temp_knn_files = [''] * self.n_layers

        self.index_knn = [None for _ in range(self.n_layers)]
        features_lid_normal = np.zeros((self.n_samples[0], self.n_layers))
        features_lid_noisy = np.zeros((self.n_samples[1], self.n_layers))
        features_lid_adversarial = np.zeros((self.n_samples[2], self.n_layers))
        for i in range(self.n_layers):
            logger.info("Processing layer {:d}:".format(i + 1))
            if self.transform_models:
                data_normal = transform_data_from_model(layer_embeddings_normal[i], self.transform_models[i])
                data_adver = transform_data_from_model(layer_embeddings_adversarial[i], self.transform_models[i])
                if noisy_data:
                    data_noisy = transform_data_from_model(layer_embeddings_noisy[i], self.transform_models[i])
                else:
                    data_noisy = None

                d1 = layer_embeddings_normal[i].shape[1]
                d2 = data_normal.shape[1]
                if d2 < d1:
                    logger.info("Input dimension = {:d}, projected dimension = {:d}".format(d1, d2))
            else:
                data_normal = layer_embeddings_normal[i]
                data_adver = layer_embeddings_adversarial[i]
                if noisy_data:
                    data_noisy = layer_embeddings_noisy[i]
                else:
                    data_noisy = None

            logger.info("Building a KNN index on the feature embeddings of normal samples.")
            # Build a KNN index on the set of feature embeddings from normal samples from layer `i`
            self.index_knn[i] = KNNIndex(
                data_normal, n_neighbors=self.n_neighbors,
                metric=self.metric, metric_kwargs=self.metric_kwargs,
                approx_nearest_neighbors=self.approx_nearest_neighbors,
                n_jobs=self.n_jobs,
                low_memory=self.low_memory,
                seed_rng=self.seed_rng
            )
            logger.info("Calculating LID estimates for the feature embeddings of normal samples.")
            # Nearest neighbors of the normal feature embeddings from this layer
            nn_indices, nn_distances = self.index_knn[i].query_self(k=self.n_neighbors)
            # LID estimates of the normal feature embeddings from this layer
            features_lid_normal[:, i] = lid_mle_amsaleg(nn_distances)

            if noisy_data:
                logger.info("Calculating LID estimates for the feature embeddings of noisy samples.")
                # Nearest neighbors of the noisy feature embeddings from this layer
                nn_indices, nn_distances = self.index_knn[i].query(data_noisy, k=self.n_neighbors)
                # LID estimates of the noisy feature embeddings from this layer
                features_lid_noisy[:, i] = lid_mle_amsaleg(nn_distances)

            logger.info("Calculating LID estimates for the feature embeddings of adversarial samples.")
            # Nearest neighbors of the adversarial feature embeddings from this layer
            nn_indices, nn_distances = self.index_knn[i].query(data_adver, k=self.n_neighbors)
            # LID estimates of the adversarial feature embeddings from this layer
            features_lid_adversarial[:, i] = lid_mle_amsaleg(nn_distances)

            if self.save_knn_indices_to_file:
                logger.info("Saving the KNN index from layer {:d} to a pickle file".format(i + 1))
                self.temp_knn_files[i] = os.path.join(self.temp_direc, 'knn_index_layer_{:d}.pkl'.format(i + 1))
                with open(self.temp_knn_files[i], 'wb') as fp:
                    pickle.dump(self.index_knn[i], fp)

                # Free up the allocated memory
                self.index_knn[i] = None

        # Feature vector and labels for the binary logistic classifier.
        # Normal and noisy samples are given labels 0 and adversarial samples are given label 1
        n_pos = features_lid_adversarial.shape[0]
        if noisy_data:
            features_lid = np.concatenate([features_lid_normal, features_lid_noisy, features_lid_adversarial],
                                          axis=0)
            labels = np.concatenate([np.zeros(features_lid_normal.shape[0], dtype=np.int),
                                     np.zeros(features_lid_noisy.shape[0], dtype=np.int),
                                     np.ones(n_pos, dtype=np.int)])
        else:
            features_lid = np.concatenate([features_lid_normal, features_lid_adversarial], axis=0)
            labels = np.concatenate([np.zeros(features_lid_normal.shape[0], dtype=np.int),
                                     np.ones(n_pos, dtype=np.int)])

        pos_prop = n_pos / float(labels.shape[0])
        # Randomly shuffle the samples to avoid determinism
        ind_perm = np.random.permutation(labels.shape[0])
        features_lid = features_lid[ind_perm, :]
        labels = labels[ind_perm]
        # Min-max scaling for the LID features
        self.scaler = MinMaxScaler().fit(features_lid)
        features_lid = self.scaler.transform(features_lid)
        logger.info("Training a binary logistic classifier with {:d} samples and {:d} LID features.".
                    format(*features_lid.shape))
        logger.info("Using {:d}-fold cross-validation with area under ROC curve as the metric to select "
                    "the best regularization hyperparameter.".format(self.n_cv_folds))
        logger.info("Proportion of positive (adversarial or OOD) samples in the training data: {:.4f}".
                    format(pos_prop))
        class_weight = None
        if self.balanced_classification:
            if (pos_prop < 0.45) or (pos_prop > 0.55):
                class_weight = {0: 1.0 / (1 - pos_prop),
                                1: 1.0 / pos_prop}
                logger.info("Balancing the classes by assigning sample weight {:.4f} to class 0 and sample weight "
                            "{:.4f} to class 1".format(class_weight[0], class_weight[1]))

        self.model_logistic = LogisticRegressionCV(
            Cs=self.c_search_values,
            cv=self.n_cv_folds,
            penalty='l2',
            scoring='roc_auc',
            multi_class='auto',
            class_weight=class_weight,
            max_iter=self.max_iter,
            refit=True,
            n_jobs=self.n_jobs,
            random_state=self.seed_rng
        ).fit(features_lid, labels)

        # Larger values of this score correspond to a higher probability of predicting class 1 (adversarial)
        scores_normal = self.model_logistic.decision_function(self.scaler.transform(features_lid_normal))
        scores_adversarial = self.model_logistic.decision_function(self.scaler.transform(features_lid_adversarial))
        if noisy_data:
            scores_noisy = self.model_logistic.decision_function(self.scaler.transform(features_lid_noisy))
            return self, scores_normal, scores_adversarial, scores_noisy
        else:
            return self, scores_normal, scores_adversarial

    def score(self, layer_embeddings, cleanup=True):
        """
        Given a list of layer embeddings for test samples, extract the layer-wise LID feature vector and return the
        decision function of the logistic classifier.

        :param layer_embeddings: list of numpy arrays with the layer embeddings for normal samples. Length of the
                                 list is equal to the number of layers. The numpy array at index `i` has shape
                                 `(n, d_i)`, where `n` is the number of samples and `d_i` is the dimension of the
                                 embeddings at layer `i`.
        :param cleanup: If set to True, the temporary directory where the KNN index files are saved will be deleted
                        after scoring. If this method is to be called multiple times, set `cleanup = False` for all
                        calls except the last one.
        :return:
            - numpy array of detection scores for the test samples. Has shape `(n, )` where `n` is the number of
              samples. Larger values correspond to a higher confidence that the sample is adversarial.
        """
        n_test = layer_embeddings[0].shape[0]
        l = len(layer_embeddings)
        if l != self.n_layers:
            raise ValueError("Expecting {:d} layers in the input 'layer_embeddings', but received {:d} layers.".
                             format(self.n_layers, l))

        features_lid = np.zeros((n_test, self.n_layers))
        for i in range(self.n_layers):
            logger.info("Calculating LID features for layer {:d}".format(i + 1))
            if self.transform_models:
                # Dimension reduction
                data_proj = transform_data_from_model(layer_embeddings[i], self.transform_models[i])
            else:
                data_proj = layer_embeddings[i]

            if self.save_knn_indices_to_file:
                with open(self.temp_knn_files[i], 'rb') as fp:
                    self.index_knn[i] = pickle.load(fp)

            _, nn_distances = self.index_knn[i].query(data_proj, k=self.n_neighbors)
            features_lid[:, i] = lid_mle_amsaleg(nn_distances)

            if self.save_knn_indices_to_file:
                self.index_knn[i] = None

        if cleanup and self.save_knn_indices_to_file:
            _ = subprocess.check_call(['rm', '-rf', self.temp_direc])

        features_lid = self.scaler.transform(features_lid)
        return self.model_logistic.decision_function(features_lid)
예제 #8
0
                          penalty='l1',
                          solver='liblinear',
                          tol=0.01)
re = lr.fit(X_train, Y_train)

# 4. 模型效果获取
r = re.score(X_train, Y_train)
print("准确率: ", r)
print("稀疏化特征比率: %.2f%%" % (np.mean(lr.coef_.ravel() == 0)**100))
print("参数: ", re.coef_)
print("截距: ", re.intercept_)
print(re.predict(X_train))
y_hat = re.predict(X_train)
print(y_hat)

print(lr.decision_function(X_train))

# 5. 模型相关信息保存
# 引入包
# from sklearn.externals import  joblib
# 要求文件夹必须存在
# 将标准化模型保存
# joblib.dump(ss, "datas/logistic/ss.model")
# 将模型保存
# joblib.dump(lr, "datas/models/logistic/lr.model")

# 模型加载
# 引入包
# from sklearn.externals import joblib
# oss = joblib.load(("models/logistic/ss.model"))
# olr = joblib.load("models/logistic/lr.model")
예제 #9
0
def main(file_in, file_out):
    # file_in = '../Re__Research_on_detecting_air_pollution_related_terms_searches_/keywords_data_rescaled_joined.csv'
    # air_data_raw = readData(file_in)

    # create an excel book
    book = xlwt.Workbook() 
    sheet0 = book.add_sheet('first_page')
    book.save(file_out)

    parameters = []
    # for lag_days in [3, 5, 7]: 
    #     for kernel_size in range(2, lag_days):
    #         for pollution_value in [60]:
    #             for search_lag in [0, 1, 2, 3]:
    #                 parameters.append((lag_days, kernel_size, pollution_value, search_lag))

    '''============Summary: 2009 90==============
    no polluted days in training data
    '''
    for lag_days in [7]: 
        for kernel_size in [2]:
            for pollution_value in [70]:
                for search_lag in [2]:
                    parameters.append((lag_days, kernel_size, pollution_value, search_lag))

    for parameter_index in range(len(parameters)):
        data = xlrd.open_workbook(file_out)
        ws = xlutils.copy.copy(data)
        data.release_resources()
        del data
        lag_days, kernel_size, pollution_value, search_lag= parameters[parameter_index]
        seq_length = lag_days

        sheet1 = ws.add_sheet('model' + str(parameter_index))
        row_index = 0
        col_index = 0
        
        sheet1.write(row_index,col_index,'Input_Features') 
        col_index = col_index + 1
        sheet1.write(row_index,col_index,'Accuracy') 
        col_index = col_index + 1
        sheet1.write(row_index,col_index, 'F1_score')
        col_index = col_index + 1
        sheet1.write(row_index,col_index, 'AUC_val')
        col_index = col_index + 1
        sheet1.write(row_index,col_index+2, 'CNN: ' + '(seq_length, kernel_size, pollution_value, search_lag):' + str(parameters[parameter_index]))
        col_index=0
        row_index = row_index + 1

        # with open(file_out, 'w') as fo:
        # fo.write('Input_Features'+',' + 'Accuracy'+ ',' + 'F1_score' + ',' + 'AUC_val' + '\n')
        for season in ['summer']:
        # for season in ['summer', 'winter']:
            sheet1.write(row_index, col_index, "============" + season + "=============")
            row_index = row_index + 1
            # fo.write("============" + season + "============="+ '\n')
            # for final_year in [2009,2010,2011,2012,2013]:
            for final_year in [2012]:
            # for final_year in [2012]:
            # for final_year in [2009]:
                sheet1.write(row_index, col_index, 'Final year: ' + str(final_year))
                row_index = row_index + 1
                # fo.write('Final year: ' + str(final_year) + '\n')
                # air_data = selectData(air_data_raw.copy(), season = season, final_year=final_year)
                for shift_days in [0]:
                    # fo.write('Shift days: ' + str(shift_days)+ '\n')
                    print("============Summary: " + str(final_year) + ' ' + str(pollution_value) + '==============' )
                    single_feature = False
                    data_split = DataSplit(file_path = file_in, season = season, final_year = final_year)
                    X_train, X_valid, X_test, y_train, y_valid, y_test = data_split.generateTrainTest()
                    train_len = len(y_train)
                    valid_len = len(y_valid)
                    test_len = len(y_test)

                    # lag_days = 3
                    # seq_length = 3
                    # kernel_size = 2
                    # pollution_value = 50

                    raw_values = np.concatenate((y_train, y_valid, y_test), axis=0)
                    # transform data to be supervised learning
                    # supervised_values = timeseries_to_supervised(raw_values, 5)
                    supervised_values = timeseries_to_supervised(raw_values, lag = lag_days)
                    # normalize to 0 to 1
                    # supervised_values = supervised_values/supervised_values.max()
                    # normalize supervised_values
                    supervised_values -= np.mean(supervised_values, axis = 0) # zero-center
                    supervised_values /= np.std(supervised_values, axis = 0) # normalize
                        
                    # for input_features in ['pollution_val', 'one-hot-encoding+', 'glove-embedding+']:
                    for with_pollution_val in ['pollution_val', 'with_pol_val', 'without_pol_val']:
                        for input_features in ['one-hot+', 'one-hot+glove+']:
                            if with_pollution_val == 'pollution_val':
                                x_train_concat = supervised_values.copy()
                                input_features = ''
                            else:
                                X_concat_frames = pd.concat([X_train, X_valid, X_test])
                                feature_embeddings = generate_search_embedding(X_concat_frames, representation = 'one-hot')
                                feature_embeddings = lag_search_features(feature_embeddings, lag = search_lag)
                                if input_features == 'one-hot+':
                                    if with_pollution_val == 'with_pol_val':
                                        x_train_concat = np.concatenate((supervised_values, feature_embeddings), axis=1)
                                    else:
                                        x_train_concat = feature_embeddings.copy()
                                else:
                                    glove_feature_embeddings = generate_search_embedding(X_concat_frames, representation = 'glove')
                                    glove_feature_embeddings = lag_search_features(glove_feature_embeddings, lag = search_lag)
                                    if with_pollution_val == 'with_pol_val':
                                        x_train_concat = np.concatenate((supervised_values, feature_embeddings, glove_feature_embeddings), axis=1)
                                    else:
                                        x_train_concat = np.concatenate((feature_embeddings, glove_feature_embeddings), axis=1)

                            input_embedding = generate_input_sequence(x_train_concat, seq_length = seq_length)
                            input_embedding = input_embedding.reshape(len(input_embedding), -1)
                            
                            y_class = [1 if i>pollution_value else 0 for i in raw_values]

                            # generate train_validation set 
                            x_train_valid = input_embedding[:train_len+valid_len]
                            y_train_valid = y_class[:train_len+valid_len]

                            x_test = input_embedding[train_len+valid_len:]
                            y_test = y_class[train_len+valid_len:]

                            valid_index = [i for i in range(train_len, train_len+valid_len)]
                            test_fold = [ -1 if i not in valid_index else 0 for i in range(0, len(x_train_valid)) ]
                            ps = PredefinedSplit(test_fold=test_fold)

                            clf = LogisticRegressionCV(
                                Cs=list(np.power(10.0, np.arange(-20, 6)))
                                ,penalty='l2'
                                ,scoring='f1'
                                ,cv=ps
                                ,random_state=0
                                ,max_iter=10000
                                ,class_weight="balanced"
                                # ,fit_intercept=True
                                ,fit_intercept=False
                                ,solver='newton-cg'
                                ,tol=10
                                ,refit=True
                            )

                            clf.fit(x_train_valid, y_train_valid)
                            pred = clf.predict(x_test)
                            accuracy = accuracy_score(y_test,pred)
                            f1_value = f1_score(y_test,pred)
                            y_score = clf.decision_function(x_test)
                            fpr,tpr,threshold = roc_curve(y_test, y_score) 
                            auc_value = auc(fpr,tpr) 

                            sheet1.write(row_index, col_index, with_pollution_val + '+' + input_features)
                            col_index = col_index + 1
                            sheet1.write(row_index, col_index, str(accuracy))
                            col_index = col_index + 1
                            sheet1.write(row_index, col_index,  str(f1_value))
                            col_index = col_index + 1
                            sheet1.write(row_index, col_index,  str(auc_value))
                            col_index = 0
                            row_index = row_index + 1

                            if with_pollution_val == 'pollution_val':
                                break
                            # fo.write(input_features + ',' + str(accuracy) +',' + str(f1_value) + ',' + str(auc_value)+ '\n')
        ws.save(file_out)
        del ws
예제 #10
0
plt.plot(x1_len, Y1_test, 'ro',markersize = 8, zorder=3, label=u'真实值')
plt.plot(x1_len, Y1_predict, 'go', markersize = 12, zorder=2, label=u'预测值,$R^2$=%.3f' % lr2.score(X1_train, Y1_train))
plt.legend(loc = 'upper left')
plt.xlabel(u'数据编号', fontsize=18)
plt.ylabel(u'葡萄酒质量', fontsize=18)
plt.title(u'葡萄酒质量预测统计(降维处理)', fontsize=20)
plt.show()

#从AUC角度看模型效果
from sklearn.preprocessing import label_binarize
from sklearn import metrics
y_test_hot = label_binarize(Y_test,classes=(3,4,5,6,7,8,9)).ravel()

#计算原始数据模型
#得到预测的损失值
lr_y_score = lr.decision_function(X_test).ravel()
#计算ROC
lr_fpr, lr_tpr, lr_threasholds = metrics.roc_curve(y_test_hot,lr_y_score)
#计算AUC
lr_auc = metrics.auc(lr_fpr, lr_tpr)

#计算降维后的数据模型
#decision_function 的值等于X1_test乘以特征系数
lr2_y_score = lr2.decision_function(X1_test).ravel()
#计算ROC
lr2_fpr, lr2_tpr, lr2_threasholds = metrics.roc_curve(y_test_hot,lr2_y_score)
#计算AUC
lr2_auc = metrics.auc(lr2_fpr, lr2_tpr)

print "原始数据AUC值:", lr_auc
print "降维数据AUC值:", lr2_auc
예제 #11
0
x_min = np.min(X_train).astype(np.float32) - 0.5
x_max = np.max(X_train).astype(np.float32) + 0.5

Y_train = np.array(Y_train).reshape((-1, 1))
test = np.concatenate([X_train, Y_train], axis=1)
test.sort(axis=0)
y_predict = lr.predict(test[:, 0].reshape(-1, 1))
plt.figure(figsize=(12, 9), facecolor='w')
plt.plot(test[:, 0], test[:, 1], 'ro', markersize=6, zorder=3, label=u'真实值')
plt.plot(test[:, 0], y_predict, 'go', markersize=10, zorder=2,
         label=u'Logis算法预测值,准确率=%.3f' % lr.score(X_test, Y_test))

# 画第一条线
plt.plot([x_min, x_max], [theta11 * x_min + theta10, theta11 * x_max + theta10], 'r-', label=u'第一条线')
plt.plot([x_min, x_max], [theta21 * x_min + theta20, theta21 * x_max + theta20], 'b-', label=u'第二条线')
plt.plot([x_min, x_max], [theta31 * x_min + theta30, theta31 * x_max + theta30], 'g-', label=u'第三条线')
plt.legend(loc='lower right')
plt.xlabel(u'{}'.format(names[0:1]), fontsize=18)
plt.ylabel(u'种类', fontsize=18)
plt.title(u'鸢尾花数据分类', fontsize=20)
plt.grid()
plt.show()

x_t = test[-10:, 0].reshape(-1,1)
y_t = lr.decision_function(x_t)
print(y_t)
print(x_t.reshape(-1))
print(lr.predict(x_t))
print(test[-10:, 1])
print([theta31 * x_min + theta30, theta31 * 7.1 + theta30])