예제 #1
0
    def normalize_features(self, scaler: StandardScaler=None) \
            -> StandardScaler:
        '''
        Normalizes the features of the dataset using a StandardScaler
        (subtract mean, divide by standard deviation).

        If a scaler is provided, uses that scaler to perform the normalization.
        Otherwise fits a scaler to the features in the dataset and then
        performs the normalization.

        :param scaler: A fitted StandardScaler. Used if provided.
        Otherwise a StandardScaler is fit on this dataset and is then used.
        :param replace_nan_token: What to replace nans with.
        :return: A fitted StandardScaler. If a scaler is provided, this is the
        same scaler. Otherwise, this is a scaler fit on this dataset.
        '''
        if not self.data or not self.data[0].features:
            return None

        if not scaler:
            scaler = StandardScaler()

        features = np.vstack([d.features for d in self.data])
        scaler.fit(features)

        for d in self.data:
            d.set_features(scaler.transform(d.features.reshape(1, -1))[0])

        return scaler
예제 #2
0
def test_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
예제 #3
0
def test_scaler_1d():
    """Test scaling of dataset along single axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(5)
    X_orig_copy = X.copy()

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_array_almost_equal(X_scaled_back, X_orig_copy)

    # Test with 1D list
    X = [0., 1., 2, 0.4, 1.]
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X_scaled = scale(X)
    assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
    assert_array_almost_equal(X_scaled.std(axis=0), 1.0)

    X = np.ones(5)
    assert_array_equal(scale(X, with_mean=False), X)
예제 #4
0
    def prepare_time_data(data):
        data_scaler = StandardScaler()
        data_concat = np.concatenate(data, axis=0)
        data_scaler.fit(data_concat)
        new_data = [data_scaler.transform(data_) for data_ in data]

        return data_scaler, new_data
예제 #5
0
 def preprocess(self):
     sc = StandardScaler()
     sc.fit(self.X_train)
     X_train_std = sc.transform(self.X_train)
     X_test_std = sc.transform(self.X_test)
     self.train_dataset = self.Dataset(data=X_train_std,
                                       target=self.y_train)
     self.test_dataset = self.Dataset(data=X_test_std, target=self.y_test)
 def __stdScaler(self):
     all_cols = list(self.data_df.columns.values)
     for col in all_cols:
         if col not in self.non_numeric_cols and col != 'time_to_failure':
             stdScaler = StandardScaler()
             stdScaler.fit(self.data_df[[col]])
             self.data_df[col] = stdScaler.transform(self.data_df[[col]])
     print('Standard Scaler applied ... ')
예제 #7
0
def main():
    args = parse()
    n_rollout = args.nrollout
    n_epoch = args.epoch
    savename = args.savename if args.savename is not None else 'model-' + str(
        n_rollout) + 'unroll'

    np.random.seed(1098)
    path = args.filename
    names = ['target_pos', 'target_speed', 'pos', 'vel', 'effort']
    with h5py.File(path, 'r') as f:
        (target_pos, target_speed, pos, vel,
         effort) = [[np.array(val) for val in f[name].values()]
                    for name in names]

    x_target = np.array(target_pos)
    x_first = np.array([pos_[0] for pos_ in pos])
    x_speed = np.array(target_speed).reshape((-1, 1))
    aux_output = [np.ones(eff.shape[0]).reshape((-1, 1)) for eff in effort]

    x = np.concatenate((x_target, x_first, x_speed), axis=1)

    input_scaler = StandardScaler()
    x = input_scaler.fit_transform(x)
    output_scaler = StandardScaler()
    effort_concat = np.concatenate([a for a in effort], axis=0)
    output_scaler.fit(effort_concat)
    effort = [output_scaler.transform(eff) for eff in effort]

    y = pad_sequences(effort, padding='post', value=0.)
    aux_output = pad_sequences(aux_output, padding='post', value=0.)
    x, x_test, y, y_test, y_aux, y_aux_test = train_test_split(x,
                                                               y,
                                                               aux_output,
                                                               test_size=0.2)

    y_mask, y_test_mask = [this_y[:, :, 0] for this_y in (y_aux, y_aux_test)]
    y_aux_mask, y_aux_test_mask = [
        np.ones(this_y.shape[:2]) for this_y in (y_aux, y_aux_test)
    ]

    model = MyModel(train=[x, [y, y_aux]],
                    val=[x_test, [y_test, y_aux_test]],
                    train_mask=[y_mask, y_aux_mask],
                    val_mask=[y_test_mask, y_aux_test_mask],
                    max_unroll=n_rollout,
                    name=savename)

    if not os.path.exists('save'):
        os.makedirs('save')

    if args.train:
        model.fit(nb_epoch=n_epoch, batch_size=32)
    elif args.resume:
        model.resume(nb_epoch=n_epoch, batch_size=32)
예제 #8
0
파일: ml.py 프로젝트: nvhuy/LM
def xval(feature_file, removed_columns=None):
    """
    Load features into file
    :param feature_file: feature file
    :param removed_columns: index of feature columns to remove
    """
    module_logger.info('------ Load feature data ::: {}'.format(feature_file))
    clf = svm_clf()

    fs = numpy.loadtxt(feature_file, delimiter='\t', skiprows=1)
    _, n = fs.shape
    iX = fs[:, 0]
    X = fs[:, 1:n - 1]
    y = fs[:, n - 1]

    if removed_columns is not None and len(removed_columns) > 0:
        X = numpy.delete(X, removed_columns, 1)
    module_logger.info('------ data dimension ::: {} ::: {}'.format(X.shape, n))

    y_true = numpy.array([])
    y_out = numpy.array([])
    y_prob = numpy.array([])
    y_i = numpy.array([])

    std_scaler = StandardScaler()

    skf = StratifiedKFold(n_splits=5)
    for train_index, test_index in skf.split(X, y):
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        std_scaler.fit(X_train)
        X_train_scaled = std_scaler.transform(X_train, copy=True)
        X_test_scaled = std_scaler.transform(X_test, copy=True)

        clf.fit(X_train_scaled, y_train)
        y_pred = clf.predict(X_test_scaled)
        y_logp = clf.predict_proba(X_test_scaled)

        y_true = numpy.hstack((y_true, y_test))
        y_out = numpy.hstack((y_out, y_pred))
        y_prob = numpy.hstack((y_prob, numpy.max(y_logp, axis=1)))

        iX_test = iX[test_index]
        y_i = numpy.hstack((y_i, iX_test))

    return write_prediction_output(y_i, y_true, y_out, feature_file.replace('.csv', '_pred.csv'), y_prob)
예제 #9
0
def test_scaler_2d_arrays():
    """Test scaling of 2d array along first axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has been copied
    assert_true(X_scaled is not X)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, axis=1, with_std=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    X_scaled = scale(X, axis=1, with_std=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
    # Check that the data hasn't been modified
    assert_true(X_scaled is not X)

    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is X)

    X = rng.randn(4, 5)
    X[:, 0] = 1.0  # first feature is a constant, non zero feature
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)
예제 #10
0
def test_scaler_2d_arrays():
    """Test scaling of 2d array along first axis"""
    rng = np.random.RandomState(0)
    X = rng.randn(4, 5)
    X[:, 0] = 0.0  # first feature is always of zero

    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))

    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has been copied
    assert_true(X_scaled is not X)

    # check inverse transform
    X_scaled_back = scaler.inverse_transform(X_scaled)
    assert_true(X_scaled_back is not X)
    assert_true(X_scaled_back is not X_scaled)
    assert_array_almost_equal(X_scaled_back, X)

    X_scaled = scale(X, axis=1, with_std=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    X_scaled = scale(X, axis=1, with_std=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0])
    # Check that the data hasn't been modified
    assert_true(X_scaled is not X)

    X_scaled = scaler.fit(X).transform(X, copy=False)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is X)

    X = rng.randn(4, 5)
    X[:, 0] = 1.0  # first feature is a constant, non zero feature
    scaler = StandardScaler()
    X_scaled = scaler.fit(X).transform(X, copy=True)
    assert_false(np.any(np.isnan(X_scaled)))
    assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0])
    assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.])
    # Check that X has not been copied
    assert_true(X_scaled is not X)
 def obtain_sets(self, psychological_construct, percentage):
     index = self.get_index(psychological_construct)
     logging.info("Psychological construct under analysis:" + 
                  psychological_construct)
     negative_students, positive_students = self.get_instances(index)
     train_set, dev_set, test_set = self.divide_sets(negative_students,
                                                     positive_students,
                                                     percentage)
     train_set_x, train_set_y = self.get_x_and_y(train_set, index)
     logging.info("Training set shape:" + str(train_set_x.shape))
     if self.norm == z_norm_literal:
         logging.info("Z-Normalizing")
         reshaped_train_set_x = self.reshape_numpy_array(train_set_x)
         scaler = StandardScaler()
         scaler.fit(reshaped_train_set_x)
         normalized_reshaped_train_x = scaler.transform(reshaped_train_set_x)
         normalized_train_set_x = np.reshape(normalized_reshaped_train_x,
                                             (train_set_x.shape[0],
                                              train_set_x.shape[1],
                                              train_set_x.shape[2],
                                              train_set_x.shape[3]))
     dev_set_x, dev_set_y = self.get_x_and_y(dev_set, index)
     if self.norm == z_norm_literal:
         logging.info("Z-Normalizing")
         reshaped_dev_x = self.reshape_numpy_array(dev_set_x)
         normalized_reshaped_dev_x = scaler.transform(reshaped_dev_x)
         normalized_dev_x = np.reshape(normalized_reshaped_dev_x,
                                       (dev_set_x.shape[0],
                                        dev_set_x.shape[1],
                                        dev_set_x.shape[2],
                                        dev_set_x.shape[3]))
     test_set_x, test_set_y = self.get_x_and_y(test_set, index,
                                               test_flag=True)
     if self.norm == z_norm_literal:
         logging.info("Z-Normalizing")
         reshaped_test_x = self.reshape_numpy_array(test_set_x)
         normalized_reshaped_test_x = scaler.transform(reshaped_test_x)
         normalized_test_x = np.reshape(normalized_reshaped_test_x,
                                        (test_set_x.shape[0],
                                         test_set_x.shape[1],
                                         test_set_x.shape[2],
                                         test_set_x.shape[3]))
         return normalized_train_set_x, train_set_y, normalized_dev_x, dev_set_y, normalized_test_x, test_set_y
     else:
         return train_set_x, train_set_y, dev_set_x, dev_set_y, test_set_x, test_set_y
예제 #12
0
class StandardScalerImpl():
    def __init__(self, copy=True, with_mean=True, with_std=True):
        self._hyperparams = {
            'copy': copy,
            'with_mean': with_mean,
            'with_std': with_std
        }
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
예제 #13
0
    def test_simple_poly_dataset_scaled_cv(self):
        model = Model.create_model(
            model_type=Model.MODEL_TYPE_SVR,
            cross_validation=True,
            feature_scaling=True,
            C_range=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10],
            kernel=Model.KERNEL_RBF
        )
        train_dataset, test_dataset = test_datasets.get_simple_polynomial_datasets(n=1000)

        scaler = StandardScaler()
        scaler.fit(train_dataset.data)
        print("Train mean: " + str(scaler.transform(train_dataset.data).mean(axis=0)))
        print("Test mean: " + str( scaler.transform(test_dataset.data).mean(axis=0)))
        print("Train std: " + str(scaler.transform(train_dataset.data).std(axis=0)))
        print("Test str: " + str( scaler.transform(test_dataset.data).std(axis=0)))

        self._test_dataset(model, train_dataset, test_dataset, 0, title="SVR with RBF kernel, scaled CV on poly dataset")
예제 #14
0
    def _proccess_input(self, target_pos, target_speed, pos, vel, effort):
        x_target = np.array(target_pos)
        x_first = np.array([pos_[0] for pos_ in pos])
        x_speed = np.array(target_speed).reshape((-1, 1))
        aux_output = [np.ones(eff.shape[0]).reshape((-1, 1)) for eff in effort]

        x = np.concatenate((x_target, x_first, x_speed), axis=1)

        input_scaler = StandardScaler()
        x = input_scaler.fit_transform(x)
        output_scaler = StandardScaler()
        effort_concat = np.concatenate([a for a in effort], axis=0)
        output_scaler.fit(effort_concat)
        effort = [output_scaler.transform(eff) for eff in effort]

        y = pad_sequences(effort, padding='post', value=0.)
        aux_output = pad_sequences(aux_output, padding='post', value=0.)
        x, x_test, y, y_test, y_aux, y_aux_test = train_test_split(x, y, aux_output, test_size=0.2)
        return x, x_test, y, y_test, y_aux, y_aux_test
예제 #15
0
class CreateStandardScaler(CreateModel):
    def fit(self, data, args):
        self.model = StandardScaler()

        with Timer() as t:
            self.model.fit(data.X_train, data.y_train)

        return t.interval

    def test(self, data):
        assert self.model is not None

        return self.model.transform(data.X_test)

    def predict(self, data):
        with Timer() as t:
            self.predictions = self.test(data)

        data.learning_task = LearningTask.REGRESSION
        return t.interval
예제 #16
0
def neural_net_2(train, test, val, train_out, test_out, val_out, BigSigma_inv):
    clf = MLPClassifier(solver='sgd',
                        alpha=1e-5,
                        hidden_layer_sizes=(100, 1),
                        activation='logistic',
                        batch_size=BATCH_HUMAN,
                        shuffle=True,
                        max_iter=5000)

    scaler = StandardScaler()
    scaler.fit(train)
    train1 = scaler.transform(train)
    # apply same transformation to test data
    test = scaler.transform(test)
    train_out = train_out.astype(float)
    clf.fit(X=train1, y=train_out)
    predict_test = clf.predict(test)
    predict_val = clf.predict(val)
    print("TEST ERMS ACCURACY", mean_squared_error(test_out, predict_test),
          acc_manual(test_out, predict_test))
    print("VAL ERMS ACCURACY", mean_squared_error(val_out, predict_val),
          acc_manual(val_out, predict_test))
예제 #17
0
def test_center_kernel():
    """Test that KernelCenterer is equivalent to StandardScaler
       in feature space"""
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    scaler = StandardScaler(with_std=False)
    scaler.fit(X_fit)
    X_fit_centered = scaler.transform(X_fit)
    K_fit = np.dot(X_fit, X_fit.T)

    # center fit time matrix
    centerer = KernelCenterer()
    K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
    K_fit_centered2 = centerer.fit_transform(K_fit)
    assert_array_almost_equal(K_fit_centered, K_fit_centered2)

    # center predict time matrix
    X_pred = rng.random_sample((2, 4))
    K_pred = np.dot(X_pred, X_fit.T)
    X_pred_centered = scaler.transform(X_pred)
    K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
    K_pred_centered2 = centerer.transform(K_pred)
    assert_array_almost_equal(K_pred_centered, K_pred_centered2)
예제 #18
0
def test_center_kernel():
    """Test that KernelCenterer is equivalent to StandardScaler
       in feature space"""
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    scaler = StandardScaler(with_std=False)
    scaler.fit(X_fit)
    X_fit_centered = scaler.transform(X_fit)
    K_fit = np.dot(X_fit, X_fit.T)

    # center fit time matrix
    centerer = KernelCenterer()
    K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
    K_fit_centered2 = centerer.fit_transform(K_fit)
    assert_array_almost_equal(K_fit_centered, K_fit_centered2)

    # center predict time matrix
    X_pred = rng.random_sample((2, 4))
    K_pred = np.dot(X_pred, X_fit.T)
    X_pred_centered = scaler.transform(X_pred)
    K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
    K_pred_centered2 = centerer.transform(K_pred)
    assert_array_almost_equal(K_pred_centered, K_pred_centered2)
예제 #19
0
파일: ml.py 프로젝트: nvhuy/LM
def train_test(feature_file, test_file, removed_columns=None):
    """
    Load features into file
    :param feature_file: feature file
    :param test_file: test file
    :param removed_columns: index of feature columns to remove
    """
    module_logger.info('------ Train/test model ::: {} ::: {}'.format(feature_file, test_file))

    clf = svm_clf()

    fs = numpy.loadtxt(feature_file, delimiter='\t', skiprows=1)
    _, n = fs.shape
    X_train = fs[:, 1:n - 1]
    y_train = fs[:, n - 1]

    fs = numpy.loadtxt(test_file, delimiter='\t', skiprows=1)
    _, n = fs.shape
    X_test = fs[:, 1:n - 1]
    y_test = fs[:, n - 1]
    y_i = fs[:, 0]

    if removed_columns is not None and len(removed_columns) > 0:
        X_test = numpy.delete(X_test, removed_columns, 1)
        X_train = numpy.delete(X_train, removed_columns, 1)
    module_logger.info('------ data dimension ::: {} ::: {} ::: {}'.format(X_train.shape, X_test.shape, n))

    std_scaler = StandardScaler()
    std_scaler.fit(X_train)
    X_train_scaled = std_scaler.transform(X_train, copy=True)
    X_test_scaled = std_scaler.transform(X_test, copy=True)

    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    y_logp = clf.predict_proba(X_test_scaled)

    return write_prediction_output(y_i, y_test, y_pred, test_file.replace('.csv', '_pred.csv'), y_logp)
예제 #20
0
 def preprocess(self):
     #         X_train, X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=0)
     sc = StandardScaler()
     sc.fit(self.X_train)
     self.X_train_std = sc.transform(self.X_train)
     self.X_test_std = sc.transform(self.X_test)
예제 #21
0
def train(args):
    timestr = time.strftime("%Y%m%d-%H%M%S-")
    output_dir = args.out_dir_path + '/' + time.strftime("%m%d")
    mkdir(output_dir)
    setLogger(timestr, out_dir=output_dir)
    print_args(args)

    if args.load_input_pkl == '':
        # process train and test data
        logger.info('Loading training file...')
        _, train_question1, train_question2, train_y = get_pdTable(
            args.train_path)
        # train_question1, train_question2, train_y = csv_processing(args.train_path)
        logger.info('Train csv: %d line loaded ' % len(train_question1))
        logger.info('Loading test file...')
        if args.predict_test:
            test_ids, test_question1, test_question2 = get_pdTable(
                args.test_path, notag=True)
        else:
            test_ids, test_question1, test_question2, test_y = get_pdTable(
                args.test_path)
        # test_question1, test_question2, test_ids = csv_processing(args.test_path, test=True)
        logger.info('Test csv: %d line loaded ' % len(test_question1))

        logger.info('Text cleaning... ')
        train_question1, train_maxLen1 = text_cleaner(train_question1)
        train_question2, train_maxLen2 = text_cleaner(train_question2)
        test_question1, test_maxLen1 = text_cleaner(test_question1)
        test_question2, test_maxLen2 = text_cleaner(test_question2)
        # 	train_question1, train_maxLen1 = tokenizeIt(train_question1, clean=args.rawMaterial)
        # 	train_question2, train_maxLen2 = tokenizeIt(train_question2, clean=args.rawMaterial)
        # 	test_question1, test_maxLen1 = tokenizeIt(test_question1, clean=args.rawMaterial)
        # 	test_question2, test_maxLen2 = tokenizeIt(test_question2, clean=args.rawMaterial)
        inputLength = max(train_maxLen1, train_maxLen2, test_maxLen1,
                          test_maxLen2)
        logger.info('Max input length: %d ' % inputLength)
        inputLength = 30
        logger.info('Reset max length to 30')

        tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
        tokenizer.fit_on_texts(train_question1 + train_question2 +
                               test_question1 + test_question2)

        sequences_1 = tokenizer.texts_to_sequences(train_question1)
        sequences_2 = tokenizer.texts_to_sequences(train_question2)
        test_sequences_1 = tokenizer.texts_to_sequences(test_question1)
        test_sequences_2 = tokenizer.texts_to_sequences(test_question2)
        del train_question1, train_question2, test_question1, test_question2

        word_index = tokenizer.word_index
        logger.info('Found %s unique tokens' % len(word_index))

        train_x1 = pad_sequences(sequences_1, maxlen=inputLength)
        train_x2 = pad_sequences(sequences_2, maxlen=inputLength)
        train_y = array(train_y)
        logger.info('Shape of data tensor: (%d, %d)' % train_x1.shape)
        logger.info('Shape of label tensor: (%d, )' % train_y.shape)

        test_x1 = pad_sequences(test_sequences_1, maxlen=inputLength)
        test_x2 = pad_sequences(test_sequences_2, maxlen=inputLength)
        test_ids = array(test_ids)
        if not args.predict_test:
            test_y = array(test_y)

        del sequences_1, sequences_2, test_sequences_1, test_sequences_2
        if args.save_model:
            with open(output_dir + '/' + timestr + 'input_train_test.pkl',
                      'wb') as input_file:
                logger.info('Dumping processed input to pickle...')
                pkl.dump((train_x1, train_x2, train_y, test_x1, test_x2,
                          test_ids, tokenizer), input_file)
    else:
        with open(args.load_input_pkl, 'rb') as input_file:
            train_x1, train_x2, train_y, test_x1, test_x2, test_ids, tokenizer = pkl.load(
                input_file)
            logger.info('Shape of data tensor: (%d, %d)' % train_x1.shape)
            logger.info('Shape of label tensor: (%d, )' % train_y.shape)
            word_index = tokenizer.word_index
        inputLength = 30
        logger.info('Reset max length to 30')

    if args.w2v or args.ft_src:
        if args.w2v.endswith('.pkl'):
            with open(args.w2v, 'rb') as embd_file:
                logger.info('Loading word embedding from pickle...')
                embdw2v, vocabReverseDict = pkl.load(embd_file)
                if not len(vocabReverseDict) == len(word_index):
                    logger.info('WARNING: reversed dict len incorrect %d , but word dict len %d ' % \
                            (len(vocabReverseDict), len(word_index)))
        else:
            logger.info('Loading word embedding from text file...')
            embdw2v, vocabReverseDict = embdReader(
                args.w2v,
                args.embd_dim,
                word_index,
                MAX_NB_WORDS,
                fasttext_source=args.ft_src,
                ft_dim=args.ft_dim,
                skip_header=args.skip_header,
                initializer=args.embd_init)
            if args.save_model:
                with open(
                        output_dir + '/' + timestr + 'embd_dump.' +
                        str(args.embd_dim + args.ft_dim) + 'd.pkl',
                        'wb') as embd_file:
                    logger.info('Dumping word embedding to pickle...')
                    pkl.dump((embdw2v, vocabReverseDict), embd_file)
    else:
        embdw2v = None

# 	if args.load_vocab_from_file:
# 		with open(args.load_vocab_from_file, 'rb') as vocab_file:
# 			(vocabDict, vocabReverseDict) = pkl.load(vocab_file)
# 			unk = None
# 			if args.w2v:
# 				if args.w2v.endswith('.pkl'):
# 					with open(args.w2v, 'rb') as embd_file:
# 						embdw2v = pkl.load(embd_file)
# 				else:
# 					from util.data_processing import w2vEmbdReader
# 					embdw2v = w2vEmbdReader(args.w2v, vocabReverseDict, args.embd_dim)
# 					with open(output_dir + '/'+ timestr + 'embd_dump.' + str(args.embd_dim) + 'd.pkl', 'wb') as embd_file:
# 						pkl.dump(embdw2v, embd_file)
# 			else:
# 				embdw2v = None
# 	else:
# 		from util.data_processing import createVocab
# 		vocabDict, vocabReverseDict = createVocab([train_question1, train_question2, test_question1, test_question2],
# 												min_count=3, reservedList=['<pad>', '<unk>'])
# 		embdw2v = None
# 		unk = '<unk>'
## 	logger.info(vocabDict)

# 	# word to padded numerical np array
# 	from util.data_processing import word2num
# 	train_x1 = word2num(train_question1, vocabDict, unk, inputLength, padding='pre')
# 	train_x2 = word2num(train_question2, vocabDict, unk, inputLength, padding='pre')
# 	test_x1 = word2num(test_question1, vocabDict, unk, inputLength, padding='pre')
# 	test_x2 = word2num(test_question2, vocabDict, unk, inputLength, padding='pre')

# Loading train features
    if not args.train_feature_path == '':
        logger.info('Loading train features from file %s ' %
                    args.train_feature_path)
        df_train = read_csv(args.train_feature_path, encoding="ISO-8859-1")
        if not args.feature_list == '':
            feature_list = args.feature_list.split(',')
            train_features = DataFrame()
            for feature_name in feature_list:
                train_features[feature_name.strip()] = df_train[
                    feature_name.strip()]
        elif args.fidx_end == 0:
            train_features = df_train.iloc[:, args.fidx_start:]
        else:
            train_features = df_train.iloc[:, args.fidx_start:args.fidx_end]

        if not args.train_bowl_feature_path == '':
            logger.info('Loading train 1bowl features from file %s ' %
                        args.train_bowl_feature_path)
            df_train = read_csv(args.train_bowl_feature_path,
                                encoding="ISO-8859-1")
            if not args.bowl_feat_list == '':
                bowl_feat_list = args.bowl_feat_list.split(',')
                for feature_name in bowl_feat_list:
                    train_features[feature_name.strip()] = df_train[
                        feature_name.strip()]
            else:
                for feature_name in df_train.columns:
                    if feature_name.startswith('z_'):
                        train_features[feature_name] = df_train[feature_name]

        logger.info('Final train feature list: \n %s ' %
                    ','.join(list(train_features.columns.values)))
        feature_length = len(train_features.columns)
        train_features = train_features.replace([inf, -inf, nan], 0)
        train_features = array(train_features)
        logger.info('Loaded train feature shape: (%d, %d) ' %
                    train_features.shape)
        del df_train

        # Loading test features
        logger.info('Loading test features from file %s ' %
                    args.test_feature_path)
        df_test = read_csv(args.test_feature_path, encoding="ISO-8859-1")
        if not args.feature_list == '':
            feature_list = args.feature_list.split(',')
            test_features = DataFrame()
            for feature_name in feature_list:
                test_features[feature_name.strip()] = df_test[
                    feature_name.strip()]
        elif args.fidx_end == 0:
            test_features = df_test.iloc[:, args.fidx_start:]
        else:
            test_features = df_test.iloc[:, args.fidx_start:args.fidx_end]

        if not args.test_bowl_feature_path == '':
            logger.info('Loading test 1bowl features from file %s ' %
                        args.test_bowl_feature_path)
            df_test = read_csv(args.test_bowl_feature_path,
                               encoding="ISO-8859-1")
            if not args.bowl_feat_list == '':
                bowl_feat_list = args.bowl_feat_list.split(',')
                for feature_name in bowl_feat_list:
                    test_features[feature_name.strip()] = df_test[
                        feature_name.strip()]
            else:
                for feature_name in df_test.columns:
                    if feature_name.startswith('z_'):
                        test_features[feature_name] = df_test[feature_name]

        test_features = test_features.replace([inf, -inf, nan], 0)
        test_features = array(test_features)
        logger.info('Loaded test feature shape: (%d, %d) ' %
                    test_features.shape)
        del df_test

        # Normalize Data
        ss = StandardScaler()
        ss.fit(vstack((train_features, test_features)))
        train_features = ss.transform(train_features)
        test_features = ss.transform(test_features)
        del ss
        logger.info('Features normalized ')

    train_x1_aug = vstack((train_x1, train_x2))
    train_x2_aug = vstack((train_x2, train_x1))
    train_y = concatenate((train_y, train_y))
    train_x = [train_x1_aug, train_x2_aug]
    test_x = [test_x1, test_x2]
    if not args.train_feature_path == '':
        train_features = vstack((train_features, train_features))
        train_x += [train_features]
        test_x += [test_features]

    # ########################################
    # ## sample train/validation data
    # ########################################
    # # np.random.seed(1234)
    # perm = random.permutation(len(train_x1))
    # idx_train = perm[:int(len(train_x1) * (1 - args.valid_split))]
    # idx_val = perm[int(len(train_x1) * (1 - args.valid_split)):]
    #
    # data_1_train = vstack((train_x1[idx_train], train_x2[idx_train]))
    # data_2_train = vstack((train_x2[idx_train], train_x1[idx_train]))
    # leaks_train = vstack((train_features[idx_train], train_features[idx_train]))
    # labels_train = concatenate((train_y[idx_train], train_y[idx_train]))
    #
    # data_1_val = vstack((train_x1[idx_val], train_x2[idx_val]))
    # data_2_val = vstack((train_x2[idx_val], train_x1[idx_val]))
    # leaks_val = vstack((train_features[idx_val], train_features[idx_val]))
    # labels_val = concatenate((train_y[idx_val], train_y[idx_val]))

    # re_weight = True  # whether to re-weight classes to fit the 17.5% share in test set
    # weight_val = ones(len(labels_val))
    # if re_weight:
    # 	weight_val *= 0.472001959
    # 	weight_val[labels_val == 0] = 1.309028344

    ########################################
    ## add class weight
    ########################################
    if args.re_weight:
        class_weight = {0: 1.309028344, 1: 0.472001959}
    else:
        class_weight = None

    # 	# Dump vocab


# 	if not args.load_vocab_from_file:
# 		with open(output_dir + '/'+ timestr + 'vocab.pkl', 'wb') as vocab_file:
# 			pkl.dump((vocabDict, vocabReverseDict), vocab_file)

    if args.load_model_json:
        with open(args.load_model_json, 'r') as json_file:
            rnnmodel = model_from_json(json_file.read(),
                                       custom_objects={
                                           "DenseWithMasking":
                                           DenseWithMasking,
                                           "Conv1DWithMasking":
                                           Conv1DWithMasking,
                                           "MaxOverTime": MaxOverTime,
                                           "MeanOverTime": MeanOverTime
                                       })
        logger.info('Loaded model from saved json')
    else:
        if args.train_feature_path == '':
            rnnmodel = getModel(args,
                                inputLength,
                                len(word_index) + 1,
                                embd=embdw2v)
        else:
            rnnmodel = getModel(args,
                                inputLength,
                                len(word_index) + 1,
                                embd=embdw2v,
                                feature_length=feature_length)

    if args.load_model_weights:
        rnnmodel.load_weights(args.load_model_weights)
        logger.info('Loaded model from saved weights')

    if args.optimizer == 'rmsprop':
        optimizer = RMSprop(lr=args.learning_rate)
    else:
        optimizer = args.optimizer

    myMetrics = 'acc'  # 'binary_accuracy' # 'mse'
    rnnmodel.compile(loss=args.loss, optimizer=optimizer, metrics=[myMetrics])
    rnnmodel.summary()

    ## Plotting model
    logger.info('Plotting model architecture')
    plot_model(rnnmodel, to_file=output_dir + '/' + timestr + 'model_plot.png')
    logger.info('  Done')

    if args.save_model:
        ## Save model architecture
        logger.info('Saving model architecture')
        with open(output_dir + '/' + timestr + 'model_config.json',
                  'w') as arch:
            arch.write(rnnmodel.to_json(indent=2))
        logger.info('  Done')

    # train and test model
    myCallbacks = []
    train_logger = TrainLogger()
    myCallbacks.append(train_logger)
    if args.eval_on_epoch:
        from util.model_eval import Evaluator
        evl = Evaluator(args, output_dir, timestr, myMetrics, test_x, test_y,
                        vocabReverseDict)
        myCallbacks.append(evl)
    if args.save_model:
        bst_model_path = output_dir + '/' + timestr + 'best_model_weights.h5'
        model_checkpoint = ModelCheckpoint(bst_model_path,
                                           save_best_only=True,
                                           save_weights_only=True,
                                           verbose=1)
        myCallbacks.append(model_checkpoint)
    if args.plot:
        if not args.eval_on_epoch:
            plot_pic = PlotPic(args, output_dir, timestr, myMetrics)
            myCallbacks.append(plot_pic)
    if args.earlystop:
        earlystop = EarlyStopping(patience=args.earlystop,
                                  verbose=1,
                                  mode='auto')
        myCallbacks.append(earlystop)

    rnnmodel.fit(train_x,
                 train_y,
                 validation_split=args.valid_split,
                 batch_size=args.train_batch_size,
                 epochs=args.epochs,
                 class_weight=class_weight,
                 callbacks=myCallbacks)
    # rnnmodel.fit([data_1_train, data_2_train, leaks_train], labels_train,
    # 				 validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val),
    # 				 epochs=args.epochs, batch_size=args.train_batch_size, shuffle=True,
    # 				 class_weight=class_weight, callbacks=myCallbacks)

    if args.predict_test:
        logger.info("Tuning model to best record...")
        rnnmodel.load_weights(bst_model_path)
        logger.info("Predicting test file result...")
        preds = rnnmodel.predict(test_x,
                                 batch_size=args.eval_batch_size,
                                 verbose=1)
        preds = squeeze(preds)
        logger.info('Write predictions into file... Total line: %d ' %
                    len(preds))
        with open(output_dir + '/' + timestr + 'predict.csv',
                  'w',
                  encoding='utf8') as fwrt:
            writer_sub = csv.writer(fwrt)
            writer_sub.writerow(['test_id', 'is_duplicate'])
            idx = 0
            for itm in tqdm(preds):
                writer_sub.writerow([idx, itm])
                idx += 1
    elif not args.eval_on_epoch:
        logger.info("Evaluating test set...")
        tloss, tacc = rnnmodel.evaluate(test_x,
                                        test_y,
                                        batch_size=args.eval_batch_size,
                                        verbose=1)
        logger.info("Test loss: %.4f   Test Accuracy: %.2f%%" %
                    (tloss, 100 * tacc))
예제 #22
0
def inference(args):

    timestr = time.strftime("%Y%m%d-%H%M%S-")
    output_dir = args.out_dir_path + '/' + time.strftime("%m%d")
    mkdir(output_dir)
    setLogger(timestr, out_dir=output_dir)
    print_args(args)

    if args.load_input_pkl == '':
        raise NotImplementedError(
            'only support loading testing materials from pickle')
    else:
        with open(args.load_input_pkl, 'rb') as input_file:
            train_x1, train_x2, train_y, test_x1, test_x2, test_ids, tokenizer = pkl.load(
                input_file)
            logger.info('Shape of test data tensor: (%d, %d)' % test_x1.shape)
            word_index = tokenizer.word_index
            logger.info('Loaded %s unique tokens' % len(word_index))

    if not args.test_path == '':
        if args.predict_test:
            test_ids, test_question1, test_question2 = get_pdTable(
                args.test_path, notag=True)
        else:
            test_ids, test_question1, test_question2, test_y = get_pdTable(
                args.test_path)
        test_question1, test_maxLen1 = text_cleaner(test_question1)
        test_question2, test_maxLen2 = text_cleaner(test_question2)
        inputLength = max(test_maxLen1, test_maxLen2)
        logger.info('Max input length: %d ' % inputLength)
        inputLength = 30
        logger.info('Reset max length to 30')
        test_sequences_1 = tokenizer.texts_to_sequences(test_question1)
        test_sequences_2 = tokenizer.texts_to_sequences(test_question2)
        test_x1 = pad_sequences(test_sequences_1, maxlen=inputLength)
        test_x2 = pad_sequences(test_sequences_2, maxlen=inputLength)
        test_ids = array(test_ids)
        if not args.predict_test:
            test_y = array(test_y)

    # Loading train features
    if not args.train_feature_path == '':
        logger.info('Loading train features from file %s ' %
                    args.train_feature_path)
        df_train = read_csv(args.train_feature_path, encoding="ISO-8859-1")
        if not args.feature_list == '':
            feature_list = args.feature_list.split(',')
            train_features = DataFrame()
            for feature_name in feature_list:
                train_features[feature_name.strip()] = df_train[
                    feature_name.strip()]
        elif args.fidx_end == 0:
            train_features = df_train.iloc[:, args.fidx_start:]
        else:
            train_features = df_train.iloc[:, args.fidx_start:args.fidx_end]

        if not args.train_bowl_feature_path == '':
            logger.info('Loading train 1bowl features from file %s ' %
                        args.train_bowl_feature_path)
            df_train = read_csv(args.train_bowl_feature_path,
                                encoding="ISO-8859-1")
            if not args.bowl_feat_list == '':
                bowl_feat_list = args.bowl_feat_list.split(',')
                for feature_name in bowl_feat_list:
                    train_features[feature_name.strip()] = df_train[
                        feature_name.strip()]
            else:
                for feature_name in df_train.columns:
                    if feature_name.startswith('z_'):
                        train_features[feature_name] = df_train[feature_name]

        logger.info('Final train feature list: \n %s ' %
                    ','.join(list(train_features.columns.values)))
        feature_length = len(train_features.columns)
        train_features = train_features.replace([inf, -inf, nan], 0)
        train_features = array(train_features)
        logger.info('Loaded train feature shape: (%d, %d) ' %
                    train_features.shape)
        del df_train

        # Loading test features
        logger.info('Loading test features from file %s ' %
                    args.test_feature_path)
        df_test = read_csv(args.test_feature_path, encoding="ISO-8859-1")
        if not args.feature_list == '':
            feature_list = args.feature_list.split(',')
            test_features = DataFrame()
            for feature_name in feature_list:
                test_features[feature_name.strip()] = df_test[
                    feature_name.strip()]
        elif args.fidx_end == 0:
            test_features = df_test.iloc[:, args.fidx_start:]
        else:
            test_features = df_test.iloc[:, args.fidx_start:args.fidx_end]

        if not args.test_bowl_feature_path == '':
            logger.info('Loading test 1bowl features from file %s ' %
                        args.test_bowl_feature_path)
            df_test = read_csv(args.test_bowl_feature_path,
                               encoding="ISO-8859-1")
            if not args.bowl_feat_list == '':
                bowl_feat_list = args.bowl_feat_list.split(',')
                for feature_name in bowl_feat_list:
                    test_features[feature_name.strip()] = df_test[
                        feature_name.strip()]
            else:
                for feature_name in df_test.columns:
                    if feature_name.startswith('z_'):
                        test_features[feature_name] = df_test[feature_name]

        test_features = test_features.replace([inf, -inf, nan], 0)
        test_features = array(test_features)
        logger.info('Loaded test feature shape: (%d, %d) ' %
                    test_features.shape)
        del df_test

        # Normalize Data
        ss = StandardScaler()
        ss.fit(vstack((train_features, test_features)))
        # train_features = ss.transform(train_features)
        test_features = ss.transform(test_features)
        del ss
        logger.info('Test Features normalized ')

    test_x = [test_x1, test_x2]
    if not args.test_feature_path == '':
        test_x += [test_features]

    if args.load_model_json:
        with open(args.load_model_json, 'r') as json_file:
            rnnmodel = model_from_json(json_file.read(),
                                       custom_objects={
                                           "DenseWithMasking":
                                           DenseWithMasking,
                                           "Conv1DWithMasking":
                                           Conv1DWithMasking,
                                           "MaxOverTime": MaxOverTime,
                                           "MeanOverTime": MeanOverTime
                                       })
        logger.info('Loaded model from saved json')

    if args.load_model_weights:
        logger.info('Loading model from saved weights')
        rnnmodel.load_weights(args.load_model_weights)

    if args.predict_test:
        logger.info("Predicting test file result...")
        preds = rnnmodel.predict(test_x,
                                 batch_size=args.eval_batch_size,
                                 verbose=1)
        preds = squeeze(preds)
        logger.info('Write predictions into file... Total line: %d ' %
                    len(preds))
        with open(output_dir + '/' + timestr + 'predict.csv',
                  'w',
                  encoding='utf8') as fwrt:
            writer_sub = csv.writer(fwrt)
            writer_sub.writerow(['test_id', 'is_duplicate'])
            idx = 0
            for itm in tqdm(preds):
                writer_sub.writerow([idx, itm])
                idx += 1
        logger.info('Predicted results written to file: %s' %
                    (output_dir + '/' + timestr + 'predict.csv'))
    else:
        if args.optimizer == 'rmsprop':
            optimizer = RMSprop(lr=args.learning_rate)
        else:
            optimizer = args.optimizer

        myMetrics = 'acc'  # 'binary_accuracy' # 'mse'
        rnnmodel.compile(loss=args.loss,
                         optimizer=optimizer,
                         metrics=[myMetrics])
        rnnmodel.summary()

        logger.info("Evaluating test set...")
        tloss, tacc = rnnmodel.evaluate(test_x,
                                        test_y,
                                        batch_size=args.eval_batch_size,
                                        verbose=1)
        logger.info("Test loss: %.4f   Test Accuracy: %.2f%%" %
                    (tloss, 100 * tacc))
예제 #23
0
                      checkpoint_dir=checkpoint_dir,
                      loss=loss_function)
 predicted_values = []
 real_values = []
 for student in students_gender_train:
     train_students = students_gender_train - set([student])
     print(train_students)
     test_student = set([student])
     print(test_student)
     train_x, train_y = dataset_loader.get_x_and_y(
         students_set=train_students, index=index, test_flag=False)
     test_x, test_y = dataset_loader.get_x_and_y(
         students_set=test_student, index=index, test_flag=True)
     reshaped_train_set_x = dataset_loader.reshape_numpy_array(train_x)
     scaler = StandardScaler()
     scaler.fit(reshaped_train_set_x)
     normalized_reshaped_train_x = scaler.transform(
         reshaped_train_set_x)
     normalized_train_set_x = np.reshape(
         normalized_reshaped_train_x,
         (train_x.shape[0], train_x.shape[1], train_x.shape[2],
          train_x.shape[3]))
     reshaped_test_x = dataset_loader.reshape_numpy_array(test_x)
     normalized_reshaped_test_x = scaler.transform(reshaped_test_x)
     normalized_test_x = np.reshape(normalized_reshaped_test_x,
                                    (test_x.shape[0], test_x.shape[1],
                                     test_x.shape[2], test_x.shape[3]))
     predicted_values.extend(
         cnn_classifier.train(normalized_train_set_x,
                              train_y,
                              normalized_test_x,
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing.data import StandardScaler
from sklearn.linear_model import Lasso
from mpl_toolkits.mplot3d import Axes3D

irisdata = load_iris()
iris_X = irisdata.data
iris_y = irisdata.target
scale = StandardScaler()
scale.fit(iris_X)
iris_x = scale.transform(iris_X)
pca = PCA(n_components=3)
iris_x = pca.fit_transform(iris_x)
fig = plt.figure()
ax = fig.add_subplot(111)
# ax.scatter(iris_x[:, 0], iris_x[:, 1], iris_x[:, 2], marker='o', c=iris_y)
x_tran, x_test, y_tran, y_test = train_test_split(iris_x,
                                                  iris_y,
                                                  test_size=0.3,
                                                  random_state=42)
result = {}
test_number = len(y_test)
for i in range(1, 11, 1):
    clf = Lasso(alpha=i / 10).fit(x_tran, y_tran)
    y_pre = clf.predict(x_test)
    result[i / 10] = sum(m < 0.5 for m in abs(y_test - y_pre)) / test_number
print(result)
ax.plot(list(result.keys()), list(result.values()))
예제 #25
0
from sklearn.preprocessing.data import StandardScaler
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import mglearn

cancer = load_breast_cancer()

scaler = StandardScaler()
scaler.fit(cancer.data)
X_scaled = scaler.transform(cancer.data)

pca = PCA(n_components=2)
pca.fit(X_scaled)
X_pca = pca.transform(X_scaled)

print("original {}, reduction {}".format(X_scaled.shape, X_pca.shape))

plt.figure(figsize=(8,8))
mglearn.discrete_scatter(X_pca[:,0], X_pca[:,1], cancer.target)
plt.legend(["malignancy(cancer)", "benign"], loc="best")
plt.gca().set_aspect("equal")
plt.xlabel("1st principal component")
plt.ylabel("2nd principal component")
plt.draw()

print("PCA PC shape:{}".format(pca.components_.shape))
print("PCA PC {}".format(pca.components_))
plt.matshow(pca.components_, cmap='viridis')
plt.yticks([0,1], ["first principal component", "second principal component"])
plt.colorbar()
예제 #26
0
class DataTransformer:
    """DataTransformer transforms CRN traces into training examples with optional scaling."""
    def __init__(
        self,
        dataset_address,
        with_timestamps=True,
        nb_randomized_params=0,
    ):
        """
        Initialize transformer.

        Parameters
        ----------
        dataset_address : filepath to the dataset containing CRN traces.
            Data in the file should be of shape [nb_traces, nb_steps, nb_features].
            If with_timestamps is True, the first feature is considered as time.
        with_timestamps : boolean, whether time is included in data (as the very first feature)
            Data produced by scripts/simulate_data_gillespy.py has time, therefore default values is True.
        """
        self.nb_trajectories = None
        self.nb_timesteps = None
        self.nb_features = None
        self.labels = None
        self.with_labels = False
        self.with_timestamps = with_timestamps
        self.nb_randomized_params = nb_randomized_params

        self._scaler = None
        self.scaler_is_fitted = False
        self.scaler_positivity = None
        self.dtype = np.float32

        self.read_data(dataset_address)

    @property
    def scaler(self):
        return self._scaler

    def read_data(self, dataset_address):
        """Read data and memorize shape."""
        with open(dataset_address, 'rb') as data_file:
            self.data = np.asarray(np.load(data_file), dtype=self.dtype)
            self._memorize_dataset_shape()

    def _memorize_dataset_shape(self):
        """Memorize data shape."""
        if self.data.ndim != 3:
            raise ShapeError(f"The dataset is not properly formatted.\n"
                             f"We expect the following shape: "
                             f"(nb_trajectories, nb_timesteps, nb_features),\n"
                             f"got: {self.data.shape}")
        self.nb_trajectories, self.nb_timesteps, self.nb_features = self.data.shape

    def set_labels(self, labels):
        """
        Set labels for species.

        Parameters
        ----------
        labels : list of species names. Length of the list and the order of names
            should coincide with the species presented in data (excluding `time`).

        """
        if labels is None:
            self.labels = None
            self.with_labels = False
        else:
            if self.with_timestamps:
                labels = ['timestamp'] + labels

            if len(labels) != self.nb_features:
                raise ShapeError(
                    f"There needs to be exactly one label for each feature.\n"
                    f"We have {len(labels)} labels for {self.nb_features} features."
                )
            self.labels = bidict(zip(range(len(labels)), labels))
            self.with_labels = True

    def drop_timestamps(self):
        """Drop time from data."""
        if self.with_timestamps is True:

            self.data = self.data[..., 1:]
            self.nb_features = self.nb_features - 1
            self.with_timestamps = False
            self._memorize_dataset_shape()

            if self.with_labels is True:
                self.labels.inv.pop('timestamp')
                self.labels = bidict(
                    zip([k - 1 for k in self.labels.keys()],
                        self.labels.values()))

    def _create_scaler(self, positivity):
        self.scaler_positivity = positivity
        if positivity is True:
            eps = 1e-9
            self._scaler = MinMaxScaler(feature_range=(eps, 1))
        else:
            self._scaler = StandardScaler()
        self.scaler_is_fitted = False

    def _fit_scaler(self, positivity=False, slice_size=None):
        if (self._scaler is None) or (self.scaler_positivity != positivity):
            self._create_scaler(positivity)

        if not self.scaler_is_fitted:

            LOGGER.info(f"Fitting scaler, positivity={positivity}")

            if slice_size is None:
                self._scaler.fit(self.data.reshape(-1, self.nb_features))
            else:
                n_slices = self.nb_trajectories // slice_size

                for i in tqdm(range(n_slices)):
                    data_slice = self.data[i * slice_size:(i + 1) * slice_size,
                                           ...]
                    data_slice = data_slice.reshape(-1, self.nb_features)
                    self._scaler.partial_fit(data_slice)

                if self.nb_trajectories % slice_size != 0:
                    data_slice = self.data[n_slices * slice_size:, ...]
                    data_slice = data_slice.reshape(-1, self.nb_features)
                    self._scaler.partial_fit(data_slice)

            self.scaler_is_fitted = True

    def rescale(self, data):
        """
        Apply scaler to data.

        Parameters
        ----------
        data : data to rescale.

        Returns
        -------
        data : rescaled data.

        """
        # return self.scaler.transform(data)
        if isinstance(self.scaler, StandardScaler):
            try:
                data = (data - self.scaler.mean_) / self.scaler.scale_
            except ValueError:
                data = (data - self.scaler.mean_[:-self.nb_randomized_params]) \
                       / self.scaler.scale_[:-self.nb_randomized_params]
        elif isinstance(self.scaler, MinMaxScaler):
            try:
                data = (data * self.scaler.scale_) + self.scaler.min_
            except ValueError:
                data = (data * self.scaler.scale_[:-self.nb_randomized_params]) \
                       + self.scaler.min_[:-self.nb_randomized_params]
        return data

    def scale_back(self, data):
        """
        Apply scaler inverse transform, returning data to the original scale.
        Parameters
        ----------
        data : data (rescaled).

        Returns
        -------
        data : data scaled back.

        """
        # return self.scaler.inverse_transform(data)
        if isinstance(self.scaler, StandardScaler):
            try:
                data = data * self.scaler.scale_ + self.scaler.mean_
            except ValueError:
                data = data * self.scaler.scale_[:-self.nb_randomized_params] \
                       + self.scaler.mean_[:-self.nb_randomized_params]
        elif isinstance(self.scaler, MinMaxScaler):
            try:
                data = (data - self.scaler.min_) / self.scaler.scale_
            except ValueError:
                data = (data - self.scaler.min_[:-self.nb_randomized_params]) \
                       / self.scaler.scale_[:-self.nb_randomized_params]
        return data

    def _shuffle_data(self):
        np.random.shuffle(self.data)

    def _transitions_from_a_batch_of_trajectories(
        self,
        trajectories,
        nb_past_timesteps,
    ):
        x_data = []
        y_data = []

        for timestep in range(self.nb_timesteps - nb_past_timesteps):
            x_data.append(trajectories[:, timestep:(timestep +
                                                    nb_past_timesteps), :])
            y_data.append(trajectories[:, timestep + nb_past_timesteps, :])

        x_data = np.concatenate(x_data, axis=0)
        y_data = np.concatenate(y_data, axis=0)

        return x_data, y_data

    def _transitions_generator(
        self,
        trajectories,
        nb_past_timesteps,
        slice_size=None,
        rescale=False,
    ):
        self._check_nb_past_timesteps(nb_past_timesteps)
        nb_trajectories = trajectories.shape[0]

        if slice_size:
            n_slices = nb_trajectories // slice_size
            additive = 0 if nb_trajectories % slice_size == 0 else 1
        else:
            n_slices = 1
            additive = 0
            slice_size = nb_trajectories

        for i in range(n_slices + additive):
            if i == n_slices:
                x_data, y_data = self._transitions_from_a_batch_of_trajectories(
                    trajectories[slice_size * n_slices:nb_trajectories],
                    nb_past_timesteps,
                )
            else:
                x_data, y_data = self._transitions_from_a_batch_of_trajectories(
                    trajectories[slice_size * i:slice_size * (i + 1)],
                    nb_past_timesteps,
                )
            if rescale:
                x_data = self.rescale(x_data)
                y_data = self.rescale(y_data)

            yield x_data, y_data

    def _train_test_generators(
        self,
        nb_past_timesteps,
        test_fraction=0.2,
        slice_size=None,
        rescale=False,
    ):
        n_train_trajectories = int((1. - test_fraction) * self.nb_trajectories)

        train_gen = self._transitions_generator(
            self.data[:n_train_trajectories],
            nb_past_timesteps,
            slice_size,
            rescale,
        )
        test_gen = self._transitions_generator(
            self.data[n_train_trajectories:],
            nb_past_timesteps,
            slice_size,
            rescale,
        )
        return train_gen, test_gen

    def get_train_test_data_generators(
        self,
        nb_past_timesteps=1,
        test_fraction=0.2,
        keep_timestamps=False,
        rescale=True,
        positivity=True,
        shuffle=True,
        slice_size=None,
    ):
        """
        Produce data generators, yielding chunks of transformed data,
        containing (optionally) rescaled training examples.
        Each training example is a single transition between states of the system:
            (x, y) = (trajectory[i:i+nb_past_timesteps], trajectory[i+nb_past_timesteps])

        Parameters
        ----------
        nb_past_timesteps : number of steps observed before each transition.
        test_fraction : float, fraction of data that will be used for test.
        keep_timestamps : boolean, whether to keep timestamps in data, default is False.
        rescale : boolean, whether data should be rescaled.
        positivity : boolean, if True, data will be rescaled between 0 and 1, otherwise standardized.
        shuffle : boolean, if True trajectories will be shuffled before producing training examples.
        slice_size : int, number of trajectories to process at once, optional. May be useful for
            large datasets to reduce memory consumption. If None, all trajectories used.

        Returns
        -------
        (train_generator, test_generator) : iterable generators of training examples.
            Every iteration of generator yields training examples produced from
            `slice_size` number of trajectories.
        """
        if keep_timestamps is False:
            self.drop_timestamps()

        if rescale is True:
            self._fit_scaler(positivity, slice_size)

        if shuffle is True:
            self._shuffle_data()

        return self._train_test_generators(
            nb_past_timesteps,
            test_fraction,
            slice_size,
            rescale,
        )

    def _check_nb_past_timesteps(self, nb_past_timesteps):
        if nb_past_timesteps + 1 > self.nb_timesteps:
            raise ValueError('Too many past timesteps.')
        elif nb_past_timesteps < 1:
            raise ValueError(
                'You need to consider at least 1 timestep in the past.')

    def _save_scaler(self, dataset_folder):
        scaler_fp = os.path.join(dataset_folder, 'scaler.pickle')
        with open(scaler_fp, 'wb') as file:
            pickle.dump(self.scaler, file)

    def save_data_for_ml_hdf5(
        self,
        dataset_folder,
        nb_past_timesteps=1,
        test_fraction=0.2,
        keep_timestamps=False,
        rescale=True,
        positivity=True,
        shuffle=True,
        slice_size=None,
        force_rewrite=False,
    ):
        """
        Write training and test datasets to hdf5 files.
        Original trajectories are optionally scaled and split into training examples:
            (x, y) = (trajectory[i:i+nb_past_timesteps], trajectory[i+nb_past_timesteps])

        Parameters
        ----------
        dataset_folder : folder to save datasets
        nb_past_timesteps : number of steps observed before each transition.
        test_fraction : float, fraction of data that will be used for test.
        keep_timestamps : boolean, whether to keep timestamps in data, default is False.
        rescale : boolean, whether data should be rescaled.
        positivity : boolean, if True, data will be rescaled between 0 and 1, otherwise standardized.
        shuffle : boolean, if True trajectories will be shuffled before producing training examples.
        slice_size : int, number of trajectories to process at once, optional. May be useful for
            large datasets to reduce memory consumption. If None, all trajectories used.
        force_rewrite : boolean, if True, existing files will be rewritten.

        Returns
        -------
        None

        """
        train_gen, test_gen = self.get_train_test_data_generators(
            nb_past_timesteps=nb_past_timesteps,
            test_fraction=test_fraction,
            keep_timestamps=keep_timestamps,
            rescale=rescale,
            positivity=positivity,
            shuffle=shuffle,
            slice_size=slice_size,
        )

        if rescale:
            self._save_scaler(dataset_folder)
            train_fp = os.path.join(dataset_folder, 'train_rescaled.hdf5')
            test_fp = os.path.join(dataset_folder, 'test_rescaled.hdf5')
        else:
            train_fp = os.path.join(dataset_folder, 'train.hdf5')
            test_fp = os.path.join(dataset_folder, 'test.hdf5')

        if force_rewrite:
            if os.path.exists(train_fp):
                os.remove(train_fp)
            if os.path.exists(test_fp):
                os.remove(test_fp)

        with h5py.File(train_fp, 'a', libver='latest') as df:
            df.create_dataset(
                'x',
                shape=(0, nb_past_timesteps, self.nb_features),
                maxshape=(None, nb_past_timesteps, self.nb_features),
                chunks=True,
            )
            df.create_dataset(
                'y',
                shape=(0, self.nb_features - self.nb_randomized_params),
                maxshape=(None, self.nb_features - self.nb_randomized_params),
                chunks=True,
            )
            for x, y in train_gen:
                n_new_items = x.shape[0]
                df['x'].resize(df['x'].shape[0] + n_new_items, axis=0)
                df['x'][-n_new_items:] = x

                df['y'].resize(df['y'].shape[0] + n_new_items, axis=0)
                df['y'][-n_new_items:] = y[..., :-self.nb_randomized_params]

            LOGGER.info(f"Train data saved to {train_fp}, \n"
                        f"Shapes: x: {df['x'].shape}, y: {df['y'].shape}")

        with h5py.File(test_fp, 'a', libver='latest') as df:
            df.create_dataset(
                'x',
                shape=(0, nb_past_timesteps, self.nb_features),
                maxshape=(None, nb_past_timesteps, self.nb_features),
                chunks=True,
            )
            df.create_dataset(
                'y',
                shape=(0, self.nb_features - self.nb_randomized_params),
                maxshape=(None, self.nb_features - self.nb_randomized_params),
                chunks=True,
            )
            for x, y in test_gen:
                n_new_items = x.shape[0]
                df['x'].resize(df['x'].shape[0] + n_new_items, axis=0)
                df['x'][-n_new_items:] = x

                df['y'].resize(df['y'].shape[0] + n_new_items, axis=0)
                df['y'][-n_new_items:] = y[..., :-self.nb_randomized_params]

            LOGGER.info(f"Test data saved to {test_fp}, \n"
                        f"Shapes: x: {df['x'].shape}, y: {df['y'].shape}")

    def save_data_for_ml_tfrecord(
        self,
        dataset_folder,
        nb_past_timesteps,
        test_fraction=0.2,
        keep_timestamps=False,
        rescale=True,
        positivity=True,
        shuffle=True,
        slice_size=None,
        force_rewrite=False,
    ):
        """
        Write training and test datasets to TFRecord files.
        Original trajectories are optionally scaled and split into training examples:
            (x, y) = (trajectory[i:i+nb_past_timesteps], trajectory[i+nb_past_timesteps])

        Parameters
        ----------
        dataset_folder : folder to save datasets
        nb_past_timesteps : number of steps observed before each transition.
        test_fraction : float, fraction of data that will be used for test.
        keep_timestamps : boolean, whether to keep timestamps in data, default is False.
        rescale : boolean, whether data should be rescaled.
        positivity : boolean, if True, data will be rescaled between 0 and 1, otherwise standardized.
        shuffle : boolean, if True trajectories will be shuffled before producing training examples.
        slice_size : int, number of trajectories to process at once, optional. May be useful for
            large datasets to reduce memory consumption. If None, all trajectories used.
        force_rewrite : boolean, if True, existing files will be rewritten.

        Returns
        -------
        None

        """
        train_gen, test_gen = self.get_train_test_data_generators(
            nb_past_timesteps=nb_past_timesteps,
            test_fraction=test_fraction,
            keep_timestamps=keep_timestamps,
            rescale=rescale,
            positivity=positivity,
            shuffle=shuffle,
            slice_size=slice_size,
        )

        if rescale:
            self._save_scaler(dataset_folder)
            train_fp = os.path.join(dataset_folder, 'train_rescaled.tfrecords')
            test_fp = os.path.join(dataset_folder, 'test_rescaled.tfrecords')
        else:
            train_fp = os.path.join(dataset_folder, 'train.tfrecords')
            test_fp = os.path.join(dataset_folder, 'test.tfrecords')

        if force_rewrite:
            if os.path.exists(train_fp):
                os.remove(train_fp)
            if os.path.exists(test_fp):
                os.remove(test_fp)

        def _float_feature(value):
            return tf.train.Feature(float_list=tf.train.FloatList(value=value))

        def _int64_feature(value):
            return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

        def _create_example(x_arr, y_arr):
            x_shape = x_arr.shape
            x_arr = x_arr.reshape(-1)
            features = tf.train.Features(
                feature={
                    'x': _float_feature(x_arr),
                    'y': _float_feature(y_arr),
                    'x_shape': _int64_feature(x_shape)
                })
            return tf.train.Example(features=features)

        def _process_chunk(x, y):
            for idx in range(x.shape[0]):
                xi = x[idx]
                yi = y[idx]
                example = _create_example(xi, yi)
                writer.write(example.SerializeToString())

        writer = tf.io.TFRecordWriter(train_fp)
        for x, y in train_gen:
            _process_chunk(x, y)

        writer = tf.io.TFRecordWriter(test_fp)
        for x, y in test_gen:
            _process_chunk(x, y)
예제 #27
0
def split_train_validation_test(multi_time_series_df,
                                valid_start_time,
                                test_start_time,
                                features,
                                time_step_lag=1,
                                horizon=1,
                                target='target',
                                time_format='%Y-%m-%d %H:%M:%S',
                                freq='H'):

    if not isinstance(features, list) or len(features) < 1:
        raise Exception(
            "Bad input for features. It must be an array of dataframe colummns used"
        )

    train = multi_time_series_df.copy()[
        multi_time_series_df.index < valid_start_time]
    train_features = train[features]
    train_targets = train[target]

    # X_scaler = MinMaxScaler()
    # target_scaler = MinMaxScaler()
    # y_scaler = MinMaxScaler()

    X_scaler = StandardScaler()
    target_scaler = StandardScaler()
    y_scaler = StandardScaler()

    # 'load' is our key target. If it is in features, then we scale it.
    # if it not 'load', then we scale the first column
    if 'load' in features:
        tg = train[['load']]
        y_scaler.fit(tg)
    else:

        tg = train[target]
        ## scale the first column
        y_scaler.fit(tg.values.reshape(-1, 1))

    train[target] = target_scaler.fit_transform(train_targets)

    X_scaler.fit(train_features)
    train[features] = X_scaler.transform(train_features)

    tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)}
    train_inputs = TimeSeriesTensor(train,
                                    target=target,
                                    H=horizon,
                                    freq=freq,
                                    tensor_structure=tensor_structure)

    print(train_inputs.dataframe.head())

    look_back_dt = dt.datetime.strptime(
        valid_start_time, time_format) - dt.timedelta(hours=time_step_lag - 1)
    valid = multi_time_series_df.copy()[
        (multi_time_series_df.index >= look_back_dt)
        & (multi_time_series_df.index < test_start_time)]
    valid_features = valid[features]
    valid[features] = X_scaler.transform(valid_features)
    tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)}
    valid_inputs = TimeSeriesTensor(valid,
                                    target=target,
                                    H=horizon,
                                    freq=freq,
                                    tensor_structure=tensor_structure)

    print(valid_inputs.dataframe.head())

    # test set
    # look_back_dt = dt.datetime.strptime(test_start_time, '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=time_step_lag - 1)
    test = multi_time_series_df.copy()[test_start_time:]
    test_features = test[features]
    test[features] = X_scaler.transform(test_features)
    test_inputs = TimeSeriesTensor(test,
                                   target=target,
                                   H=horizon,
                                   freq=freq,
                                   tensor_structure=tensor_structure)

    print("time lag:", time_step_lag, "original_feature:", len(features))

    return train_inputs, valid_inputs, test_inputs, y_scaler
예제 #28
0
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing.data import StandardScaler
from sklearn.linear_model import ElasticNet  # In ElasticNet,we have two important variable, alpha and l1_ratio
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.ticker import LinearLocator

bostondata = load_boston()
boston_X = bostondata.data
boston_y = bostondata.target
scale = StandardScaler()
scale.fit(boston_X)
boston_x = scale.transform(boston_X)
pca = PCA(n_components=3)
# boston_x = pca.fit_transform(boston_x)
fig = plt.figure()
ax = plt.gca(projection='3d')
# ax.scatter(boston_x[:, 0], boston_x[:, 1], boston_x[:, 2], marker='o', c=boston_y)
x_tran, x_test, y_tran, y_test = train_test_split(boston_x, boston_y, test_size=0.3, random_state=42)
result = []
z = np.zeros(shape=(10, 10))
test_number = len(y_test)
for i in range(1, 11, 1):
    for j in range(1, 11, 1):
        clf = ElasticNet(alpha=i / 10, l1_ratio=j / 10).fit(x_tran, y_tran)
        y_pre = clf.predict(x_test)
        result.append([i, j, clf.score(x_test, y_test)])
예제 #29
0
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing.data import StandardScaler

bostondata = load_boston()  # 导入boston数据
boston_X = bostondata.data
boston_y = bostondata.target
scale_boston = StandardScaler()  # 标准化
scale_boston.fit(boston_X)
boston_x = scale_boston.transform(boston_X)
pca = PCA(n_components=2)  # 二维降维
pca.fit(boston_X)
dimesionpower = pca.explained_variance_ratio_
print(dimesionpower)
boston_x_train, boston_x_test, boston_y_train, boston_y_test = train_test_split(
    boston_x, boston_y, test_size=0.3, random_state=42)
result = {
    i / 10:
    Lasso(alpha=i / 10).fit(boston_x_train,
                            boston_y_train).score(boston_x_test, boston_y_test)
    for i in range(1, 11, 1)
}
plt.plot(list(result.keys()), list(result.values()))
plt.show()
print(result)  # 就结果来讲,正则修正系数越小越好,当然这也是在剔除异常值之前的了,以alpha=0为佳
# todo:使用聚类法整理原始数据,剔除一些异常数据信息
예제 #30
0
n = len(X_all)
with open('diff_X.txt') as inFile:
    for line in inFile:
        if n == len(X_all):
            n = 0
            X_d.append([])
            X_d[-1].append(float(line.split('\t')[-1]))
            n += 1
        else:
            X_d[-1].append(float(line.split('\t')[-1]))
            n += 1

plt.figure(figsize=(7, 5))

scaler = StandardScaler()
X_scaled = scaler.fit(X_d).transform(X_d)

pca = PCA(n_components=2)
X_r = pca.fit(X_scaled).transform(X_scaled)

X_rx = [i[0] for i in X_r]
X_ry = [i[1] for i in X_r]

country_sp = []
for x in h_run:
    if run_toCountry[x] == 'Fiji' or run_toCountry[
            x] == 'United Republic of Tanzania' or run_toCountry[
                x] == 'Madagascar' or run_toCountry[x] == 'Peru':
        if run_toCountry[x] == 'United Republic of Tanzania':
            country_sp.append('Tanzania')
        else:
예제 #31
0
                    label='test set')


iris = load_iris()
iris_data = iris.data[:, [2, 3]]
print(iris_data)

#切分数据集
X_train, X_test, y_train, y_test = train_test_split(iris_data,
                                                    iris.target,
                                                    test_size=0.3,
                                                    random_state=0)

#
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

X_combined_std = np.vstack((X_train_std, X_test_std))
y_combined = np.hstack((y_train, y_test))

#模型训练与评估,默认L2范数,与model=LogisticRegression(penalty="l2")等价
model = LogisticRegression(C=1000.0, random_state=0)
model.fit(X_train_std, y_train)
model.predict_proba(np.array(X_test_std[0, :]).reshape(1, -1))
plot_decision_regions(X_combined_std,
                      y_combined,
                      classifier=model,
                      test_idx=range(105, 150))