def normalize_features(self, scaler: StandardScaler=None) \ -> StandardScaler: ''' Normalizes the features of the dataset using a StandardScaler (subtract mean, divide by standard deviation). If a scaler is provided, uses that scaler to perform the normalization. Otherwise fits a scaler to the features in the dataset and then performs the normalization. :param scaler: A fitted StandardScaler. Used if provided. Otherwise a StandardScaler is fit on this dataset and is then used. :param replace_nan_token: What to replace nans with. :return: A fitted StandardScaler. If a scaler is provided, this is the same scaler. Otherwise, this is a scaler fit on this dataset. ''' if not self.data or not self.data[0].features: return None if not scaler: scaler = StandardScaler() features = np.vstack([d.features for d in self.data]) scaler.fit(features) for d in self.data: d.set_features(scaler.transform(d.features.reshape(1, -1))[0]) return scaler
def test_scaler_1d(): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
def test_scaler_1d(): """Test scaling of dataset along single axis""" rng = np.random.RandomState(0) X = rng.randn(5) X_orig_copy = X.copy() scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_array_almost_equal(X_scaled_back, X_orig_copy) # Test with 1D list X = [0., 1., 2, 0.4, 1.] scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=False) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X_scaled = scale(X) assert_array_almost_equal(X_scaled.mean(axis=0), 0.0) assert_array_almost_equal(X_scaled.std(axis=0), 1.0) X = np.ones(5) assert_array_equal(scale(X, with_mean=False), X)
def prepare_time_data(data): data_scaler = StandardScaler() data_concat = np.concatenate(data, axis=0) data_scaler.fit(data_concat) new_data = [data_scaler.transform(data_) for data_ in data] return data_scaler, new_data
def preprocess(self): sc = StandardScaler() sc.fit(self.X_train) X_train_std = sc.transform(self.X_train) X_test_std = sc.transform(self.X_test) self.train_dataset = self.Dataset(data=X_train_std, target=self.y_train) self.test_dataset = self.Dataset(data=X_test_std, target=self.y_test)
def __stdScaler(self): all_cols = list(self.data_df.columns.values) for col in all_cols: if col not in self.non_numeric_cols and col != 'time_to_failure': stdScaler = StandardScaler() stdScaler.fit(self.data_df[[col]]) self.data_df[col] = stdScaler.transform(self.data_df[[col]]) print('Standard Scaler applied ... ')
def main(): args = parse() n_rollout = args.nrollout n_epoch = args.epoch savename = args.savename if args.savename is not None else 'model-' + str( n_rollout) + 'unroll' np.random.seed(1098) path = args.filename names = ['target_pos', 'target_speed', 'pos', 'vel', 'effort'] with h5py.File(path, 'r') as f: (target_pos, target_speed, pos, vel, effort) = [[np.array(val) for val in f[name].values()] for name in names] x_target = np.array(target_pos) x_first = np.array([pos_[0] for pos_ in pos]) x_speed = np.array(target_speed).reshape((-1, 1)) aux_output = [np.ones(eff.shape[0]).reshape((-1, 1)) for eff in effort] x = np.concatenate((x_target, x_first, x_speed), axis=1) input_scaler = StandardScaler() x = input_scaler.fit_transform(x) output_scaler = StandardScaler() effort_concat = np.concatenate([a for a in effort], axis=0) output_scaler.fit(effort_concat) effort = [output_scaler.transform(eff) for eff in effort] y = pad_sequences(effort, padding='post', value=0.) aux_output = pad_sequences(aux_output, padding='post', value=0.) x, x_test, y, y_test, y_aux, y_aux_test = train_test_split(x, y, aux_output, test_size=0.2) y_mask, y_test_mask = [this_y[:, :, 0] for this_y in (y_aux, y_aux_test)] y_aux_mask, y_aux_test_mask = [ np.ones(this_y.shape[:2]) for this_y in (y_aux, y_aux_test) ] model = MyModel(train=[x, [y, y_aux]], val=[x_test, [y_test, y_aux_test]], train_mask=[y_mask, y_aux_mask], val_mask=[y_test_mask, y_aux_test_mask], max_unroll=n_rollout, name=savename) if not os.path.exists('save'): os.makedirs('save') if args.train: model.fit(nb_epoch=n_epoch, batch_size=32) elif args.resume: model.resume(nb_epoch=n_epoch, batch_size=32)
def xval(feature_file, removed_columns=None): """ Load features into file :param feature_file: feature file :param removed_columns: index of feature columns to remove """ module_logger.info('------ Load feature data ::: {}'.format(feature_file)) clf = svm_clf() fs = numpy.loadtxt(feature_file, delimiter='\t', skiprows=1) _, n = fs.shape iX = fs[:, 0] X = fs[:, 1:n - 1] y = fs[:, n - 1] if removed_columns is not None and len(removed_columns) > 0: X = numpy.delete(X, removed_columns, 1) module_logger.info('------ data dimension ::: {} ::: {}'.format(X.shape, n)) y_true = numpy.array([]) y_out = numpy.array([]) y_prob = numpy.array([]) y_i = numpy.array([]) std_scaler = StandardScaler() skf = StratifiedKFold(n_splits=5) for train_index, test_index in skf.split(X, y): # print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] std_scaler.fit(X_train) X_train_scaled = std_scaler.transform(X_train, copy=True) X_test_scaled = std_scaler.transform(X_test, copy=True) clf.fit(X_train_scaled, y_train) y_pred = clf.predict(X_test_scaled) y_logp = clf.predict_proba(X_test_scaled) y_true = numpy.hstack((y_true, y_test)) y_out = numpy.hstack((y_out, y_pred)) y_prob = numpy.hstack((y_prob, numpy.max(y_logp, axis=1))) iX_test = iX[test_index] y_i = numpy.hstack((y_i, iX_test)) return write_prediction_output(y_i, y_true, y_out, feature_file.replace('.csv', '_pred.csv'), y_prob)
def test_scaler_2d_arrays(): """Test scaling of 2d array along first axis""" rng = np.random.RandomState(0) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has been copied assert_true(X_scaled is not X) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) X_scaled = scale(X, axis=1, with_std=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) # Check that the data hasn't been modified assert_true(X_scaled is not X) X_scaled = scaler.fit(X).transform(X, copy=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is X) X = rng.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X)
def test_scaler_2d_arrays(): """Test scaling of 2d array along first axis""" rng = np.random.RandomState(0) X = rng.randn(4, 5) X[:, 0] = 0.0 # first feature is always of zero scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has been copied assert_true(X_scaled is not X) # check inverse transform X_scaled_back = scaler.inverse_transform(X_scaled) assert_true(X_scaled_back is not X) assert_true(X_scaled_back is not X_scaled) assert_array_almost_equal(X_scaled_back, X) X_scaled = scale(X, axis=1, with_std=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) X_scaled = scale(X, axis=1, with_std=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=1), 4 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=1), 4 * [1.0]) # Check that the data hasn't been modified assert_true(X_scaled is not X) X_scaled = scaler.fit(X).transform(X, copy=False) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is X) X = rng.randn(4, 5) X[:, 0] = 1.0 # first feature is a constant, non zero feature scaler = StandardScaler() X_scaled = scaler.fit(X).transform(X, copy=True) assert_false(np.any(np.isnan(X_scaled))) assert_array_almost_equal(X_scaled.mean(axis=0), 5 * [0.0]) assert_array_almost_equal(X_scaled.std(axis=0), [0., 1., 1., 1., 1.]) # Check that X has not been copied assert_true(X_scaled is not X)
def obtain_sets(self, psychological_construct, percentage): index = self.get_index(psychological_construct) logging.info("Psychological construct under analysis:" + psychological_construct) negative_students, positive_students = self.get_instances(index) train_set, dev_set, test_set = self.divide_sets(negative_students, positive_students, percentage) train_set_x, train_set_y = self.get_x_and_y(train_set, index) logging.info("Training set shape:" + str(train_set_x.shape)) if self.norm == z_norm_literal: logging.info("Z-Normalizing") reshaped_train_set_x = self.reshape_numpy_array(train_set_x) scaler = StandardScaler() scaler.fit(reshaped_train_set_x) normalized_reshaped_train_x = scaler.transform(reshaped_train_set_x) normalized_train_set_x = np.reshape(normalized_reshaped_train_x, (train_set_x.shape[0], train_set_x.shape[1], train_set_x.shape[2], train_set_x.shape[3])) dev_set_x, dev_set_y = self.get_x_and_y(dev_set, index) if self.norm == z_norm_literal: logging.info("Z-Normalizing") reshaped_dev_x = self.reshape_numpy_array(dev_set_x) normalized_reshaped_dev_x = scaler.transform(reshaped_dev_x) normalized_dev_x = np.reshape(normalized_reshaped_dev_x, (dev_set_x.shape[0], dev_set_x.shape[1], dev_set_x.shape[2], dev_set_x.shape[3])) test_set_x, test_set_y = self.get_x_and_y(test_set, index, test_flag=True) if self.norm == z_norm_literal: logging.info("Z-Normalizing") reshaped_test_x = self.reshape_numpy_array(test_set_x) normalized_reshaped_test_x = scaler.transform(reshaped_test_x) normalized_test_x = np.reshape(normalized_reshaped_test_x, (test_set_x.shape[0], test_set_x.shape[1], test_set_x.shape[2], test_set_x.shape[3])) return normalized_train_set_x, train_set_y, normalized_dev_x, dev_set_y, normalized_test_x, test_set_y else: return train_set_x, train_set_y, dev_set_x, dev_set_y, test_set_x, test_set_y
class StandardScalerImpl(): def __init__(self, copy=True, with_mean=True, with_std=True): self._hyperparams = { 'copy': copy, 'with_mean': with_mean, 'with_std': with_std } self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if (y is not None): self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def test_simple_poly_dataset_scaled_cv(self): model = Model.create_model( model_type=Model.MODEL_TYPE_SVR, cross_validation=True, feature_scaling=True, C_range=[0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10], kernel=Model.KERNEL_RBF ) train_dataset, test_dataset = test_datasets.get_simple_polynomial_datasets(n=1000) scaler = StandardScaler() scaler.fit(train_dataset.data) print("Train mean: " + str(scaler.transform(train_dataset.data).mean(axis=0))) print("Test mean: " + str( scaler.transform(test_dataset.data).mean(axis=0))) print("Train std: " + str(scaler.transform(train_dataset.data).std(axis=0))) print("Test str: " + str( scaler.transform(test_dataset.data).std(axis=0))) self._test_dataset(model, train_dataset, test_dataset, 0, title="SVR with RBF kernel, scaled CV on poly dataset")
def _proccess_input(self, target_pos, target_speed, pos, vel, effort): x_target = np.array(target_pos) x_first = np.array([pos_[0] for pos_ in pos]) x_speed = np.array(target_speed).reshape((-1, 1)) aux_output = [np.ones(eff.shape[0]).reshape((-1, 1)) for eff in effort] x = np.concatenate((x_target, x_first, x_speed), axis=1) input_scaler = StandardScaler() x = input_scaler.fit_transform(x) output_scaler = StandardScaler() effort_concat = np.concatenate([a for a in effort], axis=0) output_scaler.fit(effort_concat) effort = [output_scaler.transform(eff) for eff in effort] y = pad_sequences(effort, padding='post', value=0.) aux_output = pad_sequences(aux_output, padding='post', value=0.) x, x_test, y, y_test, y_aux, y_aux_test = train_test_split(x, y, aux_output, test_size=0.2) return x, x_test, y, y_test, y_aux, y_aux_test
class CreateStandardScaler(CreateModel): def fit(self, data, args): self.model = StandardScaler() with Timer() as t: self.model.fit(data.X_train, data.y_train) return t.interval def test(self, data): assert self.model is not None return self.model.transform(data.X_test) def predict(self, data): with Timer() as t: self.predictions = self.test(data) data.learning_task = LearningTask.REGRESSION return t.interval
def neural_net_2(train, test, val, train_out, test_out, val_out, BigSigma_inv): clf = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(100, 1), activation='logistic', batch_size=BATCH_HUMAN, shuffle=True, max_iter=5000) scaler = StandardScaler() scaler.fit(train) train1 = scaler.transform(train) # apply same transformation to test data test = scaler.transform(test) train_out = train_out.astype(float) clf.fit(X=train1, y=train_out) predict_test = clf.predict(test) predict_val = clf.predict(val) print("TEST ERMS ACCURACY", mean_squared_error(test_out, predict_test), acc_manual(test_out, predict_test)) print("VAL ERMS ACCURACY", mean_squared_error(val_out, predict_val), acc_manual(val_out, predict_test))
def test_center_kernel(): """Test that KernelCenterer is equivalent to StandardScaler in feature space""" rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) scaler = StandardScaler(with_std=False) scaler.fit(X_fit) X_fit_centered = scaler.transform(X_fit) K_fit = np.dot(X_fit, X_fit.T) # center fit time matrix centerer = KernelCenterer() K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T) K_fit_centered2 = centerer.fit_transform(K_fit) assert_array_almost_equal(K_fit_centered, K_fit_centered2) # center predict time matrix X_pred = rng.random_sample((2, 4)) K_pred = np.dot(X_pred, X_fit.T) X_pred_centered = scaler.transform(X_pred) K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T) K_pred_centered2 = centerer.transform(K_pred) assert_array_almost_equal(K_pred_centered, K_pred_centered2)
def test_center_kernel(): """Test that KernelCenterer is equivalent to StandardScaler in feature space""" rng = np.random.RandomState(0) X_fit = rng.random_sample((5, 4)) scaler = StandardScaler(with_std=False) scaler.fit(X_fit) X_fit_centered = scaler.transform(X_fit) K_fit = np.dot(X_fit, X_fit.T) # center fit time matrix centerer = KernelCenterer() K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T) K_fit_centered2 = centerer.fit_transform(K_fit) assert_array_almost_equal(K_fit_centered, K_fit_centered2) # center predict time matrix X_pred = rng.random_sample((2, 4)) K_pred = np.dot(X_pred, X_fit.T) X_pred_centered = scaler.transform(X_pred) K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T) K_pred_centered2 = centerer.transform(K_pred) assert_array_almost_equal(K_pred_centered, K_pred_centered2)
def train_test(feature_file, test_file, removed_columns=None): """ Load features into file :param feature_file: feature file :param test_file: test file :param removed_columns: index of feature columns to remove """ module_logger.info('------ Train/test model ::: {} ::: {}'.format(feature_file, test_file)) clf = svm_clf() fs = numpy.loadtxt(feature_file, delimiter='\t', skiprows=1) _, n = fs.shape X_train = fs[:, 1:n - 1] y_train = fs[:, n - 1] fs = numpy.loadtxt(test_file, delimiter='\t', skiprows=1) _, n = fs.shape X_test = fs[:, 1:n - 1] y_test = fs[:, n - 1] y_i = fs[:, 0] if removed_columns is not None and len(removed_columns) > 0: X_test = numpy.delete(X_test, removed_columns, 1) X_train = numpy.delete(X_train, removed_columns, 1) module_logger.info('------ data dimension ::: {} ::: {} ::: {}'.format(X_train.shape, X_test.shape, n)) std_scaler = StandardScaler() std_scaler.fit(X_train) X_train_scaled = std_scaler.transform(X_train, copy=True) X_test_scaled = std_scaler.transform(X_test, copy=True) clf.fit(X_train_scaled, y_train) y_pred = clf.predict(X_test_scaled) y_logp = clf.predict_proba(X_test_scaled) return write_prediction_output(y_i, y_test, y_pred, test_file.replace('.csv', '_pred.csv'), y_logp)
def preprocess(self): # X_train, X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=0) sc = StandardScaler() sc.fit(self.X_train) self.X_train_std = sc.transform(self.X_train) self.X_test_std = sc.transform(self.X_test)
def train(args): timestr = time.strftime("%Y%m%d-%H%M%S-") output_dir = args.out_dir_path + '/' + time.strftime("%m%d") mkdir(output_dir) setLogger(timestr, out_dir=output_dir) print_args(args) if args.load_input_pkl == '': # process train and test data logger.info('Loading training file...') _, train_question1, train_question2, train_y = get_pdTable( args.train_path) # train_question1, train_question2, train_y = csv_processing(args.train_path) logger.info('Train csv: %d line loaded ' % len(train_question1)) logger.info('Loading test file...') if args.predict_test: test_ids, test_question1, test_question2 = get_pdTable( args.test_path, notag=True) else: test_ids, test_question1, test_question2, test_y = get_pdTable( args.test_path) # test_question1, test_question2, test_ids = csv_processing(args.test_path, test=True) logger.info('Test csv: %d line loaded ' % len(test_question1)) logger.info('Text cleaning... ') train_question1, train_maxLen1 = text_cleaner(train_question1) train_question2, train_maxLen2 = text_cleaner(train_question2) test_question1, test_maxLen1 = text_cleaner(test_question1) test_question2, test_maxLen2 = text_cleaner(test_question2) # train_question1, train_maxLen1 = tokenizeIt(train_question1, clean=args.rawMaterial) # train_question2, train_maxLen2 = tokenizeIt(train_question2, clean=args.rawMaterial) # test_question1, test_maxLen1 = tokenizeIt(test_question1, clean=args.rawMaterial) # test_question2, test_maxLen2 = tokenizeIt(test_question2, clean=args.rawMaterial) inputLength = max(train_maxLen1, train_maxLen2, test_maxLen1, test_maxLen2) logger.info('Max input length: %d ' % inputLength) inputLength = 30 logger.info('Reset max length to 30') tokenizer = Tokenizer(num_words=MAX_NB_WORDS) tokenizer.fit_on_texts(train_question1 + train_question2 + test_question1 + test_question2) sequences_1 = tokenizer.texts_to_sequences(train_question1) sequences_2 = tokenizer.texts_to_sequences(train_question2) test_sequences_1 = tokenizer.texts_to_sequences(test_question1) test_sequences_2 = tokenizer.texts_to_sequences(test_question2) del train_question1, train_question2, test_question1, test_question2 word_index = tokenizer.word_index logger.info('Found %s unique tokens' % len(word_index)) train_x1 = pad_sequences(sequences_1, maxlen=inputLength) train_x2 = pad_sequences(sequences_2, maxlen=inputLength) train_y = array(train_y) logger.info('Shape of data tensor: (%d, %d)' % train_x1.shape) logger.info('Shape of label tensor: (%d, )' % train_y.shape) test_x1 = pad_sequences(test_sequences_1, maxlen=inputLength) test_x2 = pad_sequences(test_sequences_2, maxlen=inputLength) test_ids = array(test_ids) if not args.predict_test: test_y = array(test_y) del sequences_1, sequences_2, test_sequences_1, test_sequences_2 if args.save_model: with open(output_dir + '/' + timestr + 'input_train_test.pkl', 'wb') as input_file: logger.info('Dumping processed input to pickle...') pkl.dump((train_x1, train_x2, train_y, test_x1, test_x2, test_ids, tokenizer), input_file) else: with open(args.load_input_pkl, 'rb') as input_file: train_x1, train_x2, train_y, test_x1, test_x2, test_ids, tokenizer = pkl.load( input_file) logger.info('Shape of data tensor: (%d, %d)' % train_x1.shape) logger.info('Shape of label tensor: (%d, )' % train_y.shape) word_index = tokenizer.word_index inputLength = 30 logger.info('Reset max length to 30') if args.w2v or args.ft_src: if args.w2v.endswith('.pkl'): with open(args.w2v, 'rb') as embd_file: logger.info('Loading word embedding from pickle...') embdw2v, vocabReverseDict = pkl.load(embd_file) if not len(vocabReverseDict) == len(word_index): logger.info('WARNING: reversed dict len incorrect %d , but word dict len %d ' % \ (len(vocabReverseDict), len(word_index))) else: logger.info('Loading word embedding from text file...') embdw2v, vocabReverseDict = embdReader( args.w2v, args.embd_dim, word_index, MAX_NB_WORDS, fasttext_source=args.ft_src, ft_dim=args.ft_dim, skip_header=args.skip_header, initializer=args.embd_init) if args.save_model: with open( output_dir + '/' + timestr + 'embd_dump.' + str(args.embd_dim + args.ft_dim) + 'd.pkl', 'wb') as embd_file: logger.info('Dumping word embedding to pickle...') pkl.dump((embdw2v, vocabReverseDict), embd_file) else: embdw2v = None # if args.load_vocab_from_file: # with open(args.load_vocab_from_file, 'rb') as vocab_file: # (vocabDict, vocabReverseDict) = pkl.load(vocab_file) # unk = None # if args.w2v: # if args.w2v.endswith('.pkl'): # with open(args.w2v, 'rb') as embd_file: # embdw2v = pkl.load(embd_file) # else: # from util.data_processing import w2vEmbdReader # embdw2v = w2vEmbdReader(args.w2v, vocabReverseDict, args.embd_dim) # with open(output_dir + '/'+ timestr + 'embd_dump.' + str(args.embd_dim) + 'd.pkl', 'wb') as embd_file: # pkl.dump(embdw2v, embd_file) # else: # embdw2v = None # else: # from util.data_processing import createVocab # vocabDict, vocabReverseDict = createVocab([train_question1, train_question2, test_question1, test_question2], # min_count=3, reservedList=['<pad>', '<unk>']) # embdw2v = None # unk = '<unk>' ## logger.info(vocabDict) # # word to padded numerical np array # from util.data_processing import word2num # train_x1 = word2num(train_question1, vocabDict, unk, inputLength, padding='pre') # train_x2 = word2num(train_question2, vocabDict, unk, inputLength, padding='pre') # test_x1 = word2num(test_question1, vocabDict, unk, inputLength, padding='pre') # test_x2 = word2num(test_question2, vocabDict, unk, inputLength, padding='pre') # Loading train features if not args.train_feature_path == '': logger.info('Loading train features from file %s ' % args.train_feature_path) df_train = read_csv(args.train_feature_path, encoding="ISO-8859-1") if not args.feature_list == '': feature_list = args.feature_list.split(',') train_features = DataFrame() for feature_name in feature_list: train_features[feature_name.strip()] = df_train[ feature_name.strip()] elif args.fidx_end == 0: train_features = df_train.iloc[:, args.fidx_start:] else: train_features = df_train.iloc[:, args.fidx_start:args.fidx_end] if not args.train_bowl_feature_path == '': logger.info('Loading train 1bowl features from file %s ' % args.train_bowl_feature_path) df_train = read_csv(args.train_bowl_feature_path, encoding="ISO-8859-1") if not args.bowl_feat_list == '': bowl_feat_list = args.bowl_feat_list.split(',') for feature_name in bowl_feat_list: train_features[feature_name.strip()] = df_train[ feature_name.strip()] else: for feature_name in df_train.columns: if feature_name.startswith('z_'): train_features[feature_name] = df_train[feature_name] logger.info('Final train feature list: \n %s ' % ','.join(list(train_features.columns.values))) feature_length = len(train_features.columns) train_features = train_features.replace([inf, -inf, nan], 0) train_features = array(train_features) logger.info('Loaded train feature shape: (%d, %d) ' % train_features.shape) del df_train # Loading test features logger.info('Loading test features from file %s ' % args.test_feature_path) df_test = read_csv(args.test_feature_path, encoding="ISO-8859-1") if not args.feature_list == '': feature_list = args.feature_list.split(',') test_features = DataFrame() for feature_name in feature_list: test_features[feature_name.strip()] = df_test[ feature_name.strip()] elif args.fidx_end == 0: test_features = df_test.iloc[:, args.fidx_start:] else: test_features = df_test.iloc[:, args.fidx_start:args.fidx_end] if not args.test_bowl_feature_path == '': logger.info('Loading test 1bowl features from file %s ' % args.test_bowl_feature_path) df_test = read_csv(args.test_bowl_feature_path, encoding="ISO-8859-1") if not args.bowl_feat_list == '': bowl_feat_list = args.bowl_feat_list.split(',') for feature_name in bowl_feat_list: test_features[feature_name.strip()] = df_test[ feature_name.strip()] else: for feature_name in df_test.columns: if feature_name.startswith('z_'): test_features[feature_name] = df_test[feature_name] test_features = test_features.replace([inf, -inf, nan], 0) test_features = array(test_features) logger.info('Loaded test feature shape: (%d, %d) ' % test_features.shape) del df_test # Normalize Data ss = StandardScaler() ss.fit(vstack((train_features, test_features))) train_features = ss.transform(train_features) test_features = ss.transform(test_features) del ss logger.info('Features normalized ') train_x1_aug = vstack((train_x1, train_x2)) train_x2_aug = vstack((train_x2, train_x1)) train_y = concatenate((train_y, train_y)) train_x = [train_x1_aug, train_x2_aug] test_x = [test_x1, test_x2] if not args.train_feature_path == '': train_features = vstack((train_features, train_features)) train_x += [train_features] test_x += [test_features] # ######################################## # ## sample train/validation data # ######################################## # # np.random.seed(1234) # perm = random.permutation(len(train_x1)) # idx_train = perm[:int(len(train_x1) * (1 - args.valid_split))] # idx_val = perm[int(len(train_x1) * (1 - args.valid_split)):] # # data_1_train = vstack((train_x1[idx_train], train_x2[idx_train])) # data_2_train = vstack((train_x2[idx_train], train_x1[idx_train])) # leaks_train = vstack((train_features[idx_train], train_features[idx_train])) # labels_train = concatenate((train_y[idx_train], train_y[idx_train])) # # data_1_val = vstack((train_x1[idx_val], train_x2[idx_val])) # data_2_val = vstack((train_x2[idx_val], train_x1[idx_val])) # leaks_val = vstack((train_features[idx_val], train_features[idx_val])) # labels_val = concatenate((train_y[idx_val], train_y[idx_val])) # re_weight = True # whether to re-weight classes to fit the 17.5% share in test set # weight_val = ones(len(labels_val)) # if re_weight: # weight_val *= 0.472001959 # weight_val[labels_val == 0] = 1.309028344 ######################################## ## add class weight ######################################## if args.re_weight: class_weight = {0: 1.309028344, 1: 0.472001959} else: class_weight = None # # Dump vocab # if not args.load_vocab_from_file: # with open(output_dir + '/'+ timestr + 'vocab.pkl', 'wb') as vocab_file: # pkl.dump((vocabDict, vocabReverseDict), vocab_file) if args.load_model_json: with open(args.load_model_json, 'r') as json_file: rnnmodel = model_from_json(json_file.read(), custom_objects={ "DenseWithMasking": DenseWithMasking, "Conv1DWithMasking": Conv1DWithMasking, "MaxOverTime": MaxOverTime, "MeanOverTime": MeanOverTime }) logger.info('Loaded model from saved json') else: if args.train_feature_path == '': rnnmodel = getModel(args, inputLength, len(word_index) + 1, embd=embdw2v) else: rnnmodel = getModel(args, inputLength, len(word_index) + 1, embd=embdw2v, feature_length=feature_length) if args.load_model_weights: rnnmodel.load_weights(args.load_model_weights) logger.info('Loaded model from saved weights') if args.optimizer == 'rmsprop': optimizer = RMSprop(lr=args.learning_rate) else: optimizer = args.optimizer myMetrics = 'acc' # 'binary_accuracy' # 'mse' rnnmodel.compile(loss=args.loss, optimizer=optimizer, metrics=[myMetrics]) rnnmodel.summary() ## Plotting model logger.info('Plotting model architecture') plot_model(rnnmodel, to_file=output_dir + '/' + timestr + 'model_plot.png') logger.info(' Done') if args.save_model: ## Save model architecture logger.info('Saving model architecture') with open(output_dir + '/' + timestr + 'model_config.json', 'w') as arch: arch.write(rnnmodel.to_json(indent=2)) logger.info(' Done') # train and test model myCallbacks = [] train_logger = TrainLogger() myCallbacks.append(train_logger) if args.eval_on_epoch: from util.model_eval import Evaluator evl = Evaluator(args, output_dir, timestr, myMetrics, test_x, test_y, vocabReverseDict) myCallbacks.append(evl) if args.save_model: bst_model_path = output_dir + '/' + timestr + 'best_model_weights.h5' model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True, verbose=1) myCallbacks.append(model_checkpoint) if args.plot: if not args.eval_on_epoch: plot_pic = PlotPic(args, output_dir, timestr, myMetrics) myCallbacks.append(plot_pic) if args.earlystop: earlystop = EarlyStopping(patience=args.earlystop, verbose=1, mode='auto') myCallbacks.append(earlystop) rnnmodel.fit(train_x, train_y, validation_split=args.valid_split, batch_size=args.train_batch_size, epochs=args.epochs, class_weight=class_weight, callbacks=myCallbacks) # rnnmodel.fit([data_1_train, data_2_train, leaks_train], labels_train, # validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), # epochs=args.epochs, batch_size=args.train_batch_size, shuffle=True, # class_weight=class_weight, callbacks=myCallbacks) if args.predict_test: logger.info("Tuning model to best record...") rnnmodel.load_weights(bst_model_path) logger.info("Predicting test file result...") preds = rnnmodel.predict(test_x, batch_size=args.eval_batch_size, verbose=1) preds = squeeze(preds) logger.info('Write predictions into file... Total line: %d ' % len(preds)) with open(output_dir + '/' + timestr + 'predict.csv', 'w', encoding='utf8') as fwrt: writer_sub = csv.writer(fwrt) writer_sub.writerow(['test_id', 'is_duplicate']) idx = 0 for itm in tqdm(preds): writer_sub.writerow([idx, itm]) idx += 1 elif not args.eval_on_epoch: logger.info("Evaluating test set...") tloss, tacc = rnnmodel.evaluate(test_x, test_y, batch_size=args.eval_batch_size, verbose=1) logger.info("Test loss: %.4f Test Accuracy: %.2f%%" % (tloss, 100 * tacc))
def inference(args): timestr = time.strftime("%Y%m%d-%H%M%S-") output_dir = args.out_dir_path + '/' + time.strftime("%m%d") mkdir(output_dir) setLogger(timestr, out_dir=output_dir) print_args(args) if args.load_input_pkl == '': raise NotImplementedError( 'only support loading testing materials from pickle') else: with open(args.load_input_pkl, 'rb') as input_file: train_x1, train_x2, train_y, test_x1, test_x2, test_ids, tokenizer = pkl.load( input_file) logger.info('Shape of test data tensor: (%d, %d)' % test_x1.shape) word_index = tokenizer.word_index logger.info('Loaded %s unique tokens' % len(word_index)) if not args.test_path == '': if args.predict_test: test_ids, test_question1, test_question2 = get_pdTable( args.test_path, notag=True) else: test_ids, test_question1, test_question2, test_y = get_pdTable( args.test_path) test_question1, test_maxLen1 = text_cleaner(test_question1) test_question2, test_maxLen2 = text_cleaner(test_question2) inputLength = max(test_maxLen1, test_maxLen2) logger.info('Max input length: %d ' % inputLength) inputLength = 30 logger.info('Reset max length to 30') test_sequences_1 = tokenizer.texts_to_sequences(test_question1) test_sequences_2 = tokenizer.texts_to_sequences(test_question2) test_x1 = pad_sequences(test_sequences_1, maxlen=inputLength) test_x2 = pad_sequences(test_sequences_2, maxlen=inputLength) test_ids = array(test_ids) if not args.predict_test: test_y = array(test_y) # Loading train features if not args.train_feature_path == '': logger.info('Loading train features from file %s ' % args.train_feature_path) df_train = read_csv(args.train_feature_path, encoding="ISO-8859-1") if not args.feature_list == '': feature_list = args.feature_list.split(',') train_features = DataFrame() for feature_name in feature_list: train_features[feature_name.strip()] = df_train[ feature_name.strip()] elif args.fidx_end == 0: train_features = df_train.iloc[:, args.fidx_start:] else: train_features = df_train.iloc[:, args.fidx_start:args.fidx_end] if not args.train_bowl_feature_path == '': logger.info('Loading train 1bowl features from file %s ' % args.train_bowl_feature_path) df_train = read_csv(args.train_bowl_feature_path, encoding="ISO-8859-1") if not args.bowl_feat_list == '': bowl_feat_list = args.bowl_feat_list.split(',') for feature_name in bowl_feat_list: train_features[feature_name.strip()] = df_train[ feature_name.strip()] else: for feature_name in df_train.columns: if feature_name.startswith('z_'): train_features[feature_name] = df_train[feature_name] logger.info('Final train feature list: \n %s ' % ','.join(list(train_features.columns.values))) feature_length = len(train_features.columns) train_features = train_features.replace([inf, -inf, nan], 0) train_features = array(train_features) logger.info('Loaded train feature shape: (%d, %d) ' % train_features.shape) del df_train # Loading test features logger.info('Loading test features from file %s ' % args.test_feature_path) df_test = read_csv(args.test_feature_path, encoding="ISO-8859-1") if not args.feature_list == '': feature_list = args.feature_list.split(',') test_features = DataFrame() for feature_name in feature_list: test_features[feature_name.strip()] = df_test[ feature_name.strip()] elif args.fidx_end == 0: test_features = df_test.iloc[:, args.fidx_start:] else: test_features = df_test.iloc[:, args.fidx_start:args.fidx_end] if not args.test_bowl_feature_path == '': logger.info('Loading test 1bowl features from file %s ' % args.test_bowl_feature_path) df_test = read_csv(args.test_bowl_feature_path, encoding="ISO-8859-1") if not args.bowl_feat_list == '': bowl_feat_list = args.bowl_feat_list.split(',') for feature_name in bowl_feat_list: test_features[feature_name.strip()] = df_test[ feature_name.strip()] else: for feature_name in df_test.columns: if feature_name.startswith('z_'): test_features[feature_name] = df_test[feature_name] test_features = test_features.replace([inf, -inf, nan], 0) test_features = array(test_features) logger.info('Loaded test feature shape: (%d, %d) ' % test_features.shape) del df_test # Normalize Data ss = StandardScaler() ss.fit(vstack((train_features, test_features))) # train_features = ss.transform(train_features) test_features = ss.transform(test_features) del ss logger.info('Test Features normalized ') test_x = [test_x1, test_x2] if not args.test_feature_path == '': test_x += [test_features] if args.load_model_json: with open(args.load_model_json, 'r') as json_file: rnnmodel = model_from_json(json_file.read(), custom_objects={ "DenseWithMasking": DenseWithMasking, "Conv1DWithMasking": Conv1DWithMasking, "MaxOverTime": MaxOverTime, "MeanOverTime": MeanOverTime }) logger.info('Loaded model from saved json') if args.load_model_weights: logger.info('Loading model from saved weights') rnnmodel.load_weights(args.load_model_weights) if args.predict_test: logger.info("Predicting test file result...") preds = rnnmodel.predict(test_x, batch_size=args.eval_batch_size, verbose=1) preds = squeeze(preds) logger.info('Write predictions into file... Total line: %d ' % len(preds)) with open(output_dir + '/' + timestr + 'predict.csv', 'w', encoding='utf8') as fwrt: writer_sub = csv.writer(fwrt) writer_sub.writerow(['test_id', 'is_duplicate']) idx = 0 for itm in tqdm(preds): writer_sub.writerow([idx, itm]) idx += 1 logger.info('Predicted results written to file: %s' % (output_dir + '/' + timestr + 'predict.csv')) else: if args.optimizer == 'rmsprop': optimizer = RMSprop(lr=args.learning_rate) else: optimizer = args.optimizer myMetrics = 'acc' # 'binary_accuracy' # 'mse' rnnmodel.compile(loss=args.loss, optimizer=optimizer, metrics=[myMetrics]) rnnmodel.summary() logger.info("Evaluating test set...") tloss, tacc = rnnmodel.evaluate(test_x, test_y, batch_size=args.eval_batch_size, verbose=1) logger.info("Test loss: %.4f Test Accuracy: %.2f%%" % (tloss, 100 * tacc))
checkpoint_dir=checkpoint_dir, loss=loss_function) predicted_values = [] real_values = [] for student in students_gender_train: train_students = students_gender_train - set([student]) print(train_students) test_student = set([student]) print(test_student) train_x, train_y = dataset_loader.get_x_and_y( students_set=train_students, index=index, test_flag=False) test_x, test_y = dataset_loader.get_x_and_y( students_set=test_student, index=index, test_flag=True) reshaped_train_set_x = dataset_loader.reshape_numpy_array(train_x) scaler = StandardScaler() scaler.fit(reshaped_train_set_x) normalized_reshaped_train_x = scaler.transform( reshaped_train_set_x) normalized_train_set_x = np.reshape( normalized_reshaped_train_x, (train_x.shape[0], train_x.shape[1], train_x.shape[2], train_x.shape[3])) reshaped_test_x = dataset_loader.reshape_numpy_array(test_x) normalized_reshaped_test_x = scaler.transform(reshaped_test_x) normalized_test_x = np.reshape(normalized_reshaped_test_x, (test_x.shape[0], test_x.shape[1], test_x.shape[2], test_x.shape[3])) predicted_values.extend( cnn_classifier.train(normalized_train_set_x, train_y, normalized_test_x,
import matplotlib.pyplot as plt from sklearn.datasets import load_iris from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split from sklearn.preprocessing.data import StandardScaler from sklearn.linear_model import Lasso from mpl_toolkits.mplot3d import Axes3D irisdata = load_iris() iris_X = irisdata.data iris_y = irisdata.target scale = StandardScaler() scale.fit(iris_X) iris_x = scale.transform(iris_X) pca = PCA(n_components=3) iris_x = pca.fit_transform(iris_x) fig = plt.figure() ax = fig.add_subplot(111) # ax.scatter(iris_x[:, 0], iris_x[:, 1], iris_x[:, 2], marker='o', c=iris_y) x_tran, x_test, y_tran, y_test = train_test_split(iris_x, iris_y, test_size=0.3, random_state=42) result = {} test_number = len(y_test) for i in range(1, 11, 1): clf = Lasso(alpha=i / 10).fit(x_tran, y_tran) y_pre = clf.predict(x_test) result[i / 10] = sum(m < 0.5 for m in abs(y_test - y_pre)) / test_number print(result) ax.plot(list(result.keys()), list(result.values()))
from sklearn.preprocessing.data import StandardScaler from sklearn.datasets import load_breast_cancer from sklearn.decomposition import PCA import matplotlib.pyplot as plt import mglearn cancer = load_breast_cancer() scaler = StandardScaler() scaler.fit(cancer.data) X_scaled = scaler.transform(cancer.data) pca = PCA(n_components=2) pca.fit(X_scaled) X_pca = pca.transform(X_scaled) print("original {}, reduction {}".format(X_scaled.shape, X_pca.shape)) plt.figure(figsize=(8,8)) mglearn.discrete_scatter(X_pca[:,0], X_pca[:,1], cancer.target) plt.legend(["malignancy(cancer)", "benign"], loc="best") plt.gca().set_aspect("equal") plt.xlabel("1st principal component") plt.ylabel("2nd principal component") plt.draw() print("PCA PC shape:{}".format(pca.components_.shape)) print("PCA PC {}".format(pca.components_)) plt.matshow(pca.components_, cmap='viridis') plt.yticks([0,1], ["first principal component", "second principal component"]) plt.colorbar()
class DataTransformer: """DataTransformer transforms CRN traces into training examples with optional scaling.""" def __init__( self, dataset_address, with_timestamps=True, nb_randomized_params=0, ): """ Initialize transformer. Parameters ---------- dataset_address : filepath to the dataset containing CRN traces. Data in the file should be of shape [nb_traces, nb_steps, nb_features]. If with_timestamps is True, the first feature is considered as time. with_timestamps : boolean, whether time is included in data (as the very first feature) Data produced by scripts/simulate_data_gillespy.py has time, therefore default values is True. """ self.nb_trajectories = None self.nb_timesteps = None self.nb_features = None self.labels = None self.with_labels = False self.with_timestamps = with_timestamps self.nb_randomized_params = nb_randomized_params self._scaler = None self.scaler_is_fitted = False self.scaler_positivity = None self.dtype = np.float32 self.read_data(dataset_address) @property def scaler(self): return self._scaler def read_data(self, dataset_address): """Read data and memorize shape.""" with open(dataset_address, 'rb') as data_file: self.data = np.asarray(np.load(data_file), dtype=self.dtype) self._memorize_dataset_shape() def _memorize_dataset_shape(self): """Memorize data shape.""" if self.data.ndim != 3: raise ShapeError(f"The dataset is not properly formatted.\n" f"We expect the following shape: " f"(nb_trajectories, nb_timesteps, nb_features),\n" f"got: {self.data.shape}") self.nb_trajectories, self.nb_timesteps, self.nb_features = self.data.shape def set_labels(self, labels): """ Set labels for species. Parameters ---------- labels : list of species names. Length of the list and the order of names should coincide with the species presented in data (excluding `time`). """ if labels is None: self.labels = None self.with_labels = False else: if self.with_timestamps: labels = ['timestamp'] + labels if len(labels) != self.nb_features: raise ShapeError( f"There needs to be exactly one label for each feature.\n" f"We have {len(labels)} labels for {self.nb_features} features." ) self.labels = bidict(zip(range(len(labels)), labels)) self.with_labels = True def drop_timestamps(self): """Drop time from data.""" if self.with_timestamps is True: self.data = self.data[..., 1:] self.nb_features = self.nb_features - 1 self.with_timestamps = False self._memorize_dataset_shape() if self.with_labels is True: self.labels.inv.pop('timestamp') self.labels = bidict( zip([k - 1 for k in self.labels.keys()], self.labels.values())) def _create_scaler(self, positivity): self.scaler_positivity = positivity if positivity is True: eps = 1e-9 self._scaler = MinMaxScaler(feature_range=(eps, 1)) else: self._scaler = StandardScaler() self.scaler_is_fitted = False def _fit_scaler(self, positivity=False, slice_size=None): if (self._scaler is None) or (self.scaler_positivity != positivity): self._create_scaler(positivity) if not self.scaler_is_fitted: LOGGER.info(f"Fitting scaler, positivity={positivity}") if slice_size is None: self._scaler.fit(self.data.reshape(-1, self.nb_features)) else: n_slices = self.nb_trajectories // slice_size for i in tqdm(range(n_slices)): data_slice = self.data[i * slice_size:(i + 1) * slice_size, ...] data_slice = data_slice.reshape(-1, self.nb_features) self._scaler.partial_fit(data_slice) if self.nb_trajectories % slice_size != 0: data_slice = self.data[n_slices * slice_size:, ...] data_slice = data_slice.reshape(-1, self.nb_features) self._scaler.partial_fit(data_slice) self.scaler_is_fitted = True def rescale(self, data): """ Apply scaler to data. Parameters ---------- data : data to rescale. Returns ------- data : rescaled data. """ # return self.scaler.transform(data) if isinstance(self.scaler, StandardScaler): try: data = (data - self.scaler.mean_) / self.scaler.scale_ except ValueError: data = (data - self.scaler.mean_[:-self.nb_randomized_params]) \ / self.scaler.scale_[:-self.nb_randomized_params] elif isinstance(self.scaler, MinMaxScaler): try: data = (data * self.scaler.scale_) + self.scaler.min_ except ValueError: data = (data * self.scaler.scale_[:-self.nb_randomized_params]) \ + self.scaler.min_[:-self.nb_randomized_params] return data def scale_back(self, data): """ Apply scaler inverse transform, returning data to the original scale. Parameters ---------- data : data (rescaled). Returns ------- data : data scaled back. """ # return self.scaler.inverse_transform(data) if isinstance(self.scaler, StandardScaler): try: data = data * self.scaler.scale_ + self.scaler.mean_ except ValueError: data = data * self.scaler.scale_[:-self.nb_randomized_params] \ + self.scaler.mean_[:-self.nb_randomized_params] elif isinstance(self.scaler, MinMaxScaler): try: data = (data - self.scaler.min_) / self.scaler.scale_ except ValueError: data = (data - self.scaler.min_[:-self.nb_randomized_params]) \ / self.scaler.scale_[:-self.nb_randomized_params] return data def _shuffle_data(self): np.random.shuffle(self.data) def _transitions_from_a_batch_of_trajectories( self, trajectories, nb_past_timesteps, ): x_data = [] y_data = [] for timestep in range(self.nb_timesteps - nb_past_timesteps): x_data.append(trajectories[:, timestep:(timestep + nb_past_timesteps), :]) y_data.append(trajectories[:, timestep + nb_past_timesteps, :]) x_data = np.concatenate(x_data, axis=0) y_data = np.concatenate(y_data, axis=0) return x_data, y_data def _transitions_generator( self, trajectories, nb_past_timesteps, slice_size=None, rescale=False, ): self._check_nb_past_timesteps(nb_past_timesteps) nb_trajectories = trajectories.shape[0] if slice_size: n_slices = nb_trajectories // slice_size additive = 0 if nb_trajectories % slice_size == 0 else 1 else: n_slices = 1 additive = 0 slice_size = nb_trajectories for i in range(n_slices + additive): if i == n_slices: x_data, y_data = self._transitions_from_a_batch_of_trajectories( trajectories[slice_size * n_slices:nb_trajectories], nb_past_timesteps, ) else: x_data, y_data = self._transitions_from_a_batch_of_trajectories( trajectories[slice_size * i:slice_size * (i + 1)], nb_past_timesteps, ) if rescale: x_data = self.rescale(x_data) y_data = self.rescale(y_data) yield x_data, y_data def _train_test_generators( self, nb_past_timesteps, test_fraction=0.2, slice_size=None, rescale=False, ): n_train_trajectories = int((1. - test_fraction) * self.nb_trajectories) train_gen = self._transitions_generator( self.data[:n_train_trajectories], nb_past_timesteps, slice_size, rescale, ) test_gen = self._transitions_generator( self.data[n_train_trajectories:], nb_past_timesteps, slice_size, rescale, ) return train_gen, test_gen def get_train_test_data_generators( self, nb_past_timesteps=1, test_fraction=0.2, keep_timestamps=False, rescale=True, positivity=True, shuffle=True, slice_size=None, ): """ Produce data generators, yielding chunks of transformed data, containing (optionally) rescaled training examples. Each training example is a single transition between states of the system: (x, y) = (trajectory[i:i+nb_past_timesteps], trajectory[i+nb_past_timesteps]) Parameters ---------- nb_past_timesteps : number of steps observed before each transition. test_fraction : float, fraction of data that will be used for test. keep_timestamps : boolean, whether to keep timestamps in data, default is False. rescale : boolean, whether data should be rescaled. positivity : boolean, if True, data will be rescaled between 0 and 1, otherwise standardized. shuffle : boolean, if True trajectories will be shuffled before producing training examples. slice_size : int, number of trajectories to process at once, optional. May be useful for large datasets to reduce memory consumption. If None, all trajectories used. Returns ------- (train_generator, test_generator) : iterable generators of training examples. Every iteration of generator yields training examples produced from `slice_size` number of trajectories. """ if keep_timestamps is False: self.drop_timestamps() if rescale is True: self._fit_scaler(positivity, slice_size) if shuffle is True: self._shuffle_data() return self._train_test_generators( nb_past_timesteps, test_fraction, slice_size, rescale, ) def _check_nb_past_timesteps(self, nb_past_timesteps): if nb_past_timesteps + 1 > self.nb_timesteps: raise ValueError('Too many past timesteps.') elif nb_past_timesteps < 1: raise ValueError( 'You need to consider at least 1 timestep in the past.') def _save_scaler(self, dataset_folder): scaler_fp = os.path.join(dataset_folder, 'scaler.pickle') with open(scaler_fp, 'wb') as file: pickle.dump(self.scaler, file) def save_data_for_ml_hdf5( self, dataset_folder, nb_past_timesteps=1, test_fraction=0.2, keep_timestamps=False, rescale=True, positivity=True, shuffle=True, slice_size=None, force_rewrite=False, ): """ Write training and test datasets to hdf5 files. Original trajectories are optionally scaled and split into training examples: (x, y) = (trajectory[i:i+nb_past_timesteps], trajectory[i+nb_past_timesteps]) Parameters ---------- dataset_folder : folder to save datasets nb_past_timesteps : number of steps observed before each transition. test_fraction : float, fraction of data that will be used for test. keep_timestamps : boolean, whether to keep timestamps in data, default is False. rescale : boolean, whether data should be rescaled. positivity : boolean, if True, data will be rescaled between 0 and 1, otherwise standardized. shuffle : boolean, if True trajectories will be shuffled before producing training examples. slice_size : int, number of trajectories to process at once, optional. May be useful for large datasets to reduce memory consumption. If None, all trajectories used. force_rewrite : boolean, if True, existing files will be rewritten. Returns ------- None """ train_gen, test_gen = self.get_train_test_data_generators( nb_past_timesteps=nb_past_timesteps, test_fraction=test_fraction, keep_timestamps=keep_timestamps, rescale=rescale, positivity=positivity, shuffle=shuffle, slice_size=slice_size, ) if rescale: self._save_scaler(dataset_folder) train_fp = os.path.join(dataset_folder, 'train_rescaled.hdf5') test_fp = os.path.join(dataset_folder, 'test_rescaled.hdf5') else: train_fp = os.path.join(dataset_folder, 'train.hdf5') test_fp = os.path.join(dataset_folder, 'test.hdf5') if force_rewrite: if os.path.exists(train_fp): os.remove(train_fp) if os.path.exists(test_fp): os.remove(test_fp) with h5py.File(train_fp, 'a', libver='latest') as df: df.create_dataset( 'x', shape=(0, nb_past_timesteps, self.nb_features), maxshape=(None, nb_past_timesteps, self.nb_features), chunks=True, ) df.create_dataset( 'y', shape=(0, self.nb_features - self.nb_randomized_params), maxshape=(None, self.nb_features - self.nb_randomized_params), chunks=True, ) for x, y in train_gen: n_new_items = x.shape[0] df['x'].resize(df['x'].shape[0] + n_new_items, axis=0) df['x'][-n_new_items:] = x df['y'].resize(df['y'].shape[0] + n_new_items, axis=0) df['y'][-n_new_items:] = y[..., :-self.nb_randomized_params] LOGGER.info(f"Train data saved to {train_fp}, \n" f"Shapes: x: {df['x'].shape}, y: {df['y'].shape}") with h5py.File(test_fp, 'a', libver='latest') as df: df.create_dataset( 'x', shape=(0, nb_past_timesteps, self.nb_features), maxshape=(None, nb_past_timesteps, self.nb_features), chunks=True, ) df.create_dataset( 'y', shape=(0, self.nb_features - self.nb_randomized_params), maxshape=(None, self.nb_features - self.nb_randomized_params), chunks=True, ) for x, y in test_gen: n_new_items = x.shape[0] df['x'].resize(df['x'].shape[0] + n_new_items, axis=0) df['x'][-n_new_items:] = x df['y'].resize(df['y'].shape[0] + n_new_items, axis=0) df['y'][-n_new_items:] = y[..., :-self.nb_randomized_params] LOGGER.info(f"Test data saved to {test_fp}, \n" f"Shapes: x: {df['x'].shape}, y: {df['y'].shape}") def save_data_for_ml_tfrecord( self, dataset_folder, nb_past_timesteps, test_fraction=0.2, keep_timestamps=False, rescale=True, positivity=True, shuffle=True, slice_size=None, force_rewrite=False, ): """ Write training and test datasets to TFRecord files. Original trajectories are optionally scaled and split into training examples: (x, y) = (trajectory[i:i+nb_past_timesteps], trajectory[i+nb_past_timesteps]) Parameters ---------- dataset_folder : folder to save datasets nb_past_timesteps : number of steps observed before each transition. test_fraction : float, fraction of data that will be used for test. keep_timestamps : boolean, whether to keep timestamps in data, default is False. rescale : boolean, whether data should be rescaled. positivity : boolean, if True, data will be rescaled between 0 and 1, otherwise standardized. shuffle : boolean, if True trajectories will be shuffled before producing training examples. slice_size : int, number of trajectories to process at once, optional. May be useful for large datasets to reduce memory consumption. If None, all trajectories used. force_rewrite : boolean, if True, existing files will be rewritten. Returns ------- None """ train_gen, test_gen = self.get_train_test_data_generators( nb_past_timesteps=nb_past_timesteps, test_fraction=test_fraction, keep_timestamps=keep_timestamps, rescale=rescale, positivity=positivity, shuffle=shuffle, slice_size=slice_size, ) if rescale: self._save_scaler(dataset_folder) train_fp = os.path.join(dataset_folder, 'train_rescaled.tfrecords') test_fp = os.path.join(dataset_folder, 'test_rescaled.tfrecords') else: train_fp = os.path.join(dataset_folder, 'train.tfrecords') test_fp = os.path.join(dataset_folder, 'test.tfrecords') if force_rewrite: if os.path.exists(train_fp): os.remove(train_fp) if os.path.exists(test_fp): os.remove(test_fp) def _float_feature(value): return tf.train.Feature(float_list=tf.train.FloatList(value=value)) def _int64_feature(value): return tf.train.Feature(int64_list=tf.train.Int64List(value=value)) def _create_example(x_arr, y_arr): x_shape = x_arr.shape x_arr = x_arr.reshape(-1) features = tf.train.Features( feature={ 'x': _float_feature(x_arr), 'y': _float_feature(y_arr), 'x_shape': _int64_feature(x_shape) }) return tf.train.Example(features=features) def _process_chunk(x, y): for idx in range(x.shape[0]): xi = x[idx] yi = y[idx] example = _create_example(xi, yi) writer.write(example.SerializeToString()) writer = tf.io.TFRecordWriter(train_fp) for x, y in train_gen: _process_chunk(x, y) writer = tf.io.TFRecordWriter(test_fp) for x, y in test_gen: _process_chunk(x, y)
def split_train_validation_test(multi_time_series_df, valid_start_time, test_start_time, features, time_step_lag=1, horizon=1, target='target', time_format='%Y-%m-%d %H:%M:%S', freq='H'): if not isinstance(features, list) or len(features) < 1: raise Exception( "Bad input for features. It must be an array of dataframe colummns used" ) train = multi_time_series_df.copy()[ multi_time_series_df.index < valid_start_time] train_features = train[features] train_targets = train[target] # X_scaler = MinMaxScaler() # target_scaler = MinMaxScaler() # y_scaler = MinMaxScaler() X_scaler = StandardScaler() target_scaler = StandardScaler() y_scaler = StandardScaler() # 'load' is our key target. If it is in features, then we scale it. # if it not 'load', then we scale the first column if 'load' in features: tg = train[['load']] y_scaler.fit(tg) else: tg = train[target] ## scale the first column y_scaler.fit(tg.values.reshape(-1, 1)) train[target] = target_scaler.fit_transform(train_targets) X_scaler.fit(train_features) train[features] = X_scaler.transform(train_features) tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)} train_inputs = TimeSeriesTensor(train, target=target, H=horizon, freq=freq, tensor_structure=tensor_structure) print(train_inputs.dataframe.head()) look_back_dt = dt.datetime.strptime( valid_start_time, time_format) - dt.timedelta(hours=time_step_lag - 1) valid = multi_time_series_df.copy()[ (multi_time_series_df.index >= look_back_dt) & (multi_time_series_df.index < test_start_time)] valid_features = valid[features] valid[features] = X_scaler.transform(valid_features) tensor_structure = {'X': (range(-time_step_lag + 1, 1), features)} valid_inputs = TimeSeriesTensor(valid, target=target, H=horizon, freq=freq, tensor_structure=tensor_structure) print(valid_inputs.dataframe.head()) # test set # look_back_dt = dt.datetime.strptime(test_start_time, '%Y-%m-%d %H:%M:%S') - dt.timedelta(hours=time_step_lag - 1) test = multi_time_series_df.copy()[test_start_time:] test_features = test[features] test[features] = X_scaler.transform(test_features) test_inputs = TimeSeriesTensor(test, target=target, H=horizon, freq=freq, tensor_structure=tensor_structure) print("time lag:", time_step_lag, "original_feature:", len(features)) return train_inputs, valid_inputs, test_inputs, y_scaler
import matplotlib.pyplot as plt from sklearn.datasets import load_boston from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split from sklearn.preprocessing.data import StandardScaler from sklearn.linear_model import ElasticNet # In ElasticNet,we have two important variable, alpha and l1_ratio import numpy as np from mpl_toolkits.mplot3d import Axes3D from matplotlib import cm from matplotlib.ticker import LinearLocator bostondata = load_boston() boston_X = bostondata.data boston_y = bostondata.target scale = StandardScaler() scale.fit(boston_X) boston_x = scale.transform(boston_X) pca = PCA(n_components=3) # boston_x = pca.fit_transform(boston_x) fig = plt.figure() ax = plt.gca(projection='3d') # ax.scatter(boston_x[:, 0], boston_x[:, 1], boston_x[:, 2], marker='o', c=boston_y) x_tran, x_test, y_tran, y_test = train_test_split(boston_x, boston_y, test_size=0.3, random_state=42) result = [] z = np.zeros(shape=(10, 10)) test_number = len(y_test) for i in range(1, 11, 1): for j in range(1, 11, 1): clf = ElasticNet(alpha=i / 10, l1_ratio=j / 10).fit(x_tran, y_tran) y_pre = clf.predict(x_test) result.append([i, j, clf.score(x_test, y_test)])
import matplotlib.pyplot as plt from sklearn.datasets import load_boston from sklearn.decomposition import PCA from sklearn.linear_model import Lasso from sklearn.model_selection import train_test_split from sklearn.preprocessing.data import StandardScaler bostondata = load_boston() # 导入boston数据 boston_X = bostondata.data boston_y = bostondata.target scale_boston = StandardScaler() # 标准化 scale_boston.fit(boston_X) boston_x = scale_boston.transform(boston_X) pca = PCA(n_components=2) # 二维降维 pca.fit(boston_X) dimesionpower = pca.explained_variance_ratio_ print(dimesionpower) boston_x_train, boston_x_test, boston_y_train, boston_y_test = train_test_split( boston_x, boston_y, test_size=0.3, random_state=42) result = { i / 10: Lasso(alpha=i / 10).fit(boston_x_train, boston_y_train).score(boston_x_test, boston_y_test) for i in range(1, 11, 1) } plt.plot(list(result.keys()), list(result.values())) plt.show() print(result) # 就结果来讲,正则修正系数越小越好,当然这也是在剔除异常值之前的了,以alpha=0为佳 # todo:使用聚类法整理原始数据,剔除一些异常数据信息
n = len(X_all) with open('diff_X.txt') as inFile: for line in inFile: if n == len(X_all): n = 0 X_d.append([]) X_d[-1].append(float(line.split('\t')[-1])) n += 1 else: X_d[-1].append(float(line.split('\t')[-1])) n += 1 plt.figure(figsize=(7, 5)) scaler = StandardScaler() X_scaled = scaler.fit(X_d).transform(X_d) pca = PCA(n_components=2) X_r = pca.fit(X_scaled).transform(X_scaled) X_rx = [i[0] for i in X_r] X_ry = [i[1] for i in X_r] country_sp = [] for x in h_run: if run_toCountry[x] == 'Fiji' or run_toCountry[ x] == 'United Republic of Tanzania' or run_toCountry[ x] == 'Madagascar' or run_toCountry[x] == 'Peru': if run_toCountry[x] == 'United Republic of Tanzania': country_sp.append('Tanzania') else:
label='test set') iris = load_iris() iris_data = iris.data[:, [2, 3]] print(iris_data) #切分数据集 X_train, X_test, y_train, y_test = train_test_split(iris_data, iris.target, test_size=0.3, random_state=0) # sc = StandardScaler() sc.fit(X_train) X_train_std = sc.transform(X_train) X_test_std = sc.transform(X_test) X_combined_std = np.vstack((X_train_std, X_test_std)) y_combined = np.hstack((y_train, y_test)) #模型训练与评估,默认L2范数,与model=LogisticRegression(penalty="l2")等价 model = LogisticRegression(C=1000.0, random_state=0) model.fit(X_train_std, y_train) model.predict_proba(np.array(X_test_std[0, :]).reshape(1, -1)) plot_decision_regions(X_combined_std, y_combined, classifier=model, test_idx=range(105, 150))