def read_and_split_data_from_file(file_name): data_X0, data_Z, data_info = read_data(file_name) data_info['train_size'], data_info['validate_size'], data_info['test_size'] = 0.7, 0.2, 0.1 X0_train, Z_train, X0_validate, Z_validate, X0_test, Z_test = \ train_test_split(X=data_X0, Z=data_Z, \ train_size=data_info['train_size'], \ validate_size=data_info['validate_size']) return X0_train, Z_train, X0_validate, Z_validate, X0_test, Z_test, data_info
def load_CartPole_data(): """Loads CartPole training data""" visited_states_df = np.load('Data/RL_Data/CP_states_df.npy') selected_actions_df = np.load( 'Data/RL_Data/CP_selected_action_sequence_df.npy') input_map_df = np.zeros((len(visited_states_df), 1)) train_data, val_data, test_data = data.train_test_split( list(zip(input_map_df, visited_states_df, selected_actions_df))) assert all([len(train_data[i][1]) == len(train_data[i][2]) for i in range(len(train_data))]), \ 'Lengths of states and actions are not equal' return train_data, val_data, test_data
def load_FrozenLake_data(): """Loads FrozenLake training data""" input_map_df = data.load_np_data('Data/RL_Data/FL_input_map_df.npy') visited_states_df = np.load('Data/RL_Data/FL_states_df.npy', allow_pickle=True) selected_actions_df = np.load( 'Data/RL_Data/FL_selected_action_sequence_df.npy', allow_pickle=True) train_data, val_data, test_data = data.train_test_split( list(zip(input_map_df, visited_states_df, selected_actions_df))) assert all([len(train_data[i][1]) == len(train_data[i][2]) for i in range(len(train_data))]), \ 'Lengths of states and actions are not equal' return train_data, val_data, test_data
def train_test(list_of_objects, object_names, target): input_train = [] input_test = [] output_train = [] output_test = [] for i in list_of_objects: # Calling the function a,b,c,d = train_test_split(i, object_names, target) # adding them to the exsisting lists input_train += a input_test += b output_train += c output_test += d return input_train, input_test, output_train, output_test
def dataset_sequences_experiment(exp_params, path_dict, reuse_sequences=None): t_exp = Timer() exp_params = dict(exp_params) ident = dict_get(exp_params, 'dataset_ident', default='', cast=str) start_time = t_exp.tic("Starting a 'dataset sequences' experiment. (%s)" % str(ident)) # Get parameters t = Timer() t.tic("Parsing parameters ...") train_perc = dict_get(exp_params, 'train_perc', default=1., cast=float) gt_dir = dict_get(path_dict, 'gt_dir', default=None) check_dir(gt_dir) ds_params = dict_get(exp_params, 'dataset_params', default=dict(), cast=dict) if reuse_sequences is None or type(reuse_sequences) != tuple or len(reuse_sequences) != 2: gt_sequences, _, _ = get_dataset_sequences(ident, ds_params, gt_dir) train_X, test_X = train_test_split(gt_sequences, train_perc) else: train_X, test_X = reuse_sequences timestamp_msg("Reusing sequences ...") # Check gt_sequences _, _, n_train_emissions = check_sequences(train_X) n_test_emissions = None if test_X is not None and len(test_X) > 0: _, _, n_test_emissions = check_sequences(test_X) _save_data(path_dict, train_X, test_X) if n_test_emissions is not None and n_train_emissions != n_test_emissions: raise Exception("Number of emissions in train and test sequence differs") exp_params['n_emissions'] = n_train_emissions exp_params = _parse_base_parameters(exp_params, path_dict) exp_params = _parse_standard_and_dense(exp_params, path_dict, exp_params['n_emissions']) _exp_params = _save_experiment_parameters(exp_params, path_dict) t.toc("Parameters parsed. Using parameters: %s" % str(_exp_params)) if 'fair_standard_params' in exp_params: _standard_vs_dense(train_X, test_X, (exp_params['standard_params'], exp_params['fair_standard_params']), exp_params['dense_params']) else: _standard_vs_dense(train_X, test_X, exp_params['standard_params'], exp_params['dense_params']) fin_time, diff = t_exp.toc("Finished a 'dataset sequences' experiment.")
def main(dataset, saved_model_path, _config, _log): policy = tf.saved_model.load(saved_model_path) flat_loss = policy.loss policy.loss = lambda *structs: flat_loss(*tf.nest.flatten(structs)) learner = Learner(policy=policy, **_config['learner']) _, test_paths = data.train_test_split(**dataset) embed_controller = embed.embed_controller_discrete # TODO: configure data_config = dict(_config['data'], embed_controller=embed_controller) test_data = data.make_source(filenames=test_paths, **data_config) test_manager = train_lib.TrainManager(learner, test_data, dict(train=False)) total_steps = 0 for _ in range(1000): # now test test_stats = test_manager.step() train_lib.log_stats(ex, test_stats, total_steps) test_loss = test_stats['loss'].numpy() print(f'test_loss={test_loss:.4f}')
def load_data(features_dict): dataset = f'movielens/{FLAGS.dataset}-ratings' ratings = tfds.load(dataset, split='train', data_dir=FLAGS.data_dir) # Prepare for binarization ratings.filter(lambda x: x['user_rating'] != 3.0) ratings = prepare_dataset(ratings, features_dict) # Cache for efficiency ratings = ratings.cache(tempfile.NamedTemporaryFile().name) features = features_by_type(features_dict) categorical_features = features['string'] + features['integer'] vocabularies = get_vocabularies(ratings, categorical_features) train, test = train_test_split(ratings, train_size=0.8, seed=FLAGS.seed) train_size = len(train) train = train.shuffle(train_size).batch(FLAGS.train_batch_size) test = test.batch(FLAGS.eval_batch_size) return train, test, vocabularies
metavar="/path/to/numpy/image/file", help='Path to numpy image file') parser.add_argument('--masks', required=False, metavar="/path/to/numpy/mask/file", help='Path to numpy mask file') args = parser.parse_args() assert args.command in [ 'imgmask', 'img', 'split', 'reset' ], "command must be in ['imgmask', 'img', 'split', 'reset']" train_directory = os.path.join(args.outdir, 'training_data') test_directory = os.path.join(args.outdir, 'testing_data') if args.command == 'imgmask': assert [args.images, args.masks ], 'imgmask requires the --images and --masks arguments' np_to_imgmask(args.images, args.masks, args.outdir) if args.command == 'img': assert args.images, "img requires the --images argument" np_to_img(args.images, args.outdir) if args.command == 'split': train_test_split(train_directory, test_directory, args.outdir, TRAIN_PERCENT) if args.command == 'reset': reset(train_directory, test_directory, args.outdir)
with open(CONFIGFILE, "r") as f: config = yaml.load(f) ap = ArgumentParser() ap.add_argument('--inspect_data', action='store_true', default=False, help="plot training data for inspection") ap.add_argument('--train', action='store_true', default=False, help="Run training") ap.add_argument('--test', action='store_true', default=False, help="Run test") args = ap.parse_args() df = load_data() milk = clean_data(df) train, test = train_test_split(milk) if args.inspect_data: print("RAW DATA") print(df.head()) milk.plot() plt.show() elif args.train: model = LSTMPredictor(config) model.fit(train['Milk Production'].values.reshape(1, -1)) model.close() elif args.test: model = LSTMPredictor(config) y_pred = model.infer(train['Milk Production'].values.reshape(1, -1), 12) y_pred = list(y_pred) test = test.copy()
print(opt) root_dir = "/home/szhang67/data/raw_data" preprocessed_dir = "/home/szhang67/data/preprocessed" cuda = opt.cuda if cuda and not torch.cuda.is_available(): raise Exception("No GPU found, please run without --cuda") torch.manual_seed(opt.seed) if cuda: torch.cuda.manual_seed(opt.seed) print('===> Loading datasets') train_set, test_set = train_test_split(opt.upscale_factor, root_dir, preprocessed_dir, opt.split_ratio) training_data_loader = DataLoader(dataset=train_set, num_workers=opt.threads, batch_size=opt.batchSize, shuffle=True) testing_data_loader = DataLoader(dataset=test_set, num_workers=opt.threads, batch_size=opt.testBatchSize, shuffle=False) print('===> Building model') model = Net() criterion = nn.MSELoss() if cuda: model = model.cuda()
def train( dirpath, pairs, test_pairs=None, train_val_split_ratio=0.95, batch_size=512, num_workers=8, seed=1234, args={}, ): set_seed(seed) src_lang, trg_lang = PolynomialLanguage.create_vocabs(pairs) train_pairs, val_pairs = train_test_split( pairs, train_test_split_ratio=train_val_split_ratio) train_tensors = pairs_to_tensors(train_pairs, src_lang, trg_lang) val_tensors = pairs_to_tensors(val_pairs, src_lang, trg_lang) collate_fn = Collater(src_lang, trg_lang) train_dataloader = DataLoader( SimpleDataset(train_tensors), batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers, ) val_dataloader = DataLoader( SimpleDataset(val_tensors), batch_size=batch_size, collate_fn=collate_fn, num_workers=num_workers, ) save_to_pickle = { "src_lang.pickle": src_lang, "trg_lang.pickle": trg_lang, } for k, v in save_to_pickle.items(): with open(os.path.join(dirpath, k), "wb") as fo: pickle.dump(v, fo) model = Seq2Seq(src_lang, trg_lang, **vars(args)).to(device) checkpoint_callback = ModelCheckpoint( monitor="val_loss", dirpath=dirpath, filename="model", save_top_k=1, mode="min", ) trainer = pl.Trainer.from_argparse_args( args, default_root_dir=dirpath, callbacks=[checkpoint_callback], ) trainer.fit(model, train_dataloader, val_dataloader) # pylint: disable=no-member # not sure why, but after trainer.fit, the model is sent to cpu, so we'll # need to send it back to device so evaluate doesn't break model.to(device) if test_pairs: final_score = evaluate(model, test_pairs, batch_size=batch_size) with open(os.path.join(dirpath, "eval.txt"), "w") as fo: fo.write(f"{final_score:.4f}\n") return model