def testPinObject(self): X = pin_in_object_store("hello") @ray.remote def f(): return get_pinned_object(X) self.assertEqual(ray.get(f.remote()), "hello")
def testFetchPinned(self): X = pin_in_object_store("hello") def train(config, reporter): get_pinned_object(X) reporter(timesteps_total=100, done=True) register_trainable("f1", train) [trial] = run_experiments({"foo": { "run": "f1", }}) self.assertEqual(trial.status, Trial.TERMINATED) self.assertEqual(trial.last_result[TIMESTEPS_TOTAL], 100)
if __name__ == "__main__": parser = argparse.ArgumentParser("Speech Verification") parser.add_argument("--ray", action='store_true', default=False) parser.add_argument("--data-parallel", action='store_true', default=False) parser.add_argument("--chunks", type=int, default=1) args = parser.parse_args() print(args) ray.init(num_gpus=1) ray.register_custom_serializer(torch.Tensor, serializer=serializer, deserializer=deserializer) # Load train set nspeakers, train_set = load_train_set(args) train_set_id = pin_in_object_store(train_set) print("Loaded train. pinned={}".format(True)) # Load dev set dev_set = load_dev_set(args) dev_set_id = pin_in_object_store(dev_set) print("Loaded dev. pinned={}".format(True)) tune.register_trainable('train_sc', train.Trainer) exp = Experiment( name="speaker classification", run='train_sc', config={ "stop": { 'training_iteration': 500
def run_ray_logistic(latents_path, tags, kf, idx, log_name): ray.init(num_cpus=5, num_gpus=1) data_train_list = [] data_val_list = [] for train_idx, val_idx in kf.split(idx): train_idx = idx[train_idx] #Indexes from the full tensor. val_idx = idx[val_idx] #Indexes from the full tensor. latents_train, latents_val = PCA_macau_samples(dir_path=latents_path, idx_train=train_idx, idx_val=val_idx) data_train_list += [latent_dataset(latents_train, tags[train_idx])] data_val_list += [latent_dataset(latents_val, tags[val_idx])] data_train = pin_in_object_store(data_train_list) data_val = pin_in_object_store(data_val_list) class train_class(Trainable): def _setup(self): self.device = torch.device("cuda:0") mod_opt = {'type': "plain_fact", 'cov': False, 'latents': 20} self.nfolds = self.config["nfolds"] #data_train=TensorFactDataset(csv_file_serie="complete_tensor_train1.csv",cov_path="complete_covariates") self.mod = [] self.dataloader = [] self.data_val = get_pinned_object(data_val) for fold in range(self.nfolds): mod_fold = MLP_class_mod( get_pinned_object(data_train)[fold].get_dim()) mod_fold.to(self.device) self.mod += [mod_fold] #self.mod=MLP_class_mod(get_pinned_object(data_train).get_dim()) self.dataloader += [ DataLoader(get_pinned_object(data_train)[fold], batch_size=5000, shuffle=True) ] #self.dataloader_val += DataLoader(get_pinned_object(data_val),batch_size=1000,shuffle=False) #self.dataloader=DataLoader(data_train,batch_size=65000,shuffle=True,num_workers=2) self.timestep = 0 print("SETUUUUP") def _train(self): self.timestep += 1 print("Timestep") print(self.timestep) #Select learning rate depending on the epoch. if self.timestep < 40: l_r = 0.005 elif self.timestep < 60: l_r = 0.0015 else: l_r = 0.0005 auc_mean_folds = 0 for fold in range(self.nfolds): optimizer = torch.optim.Adam(self.mod[fold].parameters(), lr=l_r, weight_decay=self.config["L2"]) criterion = nn.BCEWithLogitsLoss() total_loss = 0 for idx, sampled_batch in enumerate(self.dataloader[fold]): optimizer.zero_grad() target = sampled_batch[1].to(self.device) preds = self.mod[fold].fwd(sampled_batch[0].to( self.device)) loss = criterion(preds, target) loss.backward() optimizer.step() with torch.no_grad(): loss_val = 0 target = self.data_val[fold].tags.to(self.device) preds = self.mod[fold].fwd(self.data_val[fold].latents.to( self.device)) loss_val += roc_auc_score(target, preds) auc_mean = loss_val #rmse_val_loss_computed=(np.sqrt(loss_val.detach().cpu().numpy()/(i_val+1))) auc_mean_folds += auc_mean #return TrainingResult(mean_accuracy=(auc_mean_folds/self.nfolds),timesteps_this_iter=1) return { "mean_accuracy": (auc_mean_folds / self.nfolds), "time_steps_this_iter": 1 } def _save(self, checkpoint_dir): print("Saving") path = os.path.join(checkpoint_dir, "checkpoint") state_dict_list = [] for fold in range(self.nfolds): state_dict_list += [self.mod[fold].state_dict()] torch.save(state_dict_list, path) print("SAVIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIING") #raise Exception() #torch.cuda.empty_cache() np.save(path + "_timestep.npy", self.timestep) return path def _restore(self, checkpoint_path): print("LOAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADING") state_dict_list = torch.load(checkpoint_path) for fold in range(self.nfolds): self.mod[fold].load_state_dict(state_dict_list[fold]) self.timestep = np.load(checkpoint_path + "_timestep.npy").item() tune.register_trainable("my_class", train_class) hyperband = HyperBandScheduler(time_attr="timesteps_total", reward_attr="mean_accuracy", max_t=100) exp = { 'run': "my_class", 'num_samples': 50, 'trial_resources': { "gpu": 1 }, 'stop': { "training_iteration": 100 }, 'config': { "L2": lambda spec: 10**(8 * random.random() - 4), "nfolds": kf.get_n_splits() } } tune.run_experiments({log_name: exp}, scheduler=hyperband)
h5.attrs['alpha'] = alpha h5['counts'] = np.zeros(shape=(resolution, resolution), dtype=np.uint32) h5 = h5py.File(f_save, 'r+') if 'zi' not in h5: zi = grid_targets(alpha, iterations) h5['zi'] = zi else: zi = h5['zi'][...] # Drop the objects into the queue object_ids = [] zi_obj = pin_in_object_store(zi) for k in range(parallel_iterations): args = (N, alpha, iterations, resolution, extent, zi_obj) obj = compute_set.remote(*args, seed=k) object_ids.append(obj) # Accumulate the results counts = np.zeros((resolution, resolution), dtype=np.uint64) with tqdm(total=len(object_ids)) as progress: while len(object_ids): obj, object_ids = ray.wait(object_ids, num_returns=1) counts += ray.get(obj[0]).copy() progress.update()
'optimizer': self.optimizer.state_dict() } torch.save(cpd, checkpoint_dir + "/save") def _restore(self, path): cpd = torch.load(path) self.iteration = cpd['iteration'] self.sc.load_state_dict(cpd['state_dict']) self.optimizer.load_state_dict(cpd['optimizer']) if __name__ == "__main__": ray.init() dset = TensorDataset( torch.randn(100, 64, 1024), torch.randn(100, 1024), torch.randint(100, size=(100, )).type(torch.LongTensor)) dset_id = pin_in_object_store(dset) tune.register_trainable('train_sc', Trainer) exp = Experiment(name="speaker classification", run='train_sc', stop={"timesteps_total": 1}, config={ "lr": 1e-3, "dset_id": dset_id, "nspeakers": 100, "batch_size": 1, }) tune.run_experiments(exp)
return path def _restore(self,checkpoint_path): print("LOAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADING") self.mod.load_state_dict(torch.load(checkpoint_path)) self.timestep=np.load(checkpoint_path+"_timestep.npy").item() if __name__=="__main__": opt=parser.parse_args() if opt.unique: train(torch.device("cuda:0"),opt.maxepochs,opt.L2) else: ray.init(num_cpus=10,num_gpus=2) data_train=pin_in_object_store(LSTMDataset_ByPat(file_path="~/Data/MIMIC/")) data_val=pin_in_object_store(LSTMDataset_ByPat(csv_file_serie="LSTM_tensor_val.csv",file_path="~/Data/MIMIC/",cov_path="LSTM_covariates_val",tag_path="LSTM_death_tags_val.csv")) means_df=pd.Series.from_csv("~/Data/MIMIC/mean_features.csv") means_vec=pin_in_object_store(torch.tensor(means_df.as_matrix(),dtype=torch.float)) tune.register_trainable("my_class", train_class) hyperband=AsyncHyperBandScheduler(time_attr="training_iteration",reward_attr="mean_accuracy",max_t=350,grace_period=15) exp={ 'run':"my_class", 'repeat':30, 'stop':{"training_iteration":350}, 'trial_resources':{ "gpu":1, "cpu":1
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--train-pos-path", default='datasets/constrained_classification/k16/pos.train.article.txt', ) parser.add_argument( "--train-neg-path", default='datasets/constrained_classification/k16/neg.train.article.txt', ) parser.add_argument( "--eval-pos-path", default='datasets/constrained_classification/k16/pos.valid.article.txt', ) parser.add_argument( "--eval-neg-path", default='datasets/constrained_classification/k16/neg.valid.article.txt', ) args = parser.parse_args() ray.init() train_df = load_df(args.train_pos_path, args.train_neg_path, 0.01) eval_df = load_df(args.eval_pos_path, args.eval_neg_path, 0.01) class_weights = class_weight.compute_class_weight('balanced', [0, 1], train_df.label) train_df_id = pin_in_object_store(train_df) eval_df_id = pin_in_object_store(eval_df) model_args = { 'evaluate_during_training': True, 'log_tune': True, 'train_batch_size': 32, 'gradient_accumulation_steps': 1, 'eval_batch_size': 32, 'num_train_epochs': 1, 'eval_steps': 10000000, 'save_steps': 10000000, 'cache_dir': os.path.join(os.getcwd(), 'cache_dir'), 'overwrite_output_dir': True } config = { 'warmup_ratio': 0.04, 'train_df_id': train_df_id, 'eval_df_id': eval_df_id, 'model_args': model_args, 'class_weights': class_weights } space = { 'adam_epsilon': hp.loguniform('adam_epsilon', np.log(1e-8), np.log(1e-7)), 'weight_decay': hp.choice('weight_decay', [0, 0.01]), 'learning_rate': hp.loguniform('learning_rate', np.log(1e-6), np.log(1e-4)), 'max_seq_length': hp.quniform('max_seq_length', 96, 160, 1), } resources_per_trial = { "cpu": 8, "gpu": 1, } current_best_params = [{ "adam_epsilon": 1e-8, 'weight_decay': 0, 'learning_rate': 1e-5, 'max_seq_length': 128 }] algo = HyperOptSearch(space, metric="mcc", max_concurrent=5, mode="max", points_to_evaluate=current_best_params) analysis = tune.run(Classifier, config=config, search_alg=algo, resources_per_trial=resources_per_trial, scheduler=tune.schedulers.MedianStoppingRule( time_attr='training_iteration', metric='mcc', mode='max', grace_period=3)) print("Best config: ", analysis.get_best_config(metric='mcc'))
latents_train = latents[:n_train, :] latents_val = latents[n_train:n_train + n_val, :] print(latents_train.shape) print(latents_val.shape) tags_train = pd.read_csv("~/Data/MIMIC/Clean_data/LSTM_death_tags_train.csv" ).sort_values("UNIQUE_ID") tag_mat_train = tags_train[["DEATHTAG", "UNIQUE_ID"]].as_matrix()[:, 0] tags_val = pd.read_csv( "~/Data/MIMIC/Clean_data/LSTM_death_tags_val.csv").sort_values("UNIQUE_ID") tag_mat_val = tags_val[["DEATHTAG", "UNIQUE_ID"]].as_matrix()[:, 0] print(tag_mat_train.shape) print(tag_mat_val.shape) data_train = pin_in_object_store(latent_dataset(latents_train, tag_mat_train)) data_val = pin_in_object_store(latent_dataset(latents_val, tag_mat_val)) tune.register_trainable("my_class", train_class) hyperband = AsyncHyperBandScheduler(time_attr="training_iteration", reward_attr="mean_accuracy", max_t=100) exp = { 'run': "my_class", 'num_samples': 30, 'stop': { "training_iteration": 100 }, 'config': {
#torch.cuda.empty_cache() np.save(path + "_timestep.npy", self.timestep) return path def _restore(self, checkpoint_path): print("LOAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAADING") self.mod.load_state_dict(torch.load(checkpoint_path)) self.timestep = np.load(checkpoint_path + "_timestep.npy").item() #shutil.rmtree("/home/edward/ray_results/my_Experiment/") ray.init(num_gpus=3, num_cpus=10) data_train = pin_in_object_store( TensorFactDataset(csv_file_serie="complete_tensor_train1.csv", cov_path="complete_covariates")) data_val = pin_in_object_store( TensorFactDataset(csv_file_serie="complete_tensor_val1.csv", cov_path="complete_covariates")) tune.register_trainable("my_class", train_class) hyperband = HyperBandScheduler(time_attr="timesteps_total", reward_attr="neg_mean_loss", max_t=100) exp = { 'run': "my_class", 'trial_resources': { "gpu": 1
import ray import ray.tune as tune from ray.tune.util import pin_in_object_store, get_pinned_object import tensorflow as tf from tf_train_simple.mnist_data_grabber import DataGrab from tf_train_simple.mnist_model_builder import build_model ray.init() data = pin_in_object_store(DataGrab('/tmp/ray/tf/mnist/input_data')) def train_func(config, reporter): # add a reporter arg my_lr = config["lr"] cur_data = get_pinned_object(data) with tf.Session() as sess: x, y_, keep_prob, train_step, accuracy = build_model(my_lr) print('number of global vars:', len(tf.global_variables())) sess.run(tf.global_variables_initializer()) for i in range(2000): batch = cur_data.get_next_train(50) if i % 100 == 0: train_accuracy = accuracy.eval(feed_dict={ x: batch[0], y_: batch[1], keep_prob: 1.0 }) print('step %d, learning rate %f training accuracy %g' % (i, my_lr, train_accuracy)) reporter(timesteps_total=i, mean_accuracy=train_accuracy) train_step.run(feed_dict={
return TrainingResult(mean_accuracy=auc/n_splits,timesteps_this_iter=1) def _save(self,checkpoint_dir): return path def _restore(self,checkpoint_path): return checkpoint_path file_path=sys.argv[1:][0] # This file should contain a numpy array with the latents and the label as first columnself. ray.init(num_cpus=3) latents=np.load(file_path) tags=pd.read_csv("~/Data/MIMIC/complete_death_tags.csv").sort_values("UNIQUE_ID") tag_mat=tags[["DEATHTAG","UNIQUE_ID"]].as_matrix()[:,0] x=pin_in_object_store(latents.T) y=pin_in_object_store(tag_mat) tune.register_trainable("my_class", train_class) hyperband=HyperBandScheduler(time_attr="timesteps_total",reward_attr="mean_accuracy",max_t=100) exp={ 'run':"my_class", 'repeat':50, 'stop':{"training_iteration":1}, 'config':{ "C":lambda spec: 10**(8*random.random()-4), "gamma":lambda spec: 10**(8*random.random()-4), } }
def _save(self, checkpoint_dir): path = os.path.join(checkpoint_dir, "checkpoint") torch.save(self.mod.state_dict(), path) np.save(path + "_timestep.npy", self.timestep) return path def _restore(self, checkpoint_path): self.mod.load_state_dict(torch.load(checkpoint_path)) self.timestep = np.load(checkpoint_path + "_timestep.npy").item() if __name__ == "__main__": #train() ray.init(num_cpus=10, num_gpus=2) data_train = pin_in_object_store( GRU_teach_dataset(file_path="~/Data/MIMIC/Clean_data/")) data_val = pin_in_object_store( GRU_teach_dataset(file_path="~/Data/MIMIC/Clean_data/", csv_file_serie="LSTM_tensor_val.csv", cov_path="LSTM_covariates_val.csv", tag_path="LSTM_death_tags_val.csv")) data_test = pin_in_object_store( GRU_teach_dataset(file_path="~/Data/MIMIC/Clean_data/", csv_file_serie="LSTM_tensor_test.csv", cov_path="LSTM_covariates_test.csv", tag_path="LSTM_death_tags_test.csv")) tune.register_trainable("my_class", train_class) hyperband = AsyncHyperBandScheduler(time_attr="training_iteration", reward_attr="mean_accuracy",
def main(args): ray.init(num_cpus=args.rayNumCpu, num_gpus=args.rayNumGpu) t_loader, v_loader = get_loaders(train_batch_size=16, num_workers=1, data_folder=args.dataFolder, cuda_available=torch.cuda.is_available()) pinned_obj_dict['data_loader_train'] = pin_in_object_store(t_loader) pinned_obj_dict['data_loader_valid'] = pin_in_object_store(v_loader) pinned_obj_dict['args'] = pin_in_object_store(args) trainable_name = 'hyp_search_train' register_trainable(trainable_name, TrainerClass) reward_attr = "acc" ############################# # Define hyperband scheduler ############################# hpb = AsyncHyperBandScheduler(time_attr="training_iteration", reward_attr=reward_attr, grace_period=40, max_t=300) ############################## # Define hyperopt search algo ############################## space = { 'lr': hp.uniform('lr', 0.001, 0.1), 'optimizer': hp.choice("optimizer", ['SGD', 'Adam' ]), #, 'Adadelta']), # Adadelta gets the worst results 'batch_accumulation': hp.choice("batch_accumulation", [4, 8, 16]) } hos = HyperOptSearch(space, max_concurrent=4, reward_attr=reward_attr) ##################### # Define experiments ##################### exp_name = "resnet152_hyp_search_hyperband_hyperopt_{}".format( time.strftime("%Y-%m-%d_%H.%M.%S")) exp = Experiment( name=exp_name, run=trainable_name, num_samples=args.numSamples, # the number of experiments resources_per_trial={ "cpu": args.trialNumCpu, "gpu": args.trialNumGpu }, checkpoint_freq=args.checkpointFreq, checkpoint_at_end=True, stop={ reward_attr: 0.95, "training_iteration": args. trainingIteration, # how many times a specific config will be trained }) ################## # Run tensorboard ################## if args.runTensorBoard: thread = threading.Thread(target=launch_tensorboard, args=[exp_name]) thread.start() launch_tensorboard(exp_name) ################## # Run experiments ################## run_experiments(exp, search_alg=hos, scheduler=hpb, verbose=False)
ray.init(num_cpus=10) latents_pat = np.load(file_path_pat) latents_feat = np.load(file_path_feat) latents_time = np.load(file_path_time) tags = pd.read_csv("~/Data/MIMIC/LSTM_death_tags_train.csv").sort_values( "UNIQUE_ID") tag_mat = tags[["DEATHTAG", "UNIQUE_ID"]].as_matrix()[:, 0] train_idx, test_idx = train_test_split(np.arange(tag_mat.shape[0]), test_size=0.2, random_state=42) data_train = pin_in_object_store( reconstructed_ts(latents_pat[train_idx], latents_feat[train_idx], latents_time[train_idx], tag_mat[train_idx]), "~/Data/MIMIC/LSTM_tensor_train.csv", train_idx) data_train = pin_in_object_store( reconstructed_ts(latents_pat[test_idx], latents_feat[test_idx], latents_time[test_idx], tag_mat[test_idx]), "~/Data/MIMIC/LSTM_tensor_train.csv", test_idx) tune.register_trainable("my_class", train_class) hyperband = HyperBandScheduler(time_attr="timesteps_total", reward_attr="mean_accuracy", max_t=100) exp = { 'run': "my_class", 'repeat': 50,