def upload_confusion_matrices(task_id, search_name, search_key): search_dir = os.path.join(outputs_dir, f'ray_results/task{task_id}/{search_name}') for exp_dir in os.listdir(search_dir): if not exp_dir.startswith(search_key): continue exp_dir_path = os.path.join(search_dir, exp_dir) comet_dir = '' for f in os.listdir(exp_dir_path): if f.startswith('comet'): comet_dir = f if not comet_dir: continue comet_exp_key = comet_dir.split('-')[1] print(comet_exp_key) comet_exp = ExistingExperiment(previous_experiment=comet_exp_key) comet_dir_path = os.path.join(exp_dir_path, comet_dir) p = re.compile(r'confusion-matrix-epoch-(.*).json') for cm_file in os.listdir(comet_dir_path): epoch = int(p.findall(cm_file)[0]) cm_json = json.load(open(os.path.join(comet_dir_path, cm_file))) comet_exp._log_asset_data(data=cm_json, file_name=cm_file, overwrite=True, epoch=epoch, asset_type='confusion-matrix')
def setup_comet_ml(args, rank): # dummy init of experiment so it can be used without error # even if comet is disabled experiment = Experiment(api_key='dummy_key', disabled=True) if args.comet_api_key: # initiating comet if args.existing_exp_key: if rank == 0: print("STARTING FROM AND EXISTING EXPERIMENT") experiment = ExistingExperiment( api_key=args.comet_api_key, workspace=args.comet_workspace, project_name=args.project_name, previous_experiment=args.existing_exp_key, auto_output_logging="simple", auto_metric_logging=False, parse_args=False, disabled=args.disable_comet or rank != 0) else: if rank == 0: print("STARTING A NEW EXPERIMENT") experiment = Experiment( api_key=args.comet_api_key, workspace=args.comet_workspace, project_name=args.project_name, auto_output_logging="simple", auto_metric_logging=False, parse_args=False, disabled=args.disable_comet or rank != 0) experiment.log_asset('config.yaml') experiment.log_asset('config_prod.yaml') experiment.log_asset('config_prod_prime.yaml') return experiment
def load_experiment(path_to_yml_file): config = load_yaml(path_to_yml_file) api_key = os.getenv('COMET_API_KEY', None) exp = None if not config['info']['experiment_key']: if api_key: exp = Experiment(api_key=api_key, project_name=config['info']['project_name']) exp_key = exp.get_key() else: exp_key = make_random_string(20) os.environ['EXPERIMENT_KEY'] = exp_key _env_variables = env_variables + ['EXPERIMENT_KEY'] config = load_yaml(path_to_yml_file, _env_variables) config['info']['experiment_key'] = exp_key path_to_yml_file = save_experiment(config, exp) else: logging.info( f"Experiment is already set up @ {config['info']['output_folder']}!" ) try: exp = ExistingExperiment( api_key=api_key, previous_experiment=config['info']['experiment_key']) except: pass return config, exp, path_to_yml_file
def create_comet_experiment(args): if args.resume: experiment_key = input("Enter Comet ML key of experiment to resume:") experiment = ExistingExperiment(api_key="jBFVYFo9VUsy0kb0lioKXfTmM", previous_experiment=experiment_key) elif args.no_comet: experiment = Experiment(api_key="jBFVYFo9VUsy0kb0lioKXfTmM", project_name="test-runs") else: experiment = Experiment(api_key="jBFVYFo9VUsy0kb0lioKXfTmM", project_name="fastdepth") return experiment
def __init__(self, disabled, is_existing=False, prev_exp_key=None): """ Handles logging of experiment to comet and also persistence to local file system. Supports resumption of stopped experiments. """ if not is_existing: self.experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=PROJECT_NAME, disabled=disabled) else: if prev_exp_key is None: raise ValueError("Requested existing experiment, but no key provided") print("Continuing existing experiment with key: ", prev_exp_key) self.experiment = ExistingExperiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=PROJECT_NAME, disabled=disabled, previous_experiment=prev_exp_key) self.disabled = disabled self.name = None
def __init__( self, batch_size: int, snapshot_dir: Optional[str] = None, snapshot_mode: str = "last", snapshot_gap: int = 1, exp_set: Optional[str] = None, use_print_exp: bool = False, saved_exp: Optional[str] = None, **kwargs, ): """ :param kwargs: passed to comet's Experiment at init. """ if use_print_exp: self.experiment = PrintExperiment() else: from comet_ml import Experiment, ExistingExperiment, OfflineExperiment if saved_exp: self.experiment = ExistingExperiment( previous_experiment=saved_exp, **kwargs ) else: try: self.experiment = Experiment(**kwargs) except ValueError: # no API key log_dir = Path.home() / "logs" log_dir.mkdir(exist_ok=True) self.experiment = OfflineExperiment(offline_directory=str(log_dir)) self.experiment.log_parameter("complete", False) if exp_set: self.experiment.log_parameter("exp_set", exp_set) if snapshot_dir: snapshot_dir = Path(snapshot_dir) / self.experiment.get_key() # log_traj_window (int): How many trajectories to hold in deque for computing performance statistics. self.log_traj_window = 100 self._cum_metrics = { "n_unsafe_actions": 0, "constraint_used": 0, "cum_completed_trajs": 0, "logging_time": 0, } self._new_completed_trajs = 0 self._last_step = 0 self._start_time = self._last_time = time() self._last_snapshot_upload = 0 self._snaphot_upload_time = 30 * 60 super().__init__(batch_size, snapshot_dir, snapshot_mode, snapshot_gap)
def launch_parallel_experiment(gpu_rank, api_key, experiment_keys, experiment_params, repo_path): torch.cuda.set_device(gpu_rank) param = Parameters() param.segment_dataset = False param.model_backup_destination = param.model_backup_destination + "/process_{}".format( gpu_rank) experiment = ExistingExperiment( api_key=api_key, previous_experiment=experiment_keys[gpu_rank], log_env_details=True, log_env_gpu=True, log_env_cpu=True) experiment.params = experiment_params[gpu_rank] repo = Repo(repo_path) with CometLogger(experiment, gpu_id=gpu_rank, print_to_comet_only=True): setup_comet_experiment(experiment, param, repo) CometLogger.print("-> loading experiments assets:") loss, model, optimizer, train_dataloader, valid_dataloader = load_experiment_assets( param) if param.train: CometLogger.print("~~ Launching the training ~~") CometLogger.print( "Sleeping {} secs to reduce chances of deadlock.".format( gpu_rank)) sleep(gpu_rank) launch_training(model, train_dataloader, valid_dataloader, optimizer, loss, param) if param.test: CometLogger.print("~~ Testing the model ~~") launch_testing(model, param) del train_dataloader, valid_dataloader, model, optimizer, loss torch.cuda.empty_cache()
def __init__(self, comet_params, run_params=None, prev_exp_id=None): if prev_exp_id: # previous experiment api_key = comet_params['api_key'] del comet_params[ 'api_key'] # removing this because the rest of the items need to be passed self.experiment = ExistingExperiment( api_key=api_key, previous_experiment=prev_exp_id, **comet_params) print( f'In CometTracker: ExistingExperiment initialized with id: {prev_exp_id}' ) else: # new experiment self.experiment = Experiment(**comet_params) self.experiment.log_parameters(run_params)
def get_comet_logger(self): if not self.paras.load : comet_exp = Experiment(project_name=COMET_PROJECT_NAME, workspace=COMET_WORKSPACE, auto_output_logging=None, auto_metric_logging=None, display_summary=False, ) if self.paras.transfer: comet_exp.set_name(self.exp_name) comet_exp.add_tag(Path(self.ckpdir).parent.name) comet_exp.add_tag('transfer') comet_exp.add_tag(self.config['data']['corpus']['metas'][0]) if self.paras.test: comet_exp.set_name(Path(self.paras.outdir).name) comet_exp.add_tag(Path(self.paras.config).parents[2].name) comet_exp.add_tag('test') comet_exp.add_tag(Path(self.paras.config).parent.stem) #comet_exp.add_tag(Path(self.paras.outdir).name) else: comet_exp.add_tag('train') for name, param in self.config.items(): if isinstance(param, dict): comet_exp.log_parameters(param, prefix=name) else: comet_exp.log_parameter(name, param) comet_exp.log_other('seed', self.paras.seed) with open(Path(self.logdir,'exp_key'), 'w') as f: print(comet_exp.get_key(),file=f) else: with open(Path(self.logdir,'exp_key'),'r') as f: exp_key = f.read().strip() comet_exp = ExistingExperiment(previous_experiment=exp_key, project_name=COMET_PROJECT_NAME, workspace=COMET_WORKSPACE, auto_output_logging=None, auto_metric_logging=None, display_summary=False, ) return comet_exp
def log_hyperparameters_to_comet(clf, experiment): for i in range(len(clf.cv_results_["params"])): exp = Experiment( workspace="s0lvang", project_name="ideal-pancake-hyperparameter", api_key=globals.flags.comet_api_key, ) exp.add_tag("hp_tuning") exp.add_tags(globals.comet_logger.get_tags()) for k, v in clf.cv_results_.items(): if k == "params": exp.log_parameters(v[i]) else: exp.log_metric(k, v[i]) exp.end() old_experiment = ExistingExperiment( api_key=globals.flags.comet_api_key, previous_experiment=experiment.get_key(), ) globals.comet_logger = old_experiment
def _init_comet(self): """ For more information on comet, see our doc/Getting Started """ try: if self.comet_key: self.comet_exp = ExistingExperiment( previous_experiment=self.comet_key) elif self.comet_workspace: # New experiment # Use trainset name as comet project name project_name = self.comet_project self.comet_exp = CometExperiment( project_name=project_name, workspace=self.comet_workspace, log_code=False, log_graph=True, auto_param_logging=True, auto_metric_logging=False, parse_args=False, auto_output_logging='native', log_env_details=True, log_env_gpu=True, log_env_cpu=True, log_env_host=False, log_git_metadata=True, log_git_patch=True, display_summary=False) self.comet_exp.set_name(self.experiment_name) self.comet_exp.log_parameters(self.params) self.comet_key = self.comet_exp.get_key() except ConnectionError: self.logger.warning( "Could not connect to Comet.ml, metrics will not be logged " "online...") self.comet_exp = None self.comet_key = None
]) logger = logging.getLogger() logger.info("Running new experiment") ex = Experiment(api_key=config.log.comet.api_key, workspace=config.log.comet.workspace, project_name=config.log.comet.project_name, disabled=True, auto_output_logging=None, log_code=False) name = 'exp_{}'.format(config_id) config.general.exp_name = name ex.log_parameters(flatten_dictionary(config)) ex.set_name(name) start(config, ex) else: logging.info("Resuming old experiment with id {}".format(exp_id)) config = get_config(config_id=config_id) logger = logging.getLogger() ex = ExistingExperiment( api_key=config.log.comet.api_key, previous_experiment=exp_id, workspace=config.log.comet.workspace, project_name=config.log.comet.project_name, disabled=config.log.comet.disabled, auto_output_logging=None, log_code=False, ) name = 'exp_{}'.format(config_id) config.general.exp_name = name resume(config, ex)
def main(): args = get_args() results_filename = f"logs/{args.env_name}-seed-{args.seed}-num-steps-{args.num_steps}-num-env-steps-{args.num_env_steps}-results.csv" save_path = os.path.join(args.save_dir, args.algo, str(args.seed)) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.join(save_path, args.env_name) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, log_dir, device, False, args.custom_gym) if "Train" in args.env_name: test_envs = make_vec_envs(args.env_name.replace("Train", "Test"), args.seed, 1, args.gamma, log_dir, device, False, args.custom_gym) base = NaviBaseTemp obs_shape = envs.observation_space.shape save_j = 0 try: os.makedirs(save_path) except FileExistsError: pass # Recover from job pre-emption try: actor_critic, ob_rms = \ torch.load(os.path.join(save_path, args.env_name + ".pt"), map_location='cpu') j = json.load( open(os.path.join(save_path, args.env_name + "-state.json"), 'r')) save_j = j['save_j'] episode_total = j['episode_total'] test_episode_total = j['test_episode_total'] rollouts = pickle.load( open(os.path.join(save_path, args.env_name + "-rollout.pkl"), 'rb')) rollouts.to(device) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) test_rollouts = pickle.load( open(os.path.join(save_path, args.env_name + "-test-rollout.pkl"), 'rb')) test_rollouts.to(device) test_obs = test_envs.reset() test_rollouts.obs[0].copy_(test_obs) test_rollouts.to(device) optimizer_state_dict = pickle.load( open( os.path.join(save_path, args.env_name + "-optim-state-dict.pkl"), 'rb')) episode_rewards = pickle.load( open( os.path.join(save_path, args.env_name + "-episode_rewards.pkl"), 'rb')) episode_length = pickle.load( open( os.path.join(save_path, args.env_name + "-episode_length.pkl"), 'rb')) episode_success_rate = pickle.load( open( os.path.join(save_path, args.env_name + "-episode_success_rate.pkl"), 'rb')) test_episode_rewards = pickle.load( open( os.path.join(save_path, args.env_name + "-test_episode_rewards.pkl"), 'rb')) test_episode_length = pickle.load( open( os.path.join(save_path, args.env_name + "-test_episode_length.pkl"), 'rb')) test_episode_success_rate = pickle.load( open( os.path.join(save_path, args.env_name + "-test_episode_success_rate.pkl"), 'rb')) if comet_loaded and len(args.comet) > 0: comet_credentials = args.comet.split("/") experiment = ExistingExperiment(api_key=comet_credentials[2], previous_experiment=j['comet_id']) for key, value in vars(args).items(): experiment.log_parameter(key, value) else: experiment = None with open(results_filename, "a") as f: for key, value in vars(args).items(): f.write(f"{key}, {value}\n") f.close() except Exception: # create a new model actor_critic = Policy( obs_shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}, base=base, ) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) test_rollouts = RolloutStorage( args.num_steps, 1, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) if "Train" in args.env_name: test_obs = test_envs.reset() test_rollouts.obs[0].copy_(test_obs) test_rollouts.to(device) episode_rewards = deque(maxlen=10) episode_length = deque(maxlen=10) episode_success_rate = deque(maxlen=100) episode_total = 0 test_episode_rewards = deque(maxlen=10) test_episode_length = deque(maxlen=10) test_episode_success_rate = deque(maxlen=100) test_episode_total = 0 if comet_loaded and len(args.comet) > 0: comet_credentials = args.comet.split("/") experiment = Experiment(api_key=comet_credentials[2], project_name=comet_credentials[1], workspace=comet_credentials[0]) for key, value in vars(args).items(): experiment.log_parameter(key, value) else: experiment = None with open(results_filename, "w+") as f: for key, value in vars(args).items(): f.write(f"{key}, {value}\n") f.close() actor_critic.to(device) if args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'random': agent = algo.RANDOM_AGENT(actor_critic, args.value_loss_coef, args.entropy_coef) actor_critic = RandomPolicy( obs_shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}, base=base, ) try: agent.optimizer.load_state_dict(optimizer_state_dict) except Exception: pass start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates - save_j): j = j + save_j if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) print("args.num_steps: " + str(args.num_steps)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observe reward and next obs obs, reward, done, infos = envs.step(action) for idx, info in enumerate(infos): if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) episode_length.append(info['episode']['l']) if "Explorer" not in args.env_name: episode_success_rate.append( info['was_successful_trajectory']) episode_total += 1 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # Run on test if "Train" in args.env_name: for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( test_rollouts.obs[step], test_rollouts.recurrent_hidden_states[step], test_rollouts.masks[step]) # Observe reward and next obs obs, reward, done, infos = test_envs.step(action) for idx, info in enumerate(infos): if 'episode' in info.keys(): test_episode_rewards.append(info['episode']['r']) test_episode_length.append(info['episode']['l']) test_episode_success_rate.append( info['was_successful_trajectory']) test_episode_total += 1 # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "" and j > args.save_after: if args.save_multiple: torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, str(j) + "-" + args.env_name + ".pt")) else: torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) json.dump( { 'save_j': j, 'episode_total': episode_total, 'test_episode_total': test_episode_total, 'comet_id': experiment.id }, open( os.path.join(save_path, args.env_name + "-state.json"), 'w+')) pickle.dump( agent.optimizer.state_dict(), open( os.path.join(save_path, args.env_name + "-optim-state-dict.pkl"), 'wb+')) pickle.dump( rollouts, open( os.path.join(save_path, args.env_name + "-rollout.pkl"), 'wb+')) pickle.dump( test_rollouts, open( os.path.join(save_path, args.env_name + "-test-rollout.pkl"), 'wb+')) pickle.dump( episode_rewards, open( os.path.join(save_path, args.env_name + "-episode_rewards.pkl"), 'wb+')) pickle.dump( episode_length, open( os.path.join(save_path, args.env_name + "-episode_length.pkl"), 'wb+')) pickle.dump( episode_success_rate, open( os.path.join( save_path, args.env_name + "-episode_success_rate.pkl"), 'wb+')) pickle.dump( test_episode_rewards, open( os.path.join( save_path, args.env_name + "-test_episode_rewards.pkl"), 'wb+')) pickle.dump( test_episode_length, open( os.path.join( save_path, args.env_name + "-test_episode_length.pkl"), 'wb+')) pickle.dump( test_episode_success_rate, open( os.path.join( save_path, args.env_name + "-test_episode_success_rate.pkl"), 'wb+')) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() if experiment is not None: experiment.log_metric("Reward Mean", np.mean(episode_rewards), step=total_num_steps) experiment.log_metric("Reward Min", np.min(episode_rewards), step=total_num_steps) experiment.log_metric("Reward Max", np.max(episode_rewards), step=total_num_steps) experiment.log_metric("Episode Length Mean ", np.mean(episode_length), step=total_num_steps) experiment.log_metric("Episode Length Min", np.min(episode_length), step=total_num_steps) experiment.log_metric("Episode Length Max", np.max(episode_length), step=total_num_steps) experiment.log_metric("# Trajectories (Total)", j, step=total_num_steps) if "Explorer" not in args.env_name: experiment.log_metric("Episodic Success Rate", np.mean(episode_success_rate), step=total_num_steps) else: with open(results_filename, "a") as f: f.write( f"Reward Mean, {np.mean(episode_rewards)}, {total_num_steps}\n" ) f.write( f"Reward Min, {np.min(episode_rewards)}, {total_num_steps}\n" ) f.write( f"Reward Max, {np.max(episode_rewards)}, {total_num_steps}\n" ) f.write( f"Episode Length Mean, {np.mean(episode_rewards)}, {total_num_steps}\n" ) f.write( f"Episode Length Min, {np.min(episode_rewards)}, {total_num_steps}\n" ) f.write( f"Episode Length Max, {np.max(episode_rewards)}, {total_num_steps}\n" ) f.write( f"# Trajectories (Total), {j}, {total_num_steps}\n") if "Explorer" not in args.env_name: f.write( f"Episodic Success Rate, {np.mean(episode_success_rate)}, {total_num_steps}\n" ) f.close() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) # Test Generalization if "Train" in args.env_name and j % args.log_interval == 0 and len( test_episode_rewards) > 1: masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) test_rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( test_rollouts.obs[-1], test_rollouts.recurrent_hidden_states[-1], test_rollouts.masks[-1]).detach() test_rollouts.after_update() print( f"Test Episode Total: {test_episode_total}, Mean Test rewards: {np.mean(test_episode_rewards)}, Test Episode Length: {np.mean(test_episode_length)}, Test Episode Success Rate: {np.mean(test_episode_success_rate)}" ) test_total_num_steps = (j + 1) * args.num_steps experiment.log_metric("Test Reward Mean", np.mean(test_episode_rewards), step=test_total_num_steps) experiment.log_metric("Test Reward Min", np.min(test_episode_rewards), step=test_total_num_steps) experiment.log_metric("Test Reward Max", np.max(test_episode_rewards), step=test_total_num_steps) experiment.log_metric("Test Episode Length Mean ", np.mean(test_episode_length), step=test_total_num_steps) experiment.log_metric("Test Episode Length Min", np.min(test_episode_length), step=test_total_num_steps) experiment.log_metric("Test Episode Length Max", np.max(test_episode_length), step=test_total_num_steps) experiment.log_metric("# Test Trajectories (Total)", j) experiment.log_metric("Test Episodic Success Rate", np.mean(test_episode_success_rate), step=test_total_num_steps) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(args, config=None, init_distributed=False): utils.import_user_module(args) experiment = None if config: experiment = ExistingExperiment( api_key=config["api_key"], previous_experiment=config["experiment_key"], auto_output_logging=None, ) assert ( args.max_tokens is not None or args.max_sentences is not None ), "Must specify batch size either with --max-tokens or --max-sentences" # Initialize CUDA and distributed training if torch.cuda.is_available() and not args.cpu: torch.cuda.set_device(args.device_id) np.random.seed(args.seed) torch.manual_seed(args.seed) if init_distributed: args.distributed_rank = distributed_utils.distributed_init(args) if distributed_utils.is_master(args): checkpoint_utils.verify_checkpoint_directory(args.save_dir) print(args) if experiment: experiment.log_parameters(vars(args), prefix="Device {} :: ".format( args.device_id)) # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(args) # Load valid dataset (we load training data below, based on the latest checkpoint) for valid_sub_split in args.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=0) # Build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) print(model) print("| model {}, criterion {}".format(args.arch, criterion.__class__.__name__)) print("| num. model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), )) if experiment: experiment.log_parameters( { "criterion": criterion.__class__.__name__, "num. model params": sum(p.numel() for p in model.parameters()), "num. trained params": sum(p.numel() for p in model.parameters() if p.requires_grad), }, prefix="Device {} :: ".format(args.device_id), ) # Build trainer trainer = Trainer(args, task, model, criterion) print("| training on {} GPUs".format(args.distributed_world_size)) print("| max tokens per GPU = {} and max sentences per GPU = {}".format( args.max_tokens, args.max_sentences)) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint(args, trainer) # Train until the learning rate gets too small max_epoch = args.max_epoch or math.inf max_update = args.max_update or math.inf lr = trainer.get_lr() train_meter = StopwatchMeter() train_meter.start() valid_subsets = args.valid_subset.split(",") while (lr > args.min_lr and epoch_itr.epoch < max_epoch and trainer.get_num_updates() < max_update): # train for one epoch train(args, trainer, task, epoch_itr, experiment) if (not args.disable_validation and epoch_itr.epoch % args.validate_interval == 0): valid_losses = validate(args, trainer, task, epoch_itr, valid_subsets, experiment) else: valid_losses = [None] # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) # save checkpoint if epoch_itr.epoch % args.save_interval == 0: checkpoint_utils.save_checkpoint(args, trainer, epoch_itr, valid_losses[0]) reload_dataset = ":" in getattr(args, "data", "") # sharded data: get train iterator for next epoch epoch_itr = trainer.get_train_iterator(epoch_itr.epoch, load_dataset=reload_dataset) train_meter.stop() print("| done training in {:.1f} seconds".format(train_meter.sum)) if experiment: experiment.log_metrics( { "valid_loss": valid_losses[0], "lr": lr }, prefix="Device {} ".format(args.device_id), )
class CometLogger(): def __init__(self, enabled, is_existing=False, prev_exp_key=None): """ Handles logging of experiment to comet and also persistence to local file system. Supports resumption of stopped experiments. """ disabled = not enabled if not is_existing: self.experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=PROJECT_NAME, disabled=disabled) else: if prev_exp_key is None: raise ValueError( "Requested existing experiment, but no key provided") print("Continuing existing experiment with key: ", prev_exp_key) self.experiment = ExistingExperiment( api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=PROJECT_NAME, disabled=disabled, previous_experiment=prev_exp_key) self.disabled = disabled def get_experiment_key(self): return self.experiment.get_key()[:9] def add_tag(self, tag): self.experiment.add_tag(tag) def log_metric(self, name, value, step=None): self.experiment.log_metric(name, value, step=step) def log_metrics(self, metrics_dict, prefix, step=None): self.experiment.log_metrics(metrics_dict, prefix=prefix, step=step) def log_params(self, params_dict): self.experiment.log_parameters(params_dict) def set_name(self, name_str): self.experiment.set_name(name_str) def log_dataset(self, dataset: SpeakerVerificationDataset): if self.disabled: return dataset_string = "" dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers) dataset_string += "\n" + dataset.get_logs() dataset_string = dataset_string.replace("\n", "<br>") self.vis.text(dataset_string, opts={"title": "Dataset"}) def log_implementation(self, params): if self.disabled: return implementation_string = "" for param, value in params.items(): implementation_string += "<b>%s</b>: %s\n" % (param, value) implementation_string = implementation_string.replace("\n", "<br>") self.implementation_string = implementation_string self.implementation_win = self.vis.text( implementation_string, opts={"title": "Training implementation"}) def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None, max_speakers=16): if self.disabled: return max_speakers = min(max_speakers, len(colormap)) embeds = embeds[:max_speakers * utterances_per_speaker] n_speakers = len(embeds) // utterances_per_speaker ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker) colors = [colormap[i] for i in ground_truth] reducer = umap.UMAP() projected = reducer.fit_transform(embeds) plt.scatter(projected[:, 0], projected[:, 1], c=colors) plt.gca().set_aspect("equal", "datalim") plt.title("UMAP projection (step %d)" % step) if out_fpath is not None: plt.savefig(out_fpath) plt.clf() self.experiment.log_image(out_fpath, step=step)
def __init__(self, args=args): super().__init__() self.args = args # random_seed setting random_seed = args.randomseed np.random.seed(random_seed) torch.manual_seed(random_seed) if torch.cuda.device_count() > 1: torch.cuda.manual_seed_all(random_seed) else: torch.cuda.manual_seed(random_seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.slomo = model.Slomo(self.args.data_h, self.args.data_w, self.device) self.slomo.to(self.device) if self.args.init_type != "": init_net(self.slomo, self.args.init_type) print(self.args.init_type + " initializing slomo done!") if self.args.train_continue: if not self.args.nocomet and self.args.cometid != "": self.comet_exp = ExistingExperiment( previous_experiment=self.args.cometid ) elif not self.args.nocomet and self.args.cometid == "": self.comet_exp = Experiment( workspace=self.args.workspace, project_name=self.args.projectname ) else: self.comet_exp = None self.ckpt_dict = torch.load(self.args.checkpoint) self.slomo.load_state_dict(self.ckpt_dict["model_state_dict"]) self.args.init_learning_rate = self.ckpt_dict["learningRate"] self.optimizer = optim.Adam( self.slomo.parameters(), lr=self.args.init_learning_rate ) self.optimizer.load_state_dict(self.ckpt_dict["opt_state_dict"]) print("Pretrained model loaded!") else: # start logging info in comet-ml if not self.args.nocomet: self.comet_exp = Experiment( workspace=self.args.workspace, project_name=self.args.projectname ) # self.comet_exp.log_parameters(flatten_opts(self.args)) else: self.comet_exp = None self.ckpt_dict = { "trainLoss": {}, "valLoss": {}, "valPSNR": {}, "valSSIM": {}, "learningRate": {}, "epoch": -1, "detail": "End to end Super SloMo.", "trainBatchSz": self.args.train_batch_size, "validationBatchSz": self.args.validation_batch_size, } self.optimizer = optim.Adam( self.slomo.parameters(), lr=self.args.init_learning_rate ) self.scheduler = optim.lr_scheduler.MultiStepLR( self.optimizer, milestones=self.args.milestones, gamma=0.1 ) # Channel wise mean calculated on adobe240-fps training dataset mean = [0.5, 0.5, 0.5] std = [1, 1, 1] self.normalize = transforms.Normalize(mean=mean, std=std) self.transform = transforms.Compose([transforms.ToTensor(), self.normalize]) trainset = dataloader.SuperSloMo( root=self.args.dataset_root + "/train", transform=self.transform, train=True ) self.trainloader = torch.utils.data.DataLoader( trainset, batch_size=self.args.train_batch_size, num_workers=self.args.num_workers, shuffle=True, ) validationset = dataloader.SuperSloMo( root=self.args.dataset_root + "/validation", transform=self.transform, # randomCropSize=(128, 128), train=False, ) self.validationloader = torch.utils.data.DataLoader( validationset, batch_size=self.args.validation_batch_size, num_workers=self.args.num_workers, shuffle=False, ) ### loss self.supervisedloss = supervisedLoss() self.best = { "valLoss": 99999999, "valPSNR": -1, "valSSIM": -1, } self.checkpoint_counter = int( (self.ckpt_dict["epoch"] + 1) / self.args.checkpoint_epoch )
def main(args): torch.manual_seed(args.seed) np.random.seed(args.seed) print('Loading data') data = np.load(args.boards_file, allow_pickle=True) idxs = data['idxs'] labels = data['values'] mask = labels != None idxs = idxs[mask] labels = labels[mask] n = len(idxs) if args.shuffle: perm = np.random.permutation(n) idxs = idxs[perm] labels = labels[perm] if args.experiment is None: experiment = Experiment(project_name="chess-axia") experiment.log_parameters(vars(args)) else: experiment = ExistingExperiment(previous_experiment=args.experiment) key = experiment.get_key() print(f'Number of Boards: {n}') if torch.cuda.is_available() and args.num_gpus > 0: device = torch.device('cuda:0') else: device = torch.device('cpu') if args.num_train is None: args.num_train = n - args.num_test if args.num_train + args.num_test > n: raise ValueError('num-train and num-test sum to more than dataset size') train_idxs = idxs[:args.num_train] test_idxs = idxs[-args.num_test:] train_labels = labels[:-args.num_test] test_labels = labels[-args.num_test:] #print(f'Win percentage: {sum(train_labels)/ len(train_labels):.1%}') print('Train size: ' + str(len(train_labels))) train_loader = DataLoader(BoardAndPieces(train_idxs, train_labels), batch_size=args.batch_size, collate_fn=collate_fn, shuffle=True) test_loader = DataLoader(BoardAndPieces(test_idxs, test_labels), batch_size=args.batch_size, collate_fn=collate_fn) ae = AutoEncoder().to(device) ae_file = append_to_modelname(args.ae_model, args.ae_iter) ae.load_state_dict(torch.load(ae_file)) model = BoardValuator(ae).to(device) loss_fn = model.loss_fn model = DataParallel(model) if args.model_loadname: model.load_state_dict(torch.load(args.model_loadname)) if args.ae_freeze: print('Freezing AE model') for param in ae.parameters(): param.requires_grad = False if torch.cuda.device_count() > 1 and args.num_gpus > 1: model = torch.nn.DataParallel(model) optimizer = optim.Adam(model.parameters(), lr=args.lr) #cum_acc = cum_loss = count = 0 total_iters = args.init_iter for epoch in range(args.init_epoch, args.epochs): print(f'Running epoch {epoch} / {args.epochs}\n') #for batch_idx, (input, mask, label) in tqdm(enumerate(train_loader), # total=len(train_loader)): for batch_idx, (input, mask, label) in enumerate(train_loader): model.train() input = to(input, device) mask = to(mask, device) label = to(label, device) optimizer.zero_grad() output = model(input, mask) loss = loss_fn(output, label) loss.backward() optimizer.step() cum_loss += loss.item() # cum_acc += acc.item() count += 1 if total_iters % args.log_interval == 0: tqdm.write(f'Epoch: {epoch}\t Iter: {total_iters:>6}\t Loss: {loss.item():.5f}') # experiment.log_metric('accuracy', cum_acc / count, # step=total_iters) experiment.log_metric('loss', cum_loss / count, step=total_iters) experiment.log_metric('loss_', cum_loss / count, step=total_iters) #cum_acc = cum_loss = count = 0 if total_iters % args.save_interval == 0: path = get_modelpath(args.model_dirname, key, args.model_savename, iter=total_iters, epoch=epoch) dirname = os.path.dirname(path) if not os.path.exists(dirname): os.makedirs(dirname) torch.save(model.state_dict(), path) if total_iters % args.eval_interval == 0 and total_iters != 0: loss = eval_loss(model, test_loader, device, loss_fn) tqdm.write(f'\tTEST: Loss: {loss:.5f}') #experiment.log_metric('test accuracy', acc, step=total_iters, # epoch=epoch) experiment.log_metric('test loss', loss, step=total_iters, epoch=epoch) total_iters += 1
name=f"{str(epoch)}_#{logidx}", ) if __name__ == "__main__": input_dim = args.data_h * args.data_w batch_size = args.batchsize device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") encoder = Encoder(input_dim, 256, 256) decoder = Decoder(args.ld, 256, input_dim) vae = VAE(encoder, decoder) if args.train_continue: if not args.nocomet: comet_exp = ExistingExperiment(previous_experiment=args.cometid) else: comet_exp = None dict1 = torch.load(args.checkpoint) vae.load_state_dict(dict1["state_dict"]) checkpoint_counter = dict1["checkpoint_counter"] optimizer = optim.Adam(vae.parameters(), lr=dict1["learningRate"]) else: # start logging info in comet-ml if not args.nocomet: comet_exp = Experiment(workspace=args.workspace, project_name=args.projectname) # comet_exp.log_parameters(flatten_opts(args)) else: comet_exp = None dict1 = {
def experiment(variant, comet_exp_key=None): comet_logger = None if comet_exp_key is not None: # from rllab.misc.comet_logger import CometContinuedLogger, CometLogger # from comet_ml import Experiment, ExistingExperiment # comet_log = CometContinuedLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment_key=variant['comet_exp_key']) comet_logger = ExistingExperiment( api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment=variant['comet_exp_key']) # comet_log = CometLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", # project_name="ml4l3", workspace="glenb") comet_logger.set_name("test seq train") # comet_log = comet_exp_key print("RL!: ", comet_logger) print("%%%%%%%%%%%%%%%%%", comet_logger) seed = variant['seed'] log_dir = variant['log_dir'] n_parallel = variant['n_parallel'] setup(seed, n_parallel, log_dir) init_file = variant['init_file'] taskIndex = variant['taskIndex'] n_itr = variant['n_itr'] default_step = variant['default_step'] policyType = variant['policyType'] envType = variant['envType'] tasksFile = path_to_multiworld + '/multiworld/envs/goals/' + variant[ 'tasksFile'] + '.pkl' tasks = pickle.load(open(tasksFile, 'rb')) max_path_length = variant['max_path_length'] use_images = 'conv' in policyType print("$$$$$$$$$$$$$$$ RL-TASK: ", str(tasks[taskIndex]), " $$$$$$$$$$$$$$$") if 'MultiDomain' in envType: baseEnv = Sawyer_MultiDomainEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Push' in envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'Coffee' in envType: baseEnv = SawyerCoffeeEnv(mpl=max_path_length) else: raise AssertionError('') if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv( NormalizedBoxEnv( FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) baseline = ZeroBaseline(env_spec=env.spec) # baseline = LinearFeatureBaseline(env_spec = env.spec) batch_size = variant['batch_size'] if policyType == 'fullAda_Bias': baseline = LinearFeatureBaseline(env_spec=env.spec) algo = vpg_fullADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, # noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir, comet_logger=comet_logger, outer_iteration=variant['outer_iteration']) elif policyType == 'biasAda_Bias': algo = vpg_biasADA( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, # noise_opt = True, default_step=default_step, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir) elif policyType == 'basic': algo = vpg_basic( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, max_path_length=max_path_length, n_itr=n_itr, # step_size=10.0, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), reset_arg=taskIndex, optimizer=None, optimizer_args={ 'init_learning_rate': default_step, 'tf_optimizer_args': { 'learning_rate': 0.5 * default_step }, 'tf_optimizer_cls': tf.train.GradientDescentOptimizer }, log_dir=log_dir # extra_input="onehot_exploration", # added by RK 6/19 # extra_input_dim=5, # added by RK 6/19 ) elif 'conv' in policyType: algo = vpg_conv( env=env, policy=None, load_policy=init_file, baseline=baseline, batch_size=batch_size, # 2x max_path_length=max_path_length, n_itr=n_itr, sampler_cls=VectorizedSampler, # added by RK 6/19 sampler_args=dict(n_envs=1), # noise_opt = True, default_step=default_step, # reset_arg=np.asscalar(taskIndex), reset_arg=taskIndex, log_dir=log_dir) else: raise AssertionError( 'Policy Type must be fullAda_Bias or biasAda_Bias') algo.train()
def __init__(self, args=args): super().__init__() self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") dtype = torch.float self.args = args # TODO make all configurable self.num_epoch = args.epochs self.batch_size = args.train_batch_size self.input_time_window = 4 self.output_time_horizon = 1 self.temporal_stride = 1 self.temporal_frames = 1 self.time_steps = (self.input_time_window - self.temporal_frames + 1) // self.temporal_stride # Initiate the network # CxT×H×W input_shape = (1, self.temporal_frames, 128, 128) output_shape = (1, self.output_time_horizon, 128, 128) self.tau = 1 hidden_size = 64 kernel = (1, 5, 5) lstm_layers = 4 self.encoder = E3DLSTM(input_shape, hidden_size, lstm_layers, kernel, self.tau).type(dtype) self.decoder = nn.Conv3d(hidden_size * self.time_steps, output_shape[0], kernel, padding=(0, 2, 2)).type(dtype) if self.args.train_continue: if not self.args.nocomet and self.args.cometid != "": self.comet_exp = ExistingExperiment( previous_experiment=self.args.cometid) elif not self.args.nocomet and self.args.cometid == "": self.comet_exp = Experiment(workspace=self.args.workspace, project_name=self.args.projectname) else: self.comet_exp = None self.ckpt_dict = torch.load(self.args.checkpoint) self.load_state_dict(self.ckpt_dict["state_dict"]) self.to(self.device) params = self.parameters(recurse=True) self.optimizer = torch.optim.Adam(params, lr=self.args.init_learning_rate, weight_decay=0) self.optimizer.load_state_dict(self.ckpt_dict["opt_state_dict"]) self.scheduler = torch.optim.lr_scheduler.MultiStepLR( self.optimizer, milestones=self.args.milestones, gamma=0.1) else: # start logging info in comet-ml if not self.args.nocomet: self.comet_exp = Experiment(workspace=self.args.workspace, project_name=self.args.projectname) # self.comet_exp.log_parameters(flatten_opts(self.args)) else: self.comet_exp = None self.ckpt_dict = { "trainLoss": {}, "valLoss": {}, "valPSNR": {}, "valSSIM": {}, "epoch": -1, "detail": "End to end E3D", "trainBatchSz": self.args.train_batch_size, } self.to(self.device) params = self.parameters(recurse=True) self.optimizer = torch.optim.Adam(params, lr=self.args.init_learning_rate, weight_decay=0) self.scheduler = torch.optim.lr_scheduler.MultiStepLR( self.optimizer, milestones=self.args.milestones, gamma=0.1) # Setup optimizer # TODO learning rate scheduler # Weight decay stands for L2 regularization self.apply(weights_init())
def main(args): print("Loading config file: ", args.config) params = utils.load_config_file(args.config) params["test_dataset_paths"] = utils.format_dataset_path( params["test_dataset_paths"]) if args.existing_experiment: experiment = ExistingExperiment( api_key="jBFVYFo9VUsy0kb0lioKXfTmM", previous_experiment=args.existing_experiment) else: experiment = Experiment(api_key="jBFVYFo9VUsy0kb0lioKXfTmM", project_name="fastdepth") # Data loading code print("Creating data loaders...") if args.nyu: from dataloaders.nyu import NYUDataset val_dataset = NYUDataset(params["test_dataset_paths"], split='val') else: val_dataset = Datasets.FastDepthDataset(params["test_dataset_paths"], split='val', depth_min=params["depth_min"], depth_max=params["depth_max"], input_shape_model=(224, 224)) # set batch size to be 1 for validation val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=1, shuffle=True, num_workers=params["num_workers"], pin_memory=True) # Set GPU params["device"] = torch.device( "cuda:{}".format(params["device"]) if params["device"] >= 0 and torch.cuda.is_available() else "cpu") print("Using device", params["device"]) print("Loading model '{}'".format(args.model)) if not args.nyu: model, _ = utils.load_model(params, args.model, params["device"]) else: # Maintain compatibility for fastdepth NYU model format state_dict = torch.load(args.model, map_location=params["device"]) model = models.MobileNetSkipAdd(output_size=(224, 224), pretrained=True) model.load_state_dict(state_dict) params["start_epoch"] = 0 model.to(params["device"]) # Create output directory output_directory = os.path.join(os.path.dirname(args.model), "images") if not os.path.exists(output_directory): os.makedirs(output_directory) params["experiment_dir"] = output_directory print("Saving results to " + output_directory) evaluate(params, val_loader, model, experiment)
from comet_ml import ExistingExperiment import matplotlib.pyplot as plt import torch from data import create_dataloader from model import TransformerClassification experiment = ExistingExperiment( previous_experiment='b8d5b06e99484f8a93dd0d84f8a36f3e') def main(): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load data _, _, test_dl, TEXT = create_dataloader() # load model net = TransformerClassification(TEXT.vocab.vectors, d_model=300, max_seq_len=256, output_dim=2) net.load_state_dict(torch.load('checkpoints/model.pt')) net.to(device) epoch_corrects = 0 for batch in test_dl: inputs = batch.Text[0].to(device) labels = batch.Label.to(device) with torch.set_grad_enabled(False): input_pad = 1
if args.init_type != "": init_net(flowComp, args.init_type) print(args.init_type + " initializing flowComp done") ArbTimeFlowIntrp = model.UNet(20, 5) ArbTimeFlowIntrp.to(device) if args.init_type != "": init_net(ArbTimeFlowIntrp, args.init_type) print(args.init_type + " initializing ArbTimeFlowIntrp done") ### Initialization if args.train_continue: if not args.nocomet and args.cometid != "": comet_exp = ExistingExperiment(previous_experiment=args.cometid) elif not args.nocomet and args.cometid == "": comet_exp = Experiment(workspace=args.workspace, project_name=args.projectname) else: comet_exp = None dict1 = torch.load(args.checkpoint) ArbTimeFlowIntrp.load_state_dict(dict1["state_dictAT"]) flowComp.load_state_dict(dict1["state_dictFC"]) print("Pretrained model loaded!") else: # start logging info in comet-ml if not args.nocomet: comet_exp = Experiment(workspace=args.workspace, project_name=args.projectname) # comet_exp.log_parameters(flatten_opts(args)) else: comet_exp = None
def main(opts): """ Opts prevalence: 1. Load file specified in args.default (or shared/trainer/defaults.yaml if none is provided) 2. Update with file specified in args.config (or no update if none is provided) 3. Update with parsed command-line arguments e.g. `python train.py args.config=config/large-lr.yaml data.loaders.batch_size=10` loads defaults, overrides with values in large-lr.yaml and sets batch_size to 10 """ # ----------------------------- # ----- Parse arguments ----- # ----------------------------- hydra_opts = Dict(OmegaConf.to_container(opts)) args = hydra_opts.pop("args", None) auto_resumed = {} config_path = args.config if hydra_opts.train.resume: out_ = str(env_to_path(hydra_opts.output_path)) config_path = Path(out_) / "opts.yaml" if not config_path.exists(): config_path = None print("WARNING: could not reuse the opts in {}".format(out_)) default = args.default or Path( __file__).parent / "shared/trainer/defaults.yaml" # ----------------------- # ----- Load opts ----- # ----------------------- opts = load_opts(config_path, default=default, commandline_opts=hydra_opts) if args.resume: opts.train.resume = True opts.jobID = os.environ.get("SLURM_JOBID") opts.slurm_partition = os.environ.get("SLURM_JOB_PARTITION") opts.output_path = str(env_to_path(opts.output_path)) print("Config output_path:", opts.output_path) exp = comet_previous_id = None # ------------------------------- # ----- Check output_path ----- # ------------------------------- # Auto-continue if same slurm job ID (=job was requeued) if not opts.train.resume and opts.train.auto_resume: print("\n\nTrying to auto-resume...") existing_path = find_existing_training(opts) if existing_path is not None and existing_path.exists(): auto_resumed["original output_path"] = str(opts.output_path) auto_resumed["existing_path"] = str(existing_path) opts.train.resume = True opts.output_path = str(existing_path) # Still not resuming: creating new output path if not opts.train.resume: opts.output_path = str(get_increased_path(opts.output_path)) Path(opts.output_path).mkdir(parents=True, exist_ok=True) # Copy the opts's sbatch_file to output_path copy_run_files(opts) # store git hash opts.git_hash = get_git_revision_hash() opts.git_branch = get_git_branch() if not args.no_comet: # ---------------------------------- # ----- Set Comet Experiment ----- # ---------------------------------- if opts.train.resume: # Is resuming: get existing comet exp id assert Path( opts.output_path).exists(), "Output_path does not exist" comet_previous_id = get_existing_comet_id(opts.output_path) # Continue existing experiment if comet_previous_id is None: print("WARNING could not retreive previous comet id") print(f"from {opts.output_path}") else: print("Continuing previous experiment", comet_previous_id) auto_resumed["continuing exp id"] = comet_previous_id exp = ExistingExperiment(previous_experiment=comet_previous_id, **comet_kwargs) print("Comet Experiment resumed") if exp is None: # Create new experiment print("Starting new experiment") exp = Experiment(project_name="climategan", **comet_kwargs) exp.log_asset_folder( str(Path(__file__).parent / "climategan"), recursive=True, log_file_name=True, ) exp.log_asset(str(Path(__file__))) # Log note if args.note: exp.log_parameter("note", args.note) # Merge and log tags if args.comet_tags or opts.comet.tags: tags = set([f"branch:{opts.git_branch}"]) if args.comet_tags: tags.update(args.comet_tags) if opts.comet.tags: tags.update(opts.comet.tags) opts.comet.tags = list(tags) print("Logging to comet.ml with tags", opts.comet.tags) exp.add_tags(opts.comet.tags) # Log all opts exp.log_parameters(flatten_opts(opts)) if auto_resumed: exp.log_text("\n".join(f"{k:20}: {v}" for k, v in auto_resumed.items())) # allow some time for comet to get its url sleep(1) # Save comet exp url url_path = get_increased_path(Path(opts.output_path) / "comet_url.txt") with open(url_path, "w") as f: f.write(exp.url) # Save config file opts_path = get_increased_path(Path(opts.output_path) / "opts.yaml") with (opts_path).open("w") as f: yaml.safe_dump(opts.to_dict(), f) pprint("Running model in", opts.output_path) # ------------------- # ----- Train ----- # ------------------- trainer = Trainer(opts, comet_exp=exp, verbose=1) trainer.logger.time.start_time = time() trainer.setup() trainer.train() # ----------------------------- # ----- End of training ----- # ----------------------------- pprint("Done training") kill_job(opts.jobID)
def experiment(variant, comet_exp_key=None): if comet_exp_key is not None: from rllab.misc.comet_logger import CometContinuedLogger, CometLogger from comet_ml import Experiment, ExistingExperiment # comet_log = CometContinuedLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment_key=variant['comet_exp_key']) comet_log = ExistingExperiment(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", previous_experiment=variant['comet_exp_key']) # comet_log = CometLogger(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", # project_name="ml4l3", workspace="glenb") comet_log.set_name("test seq train") # comet_log = comet_exp_key print (comet_log) else: comet_log = None print ("loading libraries") from sandbox.rocky.tf.algos.maml_il import MAMLIL from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline from rllab.baselines.gaussian_mlp_baseline import GaussianMLPBaseline from rllab.baselines.maml_gaussian_mlp_baseline import MAMLGaussianMLPBaseline from rllab.baselines.zero_baseline import ZeroBaseline from rllab.envs.normalized_env import normalize from rllab.misc.instrument import stub, run_experiment_lite from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy import MAMLGaussianMLPPolicy as basic_policy # from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep import MAMLGaussianMLPPolicy as fullAda_basic_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_ppo import \ MAMLGaussianMLPPolicy as PPO_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_adaptivestep_biastransform import \ MAMLGaussianMLPPolicy as fullAda_Bias_policy from sandbox.rocky.tf.policies.maml_minimal_gauss_mlp_policy_biasonlyadaptivestep_biastransform import \ MAMLGaussianMLPPolicy as biasAda_Bias_policy from sandbox.rocky.tf.policies.maml_minimal_conv_gauss_mlp_policy import MAMLGaussianMLPPolicy as conv_policy from sandbox.rocky.tf.optimizers.quad_dist_expert_optimizer import QuadDistExpertOptimizer from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer from sandbox.rocky.tf.envs.base import TfEnv import sandbox.rocky.tf.core.layers as L from rllab.envs.mujoco.ant_env_rand_goal_ring import AntEnvRandGoalRing from multiworld.envs.mujoco.sawyer_xyz.push.sawyer_push import SawyerPushEnv from multiworld.envs.mujoco.sawyer_xyz.pickPlace.sawyer_pick_and_place import SawyerPickPlaceEnv from multiworld.envs.mujoco.sawyer_xyz.door.sawyer_door_open import SawyerDoorOpenEnv from multiworld.core.flat_goal_env import FlatGoalEnv from multiworld.core.finn_maml_env import FinnMamlEnv from multiworld.core.wrapper_env import NormalizedBoxEnv import tensorflow as tf import time from rllab.envs.gym_env import GymEnv from maml_examples.maml_experiment_vars import MOD_FUNC import numpy as np import random as rd import pickle print ("Done loading libraries") seed = variant['seed']; n_parallel = 1; log_dir = variant['log_dir'] x=0 setup(seed, n_parallel, log_dir) fast_batch_size = variant['fbs']; meta_batch_size = variant['mbs'] adam_steps = variant['adam_steps']; max_path_length = variant['max_path_length'] dagger = variant['dagger']; expert_policy_loc = variant['expert_policy_loc'] ldim = variant['ldim']; init_flr = variant['init_flr']; policyType = variant['policyType']; use_maesn = variant['use_maesn'] EXPERT_TRAJ_LOCATION = variant['expertDataLoc'] envType = variant['envType'] tasksFile = path_to_multiworld + 'multiworld/envs/goals/' + variant['tasksFile'] + '.pkl' all_tasks = pickle.load(open(tasksFile, 'rb')) assert meta_batch_size <= len(all_tasks), "meta batch size wrong: " + str(meta_batch_size) + " <= " + str(len(all_tasks)) tasks = all_tasks[:meta_batch_size] print("^^^^^^^^^^^^^^^^ meta_tasks: ", tasks, " ^^^^^^^^^^^^^^^^ ") use_images = 'conv' in policyType if 'Push' == envType: baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif envType == 'sparsePush': baseEnv = SawyerPushEnv(tasks=tasks, image=use_images, mpl=max_path_length, rewMode='l2Sparse') elif 'PickPlace' in envType: baseEnv = SawyerPickPlaceEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Door' in envType: baseEnv = SawyerDoorOpenEnv(tasks=tasks, image=use_images, mpl=max_path_length) elif 'Ant' in envType: env = TfEnv(normalize(AntEnvRandGoalRing())) elif 'claw' in envType: env = TfEnv(DClawScrewRandGoal()) else: assert True == False if envType in ['Push', 'PickPlace', 'Door']: if use_images: obs_keys = ['img_observation'] else: obs_keys = ['state_observation'] env = TfEnv(NormalizedBoxEnv(FinnMamlEnv(FlatGoalEnv(baseEnv, obs_keys=obs_keys), reset_mode='idx'))) algoClass = MAMLIL baseline = LinearFeatureBaseline(env_spec=env.spec) load_policy = variant['load_policy'] if load_policy != None: policy = None load_policy = variant['load_policy'] # if 'conv' in load_policy: # baseline = ZeroBaseline(env_spec=env.spec) elif 'fullAda_PPO' in policyType: policy = PPO_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim ) elif 'fullAda_Bias' in policyType: policy = fullAda_Bias_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim ) elif 'biasAda_Bias' in policyType: policy = biasAda_Bias_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), init_flr_full=init_flr, latent_dim=ldim ) elif 'basic' in policyType: policy = basic_policy( name="policy", env_spec=env.spec, grad_step_size=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) elif 'conv' in policyType: baseline = ZeroBaseline(env_spec=env.spec) policy = conv_policy( name="policy", latent_dim=ldim, policyType=policyType, env_spec=env.spec, init_flr=init_flr, hidden_nonlinearity=tf.nn.relu, hidden_sizes=(100, 100), extra_input_dim=(0 if extra_input is "" else extra_input_dim), ) print("|||||||||||||||||||||||||||||||||||||||||||||||", variant['n_itr']) beta_steps = 1 ; meta_step_size = 0.01 ; num_grad_updates = 1 pre_std_modifier = 1.0 ; post_std_modifier = 0.00001 limit_demos_num = None algo = algoClass( env=env, policy=policy, load_policy=load_policy, baseline=baseline, batch_size=fast_batch_size, # number of trajs for alpha grad update max_path_length=max_path_length, meta_batch_size=meta_batch_size, # number of tasks sampled for beta grad update num_grad_updates=num_grad_updates, # number of alpha grad updates n_itr=variant['n_itr'], make_video=False, use_maml=True, use_pooled_goals=True, use_corr_term=use_corr_term, test_on_training_goals=test_on_training_goals, metalearn_baseline=False, # metalearn_baseline=False, limit_demos_num=limit_demos_num, test_goals_mult=1, step_size=meta_step_size, plot=False, beta_steps=beta_steps, adam_curve=None, adam_steps=adam_steps, pre_std_modifier=pre_std_modifier, l2loss_std_mult=l2loss_std_mult, importance_sampling_modifier=MOD_FUNC[''], post_std_modifier=post_std_modifier, expert_trajs_dir=EXPERT_TRAJ_LOCATION, expert_trajs_suffix='', seed=seed, extra_input=extra_input, extra_input_dim=(0 if extra_input is "" else extra_input_dim), plotDirPrefix=None, latent_dim=ldim, dagger=dagger, expert_policy_loc=expert_policy_loc, comet_logger=comet_log, outerIteration=variant['outer_Iteration'], use_ppo=True ) algo.train()
parser.add_argument('-span', default=.5, type=float) parser.add_argument('-seed', default=1234, type=int) parser.add_argument('-eig', action='store_true') parser.add_argument('-ckpt', default='poison-filtnorm-weaker', type=str) parser.add_argument('-gpu', default='0', type=str) parser.add_argument('-svhn', action='store_true') args = parser.parse_args() # comet stuff if not os.path.exists('comet_expt_key_surface.txt'): experiment = Experiment(api_key="vPCPPZrcrUBitgoQkvzxdsh9k", parse_args=False, project_name='landscape', workspace="wronnyhuang") open('comet_expt_key_surface.txt', 'w+').write(experiment.get_key()) else: comet_key = open('comet_expt_key_surface.txt', 'r').read() experiment = ExistingExperiment(api_key="vPCPPZrcrUBitgoQkvzxdsh9k", previous_experiment=comet_key, parse_args=False) # apply settings np.random.seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # load data and model cleanloader, _, _ = get_loader(join(home, 'datasets'), batchsize=2 * 64, fracdirty=.5, nogan=True, svhn=args.svhn) evaluator = Evaluator(cleanloader) evaluator.restore_weights_dropbox('ckpt/'+args.ckpt) # plot along which direction if args.eig: eigfile = join('pickle', args.ckpt) if exists(eigfile): dw1 = pickle.load(eigfile) # load from file if hessian eigvec already computed else: # compute otherwise
pred_2d, pred_3d, keep_matching=True, ) pck = np.mean(list(pcks.values())) auc = np.mean(list(aucs.values())) values.append(pck) values.append(auc) print(" %4.1f %4.1f " % (pck, auc), end="") print() exp.log_metrics({f"{prefix}-{k}": v for k, v in zip(keys, values)}) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "model_name", help="Name of the model (either 'normal' or 'universal')") parser.add_argument( "-r", "--pose-refine", action="store_true", help="Apply pose-refinement after TPN", ) args = parser.parse_args() exp = ExistingExperiment(previous_experiment=args.model_name) main(args.model_name, args.pose_refine, exp)
optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) else: raise Exception else: start_epoch = 0 best_acc = 0 experiment = None if args.api_key: project_dir, experiment_name = split(dirname(realpath(__file__))) project_name = basename(project_dir) if args.resume: experiment = ExistingExperiment( api_key=args.api_key, previous_experiment=checkpoint['experiment_key'], auto_param_logging=False, auto_metric_logging=False, parse_args=False) else: experiment = Experiment( api_key=args.api_key, project_name=project_name, auto_param_logging=False, auto_metric_logging=False, parse_args=False) experiment.log_other('experiment_name', experiment_name) experiment.log_parameters(vars(args)) for k in hyperparameters: if type(hyperparameters[k]) == dict: experiment.log_parameters(hyperparameters[k], prefix=k) else:
DO_INTENSITY_SHIFT = True RANDOM_CROP = [128, 128, 128] DO_MIXUP = False ROT_DEGREES = 20 SCALE_FACTOR = 1.1 SIGMA = 10 MAX_INTENSITY_SHIFT = 0.1 if LOG_COMETML: if not "LOG_COMETML_EXISTING_EXPERIMENT" in locals(): experiment = Experiment(api_key="", project_name="", workspace="") else: experiment = ExistingExperiment( api_key="", previous_experiment=LOG_COMETML_EXISTING_EXPERIMENT, project_name="", workspace="") else: experiment = None #network funcitons if TRAIN_ORIGINAL_CLASSES: loss = bratsUtils.bratsDiceLossOriginal5 else: #loss = bratsUtils.bratsDiceLoss def loss(outputs, labels): return bratsUtils.bratsDiceLoss(outputs, labels, nonSquared=True) class ResidualInner(nn.Module):
from comet_ml import ExistingExperiment import matplotlib.pyplot as plt import torch from model import Generator, Discriminator from data import make_datapath_list, GAN_Img_Dataset, ImageTransform experiment = ExistingExperiment( previous_experiment='e746c2c19f194d588fdfdbb7dc573602') def main(): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # load model G = Generator(z_dim=20, image_size=64) D = Discriminator(z_dim=20, image_size=64) G.load_state_dict(torch.load('checkpoints/G.pt')) D.load_state_dict(torch.load('checkpoints/D.pt')) G.to(device) D.to(device) batch_size = 8 z_dim = 20 fixed_z = torch.randn(batch_size, z_dim) fixed_z = fixed_z.view(fixed_z.size(0), fixed_z.size(1), 1, 1) # generate fake images fake_images, am1, am2 = G(fixed_z.to(device)) # real images train_img_list = make_datapath_list()