class Experiment(): def __init__(self, api_key=None, **kwargs): self._exp = None self._id = uuid4().hex if api_key: self._exp = CometExperiment(api_key, log_code=False, auto_param_logging=False, auto_metric_logging=False, **kwargs) self._id = self._exp.get_key() def log_metric(self, name, value, step=None, epoch=None): if self._exp: self._exp.log_metric(name, value, step, epoch) def log_epoch_end(self, epoch_cnt, step=None): if self._exp: self._exp.log_epoch_end(epoch_cnt, step=step) def log_parameters(self, hp): if self._exp: self._exp.log_parameters(flatten(hp, reducer='underscore')) @property def id(self): return self._id[:12]
def load_experiment(path_to_yml_file): config = load_yaml(path_to_yml_file) api_key = os.getenv('COMET_API_KEY', None) exp = None if not config['info']['experiment_key']: if api_key: exp = Experiment(api_key=api_key, project_name=config['info']['project_name']) exp_key = exp.get_key() else: exp_key = make_random_string(20) os.environ['EXPERIMENT_KEY'] = exp_key _env_variables = env_variables + ['EXPERIMENT_KEY'] config = load_yaml(path_to_yml_file, _env_variables) config['info']['experiment_key'] = exp_key path_to_yml_file = save_experiment(config, exp) else: logging.info( f"Experiment is already set up @ {config['info']['output_folder']}!" ) try: exp = ExistingExperiment( api_key=api_key, previous_experiment=config['info']['experiment_key']) except: pass return config, exp, path_to_yml_file
def __init__(self, experiment: Experiment, gpu_id=None, print_to_comet_only=False): if CometLogger.__experiment is not None: raise Exception( "Cannot re-instantiate since this class is a singleton.") else: CometLogger.__experiment = experiment CometLogger.__APIExperiment = APIExperiment( previous_experiment=experiment.get_key()) CometLogger.gpu_id = gpu_id CometLogger.print_to_comet_only = print_to_comet_only
def get_comet_logger(self): if not self.paras.load : comet_exp = Experiment(project_name=COMET_PROJECT_NAME, workspace=COMET_WORKSPACE, auto_output_logging=None, auto_metric_logging=None, display_summary=False, ) if self.paras.transfer: comet_exp.set_name(self.exp_name) comet_exp.add_tag(Path(self.ckpdir).parent.name) comet_exp.add_tag('transfer') comet_exp.add_tag(self.config['data']['corpus']['metas'][0]) if self.paras.test: comet_exp.set_name(Path(self.paras.outdir).name) comet_exp.add_tag(Path(self.paras.config).parents[2].name) comet_exp.add_tag('test') comet_exp.add_tag(Path(self.paras.config).parent.stem) #comet_exp.add_tag(Path(self.paras.outdir).name) else: comet_exp.add_tag('train') for name, param in self.config.items(): if isinstance(param, dict): comet_exp.log_parameters(param, prefix=name) else: comet_exp.log_parameter(name, param) comet_exp.log_other('seed', self.paras.seed) with open(Path(self.logdir,'exp_key'), 'w') as f: print(comet_exp.get_key(),file=f) else: with open(Path(self.logdir,'exp_key'),'r') as f: exp_key = f.read().strip() comet_exp = ExistingExperiment(previous_experiment=exp_key, project_name=COMET_PROJECT_NAME, workspace=COMET_WORKSPACE, auto_output_logging=None, auto_metric_logging=None, display_summary=False, ) return comet_exp
def main(): # Training settings args = configure_arguments() use_cuda = not args.no_cuda and torch.cuda.is_available() # sets seeds to prevent any unwanted randomness. torch.manual_seed(args.seed) if use_cuda: torch.cuda.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True device = torch.device("cuda:1" if use_cuda else "cpu") train_loader, val_loader, test_loader = create_dataloaders(args) # get instance of model. print('Loading the {0} model...'.format(args.model)) model_class = models.find_model(args.model) model = model_class().to(device) optimizer = optim.SGD(model.parameters(), lr=args.lr) no_of_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print('Model has {0} parameters'.format(no_of_params)) if args.mode == 'train': print('Running in train mode...') #set up logging. experiment = Experiment(api_key="w7QuiECYXbNiOozveTpjc9uPg", project_name="project1-ac2g", workspace="ift6135") create_folder('output/' + args.model + '/' + experiment.get_key()) hyper_params = vars(args) experiment.log_parameters(hyper_params) train(args, model, device, (train_loader, val_loader), optimizer, experiment) elif args.mode == 'test': print('Running in test mode...') test(args, model, device, test_loader)
class CometMLLogger(ExperimentLogger): def __init__(self, provider_args: EasyDict, config, **kwargs): self.experiment = Experiment(api_key=provider_args.api_key, project_name=provider_args.project_name, workspace=provider_args.workspace, auto_param_logging=False, auto_metric_logging=False) super().__init__(config) self.run_key = self.experiment.get_key() self.log_url = self.experiment.url def log_on_hyperparameters(self, config: EasyDict): hyper_params = {} if config is not None: hyper_params['model'] = config.model hyper_params['trainer'] = config.trainer if 'train' in config.dataset and 'augmentations' in config.dataset.train: hyper_params[ 'augmentations'] = config.dataset.train.augmentations self.experiment.log_parameters(flatten(hyper_params, reducer='path')) def log_on_step_update(self, metrics_log: dict): step = metrics_log['step'] metrics_log.pop('step') self.experiment.log_metrics(metrics_log, step=step) def log_on_epoch_update(self, metrics_log: dict): epoch = metrics_log['epoch'] metrics_log.pop('epoch') self.experiment.log_metrics(metrics_log, epoch=epoch) def log_on_model_save(self, file_log: dict): pass def log_on_validation_result(self, metrics_log: dict): epoch = metrics_log['epoch'] metrics_log.pop('epoch') self.experiment.log_metrics(metrics_log, epoch=epoch)
class CometML: def __init__(self, api_key, project_name, workspace, debug=True, tags=None): self._exp = Experiment( api_key=api_key, project_name=project_name, workspace=workspace, disabled=debug, ) if not (self._exp.alive or debug): raise RuntimeError("Cannot connect to Comet ML") self._exp.disable_mp() if tags is not None: self._exp.add_tags(tags) @property def run_name(self): return self._exp.get_key() def args(self, arg_text): self._exp.log_parameter("cmd args", arg_text) def meta(self, params): self._exp.log_parameters(params) def log(self, name, value, step): self._exp.log_metric( name=name, value=value, step=step, )
parser.add_argument('--lambda_kernel_reg', type=float, default=1.0) parser.add_argument('--lambda_fgsd_kernel_reg', type=float, default=1.0) parser.add_argument('--lambda_spk_kernel_reg', type=float, default=1.0) parser.add_argument('--lambda_adj_reconst_reg', type=float, default=1.0) parser.add_argument('--warmup_epochs', type=float, nargs='*', default=[2.0], help='Number of epochs during which learning rate increases linearly from init_lr to max_lr. Afterwards, learning rate decreases exponentially from max_lr to final_lr.') parser.add_argument('--init_lr', type=float, nargs='*', default=[1e-4], help='Initial learning rate') parser.add_argument('--max_lr', type=float, nargs='*', default=[1e-3], help='Maximum learning rate') parser.add_argument('--final_lr', type=float, nargs='*', default=[1e-4], help='Final learning rate') parser.add_argument('--lr_scaler', type=float, nargs='*', default=[1.0], help='Amount by which to scale init_lr, max_lr, and final_lr (for convenience)') parser.add_argument('--lr_decay_rate', type=float, default=0.9, help='lr decay per epoch, for decay scheduler') args, unknown = parser.parse_known_args() experiment = Experiment(api_key=API_KEY, project_name="universal-graph-embedding", workspace="saurabh08", disabled=not args.run_on_comet) experiment_id = experiment.get_key() data_path = os.path.join(args.data_dir, args.dataset_name) log_path = os.path.join(args.log_dir, experiment_id) if not os.path.exists(log_path): os.makedirs(log_path) logging.basicConfig(format='%(message)s', level=logging.INFO, handlers=[logging.StreamHandler(), logging.FileHandler(os.path.join(log_path, 'console_output.txt'))]) run_filepath = os.path.abspath(__file__) shutil.copy(run_filepath, log_path) src_list = ['./train', './utils', './torch_dgl', './dataloader', './config'] dest_list = [os.path.join(log_path, 'train'), os.path.join(log_path, 'utils'), os.path.join(log_path, 'torch_dgl'), os.path.join(log_path, 'dataloader'), os.path.join(log_path, 'config')] for src, dest in zip(src_list, dest_list): shutil.copytree(src, dest) for arg, value in sorted(vars(args).items()):
step_time = round(time.time() - start_time, 1) metrics = {metric_name: log(metric) for metric_name, metric in trainer.metrics.items()} metrics['step_time'] = step_time # validation plotting progbar.add(valid_inc, [('Train Loss', metrics['train_loss']), ('Validation Loss', metrics['valid_loss']), ('Time (s)', step_time)]) #Plot on Comet #experiment.log_metrics(metrics,step=t) # Plot on WandB wandb.log(metrics, step=t) if (t+0) % save_inc == 0: # zero while we test this trainer.save_weights(model_path, run_id=wandb.run.id, experiment_key=experiment.get_key()) # if not args.images: # How we plot the cluster figs # try: # if not args.discrete: # batches = [trainer.make_sequences_variable_length(dataset_coordinator.plotting_background_dataset.next()) for i in range(0,4)] # super_batch = {} # for k in batches[0].keys(): # super_batch[k] = np.concatenate([b[k] for b in batches]) # lang_batch = dataset_coordinator.labelled_test_ds.next() # fig_enc, fig_plan, z_enc, z_plan = lfp.plotting.produce_cluster_fig(super_batch, lang_batch, trainer, args=args) # #if not args.gcbc and not args.images: # # z_enc, z_plan = produce_cluster_fig(next(plotting_dataset), encoder, planner, TEST_DATA_PATHS[0], num_take=dl.batch_size//4) # # #Comet
def experiment(doodad_config, variant): from rlkit.core import logger from rlkit.launchers.launcher_util import setup_logger print ("doodad_config.base_log_dir: ", doodad_config.base_log_dir) from datetime import datetime timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S_%f') setup_logger('wrapped_'+variant['env'], variant=variant, log_dir=doodad_config.base_log_dir+"/smirl/"+variant['exp_name']+"/"+timestamp+"/") if (variant["log_comet"]): try: comet_logger = Experiment(api_key=launchers.config.COMET_API_KEY, project_name=launchers.config.COMET_PROJECT_NAME, workspace=launchers.config.COMET_WORKSPACE) logger.set_comet_logger(comet_logger) comet_logger.set_name(str(variant['env'])+"_"+str(variant['exp_name'])) print("variant: ", variant) variant['comet_key'] = comet_logger.get_key() comet_logger.log_parameters(variant) print(comet_logger) except Exception as inst: print ("Not tracking training via commet.ml") print ("Error: ", inst) import gym from torch import nn as nn import rlkit.torch.pytorch_util as ptu import torch from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy from rlkit.exploration_strategies.base import \ PolicyWrappedWithExplorationStrategy from rlkit.policies.argmax import ArgmaxDiscretePolicy from rlkit.torch.dqn.dqn import DQNTrainer from rlkit.data_management.env_replay_buffer import EnvReplayBuffer from rlkit.samplers.data_collector import MdpPathCollector from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm from surprise.utils.rendering_algorithm import TorchBatchRLRenderAlgorithm from surprise.envs.tetris.tetris import TetrisEnv from surprise.wrappers.obsresize import ResizeObservationWrapper, RenderingObservationWrapper, SoftResetWrapper import pdb base_env = get_env(variant) base_env2 = get_env(variant) print ("GPU_BUS_Index", variant["GPU_BUS_Index"]) if torch.cuda.is_available() and doodad_config.use_gpu: print ("Using the GPU for learning") # ptu.set_gpu_mode(True, gpu_id=doodad_config.gpu_id) ptu.set_gpu_mode(True, gpu_id=variant["GPU_BUS_Index"]) else: print ("NOT Using the GPU for learning") # base_env2 = RenderingObservationWrapper(base_env2) expl_env, network = add_wrappers(base_env, variant, device=ptu.device) eval_env, _ = add_wrappers(base_env2, variant, device=ptu.device, eval=True, network=network) if ("vae_wrapper" in variant["wrappers"]): eval_env._network = base_env._network obs_dim = expl_env.observation_space.low.shape print("Final obs dim", obs_dim) action_dim = eval_env.action_space.n print("Action dimension: ", action_dim) qf, target_qf = get_network(variant["network_args"], obs_dim, action_dim) qf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(qf) if "prob_random_action" in variant: expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space, prob_random_action=variant["prob_random_action"], prob_end=variant["prob_end"], steps=variant["steps"]), eval_policy, ) else: expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space, prob_random_action=0.8, prob_end=0.05), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, render_kwargs=variant['render_kwargs'] ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = DQNTrainer( qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, **variant['trainer_kwargs'] ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLRenderAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
class Reptile(Task): """ A meta-learning task that teaches an agent over a set of other tasks """ def __init__(self, data_handler, load_key=None, sender=True, receiver=True, image_captioner=True, image_selector=False, track_results=True): self.sess = Agent.sess self.N = 1 # number of steps taken for each task - should be > 1 self.S = SenderAgent() self.R = ReceiverAgent(*self.S.get_output()) self.IC = ImageCaptioner() # self.IS = ImageSelector() self.S.all_agents_initialized(load_key) self.R.all_agents_initialized(load_key) self.train_metrics = {} self.val_metrics = {} self.experiment = Experiment(api_key='1jl4lQOnJsVdZR6oekS6WO5FI', project_name='Reptile', auto_param_logging=False, auto_metric_logging=False, disabled=(not track_results)) self.params = {} self.params.update(Agent.get_params()) self.params.update(data_handler.get_params()) self.experiment.log_parameters(self.params) self.T = {} if image_captioner: self.ic = ImageCaptioning(self.IC, experiment=self.experiment, track_results=False) self.T["Image Captioner"] = lambda img, capts: self.ic.train_batch( (img, capts), mode="train") if image_selector: self.is_ = ImageSelection(self.IS, experiment=self.experiment, track_results=False) self.T["Image Selector"] = lambda img, capts: self.is_.train_batch( (img, capts), mode="train") if sender or receiver: self.rg = ReferentialGame(self.S, self.R, experiment=self.experiment, track_results=False) if receiver: self.T["Receiver"] = lambda img, capts: self.rg.train_batch( img, mode="receiver_train") if sender: self.T["Sender"] = lambda img, capts: self.rg.train_batch( img, mode="sender_train") # Initialize TF variables_to_initialize = tf.global_variables() if load_key is not None: dont_initialize = [] if SenderAgent.loaded: dont_initialize += SenderAgent.get_all_weights() if ReceiverAgent.loaded: dont_initialize += ReceiverAgent.get_all_weights() if ImageCaptioner.loaded: dont_initialize += ImageCaptioner.get_all_weights() variables_to_initialize = [ v for v in tf.global_variables() if v not in dont_initialize ] # REMOVE LATER #variables_to_initialize += ImageCaptioner.optimizer.variables() Agent.sess.run(tf.variables_initializer(variables_to_initialize)) self.sender_shared_state = VariableState( self.sess, SenderAgent.get_shared_weights()) self.receiver_shared_state = VariableState( self.sess, ReceiverAgent.get_shared_weights()) self.sender_own_state = VariableState(self.sess, SenderAgent.get_weights()) self.receiver_own_state = VariableState(self.sess, ReceiverAgent.get_weights()) # print(SenderAgent.get_shared_weights()) # print(ReceiverAgent.get_shared_weights()) # print(SenderAgent.get_weights()) # print(ReceiverAgent.get_weights()) # print(tf.trainable_variables()) self.shared_states = { "shared_sender": self.sender_shared_state, "shared_receiver": self.receiver_shared_state } self.own_states = { "own_sender": self.sender_own_state, "own_receiver": self.receiver_own_state } shared_average = [] for k, v in self.shared_states.items(): shared_average.append(v.export_variables()) shared_average = np.mean(shared_average, axis=0) self.set_weights(new_shared_weights=shared_average) self.dh = data_handler with open( "{}/data/csv_loss_{}.csv".format(project_path, self.experiment.get_key()), 'w+') as csv_loss_file: csv_loss_file.write( "Image Captioner Loss,Image Selector Loss,Sender Loss,Receiver Loss\n" ) with open( "{}/data/csv_accuracy_{}.csv".format( project_path, self.experiment.get_key()), 'w+') as csv_acc_file: csv_acc_file.write( "Image Captioner Loss,Image Selector Loss,Sender Loss,Receiver Loss\n" ) self.step = 0 def get_diff(self, a, b): diff = 0. if isinstance(a, (np.ndarray, np.generic)): return np.sum(np.abs(a - b)) elif isinstance(a, list): for i in range(len(a)): diff += self.get_diff(a[i], b[i]) elif isinstance(a, dict): for k in a: diff += self.get_diff(a[k], b[k]) return diff def set_weights(self, new_own_weights=None, new_shared_weights=None): if new_own_weights is not None: for k, s in self.own_states.items(): s.import_variables(new_own_weights[k]) if new_shared_weights is not None: for k, s in self.shared_states.items(): s.import_variables(new_shared_weights) def train_epoch(self, e, mode=None): self.dh.set_params(distractors=0) image_gen = self.dh.get_images(return_captions=True, mode="train") # Get current variables start_vars = { k: s.export_variables() for k, s in self.own_states.items() } start_vars["shared"] = self.shared_states[ "shared_sender"].export_variables() while True: try: # Save current variables old_own = { k: s.export_variables() for k, s in self.own_states.items() } new_own = {k: [] for k, s in self.own_states.items()} old_shared = self.shared_states[ "shared_sender"].export_variables() new_shared = [] # For each task for task in ["Image Captioner", "Sender", "Receiver"]: # parameter setup to not waste data if task in ["Sender", "Receiver", "Image Selector"]: self.dh.set_params(distractors=Agent.D) else: self.dh.set_params(distractors=0) # Run task n times for _ in range(self.N): images, captions = next(image_gen) acc, loss = self.T[task](images, captions) self.train_metrics[task + " Accuracy"] = acc self.train_metrics[task + " Loss"] = loss # Store new variables [ new_own[k].append(s.export_variables()) for k, s in self.own_states.items() ] [ new_shared.append(s.export_variables()) for k, s in self.shared_states.items() ] # Reset to old variables for next task [ s.import_variables(old_own[k]) for k, s in self.own_states.items() ] [ s.import_variables(old_shared) for k, s in self.shared_states.items() ] self.step += 1 self.experiment.set_step(self.step) self.experiment.log_metrics(self.train_metrics) # Average new variables new_own = { k: interpolate_vars(old_own[k], average_vars(new_own[k]), 0.2) for k, s in self.own_states.items() } new_shared = interpolate_vars(old_shared, average_vars(new_shared), 0.2) # Set variables to new variables self.set_weights(new_own_weights=new_own, new_shared_weights=new_shared) except StopIteration: break # Get change in weights end_vars = { k: s.export_variables() for k, s in self.own_states.items() } end_vars["shared"] = self.shared_states[ "shared_sender"].export_variables() weight_diff = self.get_diff(start_vars, end_vars) #self.experiment.set_step(e) self.val_metrics["Weight Change"] = weight_diff self.experiment.log_metrics(self.val_metrics) # Log data to a csv with open("{}/data/csv_loss_{}.csv".format(project_path, self.experiment.get_key()), 'a') as csv_loss_file, \ open("{}/data/csv_accuracy_{}.csv".format(project_path, self.experiment.get_key()), 'a') as csv_acc_file: losses = [] accs = [] for task in ["Image Captioner", "Sender", "Receiver"]: losses.append(str(self.train_metrics[task + " Loss"])) accs.append(str(self.train_metrics[task + " Accuracy"])) csv_loss_file.write(",".join(losses)) csv_loss_file.write("\n") csv_acc_file.write(",".join(accs)) csv_acc_file.write("\n") return 0, weight_diff
from allennlp.modules.augmented_lstm import AugmentedLstm from torch import nn from torch.utils.data.sampler import SubsetRandomSampler from sklearn.metrics import classification_report from tqdm import tqdm from dataset import ProielDataset COMET_API_KEY = os.getenv('COMET_API_KEY') experiment = Experiment( api_key=COMET_API_KEY, project_name='deep-latin-tagger', workspace='tylerkirby', ) EXPERIMENT_HASH = experiment.get_key() class BayesianDropoutLSTM(nn.Module): def __init__(self, vocab_size, tag_size, X_lengths, embedding_dim, hidden_size, recurrent_dropout_probability=0): super(BayesianDropoutLSTM, self).__init__() self.X_lengths = X_lengths self.embedding_layer = nn.Embedding(vocab_size, embedding_dim) self.augmented_lstm = AugmentedLstm( input_size=embedding_dim,
if len(modes) == 0 or len([x for x in modes if x not in ['train', 'test']]): print('please provide train or test modes') exit(1) # OPTIONAL COMET DATA LOGGING SETUP # experiment = None if log_comet: from comet_ml import Experiment config = util.load_config() experiment = Experiment(api_key=config['comet']['api_key'], project_name=config['comet']['project_name'], workspace=config['comet']['workspace']) # END OPTIONAL COMET DATA LOGGING SETUP # dir_name = experiment.get_key() if experiment is not None else str( int(time.time())) checkpoints_dir = None if save_checkpoints: checkpoints_dir = '{}/{}'.format(checkpoints_root_dir, dir_name) if 'test' in modes: results_dir = '{}/{}'.format(results_root_dir, dir_name) # ADDITIONAL IMPORTS # - imports are split because comet_ml requires being imported before torch from dataset.dataset_loader import DatasetLoader from model.agent import DDPG from model.util import determine_device from train import train from test import test
# OPTIONAL COMET DATA LOGGING SETUP # experiment = None if log_comet: from comet_ml import Experiment config = util.load_config() experiment = Experiment(api_key=config['comet']['api_key'], project_name=config['comet']['project_name'], workspace=config['comet']['workspace']) # END OPTIONAL COMET DATA LOGGING SETUP # # SETUP CHECKPOINTS DIR # if save_checkpoints: checkpoints_dir_name = experiment.get_key( ) if experiment is not None else str(int(start)) checkpoints_dir = '{}/{}'.format(checkpoints_root_dir, checkpoints_dir_name) os.makedirs(checkpoints_dir, exist_ok=True) else: checkpoints_dir = None # END SETUP CHECKPOINTS DIR # # SETUP RESULTS DIR # if 'test' in modes: results_dir_name = experiment.get_key() if experiment is not None else str( int(start)) if 'train' in modes and save_checkpoints: results_dir_name = checkpoints_dir_name elif load_model:
class Dashboard: """Record training/evaluation statistics to comet :param Path log_dir :param list taskid_to_name """ def __init__(self, config, paras, log_dir, train_type, resume=False): self.log_dir = log_dir self.expkey_f = Path(self.log_dir, 'exp_key') self.global_step = 1 if resume: assert self.expkey_f.exists( ), f"Cannot find comet exp key in {self.log_dir}" with open(Path(self.log_dir, 'exp_key'), 'r') as f: exp_key = f.read().strip() self.exp = ExistingExperiment( previous_experiment=exp_key, project_name=COMET_PROJECT_NAME, workspace=COMET_WORKSPACE, auto_output_logging=None, auto_metric_logging=None, display_summary_level=0, ) else: self.exp = Experiment( project_name=COMET_PROJECT_NAME, workspace=COMET_WORKSPACE, auto_output_logging=None, auto_metric_logging=None, display_summary_level=0, ) #TODO: is there exists better way to do this? with open(self.expkey_f, 'w') as f: print(self.exp.get_key(), file=f) self.exp.log_other('seed', paras.seed) self.log_config(config) if train_type == 'evaluation': if paras.pretrain: self.exp.set_name( f"{paras.pretrain_suffix}-{paras.eval_suffix}") self.exp.add_tags([ paras.pretrain_suffix, config['solver']['setting'], paras.accent, paras.algo, paras.eval_suffix ]) if paras.pretrain_model_path: self.exp.log_other("pretrain-model-path", paras.pretrain_model_path) else: self.exp.log_other("pretrain-runs", paras.pretrain_runs) self.exp.log_other("pretrain-setting", paras.pretrain_setting) self.exp.log_other("pretrain-tgt-accent", paras.pretrain_tgt_accent) else: self.exp.set_name(paras.eval_suffix) self.exp.add_tags( ["mono", config['solver']['setting'], paras.accent]) else: self.exp.set_name(paras.pretrain_suffix) self.exp.log_others({ f"accent{i}": k for i, k in enumerate(paras.pretrain_accents) }) self.exp.log_other('accent', paras.tgt_accent) self.exp.add_tags([ paras.algo, config['solver']['setting'], paras.tgt_accent ]) #TODO: Need to add pretrain setting ##slurm-related hostname = os.uname()[1] if len(hostname.split('.')) == 2 and hostname.split( '.')[1] == 'speech': logger.notice(f"Running on Battleship {hostname}") self.exp.log_other('jobid', int(os.getenv('SLURM_JOBID'))) else: logger.notice(f"Running on {hostname}") def log_config(self, config): #NOTE: depth at most 2 for block in config: for n, p in config[block].items(): if isinstance(p, dict): self.exp.log_parameters(p, prefix=f'{block}-{n}') else: self.exp.log_parameter(f'{block}-{n}', p) def set_status(self, status): self.exp.log_other('status', status) def step(self, n=1): self.global_step += n def set_step(self, global_step=1): self.global_step = global_step def log_info(self, prefix, info): self.exp.log_metrics({k: float(v) for k, v in info.items()}, prefix=prefix, step=self.global_step) def log_other(self, name, value): self.exp.log_metric(name, value, step=self.global_step) def log_step(self): self.exp.log_other('step', self.global_step) def add_figure(self, fig_name, data): self.exp.log_figure(figure_name=fig_name, figure=data, step=self.global_step) def check(self): if not self.exp.alive: logger.warning("Comet logging stopped")
model = torch.nn.DataParallel(model).cuda() opt = torch.optim.Adam(model.parameters(), lr=hparams['learning_rate']) all_losses = [back_loss_tr_loss_name] + \ [k for k in sorted(val_losses.keys())] + \ [k for k in sorted(tr_val_losses.keys())] tr_step = 0 val_step = 0 for i in range(hparams['n_epochs']): res_dic = {} for loss_name in all_losses: res_dic[loss_name] = {'mean': 0., 'std': 0., 'acc': []} print("Experiment: {} - {} || Epoch: {}/{}".format(experiment.get_key(), experiment.get_tags(), i + 1, hparams['n_epochs'])) model.train() for data in tqdm(train_gen, desc='Training'): opt.zero_grad() m1wavs = data[0].unsqueeze(1).cuda() clean_wavs = data[-1].cuda() rec_sources_wavs = model(m1wavs) l = back_loss_tr_loss(rec_sources_wavs, clean_wavs, initial_mixtures=m1wavs)
def cli_main(): parser = options.get_training_parser() parser.add_argument( "--comet-logging", action="store_true", help="Whether to use Comet.ML for logging", ) args = options.parse_args_and_arch(parser) logging = getattr(args, "comet_logging", False) config = None if logging: PROJECT = "machine-translation" if not keyring.get_password("comet", PROJECT): comet_ml_api_key = getpass("Please enter the comet.ml API key: ") keyring.set_password("comet", PROJECT, comet_ml_api_key) else: comet_ml_api_key = keyring.get_password("comet", PROJECT) experiment = Experiment( api_key=comet_ml_api_key, project_name="machine-translation", workspace="machine-translation", auto_output_logging=None, ) config = { "api_key": comet_ml_api_key, "experiment_key": experiment.get_key() } print("Proceeding with Comet.ML logging...") if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, config, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args, config) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = "tcp://localhost:{port}".format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != "no_c10d": print( "| NOTE: you may get better performance with: --ddp-backend=no_c10d" ) torch.multiprocessing.spawn(fn=distributed_main, args=(args, config), nprocs=args.distributed_world_size) else: # single GPU training main(args, config=config) if config: experiment.end()
class DWIMLAbstractTrainer: """ This Trainer class's train_and_validate() method: - Creates DataLoaders from the data_loaders. Collate_fn will be the loader.load_batch() method, and the dataset will be sampler.source_data. - Trains each epoch by using compute_batch_loss, which should be implemented in each project's child class. Comet is used to save training information, but some logs will also be saved locally in the saving_path. """ def __init__(self, model: MainModelAbstract, experiments_path: str, experiment_name: str, batch_sampler_training: DWIMLBatchSampler, batch_loader_training: AbstractBatchLoader, batch_sampler_validation: DWIMLBatchSampler = None, batch_loader_validation: AbstractBatchLoader = None, model_uses_streamlines: bool = False, learning_rate: float = 0.001, weight_decay: float = 0.01, max_epochs: int = 10, max_batches_per_epoch: int = 1000, patience: int = None, nb_cpu_processes: int = 0, use_gpu: bool = False, comet_workspace: str = None, comet_project: str = None, from_checkpoint: bool = False, log_level=logging.root.level): """ Parameters ---------- model: MainModelAbstract Instatiated class containing your model. experiments_path: str Path where to save this experiment's results and checkpoints. Will be saved in experiments_path/experiment_name. experiment_name: str Name of this experiment. This will also be the name that will appear online for comet.ml experiment. batch_sampler_training: DWIMLBatchSampler Instantiated class used for sampling batches of training data. Data in batch_sampler_training.source_data must be already loaded. batch_loader_training: AbstractBatchLoader Instantiated class with a load_batch method able to load data associated to sampled batch ids. batch_sampler_validation: DWIMLBatchSampler Similar as before, for the validation set. Can be set to None if no validation is used. Then, best model is based on training loss. batch_loader_validation: AbstractBatchLoader Again, similar as before but can be set to None. model_uses_streamlines: bool If true, the batch streamlines will be sent to the model when calling the forward method. Else, only the inputs. Default: False. learning_rate: float Learning rate. Default: 0.001 (torch's default) weight_decay: float Add a weight decay penalty on the parameters. Default: 0.01. (torch's default). max_epochs: int Maximum number of epochs. Default = 10, for no good reason. max_batches_per_epoch: int Maximum number of batches per epoch. Default = 10000, for no good reason. patience: int Use early stopping. Defines the number of epochs after which the model should stop if the loss hasn't improved. Default: None (i.e. no early stopping). nb_cpu_processes: int Number of parallel CPU workers. Use 0 to avoid parallel threads. Default : 0. use_gpu: bool If true, use GPU device when possible instead of CPU. Default = False comet_workspace: str Your comet workspace. See our docs/Getting Started for more information on comet and its API key. Default= None (comet.ml will not be used). comet_project: str Send your experiment to a specific comet.ml project. Default: None (it will be sent to Uncategorized Experiments). from_checkpoint: bool If true, we do not create the output dir, as it should already exist. Default: False. """ # To developers: do not forget that changes here must be reflected # in the save_checkpoint method! # ---------------------- # Values given by the user # ---------------------- # Trainer's logging level can be changed separately from main # scripts. self.logger = logger self.logger.setLevel(log_level) # Experiment if not os.path.isdir(experiments_path): raise NotADirectoryError("The experiments path does not exist! " "({})".format(experiments_path)) self.experiments_path = experiments_path self.saving_path = os.path.join(experiments_path, experiment_name) if not from_checkpoint and not os.path.isdir(self.saving_path): logging.info('Creating directory {}'.format(self.saving_path)) os.mkdir(self.saving_path) self.experiment_name = experiment_name self.saving_path = os.path.join(self.experiments_path, self.experiment_name) if not from_checkpoint and not os.path.isdir(self.saving_path): logger.info('Creating directory {}'.format(self.saving_path)) os.mkdir(self.saving_path) # Note that the training/validation sets are contained in the # data_loaders.data_source self.train_batch_sampler = batch_sampler_training self.valid_batch_sampler = batch_sampler_validation if self.valid_batch_sampler is None: self.use_validation = False self.logger.warning( "WARNING! There is not validation set. Loss for best epoch " "monitoring will be the training loss. \n" "Best practice is to have a validation set.") else: self.use_validation = True self.train_batch_loader = batch_loader_training self.valid_batch_loader = batch_loader_validation self.model = model self.model_uses_streamlines = model_uses_streamlines self.max_epochs = max_epochs self.max_batches_per_epochs = max_batches_per_epoch self.learning_rate = learning_rate self.weight_decay = weight_decay self.patience = patience self.nb_cpu_processes = nb_cpu_processes self.use_gpu = use_gpu self.comet_workspace = comet_workspace self.comet_project = comet_project # ---------------------- # Values fixed by us # ---------------------- # Device and rng value. Note that if loading from a checkpoint, the # complete state should be updated. if use_gpu: if torch.cuda.is_available(): self.device = torch.device('cuda') # Setting the rng seed if (self.use_validation and self.train_batch_sampler.rng != self.valid_batch_sampler.rng): raise ValueError("Training and validation batch samplers " "do not have the same rng. Please verify " "the code.") # If you see a hint error below, upgrade torch. torch.cuda.manual_seed(self.train_batch_sampler.rng) else: raise ValueError("You chose GPU (cuda) device but it is not " "available!") else: self.device = torch.device('cpu') # ---------------------- # Values that will be modified later on. If initializing experiment # from a checkpoint, these values should be updated after # initialization. # ---------------------- if patience: self.best_epoch_monitoring = BestEpochMonitoring( patience=self.patience) else: # We won't use early stopping to stop the epoch, but we will use # it as monitor of the best epochs. self.best_epoch_monitoring = BestEpochMonitoring( patience=self.max_batches_per_epochs + 1) self.current_epoch = 0 # Nb of batches with be estimated later on self.nb_train_batches_per_epoch = None self.nb_valid_batches_per_epoch = None # RNG state # Nothing to to here. # Setup monitors self.train_loss_monitor = ValueHistoryMonitor("Training loss") self.valid_loss_monitor = ValueHistoryMonitor("Validation loss") self.grad_norm_monitor = ValueHistoryMonitor("Grad Norm") # Comet values will be instantiated in train(). self.comet_exp = None self.comet_key = None # ---------------------- # Launching optimizer! # ---------------------- # Prepare optimizer # Send model to device. Reminder, contrary to tensors, model.to # overwrites the model. # NOTE: This ordering is important! The optimizer needs to use the cuda # Tensors if using the GPU... self.model.to(device=self.device) # Build optimizer (Optimizer is built here since it needs the model # parameters) list_params = [n for n, _ in self.model.named_parameters()] self.logger.debug("Initiating trainer: {}".format(type(self))) self.logger.debug( "This trainer will use Adam optimization on the " "following model.parameters: \n\n".join(list_params) + "\n") self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate, weight_decay=weight_decay) @property def params_for_checkpoint(self): # These are the parameters necessary to use _init_ params = { 'model_uses_streamlines': self.model_uses_streamlines, 'learning_rate': self.learning_rate, 'weight_decay': self.weight_decay, 'max_epochs': self.max_epochs, 'max_batches_per_epoch': self.max_batches_per_epochs, 'patience': self.patience, 'nb_cpu_processes': self.nb_cpu_processes, 'use_gpu': self.use_gpu, 'comet_workspace': self.comet_workspace, 'comet_project': self.comet_project } return params @property def params(self) -> dict: params = self.params_for_checkpoint params.update({ 'experiments_path': self.experiments_path, 'experiment_name': self.experiment_name, 'comet_key': self.comet_key, 'computed_values': { 'nb_training_batches_per_epoch': self.nb_train_batches_per_epoch, 'nb_validation_batches_per_epoch': self.nb_valid_batches_per_epoch } }) return params def _init_comet(self): """ For more information on comet, see our doc/Getting Started """ try: if self.comet_key: self.comet_exp = ExistingExperiment( previous_experiment=self.comet_key) elif self.comet_workspace: # New experiment # Use trainset name as comet project name project_name = self.comet_project self.comet_exp = CometExperiment( project_name=project_name, workspace=self.comet_workspace, log_code=False, log_graph=True, auto_param_logging=True, auto_metric_logging=False, parse_args=False, auto_output_logging='native', log_env_details=True, log_env_gpu=True, log_env_cpu=True, log_env_host=False, log_git_metadata=True, log_git_patch=True, display_summary=False) self.comet_exp.set_name(self.experiment_name) self.comet_exp.log_parameters(self.params) self.comet_key = self.comet_exp.get_key() except ConnectionError: self.logger.warning( "Could not connect to Comet.ml, metrics will not be logged " "online...") self.comet_exp = None self.comet_key = None def estimate_nb_batches_per_epoch(self): """ Please override in your child class if you have a better way to define the epochs sizes. Returns: (nb_training_batches_per_epoch, nb_validation_batches_per_epoch) """ return self.max_batches_per_epochs, self.max_batches_per_epochs def train_and_validate(self, *args): """ Train + validates the model (+ computes loss) - Starts comet, - Creates DataLoaders from the BatchSamplers, - For each epoch - uses _train_one_epoch and _validate_one_epoch, - checks for earlyStopping if the loss is bad, - saves the model if the loss is good. - Checks if allowed training time is exceeded. Parameters ---------- All *args will be passed all the way to _train_one_epoch and _train_one_batch, in case you want to override them. """ self.logger.debug("Trainer {}: \n" "Running the model {}.\n\n".format( type(self), type(self.model))) # If data comes from checkpoint, this is already computed if self.nb_train_batches_per_epoch is None: self.logger.info("Estimating batch sizes.") (self.nb_train_batches_per_epoch, self.nb_valid_batches_per_epoch) = \ self.estimate_nb_batches_per_epoch() # Instantiate comet experiment # If self.comet_key is None: new experiment, will create a key # Else, resuming from checkpoint. Will continue with given key. self._init_comet() if self.comet_exp: train_context = self.comet_exp.train_and_validate valid_context = self.comet_exp.validate else: # Instantiating contexts doing nothing instead train_context = contextlib2.nullcontext valid_context = contextlib2.nullcontext # Create DataLoaders from the BatchSamplers # * Pin memory if interpolation is done by workers; this means that # dataloader output is on GPU, ready to be fed to the model. # Otherwise, dataloader output is kept on CPU, and the main thread # sends volumes and coords on GPU for interpolation. self.logger.debug("- Instantiating dataloaders...") # toDo We wouldn't need training / valid batch samplers and loaders if # I knew how to add option 'training' and 'validation' to the # __iter__ method or to the collate_fn (load_batch). But maybe the # user wants separate options. During validation and training. Ex: # less on-the-fly noise addition to the streamlines during validation? # But I don't see why we wouldn't want the same batch sampler. We # could have only one and use the same.copy() and change the value of # the subset to training or validation. # If we also don't think users want different load_batch, solution # could be (for the dataloader) the collate_fn could be nothing, and # we call load_data() ourselves after, with options. train_dataloader = DataLoader( self.train_batch_sampler.dataset, batch_sampler=self.train_batch_sampler, num_workers=self.nb_cpu_processes, collate_fn=self.train_batch_loader.load_batch, pin_memory=self.use_gpu) valid_dataloader = None if self.use_validation: valid_dataloader = DataLoader( self.valid_batch_sampler.dataset, batch_sampler=self.valid_batch_sampler, num_workers=self.nb_cpu_processes, collate_fn=self.valid_batch_loader.load_batch, pin_memory=self.use_gpu) # Instantiating our IterTimer. # After each iteration, checks that the maximum allowed time has not # been reached. iter_timer = IterTimer(history_len=20) # Start from current_epoch in case the experiment is resuming # Train each epoch for epoch in iter_timer(range(self.current_epoch, self.max_epochs)): # Updating current epoch. First epoch is 0! self.current_epoch = epoch # Training self.logger.info( "**********TRAINING: Epoch #{}*************".format(epoch)) self.train_one_epoch(train_dataloader, train_context, epoch) # Validation if self.use_validation: self.logger.info( "**********VALIDATION: Epoch #{}*************".format( epoch)) self.validate_one_epoch(valid_dataloader, valid_context, epoch, *args) last_loss = self.valid_loss_monitor.epochs_means_history[-1] else: last_loss = self.train_loss_monitor.epochs_means_history[-1] # Updating info self.best_epoch_monitoring.update(last_loss, epoch) # Check for early stopping if self.best_epoch_monitoring.is_patience_reached: self.save_checkpoint() raise EarlyStoppingError( "Early stopping! Loss has not improved after {} epochs!\n" "Best result: {}; At epoch #{}".format( self.patience, self.best_epoch_monitoring.best_value, self.best_epoch_monitoring.best_epoch)) # Else, check if current best has been reached # If that is the case, the monitor has just reset its n_bad_epochs # to 0 if self.best_epoch_monitoring.n_bad_epochs == 0: self.logger.info("Best epoch yet! Saving model and loss " "history.") # Save model self.model.update_best_model() self.model.save(self.saving_path) # Save losses (i.e. mean over all batches) losses = { 'train_loss': self.train_loss_monitor.epochs_means_history[ self.best_epoch_monitoring.best_epoch], 'valid_loss': self.best_epoch_monitoring.best_value if self.use_validation else None } with open(os.path.join(self.saving_path, "losses.json"), 'w') as json_file: json_file.write( json.dumps(losses, indent=4, separators=(',', ': '))) # Save information online if self.comet_exp: self.comet_exp.log_metric( "best_loss", self.best_epoch_monitoring.best_value) self.comet_exp.log_metric( "best_epoch", self.best_epoch_monitoring.best_epoch) # End of epoch, save checkpoint for resuming later self.save_checkpoint() def save_model(self): self.model.save(self.saving_path) def train_one_epoch(self, train_dataloader, train_context, epoch): """ Train one epoch of the model: loop on all batches. All *args will be passed all to run_one_batch, which you should implement, in case you need some variables. """ # Make sure there are no existing HDF handles if using parallel workers if (self.nb_cpu_processes > 0 and self.train_batch_sampler.dataset.is_lazy): self.train_batch_sampler.dataset.hdf_handle = None self.train_batch_sampler.dataset.volume_cache_manager = None if self.comet_exp: self.comet_exp.log_metric("current_epoch", self.current_epoch) # Improving loggers for tqdm make_logger_tqdm_fitted(self.logger) make_logger_tqdm_fitted(self.model.logger) make_logger_tqdm_fitted(self.train_batch_sampler.logger) make_logger_tqdm_fitted(self.train_batch_loader.logger) if self.valid_batch_sampler: make_logger_tqdm_fitted(self.valid_batch_sampler.logger) make_logger_tqdm_fitted(self.valid_batch_loader.logger) # Training all batches self.logger.debug("Training one epoch: iterating on batches using " "tqdm on the dataloader...") with tqdm(train_dataloader, ncols=100, total=self.nb_train_batches_per_epoch) as pbar: train_iterator = enumerate(pbar) with train_context(): for batch_id, data in train_iterator: # Break if maximum number of epochs has been reached if batch_id == self.nb_train_batches_per_epoch: # Explicitly close tqdm's progress bar to fix possible # bugs when breaking the loop pbar.close() break mean_loss, grad_norm = self.run_one_batch( data, is_training=True, batch_loader=self.train_batch_loader) self.logger.debug("Updated loss: {}".format(mean_loss)) self.train_loss_monitor.update(mean_loss) self.grad_norm_monitor.update(grad_norm) # Update information every 10 updates if not self.use_validation and batch_id % 10 == 0: self._update_logs(batch_id, mean_loss) # Explicitly delete iterator to kill threads and free memory before # running validation del train_iterator # Making loggers normal make_logger_normal(self.logger) make_logger_normal(self.model.logger) make_logger_normal(self.train_batch_sampler.logger) make_logger_normal(self.train_batch_loader.logger) if self.valid_batch_sampler: make_logger_normal(self.valid_batch_sampler.logger) make_logger_normal(self.valid_batch_loader.logger) # Saving epoch's information self.logger.info("Finishing epoch...") self.train_loss_monitor.end_epoch() self.grad_norm_monitor.end_epoch() self._save_log_from_array(self.train_loss_monitor.epochs_means_history, "train_loss.npy") self._save_log_from_array(self.grad_norm_monitor.epochs_means_history, "gradient_norm.npy") with train_context(): if self.comet_exp: self.comet_exp.log_metric( "gradient_norm_epoch", self.grad_norm_monitor.epochs_means_history[-1], step=epoch) self.comet_exp.log_metric( "loss_epoch", self.train_loss_monitor.epochs_means_history[-1], step=epoch) self.logger.info("Mean gradient norm : {}".format( self.grad_norm_monitor.epochs_means_history[-1])) self.logger.info("Mean training loss : {}".format( self.train_loss_monitor.epochs_means_history[-1])) def validate_one_epoch(self, valid_dataloader, valid_context, epoch, *args): """ Validate one epoch of the model: loop on all batches. All *args will be passed all to run_one_batch, which you should implement, in case you need some variables. """ self.logger.debug('Unused args in validate: {}'.format(args)) # Make sure there are no existing HDF handles if using parallel workers if (self.nb_cpu_processes > 0 and self.valid_batch_sampler.dataset.is_lazy): self.valid_batch_sampler.dataset.hdf_handle = None self.valid_batch_sampler.dataset.volume_cache_manager = None # Validate all batches with tqdm(valid_dataloader, ncols=100, total=self.nb_valid_batches_per_epoch) as pbar: valid_iterator = enumerate(pbar) for batch_id, data in valid_iterator: # Break if maximum number of epochs has been reached if batch_id == self.nb_valid_batches_per_epoch: # Explicitly close tqdm's progress bar to fix possible bugs # when breaking the loop pbar.close() break # Validate this batch: forward propagation + loss mean_loss, _ = self.run_one_batch( data, is_training=False, batch_loader=self.valid_batch_loader) self.valid_loss_monitor.update(mean_loss) # Update information every 10 updates if batch_id % 10 == 0: self._update_logs(batch_id, mean_loss) # Explicitly delete iterator to kill threads and free memory before # running training again del valid_iterator # Save this epoch's information self.valid_loss_monitor.end_epoch() self._save_log_from_array(self.valid_loss_monitor.epochs_means_history, "valid_loss.npy") with valid_context(): if self.comet_exp: self.comet_exp.log_metric( "loss_epoch", self.valid_loss_monitor.epochs_means_history[-1], step=epoch) self.logger.info("Validation loss : {}".format( self.valid_loss_monitor.epochs_means_history[-1])) def _update_logs(self, batch_id, mean_loss): if self.comet_exp: self.comet_exp.log_metric("loss_step", mean_loss, step=batch_id) self.comet_exp.log_metric( "gradient_norm_step", self.grad_norm_monitor.current_epoch_history[-1], step=batch_id) def run_one_batch(self, data, is_training: bool, batch_loader): """ Run a batch of data through the model (calling its forward method) and return the mean loss. If training, run the backward method too. If the sampler was instantiated with wait_for_gpu, then we need to compute the inputs here; not done yet. Parameters ---------- data : tuple of (List, dict) This is the output of the AbstractBatchLoader's load_batch() method. If wait_for_gpu, data is (batch_streamlines, final_streamline_ids_per_subj). Else, data is (batch_streamlines, final_streamline_ids_per_subj, inputs) batch_loader: AbstractBatchLoader Either self.train_batch_loader or valid_batch_loader, depending on the case. is_training : bool If True, record the computation graph and backprop through the model parameters. Returns ------- mean_loss : float The mean loss of the provided batch grad_norm: float The total norm (sqrt(sum(params**2))) of parameters before gradient clipping, if any. """ raise NotImplementedError def compute_loss(self, model_outputs, targets): """ Calls the compute_loss method of the model. Reimplement in a child class if targets needs to be formatted in any way before the call. """ mean_loss = self.model.compute_loss(model_outputs, targets) return mean_loss def fix_parameters(self): """ This function is called during training, after the forward and backward propagation, but before updating the parameters through the optimizer. User may define their own functions here if some modification on the parameters is necessary. Ex: in the case of vanishing or exploding gradients problem, this would be the place to fix the parameters based on the gradient. """ pass @classmethod def init_from_checkpoint(cls, model: MainModelAbstract, experiments_path, experiment_name, train_batch_sampler: DWIMLBatchSampler, train_batch_loader: AbstractBatchLoader, valid_batch_sampler: Union[DWIMLBatchSampler, None], valid_batch_loader: Union[AbstractBatchLoader, None], checkpoint_state: dict, new_patience, new_max_epochs): """ During save_checkpoint(), checkpoint_state.pkl is saved. Loading it back offers a dict that can be used to instantiate an experiment and set it at the same state as previously. (Current_epoch is updated +1). Hint: If you want to use this in your child class, use: experiment, checkpoint_state = super(cls, cls).init_from_checkpoint(... """ trainer = cls(model, experiments_path, experiment_name, batch_sampler_training=train_batch_sampler, batch_loader_training=train_batch_loader, batch_sampler_validation=valid_batch_sampler, batch_loader_validation=valid_batch_loader, from_checkpoint=True, **checkpoint_state['params_for_init']) current_states = checkpoint_state['current_states'] # Overriding values if new_patience: trainer.patience = new_patience if new_max_epochs: trainer.max_epochs = new_max_epochs # Set RNG states torch.set_rng_state(current_states['torch_rng_state']) trainer.train_batch_sampler.np_rng.set_state( current_states['numpy_rng_state']) if trainer.use_validation: trainer.valid_batch_sampler.np_rng.set_state( current_states['numpy_rng_state']) if trainer.use_gpu: torch.cuda.set_rng_state(current_states['torch_cuda_state']) # Set other objects trainer.comet_key = current_states['comet_key'] trainer.current_epoch = current_states['current_epoch'] + 1 trainer.nb_train_batches_per_epoch = \ current_states['nb_train_batches_per_epoch'] trainer.nb_valid_batches_per_epoch = \ current_states['nb_valid_batches_per_epoch'] trainer.best_epoch_monitoring.set_state( current_states['best_epoch_monitoring_state']) trainer.train_loss_monitor.set_state( current_states['train_loss_monitor_state']) trainer.valid_loss_monitor.set_state( current_states['valid_loss_monitor_state']) trainer.grad_norm_monitor.set_state( current_states['grad_norm_monitor_state']) trainer.optimizer.load_state_dict(current_states['optimizer_state']) logger.info( "Resuming from checkpoint! Next epoch will be epoch #{}".format( trainer.current_epoch)) return trainer def save_checkpoint(self): """ Save an experiment checkpoint that can be resumed from. """ self.logger.info("Saving checkpoint...") # Make checkpoint directory checkpoint_dir = os.path.join(self.saving_path, "checkpoint") # Backup old checkpoint before saving, and erase it afterwards to_remove = None if os.path.exists(checkpoint_dir): to_remove = os.path.join(self.saving_path, "checkpoint_old") shutil.move(checkpoint_dir, to_remove) os.mkdir(checkpoint_dir) # Save experiment # Separated function to be re-implemented by child classes to fit your # needs. Below is one working example. checkpoint_state = self._prepare_checkpoint_state() torch.save(checkpoint_state, os.path.join(checkpoint_dir, "checkpoint_state.pkl")) # Save model inside the checkpoint dir self.model.save(checkpoint_dir) if to_remove: shutil.rmtree(to_remove) def _prepare_checkpoint_state(self) -> dict: # These are parameters that should be updated after instantiating cls. current_states = { 'comet_key': self.comet_key, 'current_epoch': self.current_epoch, 'nb_train_batches_per_epoch': self.nb_train_batches_per_epoch, 'nb_valid_batches_per_epoch': self.nb_valid_batches_per_epoch, 'torch_rng_state': torch.random.get_rng_state(), 'torch_cuda_state': torch.cuda.get_rng_state() if self.use_gpu else None, 'numpy_rng_state': self.train_batch_sampler.np_rng.get_state(), 'best_epoch_monitoring_state': self.best_epoch_monitoring.get_state() if self.best_epoch_monitoring else None, 'train_loss_monitor_state': self.train_loss_monitor.get_state(), 'valid_loss_monitor_state': self.valid_loss_monitor.get_state(), 'grad_norm_monitor_state': self.grad_norm_monitor.get_state(), 'optimizer_state': self.optimizer.state_dict(), } # Additional params are the parameters necessary to load data, batch # samplers/loaders (see the example script dwiml_train_model.py). # Note that the training set and validation set attributes should be # the same in theory. #toDo to be checked? checkpoint_state = { 'train_sampler_params': self.train_batch_sampler.params, 'valid_sampler_params': None, 'train_data_params': self.train_batch_sampler.dataset.params, 'valid_data_params': None, 'train_loader_params': self.train_batch_loader.params, 'valid_loader_params': None, 'params_for_init': self.params_for_checkpoint, 'current_states': current_states } if self.use_validation: checkpoint_state.update({ 'valid_sampler_params': self.valid_batch_sampler.params, 'valid_data_params': self.valid_batch_sampler.dataset.params, 'valid_loader_params': self.valid_batch_loader.params }) return checkpoint_state def _save_log_from_array(self, array: np.ndarray, fname: str): log_dir = os.path.join(self.saving_path, "logs") if not os.path.exists(log_dir): os.makedirs(log_dir) fpath = os.path.join(log_dir, fname) np.save(fpath, array) @staticmethod def load_params_from_checkpoint(experiments_path: str, experiment_name: str): total_path = os.path.join(experiments_path, experiment_name, "checkpoint", "checkpoint_state.pkl") if not os.path.isfile(total_path): raise FileNotFoundError( 'Checkpoint was not found! ({})'.format(total_path)) checkpoint_state = torch.load(total_path) return checkpoint_state @staticmethod def check_stopping_cause(checkpoint_state, new_patience=None, new_max_epochs=None): # 1. Check if early stopping had been triggered. best_monitoring_state = \ checkpoint_state['current_states']['best_epoch_monitoring_state'] bad_epochs = best_monitoring_state['n_bad_epochs'] if new_patience is None: # No new patience: checking if early stopping had been triggered. if bad_epochs >= best_monitoring_state['patience']: raise EarlyStoppingError( "Resumed experiment was stopped because of early " "stopping, increase patience in order to resume training!") elif bad_epochs >= new_patience: # New patience: checking if will be able to continue raise EarlyStoppingError( "In resumed experiment, we had reach {} bad epochs (i.e. with " "no improvement). You have now overriden patience to {} but " "that won't be enough. Please increase that value in " "order to resume training.".format( best_monitoring_state['n_bad_epochs'], new_patience)) # 2. Checking that max_epochs had not been reached. current_epoch = checkpoint_state['current_states']['current_epoch'] if new_max_epochs is None: if current_epoch == \ checkpoint_state['params_for_init']['max_epochs'] - 1: raise ValueError( "Resumed experiment had stopped after reaching the " "maximum number of epochs allowed (max_epochs = {}). " "Please increase that value in order to resume training.". format(checkpoint_state['params_for_init']['max_epochs'])) else: if current_epoch > new_max_epochs: raise ValueError( "In resumed experiment, we had performed {} epochs). You " "have now overriden max_epoch to {} but that won't be " "enough. Please increase that value in order to resume " "training.".format(current_epoch, new_max_epochs))
GMPS_PATH=/home/gberseth/playground/GMPS MULTIWORLD_PATH=/home/gberseth/playground/multiworld/ python3 functional_scripts/seq_train.py """ import sys import os GMPS_PATH = os.environ['GMPS_PATH'] MULTIWORL_PATH = os.environ['MULTIWORLD_PATH'] from comet_ml import Experiment comet_logger = Experiment(api_key="KWwx7zh6I2uw6oQMkpEo3smu0", project_name="ml4l3", workspace="glenb") comet_logger.set_name("test seq train with vpg") print(comet_logger.get_key()) # comet_logger.end() import tensorflow as tf from functional_scripts.remote_train import experiment as train_experiment from functional_scripts.local_test_ppo import experiment as rl_experiment path_to_gmps = GMPS_PATH test_dir = path_to_gmps + '/seq_test/' meta_log_dir = test_dir + '/meta_data/' EXPERT_DATA_LOC = test_dir + '/seq_expert_traj/' def train_seq(meta_variant, rl_variant, comet_logger=comet_logger): from multiprocessing import Process
def main(args=None): # parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) configs = configparser.ConfigParser() if args.config is not None: configs = read_config_file(args.config) if args.comet_api_key is not None: comet_experiment = Experiment(api_key=args.comet_api_key, project_name=args.comet_project_name, workspace=args.comet_workspace) comet_experiment.add_tag(args.experiment_tag) comet_experiment.set_name(args.experiment_tag) # get the experiment key from comet and replace the one passed throught the arguments args.experiment_key = comet_experiment.get_key() args_dict = vars(args) for arg_key, arg_val in args_dict.items(): if isinstance(arg_val, argparse.Namespace): comet_experiment.log_parameters(vars(arg_val),arg_key) else: comet_experiment.log_parameter(arg_key, arg_val) # store the transformer configuration arg_key = 'init' comet_experiment.log_parameters(configs._sections['init'], arg_key) snapshot_path = helper.make_dir(os.path.join(args.snapshot_path, args.experiment_key)) result_path = helper.make_dir(os.path.join(args.log_path, args.experiment_key)) mfile = snapshot_path + 'transformer.h5' # store the args and configs helper.store_settings(store_object=args, json_file=result_path + 'script_arguments.args') write_config_file(configs, result_path + 'config.ini') train_generator = CSVGenerator(args.annotations, batch_size=args.batch_size, tokens_file=args.vocab, i_embedding_matrix_file=args.i_embedding_matrix, o_embedding_matrix_file=args.o_embedding_matrix, sequence_max_length=int(configs['init']['len_limit'])) i_tokens = train_generator.i_tokens o_tokens = train_generator.o_tokens i_embedding_matrix = train_generator.i_embedding_matrix o_embedding_matrix = train_generator.o_embedding_matrix if args.val_annotations: validation_generator = CSVGenerator(args.val_annotations, batch_size=args.batch_size, i_tokens=i_tokens, o_tokens=o_tokens, sequence_max_length=int(configs['init']['len_limit'])) val_size = validation_generator.size() else: validation_generator = None val_size = None if args.steps is not None: train_size = args.steps else: train_size = train_generator.size() print('seq 1 words:', i_tokens.num()) print('seq 2 words:', o_tokens.num()) s2s = Transformer(i_tokens, o_tokens, i_embedding_matrix=i_embedding_matrix, o_embedding_matrix=o_embedding_matrix, **configs['init']) training_model = transformer(transformer_structure=s2s, inputs=None) lr_scheduler = LRSchedulerPerStep(configs['init']['d_model'], 4000) training_model.compile( metrics={'transformer_classification': metrics.masked_accuracy(layer_size=int(configs['init']['len_limit']))}, loss={'transformer_classification': losses.masked_ce(layer_size=int(configs['init']['len_limit']))}, optimizer=deserialize({'class_name': configs['optimizer']['class_name'], 'config':eval(configs['optimizer']['config'])})) model_saver = ModelCheckpoint(mfile, save_best_only=True, save_weights_only=True) csv_logger = CSVLogger(result_path + 'results.csv', append=True) training_model.summary() plot_model(training_model, to_file=snapshot_path + 'architecture.png', show_shapes=True, show_layer_names=True) try: training_model.load_weights(mfile) except: print('\n\nnew model') training_model.fit_generator(train_generator, epochs=args.epochs, shuffle=False, steps_per_epoch=train_size, callbacks=[lr_scheduler, model_saver, csv_logger], validation_data=validation_generator, validation_steps=val_size)
def main(args): torch.manual_seed(args.seed) np.random.seed(args.seed) print('Loading data') data = np.load(args.boards_file, allow_pickle=True) idxs = data['idxs'] labels = data['values'] mask = labels != None idxs = idxs[mask] labels = labels[mask] n = len(idxs) if args.shuffle: perm = np.random.permutation(n) idxs = idxs[perm] labels = labels[perm] if args.experiment is None: experiment = Experiment(project_name="chess-axia") experiment.log_parameters(vars(args)) else: experiment = ExistingExperiment(previous_experiment=args.experiment) key = experiment.get_key() print(f'Number of Boards: {n}') if torch.cuda.is_available() and args.num_gpus > 0: device = torch.device('cuda:0') else: device = torch.device('cpu') if args.num_train is None: args.num_train = n - args.num_test if args.num_train + args.num_test > n: raise ValueError('num-train and num-test sum to more than dataset size') train_idxs = idxs[:args.num_train] test_idxs = idxs[-args.num_test:] train_labels = labels[:-args.num_test] test_labels = labels[-args.num_test:] #print(f'Win percentage: {sum(train_labels)/ len(train_labels):.1%}') print('Train size: ' + str(len(train_labels))) train_loader = DataLoader(BoardAndPieces(train_idxs, train_labels), batch_size=args.batch_size, collate_fn=collate_fn, shuffle=True) test_loader = DataLoader(BoardAndPieces(test_idxs, test_labels), batch_size=args.batch_size, collate_fn=collate_fn) ae = AutoEncoder().to(device) ae_file = append_to_modelname(args.ae_model, args.ae_iter) ae.load_state_dict(torch.load(ae_file)) model = BoardValuator(ae).to(device) loss_fn = model.loss_fn model = DataParallel(model) if args.model_loadname: model.load_state_dict(torch.load(args.model_loadname)) if args.ae_freeze: print('Freezing AE model') for param in ae.parameters(): param.requires_grad = False if torch.cuda.device_count() > 1 and args.num_gpus > 1: model = torch.nn.DataParallel(model) optimizer = optim.Adam(model.parameters(), lr=args.lr) #cum_acc = cum_loss = count = 0 total_iters = args.init_iter for epoch in range(args.init_epoch, args.epochs): print(f'Running epoch {epoch} / {args.epochs}\n') #for batch_idx, (input, mask, label) in tqdm(enumerate(train_loader), # total=len(train_loader)): for batch_idx, (input, mask, label) in enumerate(train_loader): model.train() input = to(input, device) mask = to(mask, device) label = to(label, device) optimizer.zero_grad() output = model(input, mask) loss = loss_fn(output, label) loss.backward() optimizer.step() cum_loss += loss.item() # cum_acc += acc.item() count += 1 if total_iters % args.log_interval == 0: tqdm.write(f'Epoch: {epoch}\t Iter: {total_iters:>6}\t Loss: {loss.item():.5f}') # experiment.log_metric('accuracy', cum_acc / count, # step=total_iters) experiment.log_metric('loss', cum_loss / count, step=total_iters) experiment.log_metric('loss_', cum_loss / count, step=total_iters) #cum_acc = cum_loss = count = 0 if total_iters % args.save_interval == 0: path = get_modelpath(args.model_dirname, key, args.model_savename, iter=total_iters, epoch=epoch) dirname = os.path.dirname(path) if not os.path.exists(dirname): os.makedirs(dirname) torch.save(model.state_dict(), path) if total_iters % args.eval_interval == 0 and total_iters != 0: loss = eval_loss(model, test_loader, device, loss_fn) tqdm.write(f'\tTEST: Loss: {loss:.5f}') #experiment.log_metric('test accuracy', acc, step=total_iters, # epoch=epoch) experiment.log_metric('test loss', loss, step=total_iters, epoch=epoch) total_iters += 1
def main(datafile='./data/train_.pt', epochs=1000, learning_rate=1e-3, dim_out=10, device='cuda:0', project_name='em_showers_net_training', work_space='schattengenie', graph_embedder='GraphNN_KNN_v2', edge_classifier='EdgeClassifier_v1', patience=10): experiment = Experiment(project_name=project_name, workspace=work_space) early_stopping = EarlyStopping_(patience=patience, verbose=True) device = torch.device(device) showers = preprocess_dataset(datafile) showers_train, showers_test = train_test_split(showers, random_state=1337) train_loader = DataLoader(showers_train, batch_size=1, shuffle=True) test_loader = DataLoader(showers_test, batch_size=1, shuffle=True) k = showers[0].x.shape[1] print(k) graph_embedder = str_to_class(graph_embedder)(dim_out=dim_out, k=k).to(device) edge_classifier = str_to_class(edge_classifier)(dim_out=dim_out).to(device) criterion = FocalLoss(gamma=2.) optimizer = torch.optim.Adam(list(graph_embedder.parameters()) + list(edge_classifier.parameters()), lr=learning_rate) loss_train = RunningAverageMeter() loss_test = RunningAverageMeter() roc_auc_test = RunningAverageMeter() pr_auc_test = RunningAverageMeter() acc_test = RunningAverageMeter() class_disbalance = RunningAverageMeter() for _ in tqdm(range(epochs)): for shower in train_loader: shower = shower.to(device) edge_labels_true, edge_labels_predicted = predict_one_shower( shower, graph_embedder=graph_embedder, edge_classifier=edge_classifier) # calculate the batch loss loss = criterion(edge_labels_predicted, edge_labels_true.float()) # Zero gradients, perform a backward pass, and update the weights. optimizer.zero_grad() loss.backward() optimizer.step() loss_train.update(loss.item()) class_disbalance.update((edge_labels_true.sum().float() / len(edge_labels_true)).item()) y_true_list = deque() y_pred_list = deque() for shower in test_loader: shower = shower.to(device) edge_labels_true, edge_labels_predicted = predict_one_shower( shower, graph_embedder=graph_embedder, edge_classifier=edge_classifier) # calculate the batch loss loss = criterion(edge_labels_predicted, edge_labels_true.float()) y_true, y_pred = edge_labels_true.detach().cpu().numpy( ), edge_labels_predicted.detach().cpu().numpy() y_true_list.append(y_true) y_pred_list.append(y_pred) acc = accuracy_score(y_true, y_pred.round()) roc_auc = roc_auc_score(y_true, y_pred) pr_auc = average_precision_score(y_true, y_pred) loss_test.update(loss.item()) acc_test.update(acc) roc_auc_test.update(roc_auc) pr_auc_test.update(pr_auc) class_disbalance.update((edge_labels_true.sum().float() / len(edge_labels_true)).item()) #f = plot_aucs(y_true=y_true, y_pred=y_pred) #experiment.log_figure("Optimization dynamic", f, overwrite=True) experiment_key = experiment.get_key() eval_loss = loss_test.val early_stopping(eval_loss, graph_embedder, edge_classifier, experiment_key) #### if early_stopping.early_stop: print("Early stopping") break # TODO: save best #torch.save(graph_embedder.state_dict(), "graph_embedder_{}.pt".format(experiment_key)) #torch.save(edge_classifier.state_dict(), "edge_classifier_{}.pt".format(experiment_key)) experiment.log_metric('loss_test', loss_test.val) experiment.log_metric('acc_test', acc_test.val) experiment.log_metric('roc_auc_test', roc_auc_test.val) experiment.log_metric('pr_auc_test', pr_auc_test.val) experiment.log_metric('class_disbalance', class_disbalance.val) y_true = np.concatenate(y_true_list) y_pred = np.concatenate(y_pred_list) # load the last checkpoint with the best model graph_embedder.load_state_dict( torch.load("graph_embedder_{}.pt".format(experiment_key))) edge_classifier.load_state_dict( torch.load("edge_classifier_{}.pt".format(experiment_key)))
return new_sources tr_step = 0 val_step = 0 for i in range(hparams['n_epochs']): res_dic = {} histograms_dic = {} for loss_name in all_losses: res_dic[loss_name] = {'mean': 0., 'std': 0., 'acc': []} res_dic[loss_name+'i'] = {'mean': 0., 'std': 0., 'acc': []} for hist_name in histogram_names: histograms_dic[hist_name] = [] histograms_dic[hist_name+'i'] = [] print("Higher Order Sudo-RM-RF: {} - {} || Epoch: {}/{}".format( experiment.get_key(), experiment.get_tags(), i+1, hparams['n_epochs'])) model.train() for data in tqdm(generators['train'], desc='Training'): opt.zero_grad() #m1wavs = data[0].cuda() clean_wavs = data[-1].cuda() if hparams['max_abs_snr'] > 0.: clean_wavs = mix_with_random_snr(clean_wavs, hparams['max_abs_snr']) histograms_dic['tr_input_snr'] += (10. * torch.log10( (clean_wavs[:, 0] ** 2).sum(-1) / (1e-8 + ( clean_wavs[:, 1] ** 2).sum(-1)))).tolist() # # Online mixing over samples of the batch. (This might cause to get
def main(): # Training settings parser = argparse.ArgumentParser(description='Cifar10 Example') parser.add_argument('--batch-size', type=int, default=128, metavar='N', help='input batch size for training (default: 128)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=25, metavar='N', help='number of epochs to train (default: 25)') parser.add_argument('--lr', type=float, default=0.1, metavar='LR', help='learning rate (default: 0.1)') parser.add_argument('--momentum', type=float, default=0.9, metavar='M', help='SGD momentum (default: 0.9)') parser.add_argument('--model-path', type=str, default='', metavar='M', help='model param path') parser.add_argument('--loss-type', type=str, default='CE', metavar='L', help='B or CE or F or ICF_CE or ICF_F or CB_CE or CB_F') parser.add_argument('--beta', type=float, default=0.999, metavar='B', help='Beta for ClassBalancedLoss') parser.add_argument('--gamma', type=float, default=2.0, metavar='G', help='Gamma for FocalLoss') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--balanced-data', action='store_true', default=False, help='For sampling rate. Default is Imbalanced-data.') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() # Add the following code anywhere in your machine learning file experiment = Experiment(api_key="5Yl3Rxz9S3E0PUKQTBpA0QJPi", project_name="imbalanced-cifar-10", workspace="tancoro") # ブラウザの実験ページを開く # experiment.display(clear=True, wait=True, new=0, autoraise=True) # 実験キー(実験を一意に特定するためのキー)の取得 exp_key = experiment.get_key() print('KEY: ' + exp_key) # HyperParamの記録 hyper_params = { 'batch_size': args.batch_size, 'epoch': args.epochs, 'learning_rate': args.lr, 'sgd_momentum' : args.momentum, 'model_path' : args.model_path, 'loss_type' : args.loss_type, 'beta' : args.beta, 'gamma' : args.gamma, 'torch_manual_seed': args.seed, 'balanced_data' : args.balanced_data } experiment.log_parameters(hyper_params) use_cuda = not args.no_cuda and torch.cuda.is_available() print('use_cuda {}'.format(use_cuda)) torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {} # train dataset cifar10_train_dataset = datasets.CIFAR10('./data', train=True, download=True, transform=transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) ])) # train sampling rate sampling_rate = {} if not args.balanced_data: sampling_rate = {1:0.05, 4:0.05, 6:0.05} print(sampling_rate) # train Sampler train_sampler = ReductionSampler(cifar10_train_dataset, sampling_rate=sampling_rate) # train loader train_loader = torch.utils.data.DataLoader(cifar10_train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) # test dataset cifar10_test_dataset = datasets.CIFAR10('./data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) ])) # test majority loader test_majority_sampler = ReductionSampler(cifar10_test_dataset, sampling_rate={1:0, 4:0, 6:0}) test_majority_loader = torch.utils.data.DataLoader(cifar10_test_dataset, batch_size=args.test_batch_size, sampler=test_majority_sampler, **kwargs) # test minority loader test_minority_sampler = ReductionSampler(cifar10_test_dataset, sampling_rate={0:0, 2:0, 3:0, 5:0, 7:0, 8:0, 9:0}) test_minority_loader = torch.utils.data.DataLoader(cifar10_test_dataset, batch_size=args.test_batch_size, sampler=test_minority_sampler, **kwargs) # test alldata loader test_alldata_loader = torch.utils.data.DataLoader(cifar10_test_dataset, batch_size=args.test_batch_size, shuffle=True, **kwargs) model = ResNet18().to(device) # train loss train_loss = BasicCrossEntropyLoss() if args.loss_type == 'CE': train_loss = CrossEntropyLoss(train_sampler.get_data_count_map(), device) elif args.loss_type == 'F': train_loss = FocalLoss(train_sampler.get_data_count_map(), device, gamma=args.gamma) elif args.loss_type == 'ICF_CE': train_loss = InverseClassFrequencyCrossEntropyLoss(train_sampler.get_data_count_map(), device) elif args.loss_type == 'ICF_F': train_loss = InverseClassFrequencyFocalLoss(train_sampler.get_data_count_map(), device, gamma=args.gamma) elif args.loss_type == 'CB_CE': train_loss = ClassBalancedCrossEntropyLoss(train_sampler.get_data_count_map(), device, beta=args.beta) elif args.loss_type == 'CB_F': train_loss = ClassBalancedFocalLoss(train_sampler.get_data_count_map(), device, beta=args.beta, gamma=args.gamma) print('Train Loss Type: {}'.format(type(train_loss))) # load param if len(args.model_path) > 0: model.load_state_dict(torch.load(args.model_path)) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=5e-4) # lr = 0.1 if epoch < 15 # lr = 0.01 if 15 <= epoch < 20 # lr = 0.001 if 20 <= epoch < 25 scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[15,20], gamma=0.1) for epoch in range(1, args.epochs + 1): with experiment.train(): experiment.log_current_epoch(epoch) train(args, model, device, train_loader, len(train_sampler), optimizer, epoch, experiment, lossfunc=train_loss) with experiment.test(): test(args, model, device, test_minority_loader, len(test_minority_sampler), epoch, experiment, pref='minority') test(args, model, device, test_majority_loader, len(test_majority_sampler), epoch, experiment, pref='majority') test(args, model, device, test_alldata_loader, len(test_alldata_loader.dataset), epoch, experiment, pref='all') if (args.save_model) and (epoch % 10 == 0): print('saving model to ./model/cifar10_{0}_{1:04d}.pt'.format(exp_key, epoch)) torch.save(model.state_dict(), "./model/cifar10_{0}_{1:04d}.pt".format(exp_key, epoch)) scheduler.step()
def comet_lgbm(save_path): from comet_ml import Experiment exp = Experiment(api_key="sqMrI9jc8kzJYobRXRuptF5Tj", project_name="baseline", workspace="gdreiman1") exp.log_code = True import pickle import pandas as pd import lightgbm as lgb import numpy as np import sklearn import matplotlib.pyplot as plt from sklearn.metrics import precision_recall_fscore_support as prf #%% def single_roc(y_preds,y_true): from sklearn.metrics import roc_curve, auc,precision_recall_curve fpr, tpr, _ = roc_curve(y_true, y_preds) roc_auc = auc(fpr, tpr) plt.figure() lw = 2 plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') precision, recall, thresholds = precision_recall_curve(y_true, y_preds) plt.plot(recall, precision, color='blue', lw=lw, label='Precision vs Recall') # show the plot plt.legend(loc="lower right") plt.show() def multi_roc(y_preds,y_true,name,n_classes): import collections nested_dict = lambda: collections.defaultdict(nested_dict) data_store = nested_dict() from sklearn.metrics import roc_curve, auc from scipy import interp from itertools import cycle lw = 2 name_store = ['Active', 'Inactive', 'Inconclusive'] fpr = dict() tpr = dict() roc_auc = dict() for i in range(n_classes): fpr[i], tpr[i], _ = roc_curve(y_true[:, i], y_preds[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y_true[:, i].ravel(), y_preds[:, i].ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Compute macro-average ROC curve and ROC area # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(n_classes): mean_tpr += interp(all_fpr, fpr[i], tpr[i]) # Finally average it and compute AUC mean_tpr /= n_classes fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # Plot all ROC curves plt.figure() plt.plot(fpr["micro"], tpr["micro"], label='micro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4) plt.plot(fpr["macro"], tpr["macro"], label='macro-average ROC curve (area = {0:0.2f})' ''.format(roc_auc["macro"]), color='navy', linestyle=':', linewidth=4) colors = cycle(['aqua', 'darkorange', 'cornflowerblue','green']) for i, color in zip(range(n_classes), colors): plt.plot(fpr[i], tpr[i], color=color, lw=lw, label='ROC curve of '+ name_store[i]+'(area = {1:0.2f})' ''.format(i, roc_auc[i])) plt.plot([0, 1], [0, 1], 'k--', lw=lw) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') #plt.title('Multi-class ROC for '+name+' Split= '+str(count+1)) plt.title('Multi-class ROC for '+name) plt.legend(loc="lower right") #plt.show() #%% #save_path = r'C:\Users\gdrei\Dropbox\UCL\Thesis\May_13\AID_1345083_processed.pkl' model_type = 'lgbm' #get data cleaned pickle_off = open(save_path,'rb') activity_table=pickle.load(pickle_off) pickle_off.close() #get length of MFP fp_length = len(activity_table.iloc[5]['MFP']) from sklearn.preprocessing import StandardScaler, LabelEncoder scaler = StandardScaler(copy = False) le = LabelEncoder() labels = le.fit_transform(activity_table['PUBCHEM_ACTIVITY_OUTCOME']) #split data: from sklearn.model_selection import StratifiedShuffleSplit splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.5, train_size=None, random_state=2562) X_mfp = np.concatenate(np.array(activity_table['MFP'])).ravel() X_mfp = X_mfp.reshape((-1,fp_length)) for train_ind, test_ind in splitter.split(X_mfp,labels): # standardize data X_train_molchars_std = scaler.fit_transform(np.array(activity_table.iloc[train_ind,4:])) X_test_molchars_std = scaler.transform(np.array(activity_table.iloc[test_ind,4:])) X_train = np.concatenate((X_mfp[train_ind,:],X_train_molchars_std),axis = 1) X_test = np.concatenate((X_mfp[test_ind,:],X_test_molchars_std),axis = 1) y_train = labels[train_ind] y_test = labels[test_ind] #X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,labels,test_size = .5, shuffle = True, stratify = labels, random_state = 2562) bin_y_train, bin_y_test = [1 if x ==2 else x for x in y_train],[1 if x ==2 else x for x in y_test] #do light gbm #need to make a lib svm file train_data = lgb.Dataset(X_train,label=y_train) test_data = lgb.Dataset(X_test,label=y_test) #make model class lgbm_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1, n_estimators=500, subsample_for_bin=200000, objective='binary', is_unbalance=True, min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True, importance_type='split') #train model trained_mod = lgbm_model.fit(X_train,y_train) #predict classes and class_probs test_class_preds = lgbm_model.predict(X_test) test_prob_preds = lgbm_model.predict_proba(X_test) #calculate Class report class_rep = sklearn.metrics.classification_report(y_test,test_class_preds) print(class_rep) if len(set(y_test)) == 2: single_roc(test_prob_preds[:,1],y_test) prec,rec,f_1,supp = prf(y_test, test_class_preds, average=None) else: from tensorflow.keras.utils import to_categorical multi_roc(test_prob_preds,to_categorical(y_test),'',3) prec,rec,f_1,supp = prf(y_test, test_class_preds, average=None) #%% '''Comet Saving Zone''' #get AID number import ntpath #get base file name folder,base = ntpath.split(save_path) #split file name at second _ assumes file save in AID_xxx_endinfo.pkl AID, _,end_info = base.rpartition('_') #save data location, AID info, and version info exp.log_dataset_info(name = AID, version = end_info, path = save_path) #save model params exp.log_parameters(trained_mod.get_params()) #save metrics report to comet if len(f_1) == 2: for i,name in enumerate(['Active','Inactive']): exp.log_metric('f1 class '+name, f_1[i]) exp.log_metric('Recall class'+name,rec[i]) exp.log_metric('Precision class'+name, prec[i]) else: for i,name in enumerate(['Active','Inconclusive','Inactive']): exp.log_metric('f1 class '+str(i), f_1[i]) exp.log_metric('Recall class'+str(i),rec[i]) exp.log_metric('Precision class'+str(i), prec[i]) #exp.log_metric('f1 class '+str(i), f_1[i]) #exp.log_metric('Recall class'+str(i),rec[i]) #exp.log_metric('Precision class'+str(i), prec[i]) exp.log_other('Classification Report',class_rep) #save model in data_folder with comet experiement number associated exp_num = exp.get_key() model_save = folder+'\\'+model_type+'_'+exp_num+'.pkl' pickle_on = open(model_save,'wb') pickle.dump(trained_mod,pickle_on) pickle_on.close() #log trained model location exp.log_other('Trained Model Path',model_save) #save some informatvie tags: tags = [AID,end_info,model_type] exp.add_tags(tags) #save ROC curve exp.log_figure(figure_name = 'ROC-Pres/Recall',figure=plt) plt.show() #tell comet that the experiement is over exp.end()
def train_cifar10(batch_size: int, learning_rate: float, epochs: int, experiment: Experiment, model: Sequential = get_model(), initial_epoch: int = 0, training_datagen: ImageDataGenerator = ImageDataGenerator(), scheduler: Callable[[int], float] = None, early_stopping_th: Optional[int] = 250, data_portion: float = 1.0, find_lr: bool = False) -> None: preprocessing_fnc = training_datagen.preprocessing_function name = experiment.get_key() log_path, model_path = get_output_paths(name) data = get_cifar10_data(data_portion=data_portion) training_datagen.fit(data.x_train) log_images(data.x_train, training_datagen, experiment) log_input_images(data.x_train, data.y_train, training_datagen, experiment) opt = Adam(lr=learning_rate) model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) log_model_plot(experiment, model) csv_cb = CSVLogger(log_path) keep_best_cb = KeepBest('val_acc') callbacks = [csv_cb, keep_best_cb] # [csv_cb, early_stopping_cb, keep_best_cb] if early_stopping_th is not None: early_stopping_cb = EarlyStopping('val_acc', patience=early_stopping_th, restore_best_weights=True, verbose=2) callbacks.append(early_stopping_cb) if scheduler is not None: scheduler.experiment_log(experiment=experiment, epochs=list(range(epochs))) callbacks.append(LearningRateScheduler(scheduler)) if find_lr: lrf = LearningRateFinder(model=model) lrf.lrMult = (10e-1 / learning_rate)**( 1.0 / (epochs * len(data.x_train) / batch_size)) callbacks = [ LambdaCallback( on_batch_end=lambda batch, logs: lrf.on_batch_end(batch, logs)) ] model.fit_generator(training_datagen.flow(data.x_train, data.y_train, batch_size=batch_size), steps_per_epoch=len(data.x_train) / batch_size, epochs=epochs, validation_data=(preprocessing_fnc(data.x_dev), data.y_dev), shuffle=True, callbacks=callbacks, verbose=2, initial_epoch=initial_epoch) model.save(model_path) experiment.log_asset(model_path) experiment.log_asset(log_path) if find_lr: experiment.log_figure('lr vs acc', lrf.plot_loss()) log_final_metrics(experiment, model, data, preprocessing_fnc)
} metrics['step_time'] = step_time # validation plotting progbar.add(valid_inc, [('Train Loss', metrics['train_loss']), ('Validation Loss', metrics['valid_loss']), ('Time (s)', step_time)]) #Plot on Comet experiment.log_metrics(metrics, step=t) # Plot on WandB wandb.log(metrics, step=t) if (t + 1) % save_inc == 0: trainer.save_weights(model_path, run_id=wandb.run.id, experiment_key=experiment.get_key()) if not args.gcbc and not args.images: z_enc, z_plan = produce_cluster_fig(next(plotting_dataset), encoder, planner, TEST_DATA_PATHS[0], num_take=dl.batch_size // 4) #Comet experiment.log_figure('z_enc', z_enc, step=t) experiment.log_figure('z_plan', z_plan, step=t) # WandB wandb.log({'z_enc': z_enc, 'z_plan': z_plan}, step=t) #latent_fig = project_enc_and_plan(ze, zp)
home = os.environ['HOME'] parser = argparse.ArgumentParser() parser.add_argument('-span', default=.5, type=float) parser.add_argument('-seed', default=1234, type=int) parser.add_argument('-eig', action='store_true') parser.add_argument('-ckpt', default='poison-filtnorm-weaker', type=str) parser.add_argument('-gpu', default='0', type=str) parser.add_argument('-svhn', action='store_true') args = parser.parse_args() # comet stuff if not os.path.exists('comet_expt_key_surface.txt'): experiment = Experiment(api_key="vPCPPZrcrUBitgoQkvzxdsh9k", parse_args=False, project_name='landscape', workspace="wronnyhuang") open('comet_expt_key_surface.txt', 'w+').write(experiment.get_key()) else: comet_key = open('comet_expt_key_surface.txt', 'r').read() experiment = ExistingExperiment(api_key="vPCPPZrcrUBitgoQkvzxdsh9k", previous_experiment=comet_key, parse_args=False) # apply settings np.random.seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # load data and model cleanloader, _, _ = get_loader(join(home, 'datasets'), batchsize=2 * 64, fracdirty=.5, nogan=True, svhn=args.svhn) evaluator = Evaluator(cleanloader) evaluator.restore_weights_dropbox('ckpt/'+args.ckpt) # plot along which direction if args.eig:
max=50.) errors = torch.cat([er_00 + er_01, er_10 + er_11], 1) return torch.mean(torch.min(errors, 1)[0]) num_available_nodes = len(federated_generators_list) tr_step = 0 val_step = 0 prev_epoch_val_loss = 0. for i in range(hparams['n_global_epochs']): res_dic = {} for loss_name in all_losses: res_dic[loss_name] = {'mean': 0., 'std': 0., 'median': 0., 'acc': []} print("Individual Federated Sudo-RM-RF: {} - {} || Epoch: {}/{}".format( experiment.get_key(), experiment.get_tags(), i + 1, hparams['n_global_epochs'])) training_nodes = federated_generators_list sum_global_loss = 0. for train_node_id, node_dic in enumerate(training_nodes): local_model = node_dic['local_model'] local_model = local_model.cuda() local_model.train() local_opt = torch.optim.Adam(local_model.parameters(), lr=hparams['learning_rate']) if hparams['patience'] > 0: if tr_step % hparams['patience'] == 0: new_lr = (hparams['learning_rate'] / (hparams['divide_lr_by']
def main(args=None): # parse arguments if args is None: args = sys.argv[1:] args = parse_args(args) # create object that stores backbone information backbone = models.backbone(args.backbone) # make sure keras is the minimum required version check_keras_version() # optionally choose specific GPU if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu keras.backend.tensorflow_backend.set_session(get_session()) # optionally load config parameters if args.config: args.config = read_config_file(args.config) if args.comet_api_key is not None: comet_experiment = Experiment(api_key=args.comet_api_key, project_name=args.comet_project_name, workspace=args.comet_workspace) comet_experiment.add_tag(args.experiment_tag) comet_experiment.set_name(args.experiment_tag) # get the experiment key from comet and replace the one passed through the arguments args.experiment_key = comet_experiment.get_key() # modify the snapshot path to include the experiment key args.snapshot_path = make_dir( os.path.join(args.snapshot_path, args.experiment_key)) # create the generators train_generator, validation_generator = create_generators( args, backbone.preprocess_image) # create the model if args.snapshot is not None: print('Loading model, this may take a second...') model = models.load_model(args.snapshot, backbone_name=args.backbone) training_model = model anchor_params = None if args.config and 'anchor_parameters' in args.config: anchor_params = parse_anchor_parameters(args.config) prediction_model = retinanet_bbox(model=model, anchor_params=anchor_params) else: weights = args.weights # default to imagenet if nothing else is specified if weights is None and args.imagenet_weights: weights = backbone.download_imagenet() print('Creating model, this may take a second...') model, training_model, prediction_model = create_models( backbone_retinanet=backbone.retinanet, num_classes=train_generator.num_classes(), weights=weights, multi_gpu=args.multi_gpu, freeze_backbone=args.freeze_backbone, config=args.config) # print model summary print(model.summary()) # this lets the generator compute backbone layer shapes using the actual backbone model if 'vgg' in args.backbone or 'densenet' in args.backbone: train_generator.compute_shapes = make_shapes_callback(model) if validation_generator: validation_generator.compute_shapes = train_generator.compute_shapes # create the callbacks callbacks = create_callbacks( model, training_model, prediction_model, validation_generator, args, ) # start training training_model.fit_generator( generator=train_generator, steps_per_epoch=args.steps, epochs=args.epochs, verbose=1, callbacks=callbacks, )