def run(config): if "seed" in config: torch.manual_seed(config["seed"]) np.random.seed(config["seed"]) random.seed(config["seed"]) model_config = config["model"] model_folder_name = f"{model_config['name']}-{datetime.now().strftime('%Y-%m-%d_%H-%M')}" model_save_path = os.path.join(config["model_path"], model_folder_name) os.makedirs(model_save_path, exist_ok=True) if config.get("use_wandb", False): wandb_config = config["wandb"] run = wandb.init(config=config, project=wandb_config.get("wandb_project", "mmemb"), name=wandb_config.get("wandb_name", model_folder_name), reinit=True) logging.info(config) if config.get("do_dev_test", False): logging.info("### Testing on Dev ###") logging.info("Loading model from ", model_config.get("model_path_embedding", model_save_path)) emb_model = MultimodalTransformer(model_name_or_path=model_config.get( "model_path_embedding", model_save_path)) logging.info("Loading model from ", model_config.get("model_path_cross", model_save_path)) cross_model = MultimodalTransformer( model_name_or_path=model_config.get("model_path_cross", model_save_path)) cross_model.image_dict = emb_model.image_dict dev_config = config["test"] dev_evaluator = get_evaluator(config["data"], dev_config, emb_model, "dev") dev_evaluator([emb_model, cross_model], output_path=model_save_path) if config.get("do_test", True): logging.info("### Testing ###") test_config = config["test"] logging.info("Loading model from ", model_config.get("model_path_embedding", model_save_path)) emb_model = MultimodalTransformer(model_name_or_path=model_config.get( "model_path_embedding", model_save_path)) logging.info("Loading model from ", model_config.get("model_path_cross", model_save_path)) cross_model = MultimodalTransformer( model_name_or_path=model_config.get("model_path_cross", model_save_path)) cross_model.image_dict = emb_model.image_dict test_evaluator = get_evaluator(config["data"], test_config, emb_model, "test") test_evaluator([emb_model, cross_model], output_path=model_save_path) with open(os.path.join(model_save_path, "config.yaml"), "w") as f: yaml.dump(config, f) if config.get("use_wandb", False): wandb.save(os.path.join(model_save_path, "*.csv")) wandb.join() run.finish()
def test_logging(self): net = WandbLoggedNet.easy_init() net.set_save_valid_conditions('valid', 'every', 1, 'epochs') net.train_one_epoch() net.train_one_epoch() # ensure that we are not saving log in net (should be online instead) self.assertTrue(net.logs['train'] == {}) self.assertTrue(net.logs['valid'] == {}) net.to_cuda() net.train_one_epoch() self.assertTrue(wandb.run.project_name() == net.wandb_project) url = wandb.run.get_url() wandb.join() api = wandb.Api() path = '/'.join(url.split('/')[-3:]) run = api.run(path) self.assertTrue(run.state == 'finished') for key in ['seed', 'nn_args', 'name_prefix']: self.assertTrue(run.config[key] == net.default_init_kwargs[key]) history = run.history(pandas=False) self.assertTrue(history[0]['train-two'] == 2) self.assertTrue(history[2]['iter'] == history[2]['_step']) self.assertTrue(history[-1]['epoch'] == net.epochs)
def train(): wandb.init(WAND_PROJECT_NAME) modelArgs = { "max_seq_length": self.maxSeqLength, "output_dir": self.modelOutputDir, "overwrite_output_dir": True, "best_model_dir": self.bestModelOutputDir, "wandb_project": WAND_PROJECT_NAME, "num_training_epochs": wandb.config.epochs, "learning_rate": wandb.config.learning_rate, "do_lower_case": True, "cache_dir": self.modelCacheDir, "encoding": "utf-8", "train_batch_size": 5, "eval_batch_size": 5, "evaluate_during_training_steps": 50, "evaluate_during_training_verbose": True, "logging_steps": 5, "sliding_window": True, "reprocess_input_data": True, "evaluate_during_training": True, "use_multiprocessing": True, "labels_list": SECTOR_LABELS } model = ClassificationModel(self.modelType, self.modelNameOrPath, args=modelArgs, sweep_config=wandb.config, use_cuda=torch.cuda.is_available(), num_labels=len(SECTOR_LABELS), ) # Training and evaluation try: log.info(f"Started training/finetuning BERT on multi-class classification task..") model.train_model(train_df=self.trainDataset, eval_df=self.evalDataset, show_running_loss=True, output_dir=self.modelOutputDir, mcc=sklearn.metrics.matthews_corrcoef, acc=sklearn.metrics.balanced_accuracy_score, ) log.info(f"Finished finetuning and evaluating our fine-tuned model on multi-class classification task. Check the folder '{self.modelOutputDir}' for finetuned weights.") log.info(f"It took {round((time.time() - startTime) / 3600, 1)} hours to finetune and evaluate our fine-tuned model on multi-class classification task.") except: exc_type, exc_value, exc_traceback = sys.exc_info() err = f"Error occurred while training and evaluating the finetuned model on multi-class classification task. Error is: {exc_type}; {exc_value}." log.error(err) wandb.join()
def log_fn(self, stop_event: Event): try: self._super_create_loggers() self.resposne_queue.put({ k: self.__dict__[k] for k in ["save_dir", "tb_logdir", "is_sweep"] }) while True: try: cmd = self.draw_queue.get(True, 0.1) except EmptyQueue: if stop_event.is_set(): break else: continue self._super_log(*cmd) self.resposne_queue.put(True) except: print("Logger process crashed.") raise finally: print("Logger: syncing") if self.use_wandb: wandb.join() stop_event.set() print("Logger process terminating...")
def launch_training_on_all_splits(experiment: str, splits: List, base_model: str, dropout: float, learning_rate: float): project = f'buchwald_hartwig_training_{experiment}_{base_model}' model_args = { 'wandb_project': project, 'num_train_epochs': 10, 'overwrite_output_dir': True, 'learning_rate': learning_rate, 'gradient_accumulation_steps': 1, 'regression': True, "num_labels":1, "fp16": False, "evaluate_during_training": True, 'manual_seed': 42, "max_seq_length": 300, "train_batch_size": 16,"warmup_ratio": 0.00, "config" : { 'hidden_dropout_prob': dropout } } for (name, split) in splits: if wandb_available: wandb.init(name=name, project=project, reinit=True) df_doyle = pd.read_excel('../data/Buchwald-Hartwig/Dreher_and_Doyle_input_data.xlsx', sheet_name=name) df_doyle['rxn'] = generate_buchwald_hartwig_rxns(df_doyle) train_df = df_doyle.iloc[:split-1][['rxn', 'Output']] # paper has starting index 1 not 0 test_df = df_doyle.iloc[split-1:][['rxn', 'Output']] # paper has starting index 1 not 0 train_df.columns = ['text', 'labels'] test_df.columns = ['text', 'labels'] mean = train_df.labels.mean() std = train_df.labels.std() train_df['labels'] = (train_df['labels'] - mean) / std test_df['labels'] = (test_df['labels'] - mean) / std model_path = pkg_resources.resource_filename("rxnfp", f"models/transformers/bert_{base_model}") pretrained_bert = SmilesClassificationModel("bert", model_path, num_labels=1, args=model_args, use_cuda=torch.cuda.is_available()) pretrained_bert.train_model(train_df, output_dir=f"outputs_buchwald_hartwig_{experiment}_{base_model}_{name}_split_{str(split).replace('-','_')}", eval_df=test_df, r2=sklearn.metrics.r2_score) if wandb_available: wandb.join() # multiple runs in same script
def run_job(self, job): run_id = job.run_id config_file = os.path.join("wandb", "sweep-" + self._sweep_id, "config-" + run_id + ".yaml") config_util.save_config_file_from_dict(config_file, job.config) os.environ[wandb.env.RUN_ID] = run_id os.environ[wandb.env.CONFIG_PATHS] = config_file os.environ[wandb.env.SWEEP_ID] = self._sweep_id wandb.setup(_reset=True) print("wandb: Agent Starting Run: {} with config:\n".format(run_id) + "\n".join([ "\t{}: {}".format(k, v["value"]) for k, v in job.config.items() ])) try: self._function() if wandb.run: wandb.join() except KeyboardInterrupt as e: print("Keyboard interrupt", e) return True except Exception as e: print("Problem", e) return True
def test_wandb_experiment(csv_filename): # Test W&B integration # add wandb arg and detect flag sys.argv.append('--wandb') ludwig.contrib.contrib_import() # disable sync to cloud os.environ['WANDB_MODE'] = 'dryrun' # Image Inputs image_dest_folder = os.path.join(os.getcwd(), 'generated_images') # Inputs & Outputs input_features = [image_feature(folder=image_dest_folder)] output_features = [category_feature()] rel_path = generate_data(input_features, output_features, csv_filename) # Run experiment run_experiment(input_features, output_features, data_csv=rel_path) # Check a W&B run was created assert wandb.run is not None # End session wandb.join() # Remove instance from contrib_registry ludwig.contrib.contrib_registry['instances'].pop() # Delete the temporary data created shutil.rmtree(image_dest_folder)
def main(CONFIG): # Reproducibility random.seed(CONFIG.SEED) np.random.seed(CONFIG.SEED) torch.manual_seed(CONFIG.SEED) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Setup model net = Net(CONFIG).double() # Setup wandb wandb.init(project="MagNet", config=OmegaConf.to_container(CONFIG), reinit=True) wandb.watch(net) # Log number of parameters CONFIG.NUM_PARAMETERS = count_parameters(net) trainer = pl.Trainer( # TODO: Add CONFIG parameters for devices gpus=1, # Don't show progress bar progress_bar_refresh_rate=0, check_val_every_n_epoch=1, # TODO: Try early stopping max_epochs=CONFIG.NUM_EPOCH, ) trainer.fit(net) # Close wandb wandb.join()
def test(): print('Testing') model.eval() pos = 0 total = 0 prediction_list = [] groundtruth_list = [] for _, (data, target) in enumerate(tqdm(test_data_loader)): data, target = data.cuda(), target.long().cuda() with torch.no_grad(): out = model(data) pred = torch.max(out, out.dim() - 1)[1] pos = pos + torch.eq(pred.cpu().long(), target.data.cpu().long()).sum().item() groundtruth_list += target.data.tolist() prediction_list += out[:, 1].tolist() total = total + data.size(0) acc = pos * 1.0 / total * 100 print('Acc: %.2f' % acc) if args.is_test: #Log test accuracy in wandb wandb.log({"Test Accuracy": pos * 1.0 / total * 100}) wandb.join() else: #Log validation accuracy in wandb wandb.log({"Validation Accuracy": pos * 1.0 / total * 100}) return acc
def run(self): result_folder = luigi.configuration.get_config().get( 'GlobalConfig', 'result_folder') model = GPT2LMHeadModel.from_pretrained("distilgpt2") tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2") train_dataset = TextDataset(tokenizer, self.input()['train'].path, block_size=self.block_size) test_dataset = TextDataset(tokenizer, self.input()['test'].path, block_size=self.block_size) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) training_args = TrainingArguments( do_eval=self.do_eval, do_train=self.do_train, eval_steps=self.eval_steps, evaluate_during_training=self.evaluate_during_training, gradient_accumulation_steps=self.gradient_accumulation_steps, logging_dir='./logs', logging_steps=self.logging_steps, learning_rate=self.learning_rate, max_grad_norm=self.max_grad_norm, num_train_epochs=self.num_train_epochs, output_dir=result_folder, overwrite_output_dir=True, per_device_train_batch_size=self.per_device_train_batch_size, per_device_eval_batch_size=self.per_device_eval_batch_size, save_steps=self.save_steps, seed=self.seed, warmup_steps=self.warmup_steps, weight_decay=self.weight_decay, ) trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=test_dataset) trainer.train() trainer.save_model() tokenizer.save_pretrained(result_folder) wanb_disabled = os.environ.get('WANDB_DISABLED', False) if wanb_disabled: run_name = time.strftime('%Y%m%d-%H%M%S') else: wandb.run.save() wandb.join() run_name = wandb.run.name with open(self.output()['run_name'].path, 'w') as f: f.write(run_name)
def main(): # This flag allows you to enable the inbuilt cudnn auto-tuner to # find the best algorithm to use for your hardware. torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False args = get_args() for args in iterate_args(args): output_dir = f'{args.output_dir}/{socket.gethostname()}/{args.experiment}/{args.network}/' + \ '_'.join([str(_) for _ in args.params])+'/' writer = Writer(output_dir=output_dir, file=f'{args.source[0]}_{args.target}') model = model_fns[args.network](num_usv_classes=args.num_usv_classes, num_classes=args.num_classes) # monitor_memory() if args.wandb: tags = [ args.source[0] + '_' + args.target, "_".join([str(_) for _ in args.params]) ] wandb.init( project=f'{args.experiment}_{args.network}', tags=tags, dir=dirname(__file__), config=args, reinit=True, name= f'{"-".join([str(_) for _ in args.params])}-{args.source[0]}-{args.target}' ) if args.redirect_to_file and args.redirect_to_file != 'null': print('redirect to ', output_dir + args.redirect_to_file) sys.stdout = open(output_dir + args.redirect_to_file, 'a') # wandb.watch(model, log='all') # can lead to continuous increment of GPU memory usage data_loaders = get_DGSSR_data_loader(args.source, args.target, args.data_dir, args.val_size, args.original_img_prob, args.batch_size, args.max_num_s_img, args) optimizer = get_optimizer(model, lr=args.learning_rate, train_all=args.train_all_param) scheduler = optim.lr_scheduler.StepLR(optimizer, int(args.epochs * .8)) Trainer(args, model, data_loaders, optimizer, scheduler, writer) save_model_dir = f'{args.data_dir}/cache/{socket.gethostname()}/{args.experiment}/{args.network}/' + \ '_'.join([str(_) for _ in args.params]) if not os.path.exists(save_model_dir): os.makedirs(save_model_dir) torch.save(model.state_dict(), save_model_dir + f'/{args.source[0]}_{args.target}.pkl') # wandb.save(args.data_dir+'/cache/model.pkl') if args.wandb: wandb.join()
def train_from_cache( *, architecture: str = "BiT", size: int = 256, augmentation: int = 1, epochs: int = 20, batch_size: int = 32, learning_rate: float = 0.001, lr_decay_rate: float = 0.99, lr_decay_steps: int = 5e2, **model_params, ) -> float: """ Trains the model using embeddings that were previously cached. """ train_data, dev_data = load_embeddings(architecture, size, augmentation) # Make and init the wandb run. wandb.init(project="Plant Pathology", reinit=True) wandb.config.update( { "architecture": architecture, "epochs": epochs, "batch_size": batch_size, "learning_rate": learning_rate, "lr_decay_rate": lr_decay_rate, "lr_decay_steps": lr_decay_steps, "augmentation": augmentation, "size": size, } ) model = PathologistModel(nclasses=constants.NCLASSES, **model_params) model.compile( optimizer=Adam( learning_rate=ExponentialDecay(learning_rate, lr_decay_steps, lr_decay_rate) ), loss="categorical_crossentropy", metrics=["categorical_accuracy"], ) model.fit( train_data["X"], train_data["y"], batch_size=batch_size, epochs=epochs, validation_data=(dev_data["X"], dev_data["y"]), callbacks=[WandbCallback(save_model=False)], ) # Log the scores train_loss, train_acc = model.evaluate(train_data["X"], train_data["y"]) _, dev_acc = model.evaluate(dev_data["X"], dev_data["y"]) wandb.run.summary.update( {"final_train_loss": train_loss, "final_train_acc": train_acc} ) wandb.join() return dev_acc
def run_batch_episode_exp(total_eps: int, update_every: int, use_norm: bool, wandb_project: str, wandb_group: str): # NOTE: # This code doesn't run properly on Windows 10. # The result can be reproduced on Ubuntu and Mac OS. config = dict() config['update_every'] = update_every config['use_norm'] = use_norm wandb.init(project=wandb_project, entity='junyoung-park', reinit=True, group=wandb_group, config=config) env = gym.make('CartPole-v1') s_dim = env.observation_space.shape[0] a_dim = env.action_space.n net = MLP(s_dim, a_dim, [128]) agent = REINFORCE(net) memory = EpisodicMemory(max_size=100, gamma=1.0) n_update = 0 for ep in range(total_eps): s = env.reset() cum_r = 0 while True: s = to_tensor(s, size=(1, 4)) a = agent.get_action(s) ns, r, done, info = env.step(a.item()) # preprocess data r = torch.ones(1, 1) * r done = torch.ones(1, 1) * done memory.push(s, a, r, torch.tensor(ns), done) s = ns cum_r += r if done: break if ep % update_every == 0: s, a, _, _, done, g = memory.get_samples() agent.update_episodes(s, a, g, use_norm=use_norm) memory.reset() n_update += 1 wandb.log({"episode return": cum_r, "num_update": n_update}) torch.save(agent.state_dict(), join(wandb.run.dir, "agent.pt")) wandb.join()
def run_experiment(cfg: Dict, save_weights: bool = False): """Run a training experiment. Configuration file can be generated using deepblink config. NOTE - There are currently only one type of dataset and model option. This is intentional to make future development easier of new models such as 3D / 4D options. Args: cfg: Dictionary configuration file. save_weights: If model weights should be saved separately. The complete model is automatically saved. """ dataset_class = get_from_module("deepblink.datasets", cfg["dataset"]) model_class = get_from_module("deepblink.models", cfg["model"]) network_fn = get_from_module("deepblink.networks", cfg["network"]) optimizer_fn = get_from_module("deepblink.optimizers", cfg["optimizer"]) loss_fn = get_from_module("deepblink.losses", cfg["loss"]) network_args = (cfg.get("network_args", {}) if cfg.get("network_args", {}) is not None else {}) dataset_args = cfg.get("dataset_args", {}) train_args = cfg.get("train_args", {}) network_args["cell_size"] = dataset_args["cell_size"] dataset = dataset_class(dataset_args["version"], dataset_args["cell_size"]) use_wandb = cfg["use_wandb"] model = model_class( dataset_args=dataset_args, dataset_cls=dataset, loss_fn=loss_fn, network_args=network_args, network_fn=network_fn, optimizer_fn=optimizer_fn, train_args=train_args, ) cfg["system"] = { "gpus": tf.config.list_logical_devices("GPU"), "version": platform.version(), "platform": platform.platform(), } now = datetime.datetime.now().strftime("%y%m%d_%H%M%S") run_name = f"{now}_{cfg['run_name']}" if use_wandb: wandb.init(name=run_name, project=cfg["name"], config=cfg) model = train_model(model, dataset, cfg, run_name, use_wandb) if use_wandb: wandb.join() if save_weights: model.save_weights()
def train(): wandb.init(WAND_PROJECT_NAME) modelArgs = { "max_seq_length": self.maxSeqLength, "output_dir": self.modelOutputDir, "overwrite_output_dir": True, "best_model_dir": self.bestModelOutputDir, "wandb_project": WAND_PROJECT_NAME, "num_training_epochs": wandb.config.epochs, "learning_rate": wandb.config.learning_rate, "do_lower_case": True, "cache_dir": self.modelCacheDir, "encoding": "utf-8", "train_batch_size": 5, "eval_batch_size": 5, "evaluate_during_training_steps": 50, "evaluate_during_training_verbose": True, "logging_steps": 5, "sliding_window": True, "reprocess_input_data": True, "evaluate_during_training": True, "use_multiprocessing": False, "regression": True } model = ClassificationModel(self.modelType, self.modelNameOrPath, args=modelArgs, sweep_config=wandb.config, use_cuda=torch.cuda.is_available(), num_labels=1) # Training try: log.info( f"Started finetuning BERT on sentiment analysis/regression task.." ) model.train_model( train_df=self.trainDataFrame, eval_df=self.evalDataFrame, show_running_loss=True, output_dir=self.modelOutputDir, mse=sklearn.metrics.mean_squared_error, r2Score=sklearn.metrics.r2_score, ) log.info( f"Finished training and evaluation of our finetuned model on sentiment analysis/regression task. Check the folder '{self.modelOutputDir}' for finetuned weights." ) log.info( f"It took {round((time.time() - startTime) / 3600, 1)} hours to train/finetune BERT model on sentiment analysis/regression task." ) except: exc_type, exc_value, exc_traceback = sys.exc_info() err = f"Error occurred while training finetuned model on sentiment analysis/regression task. Error is: {str(exc_type)}; {str(exc_value)}." log.error(err) wandb.join()
def run(self): wandb.init(*self.args, **self.kwargs) while True: result = self.queue.get() if result == _WANDB_QUEUE_END: break log, config_update = self._handle_result(result) wandb.config.update(config_update, allow_val_change=True) wandb.log(log) wandb.join()
def launch_training_on_all_splits(experiment: str, splits: List, base_model: str, dropout: float, learning_rate: float): project = f'suzuki_miyaura_training_{experiment}_{base_model}' model_args = { 'wandb_project': project, 'num_train_epochs': 15, 'overwrite_output_dir': True, 'learning_rate': learning_rate, 'gradient_accumulation_steps': 1, 'regression': True, "num_labels": 1, "fp16": False, "evaluate_during_training": False, 'manual_seed': 42, "max_seq_length": 300, "train_batch_size": 16, "warmup_ratio": 0.00, "config": { 'hidden_dropout_prob': dropout } } for (name, split) in splits: if wandb_available: wandb.init(name=name, project=project, reinit=True) df = pd.read_csv(f'../data/Suzuki-Miyaura/random_splits/{name}.tsv', sep='\t') train_df = df.iloc[:split][['rxn', 'y']] test_df = df.iloc[split:][['rxn', 'y']] train_df.columns = ['text', 'labels'] test_df.columns = ['text', 'labels'] mean = train_df.labels.mean() std = train_df.labels.std() train_df['labels'] = (train_df['labels'] - mean) / std test_df['labels'] = (test_df['labels'] - mean) / std model_path = pkg_resources.resource_filename( "rxnfp", f"models/transformers/bert_{base_model}") pretrained_bert = SmilesClassificationModel( "bert", model_path, num_labels=1, args=model_args, use_cuda=torch.cuda.is_available()) pretrained_bert.train_model( train_df, output_dir= f"outputs_suzuki_miyaura_{experiment}_{base_model}_{name}_split_{str(split).replace('-','_')}", eval_df=test_df, r2=sklearn.metrics.r2_score) if wandb_available: wandb.join()
def offline_log_to_wandb(project_name, args_dict, early_stop_results_dict, summary_df, workdir=None, wandb_log_subset_of_metrics=False): if project_name is None: project_name = args_dict['exp']['project_name'] + '_offline' if wandb_log_subset_of_metrics: project_name += '_subset' print(f'Writing to W&B project {project_name}') curve_metric_names = None if wandb_log_subset_of_metrics: curve_metric_names = get_wandb_curve_metrics() print(f'Start dump results to W&B project: {project_name}') wandb_myinit(project_name=project_name, experiment_name=args_dict['exp']['experiment_name'], instance_name=args_dict['exp']['instance_name'], config=args_dict, workdir=workdir) global_step_name = 'epoch' summary_df = summary_df.set_index(global_step_name) print(f'Dump run curves') first_iter = True for global_step, step_metrics in summary_df.iterrows(): if first_iter: first_iter = False if curve_metric_names is not None: for metric in curve_metric_names: if metric not in step_metrics: warnings.warn( f"Can't log '{metric}'. It doesn't exists.") if wandb_log_subset_of_metrics: metrics_to_log = slice_dict_to_dict(step_metrics.to_dict(), curve_metric_names, ignore_missing_keys=True) else: # log all metrics metrics_to_log = step_metrics.to_dict() metrics_to_log[global_step_name] = global_step wandb.log(metrics_to_log) early_stop_results_to_wandb_summary(early_stop_results_dict) dump_preds_at_early_stop(early_stop_results_dict, workdir, use_wandb=True) # terminate nicely offline w&b run wandb.join()
def training(): wandb.init() model = NERModel("roberta", "roberta-base", use_cuda=True, args=model_args, sweep_config=wandb.config) # model = NERModel("distilbert", "distilbert-base-cased", use_cuda=True, args=model_args, sweep_config=wandb.config) model.train_model(train_df, eval_data=trial_df) wandb.join()
def evaluate_model(model_dir, wandb_ID=None, predict=True): """Predicts on the dataset and makes performance plots induced by the model_dir. If wanted, the results are logged to W&B. Arguments: model_dir {str} -- Full or partial path to a trained model Keyword Arguments: wandb_ID {str} -- The unique W&B-ID of the experiment. If None, no logging is performed. (default: {None}) """ # ======================================================================== # SAVE OPERATION PLOTS # ======================================================================== if wandb_ID is not None: hyper_pars, data_pars, arch_pars, meta_pars = load_model_pars(model_dir) WANDB_DIR = get_project_root()+'/models' PROJECT = meta_pars['project'] wandb.init(resume=True, id=wandb_ID, dir=WANDB_DIR, project=PROJECT) print(model_dir) log_operation_plots(model_dir, wandb_ID=wandb_ID) # ======================================================================== # PREDICT USING BEST MODEL # ======================================================================== if predict: hyper_pars, data_pars, arch_pars, meta_pars = load_model_pars(model_dir) if data_pars['dataloader'] == 'PickleLoader' or data_pars['dataloader'] == 'SqliteLoader' : calc_predictions_pickle(model_dir, wandb_ID=wandb_ID) else: calc_predictions(model_dir, wandb_ID=wandb_ID) # ======================================================================== # # REPORT PERFORMANCE # ======================================================================== # log_performance_plots(model_dir, wandb_ID=wandb_ID) summarize_model_performance(model_dir, wandb_ID=wandb_ID) if wandb_ID is not None: wandb.log() wandb.join() # Update the meta_pars-file with open(model_dir+'/meta_pars.json') as json_file: meta_pars = json.load(json_file) meta_pars['status'] = 'Finished' with open(model_dir+'/meta_pars.json', 'w') as fp: json.dump(meta_pars, fp) # Close all open figures plt.close('all')
def setup_and_evaluate(run: Run, blueprints: List[Tuple[BlueprintGenome, int]], in_size: List[int], feature_mul: int): if config.use_wandb: if config.resume_fully_train: resume_ft_run(True) else: new_ft_run(True) for blueprint, gen_num in blueprints: eval_with_retries(run, blueprint, gen_num, in_size, feature_mul) wandb.join()
def _start(self, finished_q, env, function, run_id, in_jupyter): if env: for k, v in env.items(): os.environ[k] = v # call user function print("wandb: Agent Started Run:", run_id) if function: function() print("wandb: Agent Finished Run:", run_id, "\n") if run := wandb.run: wandb.join()
def test_resume_allow_success(live_mock_server, test_settings): res = live_mock_server.set_ctx({"resume": True}) print("CTX AFTER UPDATE", res) print("GET RIGHT AWAY", live_mock_server.get_ctx()) wandb.init(reinit=True, resume="allow", settings=test_settings) wandb.log({"acc": 10}) wandb.join() server_ctx = live_mock_server.get_ctx() print("CTX", server_ctx) first_stream_hist = first_filestream(server_ctx)["files"]["wandb-history.jsonl"] print(first_stream_hist) assert first_stream_hist["offset"] == 15 assert json.loads(first_stream_hist["content"][0])["_step"] == 16
def transfer_train( *, train_set: str, size: int = 256, epochs: int = 20, batch_size: int = 32, learning_rate: float = 0.001, lr_decay_rate: float = 0.99, lr_decay_steps: int = 5e2, make_submission: bool = False, **model_params, ): """ Trains a new head on top of a transfer model. No fine tuning of the transfer model is conducted. Transfer model embeddings are computed once at the beginning of the training run. If `make_submission==True`, returns this model's scores on the test set, along with the corresponding image ids. """ # Make and init the wandb run. wandb.init(project="Plant Pathology", reinit=True) wandb.config.update({ "epochs": epochs, "batch_size": batch_size, "learning_rate": learning_rate, "lr_decay_rate": lr_decay_rate, "lr_decay_steps": lr_decay_steps, "size": size, }) model = TransferModel(constants.NCLASSES, size, batch_size, **model_params) model.compile( optimizer=Adam(learning_rate=ExponentialDecay( learning_rate, lr_decay_steps, lr_decay_rate)), loss="categorical_crossentropy", metrics=["categorical_accuracy"], ) # Train the model (just the new layers on top of the transfer model) model.fit_head( train_set, "dev", epochs=epochs, callbacks=[WandbCallback(save_model=False)], ) # Log the scores wandb.join() if make_submission: return model.predict_on_test()
def train(): wandb.init() model = MultiLabelClassificationModel( model_type, model_name, num_labels=len(labels), args=model_args, use_cuda=cuda_available, ) model.train_model(train_data, eval_df=eval_data) wandb.join()
def train(): # Initialize a new wandb run wandb.init() # Create a TransformerModel model = ClassificationModel("roberta", "roberta-base", use_cuda=True, args=model_args, sweep_config=wandb.config,) # Train the model model.train_model(train_df, eval_df=eval_df) # Evaluate the model model.eval_model(eval_df) # Sync wandb wandb.join()
def main(self): wandb.init(config=self.cli_args, sync_tensorboard=True) for epoch_ndx in range(2): self.logMetrics(epoch_ndx, 'trn') # self.logMetrics(epoch_ndx, 'val') if self.trn_writer is not None: self.trn_writer.close() if self.val_writer is not None: self.val_writer.close() wandb.join()
def train(sweep_q, worker_q): reset_wandb_env() worker_data = worker_q.get() run_name = "{}-{}".format(worker_data.sweep_run_name, worker_data.num) config = worker_data.config run = wandb.init( group=worker_data.sweep_id, job_type=worker_data.sweep_run_name, name=run_name, config=config, ) val_accuracy = random.random() run.log(dict(val_accuracy=val_accuracy)) wandb.join() sweep_q.put(WorkerDoneData(val_accuracy=val_accuracy))
def main(): num_folds = 5 # Spin up workers before calling wandb.init() # Workers will be blocked on a queue waiting to start sweep_q = multiprocessing.Queue() workers = [] for num in range(num_folds): q = multiprocessing.Queue() p = multiprocessing.Process(target=train, kwargs=dict(sweep_q=sweep_q, worker_q=q)) p.start() workers.append(Worker(queue=q, process=p)) sweep_run = wandb.init() sweep_id = sweep_run.sweep_id or "unknown" sweep_url = sweep_run.get_sweep_url() project_url = sweep_run.get_project_url() sweep_group_url = "{}/groups/{}".format(project_url, sweep_id) sweep_run.notes = sweep_group_url sweep_run.save() sweep_run_name = sweep_run.name or sweep_run.id or "unknown" metrics = [] for num in range(num_folds): worker = workers[num] # start worker worker.queue.put( WorkerInitData( sweep_id=sweep_id, num=num, sweep_run_name=sweep_run_name, config=dict(sweep_run.config), )) # get metric from worker result = sweep_q.get() # wait for worker to finish worker.process.join() # log metric to sweep_run metrics.append(result.val_accuracy) sweep_run.log(dict(val_accuracy=sum(metrics) / len(metrics))) wandb.join() print("*" * 40) print("Sweep URL: ", sweep_url) print("Sweep Group URL: ", sweep_group_url) print("*" * 40)
def train(): wandb.init() model_args.wandb_kwargs = {"id": wandb.run.id} model = ClassificationModel( "roberta", "roberta-base", num_labels=num_of_labels, use_cuda=True, args=model_args, sweep_config=wandb.config, ) model.train_model(train_df, eval_df=eval_df, f1=f1_multiclass) wandb.join()