def test_CP_kde_unlabelled(self): utils.set_seed(self.seed) N = utils.N K = utils.K X, x_test = utils.generate_unlabelled_dataset(N, K) pred = self.cp_kde.predict_unlabelled(x_test, X, self.epsilon) self.assertTrue(pred, 'CP with KDE NCM mispredicted an object in unlabelled setting.')
def test_CP_kde_labelled(self): utils.set_seed(self.seed) N = utils.N K = utils.K X, Y, x_test, y_test = utils.generate_labelled_dataset(N, K) pred = self.cp_kde.predict_labelled(x_test, X, Y, self.epsilon) self.assertIn(y_test, pred, 'CP with KDE NCM mispredicted an object. Objects had dimension {}'.format(K))
def test_CP_kde_labelled_1(self): """Consider objects with dimension 1. """ utils.set_seed(self.seed) N = utils.N K = 1 X, Y, x_test, y_test = utils.generate_labelled_dataset(N, K) pred = self.cp_kde.predict_labelled(x_test, X, Y, self.epsilon) self.assertIn(y_test, pred, 'CP with KDE NCM mispredicted an object. Objects had dimension 1.')
def main(): parser = argparse.ArgumentParser(description='Negotiator') parser.add_argument('--dataset', type=str, default='./data/negotiate/val.txt', help='location of the dataset') parser.add_argument('--model_file', type=str, help='model file') parser.add_argument('--smart_ai', action='store_true', default=False, help='to use rollouts') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument('--temperature', type=float, default=1.0, help='temperature') parser.add_argument('--domain', type=str, default='object_division', help='domain for the dialogue') parser.add_argument('--log_file', type=str, default='', help='log file') args = parser.parse_args() utils.set_seed(args.seed) model = utils.load_model(args.model_file) ai = LstmAgent(model, args) logger = DialogLogger(verbose=True, log_file=args.log_file) domain = get_domain(args.domain) score_func = rollout if args.smart_ai else likelihood dataset, sents = read_dataset(args.dataset) ranks, n, k = 0, 0, 0 for ctx, dialog in dataset: start_time = time.time() # start new conversation ai.feed_context(ctx) for sent, you in dialog: if you: # if it is your turn to say, take the target word and compute its rank rank = compute_rank(sent, sents, ai, domain, args.temperature, score_func) # compute lang_h for the groundtruth sentence enc = ai._encode(sent, ai.model.word_dict) _, ai.lang_h, lang_hs = ai.model.score_sent(enc, ai.lang_h, ai.ctx_h, args.temperature) # save hidden states and the utterance ai.lang_hs.append(lang_hs) ai.words.append(ai.model.word2var('YOU:')) ai.words.append(Variable(enc)) ranks += rank n += 1 else: ai.read(sent) k += 1 time_elapsed = time.time() - start_time logger.dump('dialogue %d | avg rank %.3f | raw %d/%d | time %.3f' % (k, 1. * ranks / n, ranks, n, time_elapsed)) logger.dump('final avg rank %.3f' % (1. * ranks / n))
def main(): parser = argparse.ArgumentParser(description='selfplaying script') parser.add_argument('--alice_model_file', type=str, help='Alice model file') parser.add_argument('--bob_model_file', type=str, help='Bob model file') parser.add_argument('--context_file', type=str, help='context file') parser.add_argument('--temperature', type=float, default=1.0, help='temperature') parser.add_argument('--verbose', action='store_true', default=False, help='print out converations') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument('--score_threshold', type=int, default=6, help='successful dialog should have more than score_threshold in score') parser.add_argument('--max_turns', type=int, default=20, help='maximum number of turns in a dialog') parser.add_argument('--log_file', type=str, default='', help='log successful dialogs to file for training') parser.add_argument('--smart_alice', action='store_true', default=False, help='make Alice smart again') parser.add_argument('--fast_rollout', action='store_true', default=False, help='to use faster rollouts') parser.add_argument('--rollout_bsz', type=int, default=100, help='rollout batch size') parser.add_argument('--rollout_count_threshold', type=int, default=3, help='rollout count threshold') parser.add_argument('--smart_bob', action='store_true', default=False, help='make Bob smart again') parser.add_argument('--ref_text', type=str, help='file with the reference text') parser.add_argument('--domain', type=str, default='object_division', help='domain for the dialogue') args = parser.parse_args() utils.set_seed(args.seed) alice_model = utils.load_model(args.alice_model_file) alice_ty = get_agent_type(alice_model, args.smart_alice, args.fast_rollout) alice = alice_ty(alice_model, args, name='Alice') bob_model = utils.load_model(args.bob_model_file) bob_ty = get_agent_type(bob_model, args.smart_bob, args.fast_rollout) bob = bob_ty(bob_model, args, name='Bob') dialog = Dialog([alice, bob], args) logger = DialogLogger(verbose=args.verbose, log_file=args.log_file) ctx_gen = ContextGenerator(args.context_file) selfplay = SelfPlay(dialog, ctx_gen, args, logger) selfplay.run()
def main(): parser = argparse.ArgumentParser(description='chat utility') parser.add_argument('--model_file', type=str, help='model file') parser.add_argument('--domain', type=str, default='object_division', help='domain for the dialogue') parser.add_argument('--context_file', type=str, default='', help='context file') parser.add_argument('--temperature', type=float, default=1.0, help='temperature') parser.add_argument('--num_types', type=int, default=3, help='number of object types') parser.add_argument('--num_objects', type=int, default=6, help='total number of objects') parser.add_argument('--max_score', type=int, default=10, help='max score per object') parser.add_argument('--score_threshold', type=int, default=6, help='successful dialog should have more than score_threshold in score') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument('--smart_ai', action='store_true', default=False, help='make AI smart again') parser.add_argument('--ai_starts', action='store_true', default=False, help='allow AI to start the dialog') parser.add_argument('--ref_text', type=str, help='file with the reference text') args = parser.parse_args() utils.set_seed(args.seed) human = HumanAgent(domain.get_domain(args.domain)) alice_ty = LstmRolloutAgent if args.smart_ai else LstmAgent ai = alice_ty(utils.load_model(args.model_file), args) agents = [ai, human] if args.ai_starts else [human, ai] dialog = Dialog(agents, args) logger = DialogLogger(verbose=True) # either take manually produced contextes, or relay on the ones from the dataset if args.context_file == '': ctx_gen = ManualContextGenerator(args.num_types, args.num_objects, args.max_score) else: ctx_gen = ContextGenerator(args.context_file) chat = Chat(dialog, ctx_gen, logger) chat.run()
def main(): parser = argparse.ArgumentParser(description='testing script') parser.add_argument('--data', type=str, default='data/negotiate', help='location of the data corpus') parser.add_argument('--unk_threshold', type=int, default=20, help='minimum word frequency to be in dictionary') parser.add_argument('--model_file', type=str, help='pretrained model file') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument('--hierarchical', action='store_true', default=False, help='use hierarchical model') parser.add_argument('--bsz', type=int, default=16, help='batch size') parser.add_argument('--cuda', action='store_true', default=False, help='use CUDA') args = parser.parse_args() device_id = utils.use_cuda(args.cuda) utils.set_seed(args.seed) corpus = data.WordCorpus(args.data, freq_cutoff=args.unk_threshold, verbose=True) model = utils.load_model(args.model_file) crit = Criterion(model.word_dict, device_id=device_id) sel_crit = Criterion(model.item_dict, device_id=device_id, bad_toks=['<disconnect>', '<disagree>']) testset, testset_stats = corpus.test_dataset(args.bsz, device_id=device_id) test_loss, test_select_loss = 0, 0 N = len(corpus.word_dict) for batch in testset: # run forward on the batch, produces output, hidden, target, # selection output and selection target out, hid, tgt, sel_out, sel_tgt = Engine.forward(model, batch, volatile=False) # compute LM and selection losses test_loss += tgt.size(0) * crit(out.view(-1, N), tgt).data[0] test_select_loss += sel_crit(sel_out, sel_tgt).data[0] test_loss /= testset_stats['nonpadn'] test_select_loss /= len(testset) print('testloss %.3f | testppl %.3f' % (test_loss, np.exp(test_loss))) print('testselectloss %.3f | testselectppl %.3f' % (test_select_loss, np.exp(test_select_loss)))
def create_param_space(params, n_runs): seed = np.random.randint(1000) param_space = [] for i in range(n_runs): set_seed(seed + i) param_choice = {} for param, value in params.items(): if isinstance(value, list): if len(value) == 2: mode = 'choice' param_choice[param] = sample_param_space(value, mode) else: mode = value[-1] param_choice[param] = sample_param_space(value[:-1], mode) else: param_choice[param] = value param_space.append(param_choice) return param_space
def main(): parser = get_parser() args = parser.parse_args() if not args.model_name: args.model_name = args.model_path if args.doc_stride >= args.max_seq_length - args.max_query_length: logger.warning( "WARNING - You've set a doc stride which may be superior to the document length in some " "examples. This could result in errors when building features from the examples. Please reduce the doc " "stride or increase the maximum length to ensure the features are correctly built." ) if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir ): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format( args.output_dir ) ) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Set device args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.DEBUG if args.debug else logging.INFO ) # Set seed set_seed(args) # Load pretrained model and tokenizer config = GPT2Config.from_pretrained( args.config_name if args.config_name else args.model_path, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer = GPT2Tokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) tokenizer.add_tokens(['question:', ':question']) tokenizer.pad_token = tokenizer.eos_token tokenizer.sep_token = tokenizer.eos_token tokenizer.encode = partial(tokenizer.encode, is_pretokenized=True, truncation=True) tokenizer.encode_plus = partial(tokenizer.encode_plus, is_pretokenized=True, truncation=True) model = GPT2LMHeadModel.from_pretrained( args.model_path, config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) model.resize_token_embeddings(len(tokenizer)) model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set. # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will # remove the need for this code, but it is still valid. if args.fp16: try: import apex apex.amp.register_half_function(torch, "einsum") except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") # Training train_dataset = load_and_cache_examples(args, tokenizer, 'quest_gen', evaluate=False, gpt=True) train_dataset = preprocess_dataset(train_dataset, tokenizer) dev_dataset = load_and_cache_examples(args, tokenizer, 'quest_gen', evaluate=True, gpt=True) dev_dataset = preprocess_dataset(dev_dataset, tokenizer) train(args, train_dataset, dev_dataset, model, tokenizer) logging.info('Finished training !') # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Good practice: save your training arguments together with the trained model logger.info("Saving final model checkpoint to %s", args.output_dir) model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
def train(args, train_dataset, dev_dataset, model, tokenizer): """ Train the model """ tb_writer = SummaryWriter(os.path.join(args.output_dir, 'TB_writer')) if args.dynamic_batching: train_sampler = CustomBatchSampler(train_dataset, args.train_batch_size) train_dataloader = DataLoader( train_dataset, batch_sampler=train_sampler, num_workers=0, collate_fn=dynamic_padding_collate_fn ) else: train_sampler = RandomSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, num_workers=0) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join(args.model_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_path, "scheduler.pt") ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_path, "scheduler.pt"))) if args.fp16: try: from apex import amp model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") model.train() model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch") # Added here for reproductibility set_seed(args) loss_cum = None # torch.autograd.set_detect_anomaly(True) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", smoothing=0.05) for step, batch_cpu in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue batch = tuple(t.to(args.device) for t in batch_cpu) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "labels": batch[0] } outputs = model(**inputs) loss = outputs[0] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if loss_cum is None: loss_cum = loss.detach() else: loss_cum += loss.detach() else: loss.backward() if loss_cum is None: loss_cum = loss.detach() else: loss_cum += loss.detach() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log train metrics if (not global_step % args.train_logging_steps) and args.train_logging_steps > 0: tb_writer.add_scalar('train_loss', loss_cum.item() / args.train_logging_steps, global_step) loss_cum = None # Log dev metrics if args.dev_logging_steps > 0 and global_step % args.dev_logging_steps == 0 and args.evaluate_during_training: dev_loss = evaluate(args, dev_dataset, model) tb_writer.add_scalar("dev_loss", dev_loss, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) # Save model checkpoint if args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step)) logger.info("Saving model checkpoint to %s", output_dir) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break tb_writer.close()
def main(): ''' set default hyperparams in default_hyperparams.py ''' parser = argparse.ArgumentParser() # Required arguments parser.add_argument('-d', '--dataset', choices=wilds.supported_datasets, required=True) parser.add_argument('--algorithm', required=True, choices=supported.algorithms) parser.add_argument( '--root_dir', required=True, help= 'The directory where [dataset]/data can be found (or should be downloaded to, if it does not exist).' ) # Dataset parser.add_argument( '--split_scheme', help= 'Identifies how the train/val/test split is constructed. Choices are dataset-specific.' ) parser.add_argument('--dataset_kwargs', nargs='*', action=ParseKwargs, default={}) parser.add_argument( '--download', default=False, type=parse_bool, const=True, nargs='?', help= 'If true, tries to downloads the dataset if it does not exist in root_dir.' ) parser.add_argument( '--frac', type=float, default=1.0, help= 'Convenience parameter that scales all dataset splits down to the specified fraction, for development purposes. Note that this also scales the test set down, so the reported numbers are not comparable with the full test set.' ) parser.add_argument('--version', default=None, type=str) # Loaders parser.add_argument('--loader_kwargs', nargs='*', action=ParseKwargs, default={}) parser.add_argument('--train_loader', choices=['standard', 'group']) parser.add_argument('--uniform_over_groups', type=parse_bool, const=True, nargs='?') parser.add_argument('--distinct_groups', type=parse_bool, const=True, nargs='?') parser.add_argument('--n_groups_per_batch', type=int) parser.add_argument('--batch_size', type=int) parser.add_argument('--eval_loader', choices=['standard'], default='standard') # Model parser.add_argument('--model', choices=supported.models) parser.add_argument( '--model_kwargs', nargs='*', action=ParseKwargs, default={}, help= 'keyword arguments for model initialization passed as key1=value1 key2=value2' ) # Transforms parser.add_argument('--train_transform', choices=supported.transforms) parser.add_argument('--eval_transform', choices=supported.transforms) parser.add_argument( '--target_resolution', nargs='+', type=int, help= 'The input resolution that images will be resized to before being passed into the model. For example, use --target_resolution 224 224 for a standard ResNet.' ) parser.add_argument('--resize_scale', type=float) parser.add_argument('--max_token_length', type=int) # Objective parser.add_argument('--loss_function', choices=supported.losses) # Algorithm parser.add_argument('--groupby_fields', nargs='+') parser.add_argument('--group_dro_step_size', type=float) parser.add_argument('--coral_penalty_weight', type=float) parser.add_argument('--dann_lambda', type=float) parser.add_argument('--dann_domain_layers', type=int, default=1) # hidden layers parser.add_argument('--dann_label_layers', type=int, default=1) # hidden layers parser.add_argument('--domain_loss_function', choices=supported.losses) parser.add_argument('--irm_lambda', type=float) parser.add_argument('--irm_penalty_anneal_iters', type=int) parser.add_argument('--algo_log_metric') # Model selection parser.add_argument('--val_metric') parser.add_argument('--val_metric_decreasing', type=parse_bool, const=True, nargs='?') # Optimization parser.add_argument('--n_epochs', type=int) parser.add_argument('--optimizer', choices=supported.optimizers) parser.add_argument('--lr', type=float) parser.add_argument('--weight_decay', type=float) parser.add_argument('--max_grad_norm', type=float) parser.add_argument('--optimizer_kwargs', nargs='*', action=ParseKwargs, default={}) # Scheduler parser.add_argument('--scheduler', choices=supported.schedulers) parser.add_argument('--scheduler_kwargs', nargs='*', action=ParseKwargs, default={}) parser.add_argument('--scheduler_metric_split', choices=['train', 'val'], default='val') parser.add_argument('--scheduler_metric_name') # Evaluation parser.add_argument('--process_outputs_function', choices=supported.process_outputs_functions) parser.add_argument('--evaluate_all_splits', type=parse_bool, const=True, nargs='?', default=True) parser.add_argument('--eval_splits', nargs='+', default=[]) parser.add_argument('--eval_only', type=parse_bool, const=True, nargs='?', default=False) parser.add_argument( '--eval_epoch', default=None, type=int, help= 'If eval_only is set, then eval_epoch allows you to specify evaluating at a particular epoch. By default, it evaluates the best epoch by validation performance.' ) # Misc parser.add_argument('--device', type=int, default=0) parser.add_argument('--seed', type=int, default=0) parser.add_argument('--log_dir', default='./logs') parser.add_argument('--log_every', default=50, type=int) parser.add_argument('--save_step', type=int) parser.add_argument('--save_best', type=parse_bool, const=True, nargs='?', default=True) parser.add_argument('--save_last', type=parse_bool, const=True, nargs='?', default=True) parser.add_argument('--save_pred', type=parse_bool, const=True, nargs='?', default=True) parser.add_argument('--no_group_logging', type=parse_bool, const=True, nargs='?') parser.add_argument('--use_wandb', type=parse_bool, const=True, nargs='?', default=False) parser.add_argument('--progress_bar', type=parse_bool, const=True, nargs='?', default=False) parser.add_argument('--resume', type=parse_bool, const=True, nargs='?', default=False) config = parser.parse_args() config = populate_defaults(config) # set device config.device = torch.device("cuda:" + str( config.device)) if torch.cuda.is_available() else torch.device("cpu") ## Initialize logs if os.path.exists(config.log_dir) and config.resume: resume = True mode = 'a' elif os.path.exists(config.log_dir) and config.eval_only: resume = False mode = 'a' else: resume = False mode = 'w' if not os.path.exists(config.log_dir): os.makedirs(config.log_dir) logger = Logger(os.path.join(config.log_dir, 'log.txt'), mode) # Record config log_config(config, logger) # Set random seed set_seed(config.seed) # Data full_dataset = wilds.get_dataset(dataset=config.dataset, version=config.version, root_dir=config.root_dir, download=config.download, split_scheme=config.split_scheme, **config.dataset_kwargs) # To implement data augmentation (i.e., have different transforms # at training time vs. test time), modify these two lines: train_transform = initialize_transform( transform_name=config.train_transform, config=config, dataset=full_dataset) eval_transform = initialize_transform(transform_name=config.eval_transform, config=config, dataset=full_dataset) train_grouper = CombinatorialGrouper(dataset=full_dataset, groupby_fields=config.groupby_fields) datasets = defaultdict(dict) for split in full_dataset.split_dict.keys(): if split == 'train': transform = train_transform verbose = True elif split == 'val': transform = eval_transform verbose = True else: transform = eval_transform verbose = False # Get subset datasets[split]['dataset'] = full_dataset.get_subset( split, frac=config.frac, transform=transform) if split == 'train': datasets[split]['loader'] = get_train_loader( loader=config.train_loader, dataset=datasets[split]['dataset'], batch_size=config.batch_size, uniform_over_groups=config.uniform_over_groups, grouper=train_grouper, distinct_groups=config.distinct_groups, n_groups_per_batch=config.n_groups_per_batch, **config.loader_kwargs) else: datasets[split]['loader'] = get_eval_loader( loader=config.eval_loader, dataset=datasets[split]['dataset'], grouper=train_grouper, batch_size=config.batch_size, **config.loader_kwargs) # Set fields datasets[split]['split'] = split datasets[split]['name'] = full_dataset.split_names[split] datasets[split]['verbose'] = verbose # Loggers datasets[split]['eval_logger'] = BatchLogger( os.path.join(config.log_dir, f'{split}_eval.csv'), mode=mode, use_wandb=(config.use_wandb and verbose)) datasets[split]['algo_logger'] = BatchLogger( os.path.join(config.log_dir, f'{split}_algo.csv'), mode=mode, use_wandb=(config.use_wandb and verbose)) if config.use_wandb: initialize_wandb(config) # Logging dataset info # Show class breakdown if feasible if config.no_group_logging and full_dataset.is_classification and full_dataset.y_size == 1 and full_dataset.n_classes <= 10: log_grouper = CombinatorialGrouper(dataset=full_dataset, groupby_fields=['y']) elif config.no_group_logging: log_grouper = None else: log_grouper = train_grouper log_group_data(datasets, log_grouper, logger) ## Initialize algorithm algorithm = initialize_algorithm(config=config, datasets=datasets, train_grouper=train_grouper) model_prefix = get_model_prefix(datasets['train'], config) if not config.eval_only: ## Load saved results if resuming resume_success = False if resume: save_path = model_prefix + 'epoch:last_model.pth' if not os.path.exists(save_path): epochs = [ int(file.split('epoch:')[1].split('_')[0]) for file in os.listdir(config.log_dir) if file.endswith('.pth') ] if len(epochs) > 0: latest_epoch = max(epochs) save_path = model_prefix + f'epoch:{latest_epoch}_model.pth' try: prev_epoch, best_val_metric = load(algorithm, save_path) epoch_offset = prev_epoch + 1 logger.write( f'Resuming from epoch {epoch_offset} with best val metric {best_val_metric}' ) resume_success = True except FileNotFoundError: pass if resume_success == False: epoch_offset = 0 best_val_metric = None train(algorithm=algorithm, datasets=datasets, general_logger=logger, config=config, epoch_offset=epoch_offset, best_val_metric=best_val_metric) else: if config.eval_epoch is None: eval_model_path = model_prefix + 'epoch:best_model.pth' else: eval_model_path = model_prefix + f'epoch:{config.eval_epoch}_model.pth' best_epoch, best_val_metric = load(algorithm, eval_model_path) if config.eval_epoch is None: epoch = best_epoch else: epoch = config.eval_epoch evaluate(algorithm=algorithm, datasets=datasets, epoch=epoch, general_logger=logger, config=config) logger.close() for split in datasets: datasets[split]['eval_logger'].close() datasets[split]['algo_logger'].close()
import os # import copy import fire import torch import utils import stats import torch.nn.functional as F utils.set_seed(2019) logger = utils.setup_logger() device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu") out_path = os.getcwd() + '/out/' def main(algorithm, optimizer, dataset, num_classes=10, optim_params={ 'lr': 0.05, 'weight_decay': 5e-4, 'momentum': 0.9 }): filename = algorithm + '_' + optimizer + '_' + dataset # prepare dataset logger.info("====== Evaluation ======") logger.info("Preparing dataset...{}".format(dataset)) db = utils.Datasets(dataset)
utils.plot_loss(trainer.validation_history["loss"], label="Validation loss") plt.legend() plt.subplot(1, 2, 2) plt.title("Accuracy") utils.plot_loss(trainer.validation_history["accuracy"], label="Validation Accuracy") utils.plot_loss(trainer.train_history["accuracy"], label="Training Accuracy") print(trainer.train_history["accuracy"].popitem(last=True), " train acc") print(trainer.train_history["loss"].popitem(last=True), " train loss") plt.legend() plt.savefig(plot_path.joinpath(f"{name}_final_.png")) plt.show() if __name__ == "__main__": # Set the random generator seed (parameters, shuffling etc). # You can try to change this and check if you still get the same result! utils.set_seed(0) epochs = 10 batch_size = 64 learning_rate = 5e-4 # 5e-4? early_stop_count = 10 dataloaders = load_cifar10(batch_size) model = ConvModel1(image_channels=3, num_classes=10) trainer = Trainer(batch_size, learning_rate, early_stop_count, epochs, model, dataloaders) trainer.train() create_plots(trainer, "task2")
def main(): parser = argparse.ArgumentParser() arg = parser.add_argument arg('mode', choices=['train', 'validate', 'predict']) arg('run_root') arg('--batch-size', type=int, default=64) arg('--step', type=int, default=1) arg('--workers', type=int, default=2) arg('--lr', type=float, default=0.00001) arg('--patience', type=int, default=4) arg('--clean', action='store_true') arg('--n-epochs', type=int, default=5) arg('--limit', type=int) arg('--fold', type=int, default=0) arg('--multi-gpu', type=int, default=0) arg('--lr_layerdecay', type=float, default=0.95) args = parser.parse_args() set_seed() run_root = Path('../experiments/' + args.run_root) DATA_ROOT = Path('../byebyejuly/') folds = pd.read_pickle(DATA_ROOT / 'folds.pkl') train_fold = folds[folds['fold'] != args.fold] valid_fold = folds[folds['fold'] == args.fold] if args.limit: train_fold = train_fold[:args.limit] valid_fold = valid_fold[:args.limit] if args.mode == 'train': if run_root.exists() and args.clean: shutil.rmtree(run_root) run_root.mkdir(exist_ok=True, parents=True) (run_root / 'params.json').write_text( json.dumps(vars(args), indent=4, sort_keys=True)) training_set = TrainDataset(train_fold, do_lower=True, shuffle=True) training_loader = DataLoader(training_set, collate_fn=collate_fn, shuffle=True, batch_size=args.batch_size, num_workers=args.workers) valid_set = TrainDataset(valid_fold) valid_loader = DataLoader(valid_set, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn, num_workers=args.workers) model = PairModel(BERT_PRETRAIN_PATH) model.cuda() # param_optimizer = list(model.named_parameters()) # no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # optimizer_grouped_parameters = [ # {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and p.requires_grad], # 'weight_decay': 0.01}, # {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and p.requires_grad], # 'weight_decay': 0.0} # ] NUM_LAYERS = 12 optimizer_grouped_parameters = [{ 'params': model.bert.bert.embeddings.parameters(), 'lr': args.lr * (args.lr_layerdecay**NUM_LAYERS) }, { 'params': model.head.parameters(), 'lr': args.lr }, { 'params': model.bert.bert.pooler.parameters(), 'lr': args.lr }] for layer in range(NUM_LAYERS): optimizer_grouped_parameters.append( { 'params': model.bert.bert.encoder.layer.__getattr__( '%d' % (NUM_LAYERS - 1 - layer)).parameters(), 'lr': args.lr * (args.lr_layerdecay**layer) }, ) optimizer = BertAdam(optimizer_grouped_parameters, lr=args.lr, warmup=0.05, t_total=len(training_loader) * args.n_epochs // args.step) model, optimizer = amp.initialize(model, optimizer, opt_level="O2", verbosity=0) optimizer.zero_grad() if args.multi_gpu == 1: model = nn.DataParallel(model) train(args, model, optimizer, None, train_loader=training_loader, valid_df=valid_fold, valid_loader=valid_loader, epoch_length=len(training_set)) elif args.mode == 'validate': valid_fold = pd.read_table('../byebyejuly/test.txt', names=['a', 'b', 'label']) valid_set = TrainDataset(valid_fold) valid_loader = DataLoader(valid_set, batch_size=args.batch_size, shuffle=False, collate_fn=collate_fn, num_workers=args.workers) model = PairModel(BERT_PRETRAIN_PATH) load_model(model, run_root / ('best-model-%d.pt' % args.fold), multi2single=False) model.cuda() if args.multi_gpu == 1: model = nn.DataParallel(model) validation(model, valid_fold, valid_loader, args, False, progress=True)
help='resume from checkpoint') parser.add_argument('--seed', default=0, type=int) parser.add_argument('--mask_method', default="label-square", type=str) parser.add_argument('--num_samples', default=1024, type=int) parser.add_argument('--update_mask_epochs', default=500, type=int) parser.add_argument('--save_file', default="default_accs.bin", type=str) parser.add_argument('--max_epoch', default=200, type=int) parser.add_argument('--pretrain_epoch', default=0, type=int) parser.add_argument('--sparsity', default=0.005, type=float) args = parser.parse_args() sample_type, grad_type = args.mask_method.split("-") set_seed(args) device = 'cuda' if torch.cuda.is_available() else 'cpu' best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch # Data print('==> Preparing data..') transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([
parser.add_argument('--eps', type=float) parser.add_argument('--nb_iter', type=int, default=40, help='number of attack iterations') parser.add_argument('--resume', type=int, default=0) parser.add_argument('--save_model_loc', type=str, default=None) args = parser.parse_args() print(args) device = "cuda" set_seed(0) trainset, normalize, unnormalize = str2dataset(args.dataset, train=True) trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=2) net = str2model(path=args.save_model_loc, dataset=args.dataset, pretrained=args.resume).eval().to(device) if args.attack == "frank": attacker = FrankWolfe(predict=lambda x: net(normalize(x)), loss_fn=nn.CrossEntropyLoss(reduction="sum"), eps=args.eps,
from train import * from test import * from model import * from tensorboard_logger import Logger from thumos_features import * if __name__ == "__main__": args = parse_args() if args.debug: pdb.set_trace() config = Config(args) worker_init_fn = None if config.seed >= 0: utils.set_seed(config.seed) worker_init_fn = np.random.seed(config.seed) utils.save_config(config, os.path.join(config.output_path, "config.txt")) net = Model(config.len_feature, config.num_classes, config.r_act, config.r_bkg) net = net.cuda() train_loader = data.DataLoader(ThumosFeature( data_path=config.data_path, mode='train', modal=config.modal, feature_fps=config.feature_fps, num_segments=config.num_segments, supervision='weak',
from systems import * from integrator import Integrator from utils import set_seed from lkf import LKF from kf import KF from typing import Callable import numpy as np import pandas as pd import pdb import scipy.stats as stats import matplotlib.pyplot as plt set_seed(9001) dt = 1e-3 T = 60. z = TimeVarying(dt, 0.0, 1.0, f=1 / 20) F_hat = lambda t: z.F(0) eta = lambda t: F_hat(t) - z.F(t) print(F_hat(0)) f1 = KF(z.x0, F_hat, z.H, z.Q, z.R, dt) f2 = LKF(z.x0, F_hat, z.H, z.Q, z.R, dt, tau=0.25, eps=3e-2, gamma=0.9) max_err = 2. max_eta_err = 100 max_zz = 100.
def train(args, train_dataset, model, dev_dataset): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter(log_dir=args.tensorboard_dir) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for epoch in train_iterator: set_seed( args, epoch + 10 ) # Added here for reproductibility (even between python 2 and 3) # logger.info(" seed = %d", torch.initial_seed()) epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'entity_a': batch[1], 'entity_b': batch[2], 'attention_mask': batch[3], 'token_type_ids': batch[4] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[5] } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results, eval_loss = evaluate(args, model, dev_dataset) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) logger.info(" global step = %d", global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, 'module') else model model_to_save.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) if 0 < args.max_steps < global_step: epoch_iterator.close() break if 0 < args.max_steps < global_step: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(self): train_sampler = RandomSampler(self.train_dataset) train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.batch_size) t_total = len( train_dataloader ) // self.args.gradient_accumulation_steps * self.args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] # n是参数的name: BERT_NAME: embeddings.word_embeddings.weight encoder.layer.5.output.LayerNorm.bias等 # 下面这段代码的意思是,如果no_decay中的任何一个字段都不在name中则对para使用L2正则项, 否则默认设为0, 即bias相关的不带偏置项, optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': self.args.weight_decay }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) # 调度学习率在初期上升,后期下降(warm_up) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(self.train_dataset)) logger.info(" Num Epochs = %d", self.args.num_train_epochs) logger.info(" Total train batch size = %d", self.args.batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 #总步数 tr_loss = 0.0 self.model.zero_grad() # 清空梯度 train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch") set_seed(self.args) for _ in train_iterator: # 一次遍历数据集 epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate( epoch_iterator ): # 取出一个batch: 原数据集是tuple(5 * Tensor(4478)) 所以一个batch是tuple(5 * Tensor(16)) self.model.train() # 告诉pytorch正在训练 而不是预测 batch = tuple(t.to(self.device) for t in batch) # GPU or CPU inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'segment_ids': batch[2], 'start_positions': batch[3], 'end_positions': batch[4] } outputs = self.model( **inputs) #该语句自动执行forward, 与显式调用forward不同的是这个过程还会调用一些hooks loss = outputs[0] # 喂入的是一个batch, loss应该是一个batch的平均值 if self.args.gradient_accumulation_steps > 1: #取一个step的平均loss loss = loss / self.args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if ( step + 1 ) % self.args.gradient_accumulation_steps == 0: # 一个step结束, 需要更新参数 torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm) #梯度截断 optimizer.step() #一个loss的积累过程结束,更新参数 scheduler.step() # Update learning rate schedule, 更新学习率 self.model.zero_grad() #清空梯度 global_step += 1 if self.args.save_steps > 0 and global_step % self.args.save_steps == 0: # 200步save model self.save_model() self.evaluate() return global_step, tr_loss / global_step
def main(): parser = argparse.ArgumentParser() # Settings parser.add_argument('-d', '--dataset', choices=dataset_attributes.keys(), required=True) parser.add_argument('-s', '--shift_type', choices=shift_types, required=True) # Confounders parser.add_argument('-t', '--target_name') parser.add_argument('-c', '--confounder_names', nargs='+') # Resume? parser.add_argument('--resume', default=False, action='store_true') # Label shifts parser.add_argument('--minority_fraction', type=float) parser.add_argument('--imbalance_ratio', type=float) # Data parser.add_argument('--fraction', type=float, default=1.0) parser.add_argument('--root_dir', default=None) parser.add_argument('--subsample_to_minority', action='store_true', default=False) parser.add_argument('--reweight_groups', action='store_true', default=False) parser.add_argument('--augment_data', action='store_true', default=False) parser.add_argument('--val_fraction', type=float, default=0.1) # Objective parser.add_argument('--robust', default=False, action='store_true') parser.add_argument('--alpha', type=float, default=0.2) parser.add_argument('--generalization_adjustment', default="0.0") parser.add_argument('--automatic_adjustment', default=False, action='store_true') parser.add_argument('--robust_step_size', default=0.01, type=float) parser.add_argument('--use_normalized_loss', default=False, action='store_true') parser.add_argument('--btl', default=False, action='store_true') parser.add_argument('--hinge', default=False, action='store_true') # Model parser.add_argument('--model', choices=model_attributes.keys(), default='resnet50') parser.add_argument('--train_from_scratch', action='store_true', default=False) parser.add_argument('--resnet_width', type=int, default=None) # Optimization parser.add_argument('--n_epochs', type=int, default=4) parser.add_argument('--batch_size', type=int, default=32) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--scheduler', action='store_true', default=False) parser.add_argument('--weight_decay', type=float, default=5e-5) parser.add_argument('--gamma', type=float, default=0.1) parser.add_argument('--minimum_variational_weight', type=float, default=0) # Misc parser.add_argument('--seed', type=int, default=0) parser.add_argument('--show_progress', default=False, action='store_true') parser.add_argument('--log_dir', default='./logs') parser.add_argument('--log_every', default=50, type=int) parser.add_argument('--save_step', type=int, default=10) parser.add_argument('--save_best', action='store_true', default=False) parser.add_argument('--save_last', action='store_true', default=True) parser.add_argument('--student_width', type=int) parser.add_argument('--teacher_dir', type=str) parser.add_argument('--teacher_width', type=int) parser.add_argument('--gpu', type=str) parser.add_argument('--temp', type=str) args = parser.parse_args() gpu = args.gpu temp = args.temp check_args(args) teacher_dir = args.teacher_dir student_width = args.student_width teacher_width = args.teacher_width os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = gpu def DistillationLoss(temperature): cross_entropy = torch.nn.CrossEntropyLoss() def loss(student_logits, teacher_logits, target): last_dim = len(student_logits.shape) - 1 p_t = nn.functional.softmax(teacher_logits / temperature, dim=last_dim) log_p_s = nn.functional.log_softmax(student_logits / temperature, dim=last_dim) return cross_entropy(student_logits, target) - (p_t * log_p_s).sum( dim=last_dim).mean() * temperature**2 return loss # BERT-specific configs copied over from run_glue.py if args.model == 'bert': args.max_grad_norm = 1.0 args.adam_epsilon = 1e-8 args.warmup_steps = 0 if os.path.exists(args.log_dir) and args.resume: resume = True mode = 'a' else: resume = False mode = 'w' ## Initialize logs if not os.path.exists(args.log_dir): os.makedirs(args.log_dir) logger = Logger(os.path.join(args.log_dir, 'log.txt'), mode) # Record args log_args(args, logger) set_seed(args.seed) print("starting prep") # Data # Test data for label_shift_step is not implemented yet test_data = None test_loader = None if args.shift_type == 'confounder': train_data, val_data, test_data = prepare_data(args, train=True) elif args.shift_type == 'label_shift_step': train_data, val_data = prepare_data(args, train=True) print("done prep") loader_kwargs = { 'batch_size': args.batch_size, 'num_workers': 16, 'pin_memory': True } train_loader = train_data.get_loader(train=True, reweight_groups=args.reweight_groups, **loader_kwargs) val_loader = val_data.get_loader(train=False, reweight_groups=None, **loader_kwargs) if test_data is not None: test_loader = test_data.get_loader(train=False, reweight_groups=None, **loader_kwargs) data = {} data['train_loader'] = train_loader data['val_loader'] = val_loader data['test_loader'] = test_loader data['train_data'] = train_data data['val_data'] = val_data data['test_data'] = test_data n_classes = train_data.n_classes log_data(data, logger) logger.flush() ## Define the objective if args.hinge: assert args.dataset in ['CelebA', 'CUB'] # Only supports binary def hinge_loss(yhat, y): # The torch loss takes in three arguments so we need to split yhat # It also expects classes in {+1.0, -1.0} whereas by default we give them in {0, 1} # Furthermore, if y = 1 it expects the first input to be higher instead of the second, # so we need to swap yhat[:, 0] and yhat[:, 1]... torch_loss = torch.nn.MarginRankingLoss(margin=1.0, reduction='none') y = (y.float() * 2.0) - 1.0 return torch_loss(yhat[:, 1], yhat[:, 0], y) criterion = hinge_loss else: criterion = torch.nn.CrossEntropyLoss(reduction='none') if resume: df = pd.read_csv(os.path.join(args.log_dir, 'test.csv')) epoch_offset = df.loc[len(df) - 1, 'epoch'] + 1 logger.write(f'starting from epoch {epoch_offset}') else: epoch_offset = 0 train_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'train.csv'), train_data.n_groups, mode=mode) val_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'val.csv'), train_data.n_groups, mode=mode) test_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'test.csv'), train_data.n_groups, mode=mode) strain_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'strain.csv'), train_data.n_groups, mode=mode) sval_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'sval.csv'), train_data.n_groups, mode=mode) stest_csv_logger = CSVBatchLogger(os.path.join(args.log_dir, 'stest.csv'), train_data.n_groups, mode=mode) teacher = resnet10vw(teacher_width, num_classes=n_classes) teacher_old = torch.load(teacher_dir + "/10_model.pth") for k, m in teacher_old.named_modules(): m._non_persistent_buffers_set = set() # pytorch 1.6.0 compatability teacher.load_state_dict(teacher_old.state_dict()) teacher = teacher.to('cuda') # def DistillationLoss(temperature): # cross_entropy = torch.nn.CrossEntropyLoss() # # def loss(student_logits, teacher_logits, target): # last_dim = len(student_logits.shape) - 1 # # p_t = nn.functional.softmax(teacher_logits/temperature, dim=last_dim) # log_p_s = nn.functional.log_softmax(student_logits/temperature, dim=last_dim) # # return cross_entropy(student_logits, target) - (p_t * log_p_s).sum(dim=last_dim).mean() # # return loss distill_criterion = DistillationLoss(float(temp)) student = resnet10vw(int(student_width), num_classes=n_classes).to('cuda') #student.to(device) train(teacher, student, criterion, distill_criterion, data, logger, train_csv_logger, val_csv_logger, test_csv_logger, strain_csv_logger, sval_csv_logger, test_csv_logger, args, epoch_offset=epoch_offset) train_csv_logger.close() val_csv_logger.close() test_csv_logger.close() strain_csv_logger.close() sval_csv_logger.close() stest_csv_logger.close()
default=1., help="Learning rate") parser.add_argument("--train", type=bool, default=False, help="train or test") parser.add_argument("--test", type=bool, default=False, help="train or test") args = parser.parse_args() # Set the seed set_seed(args.seed) # Set the hyperparameters LR = args.lr WD = args.wd EPOCHS = args.epochs OFFSET = args.offset # pdb.set_trace() dataset_train = get_data_loader_list('./data/CUB_2011_train.txt', 1, train=True) dataset_test = get_data_loader_list('./data/CUB_2011_test.txt', 1,
def train(self): train_sampler = RandomSampler(self.train_dataset) train_dataloader = DataLoader(self.train_dataset, sampler=train_sampler, batch_size=self.args.batch_size) if self.args.max_steps > 0: t_total = self.args.max_steps self.args.num_train_epochs = self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': self.args.weight_decay}, {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=self.args.learning_rate, eps=self.args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=t_total) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(self.train_dataset)) logger.info(" Num Epochs = %d", self.args.num_train_epochs) logger.info(" Total train batch size = %d", self.args.batch_size) logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 tr_loss = 0.0 self.model.zero_grad() train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch") set_seed(self.args) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(epoch_iterator): self.model.train() batch = tuple(t.to(self.device) for t in batch) # GPU or CPU inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'intent_label_ids': batch[3], 'slot_labels_ids': batch[4]} if self.args.model_type != 'distilbert': inputs['token_type_ids'] = batch[2] outputs = self.model(**inputs) loss = outputs[0] if self.args.gradient_accumulation_steps > 1: loss = loss / self.args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() if (step + 1) % self.args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule self.model.zero_grad() global_step += 1 if self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0: self.evaluate("dev") if self.args.save_steps > 0 and global_step % self.args.save_steps == 0: self.save_model() if 0 < self.args.max_steps < global_step: epoch_iterator.close() break if 0 < self.args.max_steps < global_step: train_iterator.close() break return global_step, tr_loss / global_step
import torch.optim as optim from progressbar import ProgressBar from torch.optim.lr_scheduler import ReduceLROnPlateau from torch.utils.data import Dataset, DataLoader import torchvision.transforms as transforms from torch.distributions.multivariate_normal import MultivariateNormal from config import gen_args from data import PhysicsDataset, load_data from data_d4rl import D4RLDataset from models_kp import KeyPointNet from models_dy import DynaNetGNN, HLoss from utils import rand_int, count_parameters, Tee, AverageMeter, get_lr, to_np, set_seed args = gen_args() set_seed(args.random_seed) torch.manual_seed(args.random_seed) np.random.seed(args.random_seed) os.system('mkdir -p ' + args.outf_kp) os.system('mkdir -p ' + args.dataf) if args.stage == 'dy': os.system('mkdir -p ' + args.outf_dy) tee = Tee(os.path.join(args.outf_dy, 'train.log'), 'w') else: raise AssertionError("Unsupported env %s" % args.stage) print(args)
def main(): parser = argparse.ArgumentParser(description='selfplaying script') parser.add_argument('--alice_model_file', type=str, help='Alice model file') parser.add_argument('--bob_model_file', type=str, help='Bob model file') parser.add_argument('--context_file', type=str, help='context file') parser.add_argument('--temperature', type=float, default=1.0, help='temperature') parser.add_argument('--verbose', action='store_true', default=False, help='print out converations') parser.add_argument('--seed', type=int, default=1, help='random seed') parser.add_argument( '--score_threshold', type=int, default=6, help='successful dialog should have more than score_threshold in score' ) parser.add_argument('--max_turns', type=int, default=20, help='maximum number of turns in a dialog') parser.add_argument('--log_file', type=str, default='', help='log successful dialogs to file for training') parser.add_argument('--smart_alice', action='store_true', default=False, help='make Alice smart again') parser.add_argument('--fast_rollout', action='store_true', default=False, help='to use faster rollouts') parser.add_argument('--rollout_bsz', type=int, default=100, help='rollout batch size') parser.add_argument('--rollout_count_threshold', type=int, default=3, help='rollout count threshold') parser.add_argument('--smart_bob', action='store_true', default=False, help='make Bob smart again') parser.add_argument('--ref_text', type=str, help='file with the reference text') parser.add_argument('--domain', type=str, default='object_division', help='domain for the dialogue') parser.add_argument('--fixed_bob', action='store_true', default=False, help='make Bob smart again') args = parser.parse_args() utils.set_seed(args.seed) alice_model = utils.load_model(args.alice_model_file) alice_ty = get_agent_type(alice_model, args.smart_alice, args.fast_rollout) alice = alice_ty(alice_model, args, name='Alice') bob_model = utils.load_model(args.bob_model_file) bob_ty = get_agent_type(bob_model, args.smart_bob, args.fast_rollout) bob = bob_ty(bob_model, args, name='Bob') dialog = Dialog([alice, bob], args) logger = DialogLogger(verbose=args.verbose, log_file=args.log_file) ctx_gen = ContextGenerator(args.context_file) selfplay = SelfPlay(dialog, ctx_gen, args, logger) selfplay.run()
def run(): parser = ArgumentParser() parser.add_argument("--run_name", type=str, default='run1', help="The name of the run (subdirectory in ./runs)") parser.add_argument( "--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument( "--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=40, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=1, help="Sampling softmax temperature") parser.add_argument( "--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument( "--top_p", type=float, default=0.8, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() # set seed set_seed(args.seed) logger.info("Get pretrained model and tokenizer") model_path = os.path.join('runs', args.run_name) tokenizer_class, model_class = ( GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(model_path) model = model_class.from_pretrained(model_path) model.to(args.device) add_special_tokens_(model, tokenizer) history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(history, tokenizer, model, args) history.append(out_ids) history = history[-(2 * args.max_history + 1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
import utils import torch import time import Procedure import numpy as np from parse import args import model from pprint import pprint import dataloader from parse import para_dict import torch.optim as optim pprint(vars(args)) utils.set_seed(args.seed) # dataset dataset = dataloader.Loader(path=args.datadir + args.dataset) # model n_users = para_dict['user_num'] m_items = para_dict['item_num'] Recmodel = model.LightGCN(n_users, m_items).to(args.device) weight_file = utils.getFileName() print(f"model will be save in {weight_file}") # loss opt = optim.Adam(Recmodel.parameters(), lr=args.lr) # result best_val = {'recall': np.array([0.0]),
def main(rank, dev_id, args): set_seed() # Remove the line below will result in problems for multiprocess if args['num_devices'] > 1: torch.set_num_threads(1) if dev_id == -1: args['device'] = torch.device('cpu') else: args['device'] = torch.device('cuda:{}'.format(dev_id)) # Set current device torch.cuda.set_device(args['device']) train_set, val_set = load_dataset(args) get_center_subset(train_set, rank, args['num_devices']) train_loader = DataLoader(train_set, batch_size=args['batch_size'], collate_fn=collate_center, shuffle=True) val_loader = DataLoader(val_set, batch_size=args['batch_size'], collate_fn=collate_center, shuffle=False) model = WLNReactionCenter(node_in_feats=args['node_in_feats'], edge_in_feats=args['edge_in_feats'], node_pair_in_feats=args['node_pair_in_feats'], node_out_feats=args['node_out_feats'], n_layers=args['n_layers'], n_tasks=args['n_tasks']).to(args['device']) model.train() if rank == 0: print('# trainable parameters in the model: ', count_parameters(model)) criterion = BCEWithLogitsLoss(reduction='sum') optimizer = Adam(model.parameters(), lr=args['lr']) if args['num_devices'] <= 1: from utils import Optimizer optimizer = Optimizer(model, args['lr'], optimizer, max_grad_norm=args['max_norm']) else: from utils import MultiProcessOptimizer optimizer = MultiProcessOptimizer(args['num_devices'], model, args['lr'], optimizer, max_grad_norm=args['max_norm']) total_iter = 0 rank_iter = 0 grad_norm_sum = 0 loss_sum = 0 dur = [] for epoch in range(args['num_epochs']): t0 = time.time() for batch_id, batch_data in enumerate(train_loader): total_iter += args['num_devices'] rank_iter += 1 batch_reactions, batch_graph_edits, batch_mol_graphs, \ batch_complete_graphs, batch_atom_pair_labels = batch_data labels = batch_atom_pair_labels.to(args['device']) pred, biased_pred = reaction_center_prediction( args['device'], model, batch_mol_graphs, batch_complete_graphs) loss = criterion(pred, labels) / len(batch_reactions) loss_sum += loss.cpu().detach().data.item() grad_norm_sum += optimizer.backward_and_step(loss) if rank_iter % args['print_every'] == 0 and rank == 0: progress = 'Epoch {:d}/{:d}, iter {:d}/{:d} | ' \ 'loss {:.4f} | grad norm {:.4f}'.format( epoch + 1, args['num_epochs'], batch_id + 1, len(train_loader), loss_sum / args['print_every'], grad_norm_sum / args['print_every']) print(progress) grad_norm_sum = 0 loss_sum = 0 if total_iter % args['decay_every'] == 0: optimizer.decay_lr(args['lr_decay_factor']) if total_iter % args['decay_every'] == 0 and rank == 0: if epoch >= 1: dur.append(time.time() - t0) print('Training time per {:d} iterations: {:.4f}'.format( rank_iter, np.mean(dur))) total_samples = total_iter * args['batch_size'] prediction_summary = 'total samples {:d}, (epoch {:d}/{:d}, iter {:d}/{:d}) '.format( total_samples, epoch + 1, args['num_epochs'], batch_id + 1, len(train_loader)) + \ reaction_center_final_eval(args, args['top_ks_val'], model, val_loader, easy=True) print(prediction_summary) with open(args['result_path'] + '/val_eval.txt', 'a') as f: f.write(prediction_summary) torch.save({'model_state_dict': model.state_dict()}, args['result_path'] + '/model_{:d}.pkl'.format(total_samples)) t0 = time.time() model.train() synchronize(args['num_devices'])
def train(args): # 加载数据 trainset = IMDBDataset(is_training=True) testset = IMDBDataset(is_training=False) # 封装成MapDataSet的形式 train_ds = MapDataset(trainset, label_list=[0, 1]) test_ds = MapDataset(testset, label_list=[0, 1]) # 定义XLNet的Tokenizer tokenizer = XLNetTokenizer.from_pretrained(args.model_name_or_path) trans_func = partial(convert_example, tokenizer=tokenizer, label_list=train_ds.label_list, max_seq_length=args.max_seq_length) # 构造train_data_loader 和 dev_data_loader train_ds = train_ds.map(trans_func, lazy=True) train_batch_sampler = paddle.io.DistributedBatchSampler( train_ds, batch_size=args.batch_size, shuffle=True) batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=tokenizer.pad_token_id, pad_right=False), # input Pad(axis=0, pad_val=tokenizer.pad_token_type_id, pad_right=False ), # token_type Pad(axis=0, pad_val=0, pad_right=False), # attention_mask Stack(dtype="int64" if train_ds.label_list else "float32"), # label ): fn(samples) train_data_loader = DataLoader(dataset=train_ds, batch_sampler=train_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) dev_ds = MapDataset(testset) dev_ds = dev_ds.map(trans_func, lazy=True) dev_batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=args.batch_size, shuffle=False) dev_data_loader = DataLoader(dataset=dev_ds, batch_sampler=dev_batch_sampler, collate_fn=batchify_fn, num_workers=0, return_list=True) # 训练配置 # 固定随机种子 set_seed(args) # 设定运行环境 use_gpu = True if paddle.get_device().startswith("gpu") else False if use_gpu: paddle.set_device('gpu:0') num_classes = len(train_ds.label_list) model = XLNetForSequenceClassification.from_pretrained( args.model_name_or_path, num_classes=num_classes) #paddle.set_device(args.device) if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() model = paddle.DataParallel(model) # 设定lr_scheduler if args.max_steps > 0: num_training_steps = args.max_steps num_train_epochs = ceil(num_training_steps / len(train_data_loader)) else: num_training_steps = len(train_data_loader) * args.num_train_epochs num_train_epochs = args.num_train_epochs warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, warmup) # 制定优化器 clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.max_grad_norm) decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "layer_norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=0.9, beta2=0.999, epsilon=args.adam_epsilon, parameters=model.parameters(), grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params) # 模型训练 metric = Accuracy() # 定义损失函数 loss_fct = paddle.nn.loss.CrossEntropyLoss( ) if train_ds.label_list else paddle.nn.loss.MSELoss() global_step = 0 tic_train = time.time() model.train() for epoch in range(num_train_epochs): for step, batch in enumerate(train_data_loader): global_step += 1 input_ids, token_type_ids, attention_mask, labels = batch logits = model(input_ids, token_type_ids, attention_mask) loss = loss_fct(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() if global_step % args.logging_steps == 0: print( "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s" % (global_step, num_training_steps, epoch, step, paddle.distributed.get_rank(), loss, optimizer.get_lr(), args.logging_steps / (time.time() - tic_train))) tic_train = time.time() if global_step % args.save_steps == 0 or global_step == num_training_steps: tic_eval = time.time() evaluate(model, loss_fct, metric, dev_data_loader) print("eval done total : %s s" % (time.time() - tic_eval)) if (not paddle.distributed.get_world_size() > 1 ) or paddle.distributed.get_rank() == 0: output_dir = os.path.join( args.output_dir, "%s_ft_model_%d" % (args.task_name, global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) # Need better way to get inner model of DataParallel model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) if global_step == num_training_steps: exit(0) tic_train += time.time() - tic_eval
utils.set_path(('..', '../gibbs')) from Demo.Data.HMLN.GeneratorRobotMapping import generate_rel_graph, load_data from RelationalGraph import * from MLNPotential import * from Potential import QuadraticPotential, TablePotential, HybridQuadraticPotential from EPBPLogVersion import EPBP from OneShot import OneShot, LiftedOneShot from NPVI import NPVI, LiftedNPVI from CompressedGraphSorted import CompressedGraphSorted import numpy as np import time from copy import copy seed = 0 utils.set_seed(seed) from mixture_beliefs import joint_map from utils import eval_joint_assignment_energy # from hybrid_gaussian_mrf import HybridGaussianSampler # from hybrid_gaussian_mrf import convert_to_bn, block_gibbs_sample, get_crv_marg, get_drv_marg, \ # get_rv_marg_map_from_bn_params # import sampling_utils from utils import kl_continuous_logpdf import argparse parser = argparse.ArgumentParser() # parser.add_argument('algo', type=str) # any of OSI, LOSI, NPVI, LNPVI parser.add_argument('K', type=int) parser.add_argument('-n', '--num_tests', type=int, default=5)
the dataset is structured.''' ) args = parser.parse_args() # Fetch parameters parameters = read_yaml(args.yaml_file) check_folder(parameters['output_dir']) save_yaml(parameters, os.path.join(parameters['output_dir'], 'config.yml')) logging.basicConfig(filename=os.path.join(parameters['output_dir'], parameters['log_file']), filemode='w+', level=logging.INFO) logging.info("Parameters fetched.") logging.info("Setting seed for reproductibility...") set_seed(parameters['seed']) logging.info("\tDone.") logging.info("Set and retrieve the device on which to run...") device = get_device() task = parameters['task'].lower() logging.info("\tDone.") logging.info("Instanciating dataset and data processor...") if task in ['language_modeling']: data = LMDataset(task, parameters['dataset_name'].lower(), dataset_dir=parameters['dataset_dir']) processor = LMProcessor() logging.info("\tDone.")
from __future__ import print_function import warnings import os import torch import numpy as np from time import time from termcolor import colored from parameter import parse_arguments, net_args_are_same from architectures import get_net import utils as u from data import extract_patches warnings.filterwarnings("ignore") u.set_seed() class Training: def __init__(self, args, outpath, dtype=torch.cuda.FloatTensor): self.args = args self.dtype = dtype self.outpath = outpath if args.loss == 'mse': self.loss_fn = torch.nn.MSELoss().type(self.dtype) else: self.loss_fn = torch.nn.L1Loss().type(self.dtype) self.loss_reg_fn = torch.nn.MSELoss().type(self.dtype) self.elapsed = None self.iiter = 0 self.iter_to_be_saved = list(range(0, self.args.epochs, int(self.args.save_every))) \
CustomArgs(['--percent', '--percent'], type=float, target=('trainer', 'percent')), CustomArgs(['--conv', '--conv_layer'], type=str, target=('arch', 'args', 'conv_layer_type')), CustomArgs(['--norm', '--norm_layer'], type=str, target=('arch', 'args', 'norm_layer_type')), CustomArgs(['--subset_percent', '--subset_percent'], type=float, target=('trainer', 'subset_percent')), CustomArgs(['--asym', '--asym'], type=bool, target=('trainer', 'asym')), CustomArgs(['--sym', '--sym'], type=bool, target=('trainer', 'sym')), CustomArgs(['--name', '--exp_name'], type=str, target=('name', )), CustomArgs(['--key', '--comet_key'], type=str, target=('comet', 'api')), CustomArgs(['--offline', '--comet_offline'], type=str, target=('comet', 'offline')), CustomArgs(['--seed', '--seed'], type=int, target=('seed', )), CustomArgs(['--wd', '--weight_decay'], type=float, target=('optimizer', 'args', 'weight_decay')) ] config = ConfigParser.get_instance(args, options) set_seed(manualSeed=config['seed']) main(config)
def train(args): paddle.set_device(args.device) world_size = dist.get_world_size() if world_size > 1: dist.init_parallel_env() set_seed(args.seed) model = UnifiedTransformerLMHeadModel.from_pretrained( args.model_name_or_path) tokenizer = UnifiedTransformerTokenizer.from_pretrained( args.model_name_or_path) if world_size > 1: model = paddle.DataParallel(model) train_ds, dev_ds = load_dataset('duconv', splits=('train', 'dev')) train_ds, train_data_loader = create_data_loader(train_ds, tokenizer, args, 'train') dev_ds, dev_data_loader = create_data_loader(dev_ds, tokenizer, args, 'dev') lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)), args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = AdamW(learning_rate=lr_scheduler, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm)) step = 0 total_time = 0.0 best_ppl = 1e9 for epoch in range(args.epochs): print('\nEpoch %d/%d' % (epoch + 1, args.epochs)) batch_start_time = time.time() for inputs in train_data_loader: step += 1 labels = inputs[-1] logits = model(*inputs[:-1]) loss = F.cross_entropy(logits, labels) loss.backward() optimizer.step() lr_scheduler.step() optimizer.clear_grad() total_time += (time.time() - batch_start_time) if step % args.logging_steps == 0: ppl = paddle.exp(loss) print( 'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step' % (step, loss, ppl, optimizer.get_lr(), total_time / args.logging_steps)) total_time = 0.0 if step % args.save_steps == 0: ppl = evaluation(model, dev_data_loader) if dist.get_rank() == 0: save_ckpt(model, tokenizer, args.save_dir, step) if ppl < best_ppl: best_ppl = ppl save_ckpt(model, tokenizer, args.save_dir, 'best') print('Saved step {} as best model.\n'.format(step)) batch_start_time = time.time() print('\nTraining completed.')
t = perf_counter() for epoch in range(epochs): optimizer.step(closure) # LBFGS专用 train_time = perf_counter() - t return model, train_time def test_regression(model, test_features, test_labels): with torch.no_grad(): model.eval() return f1(model(test_features), test_labels) # 随机种子固定结果 set_seed(args.seed, args.cuda) # 邻接矩阵(全), 特征, 标签, 训练集,验证集,测试集 adj, features, labels, idx_train, idx_val, idx_test = \ load_reddit_data(normalization=args.normalization, cuda=args.cuda) print("Finished data loading.") if args.model == 'SGC': model = SGC(features.size(1), labels.max().item() + 1) if args.cuda: model.cuda() # precompute processed_features, precompute_time = sgc_precompute( features, adj, args.degree) # train train_features = processed_features[idx_train]
train1 = [(x[0] + ' ' + x[1], x[2]) for x in train_data] train2 = [(x[1] + ' ' + x[0], x[2]) for x in train_data] train_data = train1 + train2 train_dataset = BuildDataSet(train_data) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) train_load = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn, sampler=train_sampler) for model_name in config.model_name: if config.local_rank in [0, -1]: msg = 'model_name:{},train_nums:{},train_iter:{},batch_size:{}' print( msg.format(model_name, len(train_data), len(train_load), config.batch_size)) train_process(config, train_load, train_sampler, model_name) torch.distributed.barrier() if __name__ == '__main__': config = roBerta_Config() config.local_rank = local_rank config.device = device config.nprocs = torch.cuda.device_count() set_seed(config) train(config)
help='Multiprocessing number.') parser.add_argument('--Ks', nargs='?', default='[1,5,10]', help='Output sizes of every layer') parser.add_argument('--skip', type=int, default=0, help='SKip epochs.') parser.add_argument('--seed', type=int, default=42, help='Random Seed.') args, _ = parser.parse_known_args() args.layers = eval(args.layers) print('#' * 70) if not args.if_stack: args.if_raw = True if args.if_output: print('\n'.join([(str(_) + ':' + str(vars(args)[_])) for _ in vars(args).keys()])) args.cuda = not args.no_cuda and torch.cuda.is_available() utils.set_seed(args.seed, args.cuda) args.device = torch.device("cuda:0" if args.cuda else "cpu") print(args.device) if args.dataset == 'wechat': args.out_epoch = 1 args.loss = 'bpr' ndcg.init(args) # In[4]: para_dict = pickle.load(open(args.datadir + args.dataset + '/warm_dict.pkl', 'rb')) uuid_code = str(uuid.uuid4())[:4] root_path = os.getcwd() + '/' save_path = root_path + 'model_save/'
import os import shutil import click import pandas as pd from deepsense import neptune from sklearn.metrics import roc_auc_score import pipeline_config as cfg from pipelines import PIPELINES from utils import init_logger, read_params, create_submission, set_seed, save_evaluation_predictions, \ read_csv_time_chunks, cut_data_in_time_chunks, data_hash_channel_send, get_submission_hours_index set_seed(1234) logger = init_logger() ctx = neptune.Context() params = read_params(ctx) @click.group() def action(): pass @action.command() def prepare_data(): logger.info('chunking train') train = pd.read_csv(params.raw_train_filepath) cut_data_in_time_chunks(train, timestamp_column='click_time', chunks_dir=params.train_chunks_dir,
default=None, help='Checkpoint location.') parser.add_argument('-save', metavar='save', type=utils.str2bool, help='Boolean', default=False) parser.add_argument( '-conf', metavar='config', default="./conf/ner/rnn.json", help='model configuration. JSON files defined in ./configs/') parser.add_argument('-ckpt', metavar='ckpt', help='Checkpoint location') args = parser.parse_args() # Set Seed for reproducibility utils.set_seed() if args.load is None: # Load Config file conf = json.load(open(args.conf, "r")) # Load Dataset object dset = Dataset(batch_size=conf["train"]["batch_size"]) # Main training loop Main(dset, conf, save=args.save) else: # Load Config file # Load Model pass