def train_translation_model(data_dir, arch, extra_flags=None): train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ '--task', 'translation', data_dir, '--save-dir', data_dir, '--arch', arch, '--optimizer', 'nag', '--lr', '0.05', '--max-tokens', '500', '--max-epoch', '1', '--no-progress-bar', '--distributed-world-size', '1', '--source-lang', 'in', '--target-lang', 'out', ] + (extra_flags or []), ) train.main(train_args)
def train_language_model(data_dir, arch): train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ '--task', 'language_modeling', data_dir, '--arch', arch, '--optimizer', 'nag', '--lr', '1.0', '--criterion', 'adaptive_loss', '--adaptive-softmax-cutoff', '5,10,15', '--decoder-layers', '[(850, 3)] * 2 + [(1024,4)]', '--decoder-embed-dim', '280', '--max-tokens', '500', '--tokens-per-sample', '500', '--save-dir', data_dir, '--max-epoch', '1', '--no-progress-bar', '--distributed-world-size', '1', ], ) train.main(train_args)
def load_dataset_splits(task, splits): for split in splits: if split == 'train': task.load_dataset(split, combine=True) else: for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') try: task.load_dataset(split_k, combine=False) except FileNotFoundError as e: if k > 0: break raise e if __name__ == '__main__': parser = options.get_training_parser() args = options.parse_args_and_arch(parser) if args.distributed_port > 0 or args.distributed_init_method is not None: from distributed_train import main as distributed_main distributed_main(args) elif args.distributed_world_size > 1: from multiprocessing_train import main as multiprocessing_main multiprocessing_main(args) else: main(args)
def cli_main(): parser = options.get_training_parser() parser.add_argument( "--comet-logging", action="store_true", help="Whether to use Comet.ML for logging", ) args = options.parse_args_and_arch(parser) logging = getattr(args, "comet_logging", False) config = None if logging: PROJECT = "machine-translation" if not keyring.get_password("comet", PROJECT): comet_ml_api_key = getpass("Please enter the comet.ml API key: ") keyring.set_password("comet", PROJECT, comet_ml_api_key) else: comet_ml_api_key = keyring.get_password("comet", PROJECT) experiment = Experiment( api_key=comet_ml_api_key, project_name="machine-translation", workspace="machine-translation", auto_output_logging=None, ) config = { "api_key": comet_ml_api_key, "experiment_key": experiment.get_key() } print("Proceeding with Comet.ML logging...") if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, config, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args, config) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = "tcp://localhost:{port}".format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != "no_c10d": print( "| NOTE: you may get better performance with: --ddp-backend=no_c10d" ) torch.multiprocessing.spawn(fn=distributed_main, args=(args, config), nprocs=args.distributed_world_size) else: # single GPU training main(args, config=config) if config: experiment.end()
def train_translation_model( data_dir, arch, extra_flags=None, task="translation", run_validation=False, lang_flags=None, extra_valid_flags=None, ): if lang_flags is None: lang_flags = [ "--source-lang", "in", "--target-lang", "out", ] train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ "--task", task, data_dir, "--save-dir", data_dir, "--arch", arch, "--optimizer", "nag", "--lr", "0.05", "--max-tokens", "500", "--max-epoch", "1", "--no-progress-bar", "--distributed-world-size", "1", "--num-workers", "0", ] + lang_flags + (extra_flags or []), ) train.main(train_args) if run_validation: # test validation validate_parser = options.get_validation_parser() validate_args = options.parse_args_and_arch( validate_parser, [ "--task", task, data_dir, "--path", os.path.join(data_dir, "checkpoint_last.pt"), "--valid-subset", "valid", "--max-tokens", "500", "--no-progress-bar", "--num-workers", "0", ] + lang_flags + (extra_valid_flags or []), ) validate.main(validate_args)
def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=False): train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ "--task", "language_modeling", data_dir, "--arch", arch, "--optimizer", "adam", "--lr", "0.0001", "--criterion", "adaptive_loss", "--adaptive-softmax-cutoff", "5,10,15", "--max-tokens", "500", "--tokens-per-sample", "500", "--save-dir", data_dir, "--max-epoch", "1", "--no-progress-bar", "--distributed-world-size", "1", "--ddp-backend", "no_c10d", "--num-workers", 0, ] + (extra_flags or []), ) train.main(train_args) # try scalar quantization scalar_quant_train_parser = options.get_training_parser() scalar_quant_train_args = options.parse_args_and_arch( scalar_quant_train_parser, [ "--task", "language_modeling", data_dir, "--arch", arch, "--optimizer", "adam", "--lr", "0.0001", "--criterion", "adaptive_loss", "--adaptive-softmax-cutoff", "5,10,15", "--max-tokens", "500", "--tokens-per-sample", "500", "--save-dir", data_dir, "--max-update", "3", "--no-progress-bar", "--distributed-world-size", "1", "--ddp-backend", "no_c10d", "--num-workers", 0, "--quant-noise-scalar", "0.5", ] + (extra_flags or []), ) train.main(scalar_quant_train_args) # try iterative PQ quantization quantize_parser = options.get_training_parser() quantize_args = options.parse_args_and_arch( quantize_parser, [ "--task", "language_modeling", data_dir, "--arch", arch, "--optimizer", "adam", "--lr", "0.0001", "--criterion", "adaptive_loss", "--adaptive-softmax-cutoff", "5,10,15", "--max-tokens", "50", "--tokens-per-sample", "50", "--max-update", "6", "--no-progress-bar", "--distributed-world-size", "1", "--ddp-backend", "no_c10d", "--num-workers", 0, "--restore-file", os.path.join(data_dir, "checkpoint_last.pt"), "--reset-optimizer", "--quantization-config-path", os.path.join(os.path.dirname(__file__), "transformer_quantization_config.yaml"), ] + (extra_flags or []), ) train.main(quantize_args)
def train_language_model( data_dir, arch, extra_flags=None, run_validation=False, extra_valid_flags=None, task="language_modeling", world_size=1, ): train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ "--task", task, data_dir, "--arch", arch, "--optimizer", "adam", "--lr", "0.0001", "--max-tokens", "500", "--tokens-per-sample", "500", "--save-dir", data_dir, "--max-epoch", "1", "--no-progress-bar", "--distributed-world-size", str(world_size), "--ddp-backend", "no_c10d", "--num-workers", "0", ] + (extra_flags or []), ) cfg = convert_namespace_to_omegaconf(train_args) distributed_utils.call_main(cfg, train.main) if run_validation: # test validation validate_parser = options.get_validation_parser() validate_args = options.parse_args_and_arch( validate_parser, [ "--task", task, data_dir, "--path", os.path.join(data_dir, "checkpoint_last.pt"), "--valid-subset", "valid", "--max-tokens", "500", "--no-progress-bar", "--num-workers", "0", ] + (extra_valid_flags or []), ) validate.main(validate_args)
def train_main(alpha, beta, save_path): parser = options.get_training_parser() input_args = [ data_set, '--share-decoder-input-output-embed', '--arch', 'transformer_iwslt_de_en', '--max-tokens', '4000', '--lr', '5e-4', '--save-interval', '2', '--max-epoch', '85', '--patience', '5', '--optimizer', 'adam', '--adam-betas', '(0.9, 0.98)', '--clip-norm', '0.0', '--weight-decay', '0.0001', '--dropout', '0.3', '--lr-scheduler', 'inverse_sqrt', '--warmup-updates', '4000', '--keep-last-epochs', '4', '--criterion', 'jensen_cross_entropy', '--alpha', str(alpha), '--beta', str(beta), '--use-uniform', '--fp16', '--save-dir', save_path ] args = options.parse_args_and_arch(parser, input_args=input_args) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print( '| NOTE: you may get better performance with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args) ckpts = os.listdir(args.save_dir) try: ckpts.remove('checkpoint_last.pt') except ValueError: print("no checkpoint_last.pt in folder", args.save_dir) f = open(os.path.join(args.save_dir, "final_entropies.txt"), "a+") results = {} entropies = {} for ckpt in ckpts: if '.pt' in ckpt: path = os.path.join(args.save_dir, ckpt) f.write(path + '\n') run_generation(path, results, entropies) f.write('{entropy: ' + str(entropies[path]) + ', bleu: ' + str(results[path]) + '}\n') f.close() return results
def cli_main(): parser = options.get_training_parser() parser.add_argument( '--config', type=str, nargs='*', help= 'paths to JSON files of experiment configurations, from high to low priority', ) parser.add_argument('--exp-name', type=str, default='', help='name of the experiment') parser.add_argument( '--debug', default=False, action='store_true', help='run training in the debugging mode', ) parser.add_argument('--path-attributes', type=str, nargs='*', default=['task', 'arch', 'lr']) parser.add_argument( '--filter_best_last_ckpts', type=str, default=False, help= 'whether to filter out checkpoint_best and checkpoint_last from checkpoint list' ) parser.add_argument('--log_valid_progress', type=str, default=False, help='whether to log validation progress') pre_parsed_args, unknown = parser.parse_known_args() config_dict = {} for config_path in pre_parsed_args.config: config_dict = update_config(config_dict, compose_configs(config_path)) parser_modifier = modify_factory(config_dict) args = options.parse_args_and_arch(parser, modify_parser=parser_modifier) update_namespace(args, config_dict) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if (args.update_freq is not None and max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d'): logger.info( 'NOTE: you may get faster training with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)
def setup(): parser = options.get_training_parser() args = options.parse_args_and_arch(parser) # make sure everything is reset before loading the model args.reset_optimizer = True args.reset_meters = True args.reset_dataloader = True args.reset_lr_scheduler = True args.path = args.restore_file args.max_sentences_valid = 1 # We attack batch size 1 at the moment args.beam = 1 # beam size 1 for inference on the model, could use higher utils.import_user_module(args) torch.manual_seed(args.seed) # setup task, model, loss function, and trainer task = tasks.setup_task(args) if not args.interactive_attacks: for valid_sub_split in args.valid_subset.split( ','): # load validation data task.load_dataset(valid_sub_split, combine=False, epoch=0) models, _ = checkpoint_utils.load_model_ensemble(args.path.split(':'), arg_overrides={}, task=task) assert len( models) == 1 # Make sure you didn't pass an ensemble of models in model = models[0] if torch.cuda.is_available() and not args.cpu: assert torch.cuda.device_count() == 1 # only works on 1 GPU for now torch.cuda.set_device(0) model.cuda() args.beam = 1 # beam size 1 for now model.make_generation_fast_(beamable_mm_beam_size=args.beam, need_attn=False) criterion = task.build_criterion(args) trainer = Trainer(args, task, model, criterion) generator = task.build_generator(args) bpe_vocab_size = trainer.get_model().encoder.embed_tokens.weight.shape[0] add_hooks(trainer.get_model(), bpe_vocab_size) # add gradient hooks to embeddings embedding_weight = get_embedding_weight( trainer.get_model(), bpe_vocab_size) # save the embedding matrix if not args.interactive_attacks: subset = args.valid_subset.split(',')[ 0] # only one validation subset handled itr = trainer.task.get_batch_iterator( dataset=trainer.task.dataset(subset), max_tokens=args.max_tokens_valid, max_sentences=args.max_sentences_valid, max_positions=utils.resolve_max_positions( trainer.task.max_positions(), trainer.get_model().max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, seed=args.seed, num_shards=args.distributed_world_size, shard_id=args.distributed_rank, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) else: itr = [ None ] * 100000 # a fake dataset to go through, overwritten when doing interactive attacks # Handle BPE bpe = encoders.build_bpe(args) assert bpe is not None return args, trainer, generator, embedding_weight, itr, bpe
def fairseq_train( preprocessed_dir, exp_dir, ngpus=None, max_tokens=2000, arch='fconv_iwslt_de_en', pretrained_emb_path=None, embeddings_dim=None, # Transformer (decoder is the same as encoder for now) encoder_embed_dim=512, encoder_layers=6, encoder_attention_heads=8, # encoder_decoder_dim_ratio=1, # share_embeddings=True, max_epoch=50, warmup_updates=None, lr=0.1, min_lr=1e-9, dropout=0.2, label_smoothing=0.1, lr_scheduler='fixed', weight_decay=0.0001, criterion='label_smoothed_cross_entropy', optimizer='nag', validations_before_sari_early_stopping=10, fp16=False): exp_dir = Path(exp_dir) with log_stdout(exp_dir / 'fairseq_train.stdout'): preprocessed_dir = Path(preprocessed_dir) exp_dir.mkdir(exist_ok=True, parents=True) # Copy dictionaries to exp_dir for generation shutil.copy(preprocessed_dir / 'dict.complex.txt', exp_dir) shutil.copy(preprocessed_dir / 'dict.simple.txt', exp_dir) train_parser = options.get_training_parser() # if share_embeddings: # assert encoder_decoder_dim_ratio == 1 args = [ '--task', 'translation', preprocessed_dir, '--raw-text', '--source-lang', 'complex', '--target-lang', 'simple', '--save-dir', os.path.join(exp_dir, 'checkpoints'), '--clip-norm', 0.1, '--criterion', criterion, '--no-epoch-checkpoints', '--save-interval-updates', 5000, # Validate every n updates '--validations-before-sari-early-stopping', validations_before_sari_early_stopping, '--arch', arch, # '--decoder-out-embed-dim', int(embeddings_dim * encoder_decoder_dim_ratio), # Output dim of decoder '--max-tokens', max_tokens, '--max-epoch', max_epoch, '--lr-scheduler', lr_scheduler, '--dropout', dropout, '--lr', lr, '--lr-shrink', 0.5, # For reduce lr on plateau scheduler '--min-lr', min_lr, '--weight-decay', weight_decay, '--optimizer', optimizer, '--label-smoothing', label_smoothing, '--seed', random.randint(1, 1000), # '--force-anneal', '200', # '--distributed-world-size', '1', ] if arch == 'transformer': args.extend([ '--encoder-embed-dim', encoder_embed_dim, '--encoder-ffn-embed-dim', 4 * encoder_embed_dim, '--encoder-layers', encoder_layers, '--encoder-attention-heads', encoder_attention_heads, '--decoder-layers', encoder_layers, '--decoder-attention-heads', encoder_attention_heads, ]) if pretrained_emb_path is not None: args.extend([ '--encoder-embed-path', pretrained_emb_path if pretrained_emb_path is not None else '' ]) args.extend([ '--decoder-embed-path', pretrained_emb_path if pretrained_emb_path is not None else '' ]) if embeddings_dim is not None: args.extend(['--encoder-embed-dim', embeddings_dim]) # Input and output dim of encoder args.extend(['--decoder-embed-dim', embeddings_dim]) # Input dim of decoder if ngpus is not None: args.extend(['--distributed-world-size', ngpus]) # if share_embeddings: # args.append('--share-input-output-embed') if fp16: args.append('--fp16') if warmup_updates is not None: args.extend(['--warmup-updates', warmup_updates]) args = [str(arg) for arg in args] train_args = options.parse_args_and_arch(train_parser, args) train.main(train_args)
def test_masks_token_spans(self): with TemporaryDirectory() as dirname: # prep input file raw_file = os.path.join(dirname, "raw") data = make_data(out_file=raw_file) vocab = build_vocab(data) # binarize binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) split = "train" bin_file = os.path.join(dirname, split) dataset_impl = "mmap" FileBinarizer.multiprocess_dataset( input_file=raw_file, binarizer=binarizer, dataset_impl=dataset_impl, vocab_size=len(vocab), output_prefix=bin_file, ) # adding sentinel tokens for i in range(100): vocab.add_symbol(f"<extra_id_{i}>") # setup task train_args = options.parse_args_and_arch( options.get_training_parser(), [ "--task", "span_masked_lm", "--arch", "bart_base", "--seed", "42", dirname, ], ) cfg = convert_namespace_to_omegaconf(train_args) task = SpanMaskedLMTask(cfg.task, binarizer.dict) # load datasets original_dataset = task._load_dataset_split(bin_file, 1, False) task.load_dataset(split) masked_dataset = task.dataset(split) iterator = task.get_batch_iterator( dataset=masked_dataset, max_tokens=65_536, max_positions=4_096, ).next_epoch_itr(shuffle=False) num_tokens = len(vocab) for batch in iterator: for sample in range(len(batch)): sample_id = batch["id"][sample] original_tokens = original_dataset[sample_id] masked_src_tokens = batch["net_input"]["src_tokens"][ sample] masked_src_length = batch["net_input"]["src_lengths"][ sample] masked_tgt_tokens = batch["target"][sample] original_offset = 0 masked_tgt_offset = 0 extra_id_token = len(vocab) - 1 for masked_src_token in masked_src_tokens[: masked_src_length]: if masked_src_token == extra_id_token: assert (masked_src_token == masked_tgt_tokens[masked_tgt_offset]) extra_id_token -= 1 masked_tgt_offset += 1 while (original_offset < len(original_tokens) and masked_tgt_tokens[masked_tgt_offset] != extra_id_token): assert (original_tokens[original_offset] == masked_tgt_tokens[masked_tgt_offset]) original_offset += 1 masked_tgt_offset += 1 else: assert original_tokens[ original_offset] == masked_src_token original_offset += 1
def test_denoising(self): with TemporaryDirectory() as dirname: # prep input file raw_file = os.path.join(dirname, "raw") data = make_data(out_file=raw_file) vocab = build_vocab(data) # binarize binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) split = "train" bin_file = os.path.join(dirname, split) dataset_impl = "mmap" FileBinarizer.multiprocess_dataset( input_file=raw_file, binarizer=binarizer, dataset_impl=dataset_impl, vocab_size=len(vocab), output_prefix=bin_file, ) # setup task train_args = options.parse_args_and_arch( options.get_training_parser(), [ "--task", "denoising", "--arch", "bart_base", "--seed", "42", "--mask-length", "word", "--permute-sentences", "1", "--rotate", "0", "--replace-length", "-1", "--mask", "0.2", dirname, ], ) cfg = convert_namespace_to_omegaconf(train_args) task = DenoisingTask(cfg.task, binarizer.dict) # load datasets original_dataset = task._load_dataset_split(bin_file, 1, False) task.load_dataset(split) masked_dataset = task.dataset(split) iterator = task.get_batch_iterator( dataset=masked_dataset, max_tokens=65_536, max_positions=4_096, ).next_epoch_itr(shuffle=False) mask_index = task.source_dictionary.index("<mask>") for batch in iterator: for sample in range(len(batch)): net_input = batch["net_input"] masked_src_tokens = net_input["src_tokens"][sample] masked_src_length = net_input["src_lengths"][sample] masked_tgt_tokens = batch["target"][sample] sample_id = batch["id"][sample] original_tokens = original_dataset[sample_id] original_tokens = original_tokens.masked_select( masked_src_tokens[:masked_src_length] == mask_index ) masked_tokens = masked_tgt_tokens.masked_select( masked_src_tokens == mask_index ) assert masked_tokens.equal(original_tokens)
def train_masked_language_model(data_dir, arch, extra_args=()): train_parser = options.get_training_parser() # TODO: langs should be in and out right? train_args = options.parse_args_and_arch( train_parser, [ "--task", "cross_lingual_lm", data_dir, "--arch", arch, # Optimizer args "--optimizer", "adam", "--lr-scheduler", "reduce_lr_on_plateau", "--lr-shrink", "0.5", "--lr", "0.0001", "--min-lr", "1e-09", # dropout, attention args "--dropout", "0.1", "--attention-dropout", "0.1", # MLM args "--criterion", "masked_lm_loss", "--masked-lm-only", "--monolingual-langs", "in,out", "--num-segment", "5", # Transformer args: use a small transformer model for fast training "--encoder-layers", "1", "--encoder-embed-dim", "32", "--encoder-attention-heads", "1", "--encoder-ffn-embed-dim", "32", # Other training args "--max-tokens", "500", "--tokens-per-sample", "500", "--save-dir", data_dir, "--max-epoch", "1", "--no-progress-bar", "--distributed-world-size", "1", "--dataset-impl", "raw", ] + list(extra_args), ) train.main(train_args)
def cli_main(): parser = options.get_training_parser() parser.add_argument( '--train-subtransformer', action='store_true', default=False, help='whether train SuperTransformer or SubTransformer') parser.add_argument( '--sub-configs', required=False, is_config_file=True, help= 'when training SubTransformer, use --configs to specify architecture and --sub-configs to specify other settings' ) # for profiling parser.add_argument('--profile-flops', action='store_true', help='measure the FLOPs of a SubTransformer') parser.add_argument('--latgpu', action='store_true', help='measure SubTransformer latency on GPU') parser.add_argument('--latcpu', action='store_true', help='measure SubTransformer latency on CPU') parser.add_argument( '--latiter', type=int, default=300, help='how many iterations to run when measure the latency') parser.add_argument('--latsilent', action='store_true', help='keep silent when measure latency') parser.add_argument( '--validate-subtransformer', action='store_true', help='evaluate the SubTransformer on the validation set') options.add_generation_args(parser) args = options.parse_args_and_arch(parser) if args.latcpu: args.cpu = True args.fp16 = False if args.latgpu or args.latcpu or args.profile_flops: args.distributed_world_size = 1 #if args.distributed_init_method is None: # distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training #if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: if not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), #nprocs=torch.cuda.device_count(), nprocs=8, #Use all TPU cores ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs #assert args.distributed_world_size <= torch.cuda.device_count() import torch_xla.distributed.xla_multiprocessing as xmp torch.multiprocessing.set_sharing_strategy("file_system") port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d': print( '| NOTE: you may get better performance with: --ddp-backend=no_c10d' ) xmp.spawn( fn=distributed_main, args=(args, ), nprocs=8, # use all 8 TPU cores ) #torch.multiprocessing.spawn( # fn=distributed_main, # args=(args, ), # nprocs=args.distributed_world_size, #) else: # single GPU training main(args)
def main(args): if args.distributed_init_method is None and args.distributed_port > 0: # We can determine the init method automatically for Slurm. node_list = os.environ.get('SLURM_JOB_NODELIST') if node_list is not None: try: hostnames = subprocess.check_output(['scontrol', 'show', 'hostnames', node_list]) args.distributed_init_method = 'tcp://{host}:{port}'.format( host=hostnames.split()[0].decode('utf-8'), port=args.distributed_port) args.distributed_rank = int(os.environ.get('SLURM_PROCID')) args.device_id = int(os.environ.get('SLURM_LOCALID')) except subprocess.CalledProcessError as e: # scontrol failed raise e except FileNotFoundError as e: # Slurm is not installed pass if args.distributed_init_method is None: raise ValueError('--distributed-init-method or --distributed-port ' 'must be specified for distributed training') args.distributed_rank = distributed_utils.distributed_init(args) print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank)) single_process_main(args) if __name__ == '__main__': parser = options.get_training_parser() args = options.parse_args_and_arch(parser) main(args)
def cli_main(): parser = options.get_training_parser() parser.add_argument( '--config', type=str, nargs='*', help= 'paths to JSON files of experiment configurations, from high to low priority', ) parser.add_argument('--exp-name', type=str, default='', help='name of the experiment') parser.add_argument( '--debug', default=False, action='store_true', help='run training in the debugging mode', ) parser.add_argument('--path-attributes', type=str, nargs='*', default=['task', 'arch', 'lr']) parser.add_argument('--torch-file-system', action='store_true') pre_parsed_args, unknown = parser.parse_known_args() config_dict = {} for config_path in pre_parsed_args.config: config_dict = update_config(config_dict, compose_configs(config_path)) parser_modifier = modify_factory(config_dict) args = options.parse_args_and_arch(parser, modify_parser=parser_modifier) update_namespace(args, config_dict) # set sharing strategy file system in case /dev/shm/ limits are small if args.torch_file_system: torch.multiprocessing.set_sharing_strategy('file_system') training_name = get_training_name(args) base_save_dir = generate_save_dir(args, training_name, sys.argv[1:]) setattr(args, 'training_name', training_name) setattr(args, 'save_dir', os.path.join(base_save_dir, 'checkpoints')) setattr(args, 'tensorboard_logdir', os.path.join(base_save_dir, 'tensorboard')) save_config(vars(args), base_save_dir) if args.distributed_init_method is None: distributed_utils.infer_init_method(args) if args.distributed_init_method is not None: # distributed training if torch.cuda.device_count() > 1 and not args.distributed_no_spawn: start_rank = args.distributed_rank args.distributed_rank = None # assign automatically torch.multiprocessing.spawn( fn=distributed_main, args=(args, start_rank), nprocs=torch.cuda.device_count(), ) else: distributed_main(args.device_id, args) elif args.distributed_world_size > 1: # fallback for single node with multiple GPUs assert args.distributed_world_size <= torch.cuda.device_count() port = random.randint(10000, 20000) args.distributed_init_method = 'tcp://localhost:{port}'.format( port=port) args.distributed_rank = None # set based on device id if (args.update_freq is not None and max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d'): logger.info( 'NOTE: you may get faster training with: --ddp-backend=no_c10d' ) torch.multiprocessing.spawn( fn=distributed_main, args=(args, ), nprocs=args.distributed_world_size, ) else: # single GPU training main(args)