Exemplo n.º 1
0
def cli_main():
    parser = options.get_training_parser()
    parser.add_argument('--train-subtransformer', action='store_true', default=False, help='whether train SuperTransformer or SubTransformer')
    parser.add_argument('--sub-configs', required=False, is_config_file=True, help='when training SubTransformer, use --configs to specify architecture and --sub-configs to specify other settings')

    # for profiling
    parser.add_argument('--profile-flops', action='store_true', help='measure the FLOPs of a SubTransformer')

    parser.add_argument('--latgpu', action='store_true', help='measure SubTransformer latency on GPU')
    parser.add_argument('--latcpu', action='store_true', help='measure SubTransformer latency on CPU')
    parser.add_argument('--latiter', type=int, default=300, help='how many iterations to run when measure the latency')
    parser.add_argument('--latsilent', action='store_true', help='keep silent when measure latency')

    parser.add_argument('--validate-subtransformer', action='store_true', help='evaluate the SubTransformer on the validation set')

    options.add_generation_args(parser)

    args = options.parse_args_and_arch(parser)

    if args.latcpu:
        args.cpu = True
        args.fp16 = False

    if args.latgpu or args.latcpu or args.profile_flops:
        args.distributed_world_size = 1

    if args.pdb:
        pdb.set_trace()

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print('| NOTE: you may get better performance with: --ddp-backend=no_c10d')
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 2
0
def cli_main(modify_parser=None):
    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser, modify_parser=modify_parser)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 3
0
def cli_main():
    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser)
    
    # params_file = os.path.join(os.path.dirname(args.save_dir),"({0})-params.log".format(os.path.basename(args.save_dir)))
    # with open(params_file,"w",encoding="utf-8") as w:
    #     w.write(str(args).replace(", ",",\n"))
    #     print("saving params file into{}...".format(params_file))

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.debug:
        args.distributed_world_size = 1
        args.train_subset = args.valid_subset
    #args.cpu = True
    if args.distributed_init_method is not None:
        # distributed training
        distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print('| NOTE: you may get better performance with: --ddp-backend=no_c10d')
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 4
0
def cli_main(main_fn):
    argv = sys.argv[1:]
    # This is a maker that separates meta-learning arguments from downstream training arguments
    split_index = argv.index('---')
    meta_argv = argv[:split_index]
    maybe_downstream_argv = argv[split_index + 1:]
    parser = options.get_meta_training_parser()
    meta_learning_args = options.parse_args_and_arch(parser,
                                                     input_args=meta_argv)
    fine_tune_args = None
    if meta_learning_args.baseline:
        split_index = maybe_downstream_argv.index('---')
        downstream_argv = maybe_downstream_argv[:split_index]
        baseline_argv = maybe_downstream_argv[split_index + 1:]
        parser = options.get_meta_learning_parser()
        fine_tune_args = options.parse_args_and_arch(parser,
                                                     input_args=baseline_argv)
    else:
        downstream_argv = maybe_downstream_argv
    parser = options.get_meta_learning_parser()
    downstream_args = options.parse_args_and_arch(parser,
                                                  input_args=downstream_argv)
    print('Meta-learning Arguments: ')
    print(meta_learning_args)
    print('Downstream Arguments: ')
    print(downstream_args)
    print('Fine-tune Args: ')
    print(fine_tune_args)
    if meta_learning_args.distributed_init_method is None:
        distributed_utils.infer_init_method(meta_learning_args)

    if meta_learning_args.distributed_init_method is not None:
        # distributed training
        distributed_main(meta_learning_args.device_id,
                         meta_learning_args=meta_learning_args,
                         downstream_args=downstream_args,
                         fine_tune_args=fine_tune_args,
                         main_fn=main_fn)
    elif meta_learning_args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        port = random.randint(10000, 20000)
        meta_learning_args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        meta_learning_args.distributed_rank = None  # set based on device id
        if max(meta_learning_args.update_freq
               ) > 1 and meta_learning_args.ddp_backend != 'no_c10d':
            print(
                '| NOTE: you may get better performance with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(meta_learning_args, downstream_args, fine_tune_args,
                  main_fn),
            nprocs=meta_learning_args.distributed_world_size,
        )
    else:
        # single GPU training
        main_fn(meta_learning_args=meta_learning_args,
                downstream_args=downstream_args,
                fine_tune_args=fine_tune_args)
Exemplo n.º 5
0
def cli_main():
    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser)

    # print the model hparams
    for k, v in sorted(args.__dict__.items(), key=lambda x: x[0]):
        print('{:40s} = {}'.format(k, v))

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print(
                '| NOTE: you may get better performance with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 6
0
def cli_main():
    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    #args.distributed_world_size = 1
    #args.cpu = True
    if args.debug:
        args.distributed_world_size = 1
        args.train_subset = args.valid_subset
    if args.distributed_init_method is not None:
        # distributed training
        distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print(
                '| NOTE: you may get better performance with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 7
0
def cli_main():
    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser)

    try:
        git_branch = subprocess.check_output(['git', 'symbolic-ref', '--short', 'HEAD'])
        git_revision = subprocess.check_output(['git', 'rev-parse', 'HEAD'])
    except:
        git_branch = 'unknown'
        git_revision = 'unknown'
    print('GIT: {} {}'.format(git_branch, git_revision))
    print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
    print('-' * 80)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print('| NOTE: you may get better performance with: --ddp-backend=no_c10d')
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 8
0
def cli_main():
    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print('| NOTE: you may get better performance with: --ddp-backend=no_c10d')
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 9
0
def load_megatron_lm(args):
    """
    Load Megatron_lm in fp16. A Tesla V100 is enough for inference.
    I haven't implement the parallel method for fine-tuning.
    You can refer to fairseq_eval_lm.py implementation for parallel function.
    :return: TransformerLanguageModelWrapper
    """
    megatron_path = join(args.checkpoint_dir, 'Megatron_11b', 'megatron_11b')
    # init args for task initialization
    if os.path.exists(join(megatron_path, 'task.pkl')):
        task = _pickle.load(open(join(megatron_path, 'task.pkl'), 'rb'))
    else:
        sys.argv.append(megatron_path)
        task_args = get_task_args()
        distributed_utils.infer_init_method(task_args)
        task_args.distributed_rank = None
        task = tasks.setup_task(task_args)
        _pickle.dump(task, open(join(megatron_path, 'task.pkl'), 'wb'))

    # load model partitions
    if os.path.exists(join(megatron_path, 'model.pt')):
        merge_partition = torch.load(join(megatron_path, 'model.pt'))
    else:
        merge_partition = dict()
        for i in range(8):
            # load checkpoints
            ckpt = torch.load(join(megatron_path,
                                   'model-model_part-{}.pt'.format(i)),
                              map_location='cpu')
            if i == 0:
                merge_partition = ckpt
            else:
                print("Load from partition {}".format(i))
                for param_name, param in tqdm(ckpt['model'].items()):
                    if 'version' in param_name:
                        continue
                    src_param, tgt_param = merge_partition['model'][
                        param_name], param
                    if param_name.endswith(
                            'out_proj.weight') or param_name.endswith(
                                'fc2.weight'):
                        res = torch.cat((src_param, tgt_param), dim=1)
                    elif param_name.endswith('k_proj.weight') or param_name.endswith('k_proj.bias') or \
                            param_name.endswith('v_proj.weight') or param_name.endswith('v_proj.bias') or \
                            param_name.endswith('q_proj.weight') or param_name.endswith('q_proj.bias') or \
                            param_name.endswith('fc1.weight') or param_name.endswith('fc1.bias') or \
                            param_name.endswith('output_projection.weight') or param_name.endswith('embed_tokens.weight'):
                        res = torch.cat((src_param, tgt_param), dim=0)
                    else:
                        res = src_param
                    merge_partition['model'][param_name] = res

    # build model
    args = merge_partition['args']
    args.model_parallel_size = 0
    # torch.save(merge_partition, join(CKPT_DIR, 'Megatron_11b/megatron_11b/model.pt'))
    model = TransformerLanguageModelWrapper.build_model(args, task)
    model.load_state_dict(merge_partition['model'])
    return model.half().cuda()
Exemplo n.º 10
0
def cli_main():
    parser = options.get_training_parser()
    parser.add_argument("--mask", action="store_true")
    parser.add_argument("--decoder-wise-training", action="store_true")
    parser.add_argument("--load", default="", type=str)
    parser.add_argument("--focus", default=-1, type=int)
    parser.add_argument("--masking", action="store_true")
    parser.add_argument("--early-stop", action="store_true")
    parser.add_argument("--save-path", default="", type=str)
    parser.add_argument("--train-decoder-only", action="store_true")
    args = options.parse_args_and_arch(parser)

    if getattr(args, "pnet", False) and args.load == "":
        print("training pnet requires loading a pretrained model")
        sys.exit()

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print(
                '| NOTE: you may get better performance with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)

    if args.mask:
        args.masking = True
        main(args)
Exemplo n.º 11
0
def cli_main(modify_parser=None):
    parser = options.get_training_parser()
    parser.add_argument(
        '--remove-bpe',
        nargs='?',
        const='@@ ',
        default=None,
        help='remove BPE tokens before scoring '
        '(can be set to sentencepiece). Being used for monitoring '
        'and validation')
    args = options.parse_args_and_arch(parser, modify_parser=modify_parser)
    print_options_meaning_changes(args)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            logger.info(
                'NOTE: you may get faster training with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 12
0
def cli_main(training_args, modify_parser=None):
    # print(training_args)
    # get args for FairSeq by converting the hyperparameters as if they
    # were command-line arguments
    argv_copy = copy.deepcopy(sys.argv)
    # some arguments are pre-defined for SageMaker
    sys.argv[1:] = training_args

    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser, modify_parser=modify_parser)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    try:
        logger.info("ENV MASTER_ADDR={}, MASTER_PORT={}".format(os.environ['MASTER_ADDR'],
                                                          os.environ['MASTER_PORT']))
    except:
        logger.info("ENV MASTER_ADDR and MASTER_PORT not configured! Probably running without Sagemaker!")

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port)
        args.distributed_rank = None  # set based on device id
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 13
0
def cli_main():

    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser)

    print(args)
    # if os.path.exists(f"{args.save_dir}/checkpoint_best.pt"):
    # raise Exception('Already Trained!')

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print(
                '| NOTE: you may get better performance with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 14
0
def cli_main(modify_parser=None):
    """
    Dongxu:
    1) Parse arguments; 2) choose distribution strategy; 3) call main() with args.

    :param modify_parser:
    :return:
    """
    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser, modify_parser=modify_parser)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            logger.info(
                'NOTE: you may get faster training with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 15
0
def cli_main(modify_parser=None):
    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser, modify_parser=modify_parser)
    print_options_meaning_changes(args)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        if not getattr(args, 'tpu', False):
            # fallback for single node with multiple GPUs
            assert args.distributed_world_size <= torch.cuda.device_count()
            port = random.randint(10000, 20000)
            args.distributed_init_method = 'tcp://localhost:{port}'.format(
                port=port)
            args.distributed_rank = None  # set based on device id
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, ),
                nprocs=args.distributed_world_size,
            )
        else:
            import torch_xla.distributed.xla_multiprocessing as xmp
            torch.multiprocessing.set_sharing_strategy('file_system')
            xmp.spawn(
                fn=distributed_main,
                args=(args, ),
                nprocs=8,  # use all 8 TPU cores
            )
    else:
        # single GPU training
        main(args)
Exemplo n.º 16
0
def cli_main():
    parser = options.get_training_parser()

    parser.add_argument("--adv_sr",          action='store_true', default=False,  help='whether to train with Adv SR') 
    parser.add_argument("--num_cands",       default=9,           type=int,       help='pre-defined number of segmentation candidates')
    parser.add_argument("--src_pert_prob",   default=0.33,        type=float,     help='perturbation ratio for the source sentence')
    parser.add_argument("--tgt_pert_prob",   default=0.33,        type=float,     help='perturbation ratio for the target sentence')
    parser.add_argument("--sp_model",        help='directory to sentencepiece module for pre-segmenting candidates')
    
    args = options.parse_args_and_arch(parser)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print('| NOTE: you may get better performance with: --ddp-backend=no_c10d')
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 17
0
Arquivo: train.py Projeto: yf1291/nlp4
def cli_main():
    parser = options.get_training_parser()
    parser.add_argument('--do-evaluate', action='store_true', default=False,
                        help='Only do evaluation (for squad)')
    parser.add_argument('--do-layer-decay', action='store_true', default=False,
                        help='Do layer-wise learning rate decay ')
    parser.add_argument('--layer-decay', default=1.0, type=float,
                        help='The coefficient of layer decay')
    args = options.parse_args_and_arch(parser)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print('| NOTE: you may get better performance with: --ddp-backend=no_c10d')
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 18
0
def cli_main():
    parser = options.get_training_parser() ## 获取相关参数
    args = options.parse_args_and_arch(parser)

    if args.distributed_init_method is None: ## distributed_init_method 一般使用默认参数None, 多机训练配置
        # 每个进程使用的本节点的GPU id作为local_rank参数,传入运行代码
        #设置args.distributed_world_size 多机训练使用的所有GPU数量:nnodes * nproc_per_node
        #设置args.distributed_rank 当前GPU在所有节点的所有GPU中的ID
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None: ##多机多卡训练
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            ## 当前进程再使用torch的multiprocessing spawn出多个process来使用多个GPU, 
            ## 但由于torch.distributed.launch包通过循环调用train.py, 已经为每个GPU循环创建了进程,
            ## 所以最好设置 distributed_no_spawn 为True, 每个进程只使用一个GPU
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else: ## 设置distributed_no_spawn为True后,torch.distributed.launch创建的每个进程会直接调用distributed_main
            distributed_main(args.device_id, args)  ## args.device_id就是args.local_rank,因此就是当前节点的GPU ID
    elif args.distributed_world_size > 1: ##单机多卡训练
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print('| NOTE: you may get better performance with: --ddp-backend=no_c10d')
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 19
0
def cli_main_helper(args):
    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)
    if args.distributed_backend == "ccl":
        main(args, init_distributed=True)
    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        if not getattr(args, "tpu", False):
            # fallback for single node with multiple GPUs
            assert args.distributed_world_size <= torch.cuda.device_count()
            port = random.randint(10000, 20000)
            args.distributed_init_method = "tcp://localhost:{port}".format(
                port=port)
            args.distributed_rank = None  # set based on device id
            torch.multiprocessing.spawn(fn=distributed_main,
                                        args=(args, ),
                                        nprocs=args.distributed_world_size)
        else:
            import torch_xla.distributed.xla_multiprocessing as xmp

            torch.multiprocessing.set_sharing_strategy("file_system")
            xmp.spawn(
                fn=distributed_main,
                args=(args, ),
                nprocs=8  # use all 8 TPU cores
            )
    else:
        # single GPU training
        main(args)
Exemplo n.º 20
0
def cli_main(modify_parser=None):
    parser = options.get_training_parser()
    parser.add_argument('--comet', action='store_true', help='Log results on comet')
    parser.add_argument('--comet-tag', default="",
                        help='Set experiment.set_tag(args.comet_tag), or use an auto-generated tag if this is empty')
    parser.add_argument('--comet-real-tag', default="", type=str, help='Log results on comet')
    parser.add_argument('--comet-project', default='normalizations', type=str, help='Log results on comet')
    parser.add_argument('--api-key', default="", type=str)

    args = options.parse_args_and_arch(parser, modify_parser=modify_parser)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port)
        args.distributed_rank = None  # set based on device id
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 21
0
def cli_main():
    args = parse()

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(port=port)
        args.distributed_rank = None  # set based on device id
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 22
0
def cli_main():
    parser = options.get_training_parser()
    add_damethod_args(parser)
    args = options.parse_args_and_arch(parser)
    os.makedirs(args.save_dir, exist_ok=True)

    sys.stdout = Logger(args.save_dir + "/log.txt", "w", sys.stdout)

    if args.multidatasource == 'mixed' and args.damethod != "naive":
        args.task = "translation_da"
        assert args.criterion == "cross_entropy_da"
    else:
        args.task = "translation"

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print(
                '| NOTE: you may get better performance with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 23
0
def cli_main():

    parser = options.get_training_parser()
    parser.add_argument(
        '--config',
        type=str,
        nargs='*',
        help=
        'paths to JSON files of experiment configurations, from high to low priority',
    )
    parser.add_argument('--exp-name',
                        type=str,
                        default='',
                        help='name of the experiment')
    parser.add_argument(
        '--debug',
        default=False,
        action='store_true',
        help='run training in the debugging mode',
    )
    parser.add_argument('--path-attributes',
                        type=str,
                        nargs='*',
                        default=['task', 'arch', 'lr'])
    parser.add_argument(
        '--filter_best_last_ckpts',
        type=str,
        default=False,
        help=
        'whether to filter out checkpoint_best and checkpoint_last from checkpoint list'
    )
    parser.add_argument('--log_valid_progress',
                        type=str,
                        default=False,
                        help='whether to log validation progress')
    pre_parsed_args, unknown = parser.parse_known_args()

    config_dict = {}
    for config_path in pre_parsed_args.config:
        config_dict = update_config(config_dict, compose_configs(config_path))

    parser_modifier = modify_factory(config_dict)

    args = options.parse_args_and_arch(parser, modify_parser=parser_modifier)

    update_namespace(args, config_dict)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if (args.update_freq is not None and max(args.update_freq) > 1
                and args.ddp_backend != 'no_c10d'):
            logger.info(
                'NOTE: you may get faster training with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)
Exemplo n.º 24
0
def cli_main():
    parser = options.get_training_parser()
    parser.add_argument(
        "--comet-logging",
        action="store_true",
        help="Whether to use Comet.ML for logging",
    )
    args = options.parse_args_and_arch(parser)

    logging = getattr(args, "comet_logging", False)
    config = None
    if logging:
        PROJECT = "machine-translation"
        if not keyring.get_password("comet", PROJECT):
            comet_ml_api_key = getpass("Please enter the comet.ml API key: ")
            keyring.set_password("comet", PROJECT, comet_ml_api_key)
        else:
            comet_ml_api_key = keyring.get_password("comet", PROJECT)

        experiment = Experiment(
            api_key=comet_ml_api_key,
            project_name="machine-translation",
            workspace="machine-translation",
            auto_output_logging=None,
        )
        config = {
            "api_key": comet_ml_api_key,
            "experiment_key": experiment.get_key()
        }
        print("Proceeding with Comet.ML logging...")

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, config, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args, config)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = "tcp://localhost:{port}".format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != "no_c10d":
            print(
                "| NOTE: you may get better performance with: --ddp-backend=no_c10d"
            )
        torch.multiprocessing.spawn(fn=distributed_main,
                                    args=(args, config),
                                    nprocs=args.distributed_world_size)
    else:
        # single GPU training
        main(args, config=config)
    if config:
        experiment.end()
Exemplo n.º 25
0
def train_main(alpha, beta, save_path):
    parser = options.get_training_parser()
    input_args = [
        data_set, '--share-decoder-input-output-embed', '--arch',
        'transformer_iwslt_de_en', '--max-tokens', '4000', '--lr', '5e-4',
        '--save-interval', '2', '--max-epoch', '85', '--patience', '5',
        '--optimizer', 'adam', '--adam-betas', '(0.9, 0.98)', '--clip-norm',
        '0.0', '--weight-decay', '0.0001', '--dropout', '0.3',
        '--lr-scheduler', 'inverse_sqrt', '--warmup-updates', '4000',
        '--keep-last-epochs', '4', '--criterion', 'jensen_cross_entropy',
        '--alpha',
        str(alpha), '--beta',
        str(beta), '--use-uniform', '--fp16', '--save-dir', save_path
    ]

    args = options.parse_args_and_arch(parser, input_args=input_args)
    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if max(args.update_freq) > 1 and args.ddp_backend != 'no_c10d':
            print(
                '| NOTE: you may get better performance with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )
    else:
        # single GPU training
        main(args)

    ckpts = os.listdir(args.save_dir)
    try:
        ckpts.remove('checkpoint_last.pt')
    except ValueError:
        print("no checkpoint_last.pt in folder", args.save_dir)

    f = open(os.path.join(args.save_dir, "final_entropies.txt"), "a+")
    results = {}
    entropies = {}
    for ckpt in ckpts:
        if '.pt' in ckpt:
            path = os.path.join(args.save_dir, ckpt)
            f.write(path + '\n')
            run_generation(path, results, entropies)

            f.write('{entropy: ' + str(entropies[path]) + ', bleu: ' +
                    str(results[path]) + '}\n')

    f.close()
    return results
Exemplo n.º 26
0
    import socket
    args.device_id = i
    if args.distributed_rank is None:  # torch.multiprocessing.spawn
        args.distributed_rank = i
    args.distributed_rank = distributed_utils.distributed_init(args)
    print('| initialized host {} as rank {}'.format(socket.gethostname(),
                                                    args.distributed_rank))
    main(args)


if __name__ == '__main__':
    parser = options.get_training_parser()
    args = options.parse_args_and_arch(parser)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        print('''| NOTE: you may get better performance with:

            python -m torch.distributed.launch --nproc_per_node {ngpu} train.py {no_c10d}(...)
            '''.format(
            ngpu=args.distributed_world_size,
Exemplo n.º 27
0
def cli_main():

    parser = options.get_training_parser()
    parser.add_argument(
        '--config',
        type=str,
        nargs='*',
        help=
        'paths to JSON files of experiment configurations, from high to low priority',
    )
    parser.add_argument('--exp-name',
                        type=str,
                        default='',
                        help='name of the experiment')
    parser.add_argument(
        '--debug',
        default=False,
        action='store_true',
        help='run training in the debugging mode',
    )
    parser.add_argument('--path-attributes',
                        type=str,
                        nargs='*',
                        default=['task', 'arch', 'lr'])
    parser.add_argument('--torch-file-system', action='store_true')

    pre_parsed_args, unknown = parser.parse_known_args()

    config_dict = {}
    for config_path in pre_parsed_args.config:
        config_dict = update_config(config_dict, compose_configs(config_path))

    parser_modifier = modify_factory(config_dict)

    args = options.parse_args_and_arch(parser, modify_parser=parser_modifier)

    update_namespace(args, config_dict)

    # set sharing strategy file system in case /dev/shm/ limits are small
    if args.torch_file_system:
        torch.multiprocessing.set_sharing_strategy('file_system')

    training_name = get_training_name(args)
    base_save_dir = generate_save_dir(args, training_name, sys.argv[1:])
    setattr(args, 'training_name', training_name)
    setattr(args, 'save_dir', os.path.join(base_save_dir, 'checkpoints'))
    setattr(args, 'tensorboard_logdir',
            os.path.join(base_save_dir, 'tensorboard'))

    save_config(vars(args), base_save_dir)

    if args.distributed_init_method is None:
        distributed_utils.infer_init_method(args)

    if args.distributed_init_method is not None:
        # distributed training
        if torch.cuda.device_count() > 1 and not args.distributed_no_spawn:
            start_rank = args.distributed_rank
            args.distributed_rank = None  # assign automatically
            torch.multiprocessing.spawn(
                fn=distributed_main,
                args=(args, start_rank),
                nprocs=torch.cuda.device_count(),
            )
        else:
            distributed_main(args.device_id, args)
    elif args.distributed_world_size > 1:
        # fallback for single node with multiple GPUs
        assert args.distributed_world_size <= torch.cuda.device_count()
        port = random.randint(10000, 20000)
        args.distributed_init_method = 'tcp://localhost:{port}'.format(
            port=port)
        args.distributed_rank = None  # set based on device id
        if (args.update_freq is not None and max(args.update_freq) > 1
                and args.ddp_backend != 'no_c10d'):
            logger.info(
                'NOTE: you may get faster training with: --ddp-backend=no_c10d'
            )
        torch.multiprocessing.spawn(
            fn=distributed_main,
            args=(args, ),
            nprocs=args.distributed_world_size,
        )

    else:
        # single GPU training
        main(args)