def construct_arguments(): args = get_arguments() # Prepare Logger logger = Logger(cuda=torch.cuda.is_available() and not args.no_cuda) args.logger = logger config = json.load(open(args.config_file, 'r', encoding='utf-8')) # choose dataset and training config based on the given sequence length seq_len = str(args.max_seq_length) datasets = config["data"]["mixed_seq_datasets"][seq_len] del config["data"]["mixed_seq_datasets"] training = config["mixed_seq_training"][seq_len] del config["mixed_seq_training"] config["data"]["datasets"] = datasets config["training"] = training args.config = config args.job_name = config['name'] if args.job_name is None else args.job_name print("Running Config File: ", args.job_name) # Setting the distributed variables print("Args = {}".format(args)) # Setting all the seeds so that the task is random but same accross processes random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) os.makedirs(args.output_dir, exist_ok=True) args.saved_model_path = os.path.join(args.output_dir, "saved_models/", args.job_name) args.n_gpu = 1 # Loading Tokenizer tokenizer = BertTokenizer.from_pretrained(config["bert_token_file"]) args.tokenizer = tokenizer # Set validation dataset path if args.validation_data_path_prefix is None: logging.warning( 'Skipping validation because validation_data_path_prefix is unspecified' ) # Issue warning if early exit from epoch is configured if args.max_steps < sys.maxsize: logging.warning( 'Early training exit is set after {} global steps'.format( args.max_steps)) if args.max_steps_per_epoch < sys.maxsize: logging.warning('Early epoch exit is set after {} global steps'.format( args.max_steps_per_epoch)) return args
def construct_arguments(): args = get_arguments() # Prepare Logger logger = Logger(cuda=torch.cuda.is_available() and not args.no_cuda) args.logger = logger config = json.load(open(args.config_file, 'r', encoding='utf-8')) args.config = config args.job_name = config['name'] if args.job_name is None else args.job_name print("Running Config File: ", args.job_name) # Setting the distributed variables print("Args = {}".format(args)) # Setting all the seeds so that the task is random but same accross processes random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) os.makedirs(args.output_dir, exist_ok=True) args.saved_model_path = os.path.join(args.output_dir, "saved_models/", args.job_name) args.n_gpu = 1 # Loading Tokenizer tokenizer = BertTokenizer.from_pretrained(config["bert_token_file"]) args.tokenizer = tokenizer # Issue warning if early exit from epoch is configured if args.max_steps < sys.maxsize: logging.warning( 'Early training exit is set after {} global steps'.format( args.max_steps)) if args.max_steps_per_epoch < sys.maxsize: logging.warning('Early epoch exit is set after {} global steps'.format( args.max_steps_per_epoch)) return args
def construct_arguments(): args = get_arguments() # Prepare Logger logger = Logger(cuda=torch.cuda.is_available() and not args.no_cuda) args.logger = logger config = json.load(open(args.config_file, 'r', encoding='utf-8')) args.config = config job_name = config['name'] if args.job_name is None else args.job_name print("Running Config File: ", job_name) # Setting the distributed variables print("Args = {}".format(args)) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs start_time = time.time() torch.distributed.init_process_group(backend='nccl') end_time = time.time() logger.info("Init_process_group takes %f sec" % (end_time - start_time)) if args.fp16: logger.info( "16-bits distributed training not officially supported but seems to be working." ) args.fp16 = True # (see https://github.com/pytorch/pytorch/pull/13496) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) # Setting all the seeds so that the task is random but same accross processes random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # if os.path.exists(args.output_dir) and os.listdir(args.output_dir): # raise ValueError("Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) args.saved_model_path = os.path.join(args.output_dir, "saved_models/", job_name) # Prepare Summary Writer and saved_models path if (not args.no_cuda and dist.get_rank() == 0) or (args.no_cuda and args.local_rank == -1): summary_writer = get_sample_writer(name=job_name, base=args.output_dir) args.summary_writer = summary_writer os.makedirs(args.saved_model_path, exist_ok=True) # set device args.device = device args.n_gpu = n_gpu # Loading Tokenizer tokenizer = BertTokenizer.from_pretrained(config["bert_token_file"]) args.tokenizer = tokenizer # Issue warning if early exit from epoch is configured if args.max_steps < sys.maxsize: logging.warning( 'Early training exit is set after {} global steps'.format( args.max_steps)) if args.max_steps_per_epoch < sys.maxsize: logging.warning('Early epoch exit is set after {} global steps'.format( args.max_steps_per_epoch)) return args