def multi_train(args): init_logger() gpu_number = args.world_size mp = torch.multiprocessing.get_context('spawn') # Create a thread to listen for errors in the child processes. error_queue = mp.SimpleQueue() error_handler = ErrorHandler(error_queue) # Train with multiprocessing procs = [] for i in range(gpu_number): device_id = i procs.append( mp.Process(target=run, args=( args, device_id, error_queue, ), daemon=True)) procs[i].start() logger.info("Starting process pid: {:d} ".format(procs[i].pid)) error_handler.add_child(procs[i].pid) for p in procs: p.join()
def build_trainer(args, device_id, model, optim): grad_accum_count = args.grad_accum_count n_gpu = args.world_size if device_id >= 0: gpu_rank = int(args.gpu_ranks[device_id]) else: gpu_rank = 0 n_gpu = 0 print("gpu_rank %d" % gpu_rank) tensorboard_log_dir = args.tensorboard_log_dir writer = SummaryWriter(tensorboard_log_dir, comment='bert_coherence_measurement') report_manager = ReportMgr(args.report_every, start_time=-1, tensorboard_writer=writer) trainer = Trainer(args, model, optim, grad_accum_count, n_gpu, gpu_rank, report_manager=report_manager) if model: n_params = _tally_parameters(model) logger.info("* number of parameters: {:d}".format(n_params)) return trainer
def write_mapping(params): init_logger( "/sdc/xli/Datasets/cnn_daily/data_nsp/shard/mapping/mapping.log") paths, save_file = params with open(save_file, 'w') as file: for path in paths: file.write(path + "\n") logger.info("{:d} files has write in mapping file".format(len(paths)))
def run(device_id, gpu_ranks, world_size, args): gpu_rank = multi_init(device_id, world_size, gpu_ranks) logger.info("GPU Rank: gpu_rank {:d}".format(gpu_rank)) if gpu_rank != gpu_ranks[device_id]: raise AssertionError("An Error occured in Distributed intializaiton") n_gpu = world_size train(args, 10000, n_gpu, device_id, gpu_rank)
def save_json(save_path, file_id, samples): init_logger() for i, sample in enumerate(samples): save_ = os.path.join(save_path, "{:s}_{:d}.json".format(file_id, i)) with open(save_, 'w') as file: json.dump(sample, file) logger.info("{:s} saved at {:s}".format(save_, save_path))
def _lazy_load_dataset(pt_file, corpus_type): # with open(json_file, 'r') as file: # dataset = json.load(file) dataset = torch.load(pt_file, map_location=torch.device('cpu')) logger.info( "Loading {:s} dataset from {:s}, number of examples: {:d}".format( corpus_type, pt_file, len(dataset))) return dataset
def shard(self): init_logger() def check_file_exists(root_path): for f in glob.glob(os.path.join(root_path, "*.json")): file_path = pathlib.Path(f) if file_path.exists(): os.unlink(file_path) pairs_train_mapping, pairs_test_mapping = self.args.pairs_train_mapping, self.args.pairs_test_mapping train_files, test_files = map( self.read_mapping, (pairs_train_mapping, pairs_test_mapping)) divided_corpus = {'train': train_files, 'test': test_files} # delete all files under the save_path before write in check_file_exists(self.args.save_path) pool = Pool(mp.cpu_count()) for corpus_type in ['train', 'test']: files = divided_corpus.get(corpus_type) dataset = [] file_no = 0 for d in pool.imap_unordered(self.load_pairs, files): if d is not None: dataset.append(d) if len(dataset) > self.args.shard_size: pt_file = os.path.join( self.args.save_path, "{:s}/cd_{:s}_{:d}.json".format( corpus_type, corpus_type, file_no)) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) logger.info( "cd_{:s}_{:d}.json saved at {:s}/{:s}.".format( corpus_type, file_no, self.args.save_path, corpus_type)) file_no += 1 dataset = [] else: continue if len(dataset) > 0: pt_file = os.path.join( self.args.save_path, "{:s}/cd_{:s}_{:d}.json".format(corpus_type, corpus_type, file_no)) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) file_no += 1 pool.close() pool.join() print("Shard task is finished!")
def delete_tgt(): init_logger() root_path = "/sdc/xli/Datasets/cnn_daily/tgts" for root, dirs, file_list in os.walk(root_path): for file in file_list: file_path = os.path.join(root, file) os.unlink(file_path) logger.info("{:s} deleted from {:s}".format(file, root)) os.removedirs(root_path) logger.info("{:s} dir deleted.".format(root_path))
def check_and_delete(self, path): init_logger() # file_path = pathlib.Path(path) # if file_path.exists(): # os.unlink(file_path) # logger.info("{:s} deleted".format(path)) for f in glob.glob(os.path.join(path, "*.json")): file_path = pathlib.Path(f) if file_path.exists(): os.unlink(file_path) logger.info("{:s} deleted from {:s}".format(f, path))
def save_pair(pairs, coherence, mark, file_id, save_path): init_logger() if len(pairs) > 0: for i, pair in enumerate(pairs): pair_dict = {"pair": pair, "coherence": coherence} save_file = os.path.join( save_path, "{:s}_{:s}_{:d}.json".format(file_id, mark, i)) with open(save_file, 'w') as file: json.dump(pair_dict, file) logger.info("{:s} saved".format(save_file))
def inner(self, locator, value=""): try: return method(self, locator, value) except Exception as e: for popup in self._popup_list: ret = self.find_all(popup) if ret: logger.info(f' find popup {popup}') ret[0].click() return method(self, locator, value) continue raise e
def _save(self, step): real_model = self.model model_state_dict = real_model.state_dict() checkpoint = {'model': model_state_dict, 'optim': self.optim} checkpoint_path = os.path.join(self.args.model_path, 'model_step_{:d}.pt'.format(step)) logger.info('Saving checkpoing {:s}'.format(checkpoint_path)) if not os.path.exists(checkpoint_path): torch.save(checkpoint, checkpoint_path) return checkpoint, checkpoint_path
def train(self, train_iter_method, train_steps): logger.info("Start training...") step = self.optim._step + 1 true_batches = [] accum, normalization = 0, 0 train_iter = train_iter_method() total_stats = Statistics( ) # 初始化,loss=0, n_docs=0, start_time=time.time() report_stats = Statistics() self._start_report_manager(start_time=total_stats.start_time) while step <= train_steps: reduce_number = 0 for i, batch in enumerate(train_iter): if self.n_gpu == 0 or (i % self.n_gpu == self.gpu_rank): true_batches.append(batch) # normalization += batch.batch_size normalization = len(true_batches) accum += 1 if accum == self.grad_accum_count: reduce_number += 1 # if self.n_gpu > 1: # normalization = sum(distributed.all_gather_list(normalization)) # print("Normalization: {}".format(normalization)) self._gradient_accumulation(true_batches, normalization, total_stats, report_stats) report_stats = self._maybe_report_training( step, train_steps, self.optim.learning_rate, report_stats) true_batches = [] accum, normalization = 0, 0 if (step % self.save_checkpoint_step == 0 and self.gpu_rank == 0): self._save(step) step += 1 if step > train_steps: break train_iter = train_iter_method() return total_stats
def run(args, device_id, error_queue): setattr(args, 'gpu_ranks', [int(i) for i in args.gpu_ranks]) try: gpu_rank = distributed.multi_init(device_id, args.world_size, args.gpu_ranks) logger.info("GPU Rank: gpu_rank {:d}".format(gpu_rank)) if gpu_rank != args.gpu_ranks[device_id]: raise AssertionError( "An Error occured in Distributed intializaiton") single_train(args, device_id) except KeyboardInterrupt: pass except Exception: import traceback error_queue.put((args.gpu_ranks[device_id], traceback.format_exc()))
def output(self, step, num_steps, learning_rate, start): """Write out statistics to stdout. Args: step (int): current step n_batch (int): total batches start (int): start time of step. """ t = self.elapsed_time() step_fmt = "%2d" % step if num_steps > 0: step_fmt = "%s/%5d" % (step_fmt, num_steps) logger.info( ("Step %s; xent: %4.4f; " + "lr: %7.7f; %3.0f docs/s; %6.0f sec") % (step_fmt, self.xent(), learning_rate, self.n_docs / (t + 1e-5), time.time() - start)) sys.stdout.flush()
def _format_to_bert_one_sample(self, params): init_logger( "/sdc/xli/Datasets/cnn_daily/data_nsp/logs/_format_to_bert_one_sample.log" ) tokenizer, file, save_file, sample_type = params with open(file, 'r') as json_file: sample = json.load(json_file) pair, coherence = sample['pair'], sample['coherence'] if isinstance(pair, list) and len(pair) > 0 \ and isinstance(pair[0][0], type(pair[1][0])): os.environ["CUDA_VISIBLE_DEVICES"] = "1" encode = tokenizer(pair[0], pair[1], return_tensors='pt', is_pretrained=True) if encode['input_ids'].numel() <= self.args.bert_max_position: sample_dict = { 'input_ids': encode['input_ids'], 'token_type_ids': encode['token_type_ids'], 'attention_mask': encode['attention_mask'] } sample_tuple = (sample_dict, coherence) torch.save(sample_tuple, save_file) logger.info("{:s} has converted and saved at {:s}".format( file, save_file)) file_name = file.split("/")[-1] dst_file = os.path.join( "/sdc/xli/Datasets/cnn_daily/data_nsp/pts_and_back/processed", "{:s}/{:s}".format(sample_type, file_name)) shutil.move(file, dst_file) logger.info("{:s} has moved to {:s}".format( file_name, dst_file)) gc.collect()
def shard(self): init_logger("/sdc/xli/Datasets/cnn_daily/data_nsp/logs/shard.log") pairs_train_mapping, pairs_test_mapping, pairs_valid_mapping = \ self.args.pairs_train_mapping, self.args.pairs_test_mapping, self.args.pairs_valid_mapping # train_files, test_files, valid_files = map(self.read_mapping, (pairs_train_mapping, pairs_test_mapping, pairs_valid_mapping)) train_files = self.read_mapping(pairs_train_mapping) test_files = self.read_mapping(pairs_test_mapping) valid_files = self.read_mapping(pairs_valid_mapping) divided_corpus = { 'train': train_files, 'test': test_files, 'valid': valid_files } pool = Pool(mp.cpu_count()) for corpus_type in ['train', 'test', 'valid']: files = divided_corpus.get(corpus_type) dataset = [] file_no = 0 for d in pool.imap_unordered(self.load_pairs, files): if d is not None: dataset.append(d) if len(dataset) >= self.args.shard_size: pt_file = os.path.join( self.args.save_path, "{:s}/cd_{:s}_{:d}.json".format( corpus_type, corpus_type, file_no)) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) logger.info("{:s} has saved at {:s}/{:s}".format( pt_file.split("/")[-1], self.args.save_path, corpus_type)) file_no += 1 dataset = [] else: continue if len(dataset) > 0: pt_file = os.path.join( self.args.save_path, "{:s}/cd_{:s}_{:d}.json".format(corpus_type, corpus_type, file_no)) with open(pt_file, 'w') as save: save.write(json.dumps(dataset)) logger.info("{:s} has saved at {:s}/{:s}".format( pt_file.split("/")[-1], self.args.save_path, corpus_type)) file_no += 1 pool.close() pool.join() logger.info("Shard task is finished!")
def single_train(args, device_id): init_logger(args.log_file) device = "cpu" if args.visible_gpus == '-1' else "cuda" logger.info('Device ID %d' % device_id) logger.info('Device %s' % device) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if device_id >= 0: # 使用指定的gpu torch.cuda.set_device(device_id) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.deterministic = True if args.train_from != '': logger.info('Loading checkpoint from %s' % args.train_from) checkpoint = torch.load(args.train_from, map_location=lambda storage, loc: storage) else: checkpoint = None def train_iter_method(): return DataLoaderBert(load_dataset(args, 'train', shuffle=True), args.batch_size, shuffle=True, is_test=False) model = NextSentencePrediction(args, device, checkpoint) optim = build_optim(args, model, checkpoint) logger.info(model) trainer = build_trainer(args, device_id, model, optim) trainer.train(train_iter_method, args.train_steps)
def _format_to_bert(self, params): init_logger( "/sdc/xli/Datasets/cnn_daily/data_nsp/logs/_format_to_bert_one_sample.log" ) tokenizer, mapping_file, save_file = params logger.info("Processing {:s}".format(mapping_file)) with open(mapping_file, 'r') as m_file: json_paths = (line.strip() for line in m_file.readlines()) samples = [] for json_file in json_paths: with open(json_file, 'r') as j_file: sample = json.load(j_file) pair = sample['pair'] label = sample['coherence'] try: encode = tokenizer(pair[0], pair[1], return_tensors='pt', is_pretokenized=True) if encode['input_ids'].numel() <= self.args.bert_max_position: sample_dict = { 'input_ids': encode['input_ids'].to('cuda'), 'token_type_ids': encode['token_type_ids'].to('cuda'), 'attention_mask': encode['attention_mask'].to('cuda') } samples.append((sample_dict, label)) else: logger.info("Valid sample length: {}".format( encode['input_ids'].numel())) except ValueError: logger.warning("Value Error! And your data is {}".format(pair)) torch.save(samples, save_file) logger.info("{:s} has converted and saved at {:s}".format( mapping_file, save_file)) del (samples) gc.collect()
def log(self, *args, **kwargs): logger.info(*args, **kwargs)
def _lazy_load_dataset(json_file, corpus_type): with open(json_file, 'r') as file: dataset = json.load(file) logger.info( "Loading {:s} dataset from {:s}, number of examples: {:d}".format(corpus_type, json_file, len(dataset))) return dataset
def save(input): init_logger() with open(self.args.save_file, 'a+') as file: json.dump(input, file) logger.info("")