def __init__(self, root_dir, ids_file, mode='train', length=None): ''' root_dir: ids_file: mode: length: ''' self.root_dir = root_dir # папка с данными в формате json файлов (gpt2_1024_data) self.tokenizer = add_special_tokens() self.pad = self.tokenizer.encode(self.tokenizer.pad_token) self.files = np.sort( [x for x in os.listdir(root_dir) if x.endswith('.json')]) self.mode = mode with open(ids_file, 'r') as f: self.data = json.load(f) if mode == 'train': self.idxs = self.data['train_ids'] elif mode == 'valid': self.idxs = self.data['valid_ids'] else: self.idxs = self.data['test_ids'] if length == None: self.len = len(self.idxs) else: self.len = length
def __init__(self, root_dir, ids_file, mode='train', length=None): self.root_dir = root_dir self.tokenizer = add_special_tokens() with open(ids_file, 'r') as f: if mode == 'train': self.idxs = json.load(f)['train_ids'] elif mode == 'valid': self.idxs = json.load(f)['valid_ids'] else: self.idxs = json.load(f)['test_ids'] if len == None: self.len = len(self.idxs) else: self.len = length
def main(file_names, directory): """ Reads txt files, extract articles and summaries, tokenize them and save as json files Args: file_names: list, all the articles with total no of tokens less than 1024 directory: string, directory where files in file_names is stored """ tokenizer = add_special_tokens() print("Execution Started...") train_ids = [] file_id_map = {} i = 0 for file in file_names: file = os.path.join(os.getcwd(), directory, file) with open(file, 'r', encoding='utf-8') as f: lines = f.read().split('\n\n') article, abstract = get_art_abs(lines) article, abstract = tokenizer.encode(article), tokenizer.encode(abstract) if len(article) > 0 and len(abstract) > 0: #and (len(article) + len(abstract)) <= 1023: if len(article) > 923: article = article[:923] if len(abstract) > 100: abstract = abstract[:100] train_ids.append(i) write_json(i, article, abstract) file_id_map[i] = os.path.basename(file).replace('.story', '') i += 1 if i % 100 == 0: print(i, " files written") x, y = int(len(train_ids) * 0.8), int(len(train_ids) * 0.9) valid_ids = train_ids[x:y] test_ids = train_ids[y:] train_ids = train_ids[:x] with open("ids.json", 'w') as f: js = dict() js['train_ids'] = train_ids js['valid_ids'] = valid_ids js['test_ids'] = test_ids json.dump(js, f) # file_id_map maps the json file ids to actual cnn/dm file names ending with ".story" print("saving file_id_map...") with open("file_id_map.pickle", 'wb') as f: pickle.dump(file_id_map, f) print("file_id_map saved.")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--lr", default=5e-5, type=float, required=False, help="learning rate") parser.add_argument("--seed", default=42, type=int, required=False, help="seed to replicate results") parser.add_argument("--num_workers", default=4, type=int, required=False, help="num of cpus available") parser.add_argument("--device", default=3, required=False, help="torch.device object") parser.add_argument("--output_dir", default='./output', type=str, required=True, help="path to save evaluation results") parser.add_argument("--model_dir", default='./weights', type=str, required=True, help="path to save trained model") parser.add_argument("--root_dir", default='./CNN-DM/gpt2_1024_data', type=str, help="location of json dataset.") parser.add_argument("--ids_file", default='./CNN-DM/ids.json', type=str, help="location of train, valid and test file indexes") all_args = parser.parse_args() dataset = GPT21024Dataset(all_args.root_dir, all_args.ids_file, mode='test') tokenizer = add_special_tokens() model = GPT2LMHeadModel.from_pretrained(all_args.model_dir) all_args.device = torch.device('cuda:'+str(all_args.device)) model.to(all_args.device) test(all_args, model, tokenizer, dataset)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--lr", default=5e-5, type=float, required=False, help="learning rate") parser.add_argument("--seed", default=42, type=int, required=False, help="seed to replicate results") parser.add_argument("--n_gpu", default=1, type=int, required=False, help="no of gpu available") parser.add_argument("--gradient_accumulation_steps", default=4, type=int, required=True, help="gradient_accumulation_steps") parser.add_argument("--batch_size", default=1, type=int, required=True, help="batch_size") parser.add_argument("--num_workers", default=2, type=int, required=False, help="num of cpus available") parser.add_argument("--device", default=0, required=False, help="torch.device object") parser.add_argument("--num_train_epochs", default=5, type=int, required=True, help="no of epochs of training") parser.add_argument("--output_dir", default='./output', type=str, required=True, help="path to save evaluation results") parser.add_argument("--model_dir", default='./weights', type=str, required=True, help="path to save trained model") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="max gradient norm.") parser.add_argument("--root_dir", default='./CNN-DM/gpt2_1024_data', type=str, help="location of json dataset.") parser.add_argument("--ids_file", default='./CNN-DM/ids.json', type=str, help="location of train, valid and test file indexes") all_args = parser.parse_args() # загружаем трейновый и валидационный датасеты, текенизатор train_data = GPT21024Dataset(all_args.root_dir, all_args.ids_file, mode='train') valid_data = GPT21024Dataset(all_args.root_dir, all_args.ids_file, mode='valid', length=500) tokenizer = add_special_tokens() ignore_idx = tokenizer.pad_token_id # загружаем gpt2-small model = GPT2LMHeadModel.from_pretrained('gpt2') model.resize_token_embeddings(len(tokenizer)) if all_args.n_gpu > 1: model = SaveModelDataParallel(model, device_ids=[i for i in range(all_args.n_gpu)]) all_args.device = torch.device('cuda:' + str(all_args.device)) model.to(all_args.device) train(all_args, model, tokenizer, train_data, valid_data, ignore_idx)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--lr",default=5e-5, type=float, required=True, help="learning rate") parser.add_argument("--seed",default=42, type=int, required=False, help="seed to replicate results") parser.add_argument("--n_gpu",default=1, type=int, required=False, help="no of gpu available") parser.add_argument("--gradient_accumulation_steps",default=32, type=int, required=True, help="gradient_accumulation_steps") parser.add_argument("--batch_size",default=1, type=int, required=True, help="batch_size") parser.add_argument("--num_workers",default=4, type=int, required=False, help="num of cpus available") parser.add_argument("--device",default=torch.device('cpu'), required=False, help="torch.device object") parser.add_argument("--num_train_epochs",default=1, type=int, required=True, help="no of epochs of training") parser.add_argument("--output_dir",default=./output, type=str, required=True, help="path to save evaluation results") parser.add_argument("--model_dir",default=./weights, type=str, required=True, help="path to save trained model") parser.add_argument("--fp16",default=True, type=bool, required=False, help="whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") parser.add_argument("--fp16_opt_level",default='O0', type=str, required=False, help="apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].") parser.add_argument("--max_grad_norm",default=1.0, type=float, help="max gradient norm.") parser.add_argument("--root_dir",default='./CNN/gpt2_1024_data', type=str, help="location of json dataset.") parser.add_argument("--ids_file",default='./CNN/ids.json', type=str, help="location of train, valid and test file indexes") args = parser.parse_args() train_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='train',length=3000) #training on only 3000 datasets valid_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='valid',length=500) #validation on only 500 datasets tokenizer = add_special_tokens() ignore_idx = tokenizer.pad_token_id model = GPT2LMHeadModel.from_pretrained('gpt2')
def main(): parser = argparse.ArgumentParser() parser.add_argument("--lr",default=5e-5, type=float, required=True, help="learning rate") parser.add_argument("--seed",default=42, type=int, required=False, help="seed to replicate results") parser.add_argument("--n_gpu",default=1, type=int, required=False, help="no of gpu available") parser.add_argument("--gradient_accumulation_steps",default=32, type=int, required=True, help="gradient_accumulation_steps") parser.add_argument("--batch_size",default=1, type=int, required=True, help="batch_size") parser.add_argument("--num_workers",default=4, type=int, required=False, help="num of cpus available") parser.add_argument("--device",default=torch.device('cuda'), required=False, help="torch.device object") parser.add_argument("--num_train_epochs",default=5, type=int, required=True, help="no of epochs of training") parser.add_argument("--output_dir",default=./output, type=str, required=True, help="path to save evaluation results") parser.add_argument("--model_dir",default=./weights, type=str, required=True, help="path to save trained model") parser.add_argument("--fp16",default=True, type=bool, required=False, help="whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") parser.add_argument("--fp16_opt_level",default='O0', type=str, required=False, help="apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3'].") parser.add_argument("--max_grad_norm",default=1.0, type=float, help="max gradient norm.") parser.add_argument("--root_dir",default='./CNN/gpt2_1024_data', type=str, help="location of json dataset.") parser.add_argument("--ids_file",default='./CNN/ids.json', type=str, help="location of train, valid and test file indexes") args = parser.parse_args() train_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='train',length=3000) #training on only 3000 datasets valid_data = GPT21024Dataset(args.root_dir,args.ids_file,mode='valid',length=500) #validation on only 500 datasets tokenizer = add_special_tokens() ignore_idx = tokenizer.pad_token_id model = GPT2LMHeadModel.from_pretrained('gpt2') model.resize_token_embeddings(len(tokenizer)) model.to(args.device) start = time.time() train(args, model, tokenizer, train_data, valid_data, ignore_index) print('total time: ', (time.time()-start)/60, " minutes", end='\n\n') print('Saving trained model...') model_file = os.path.join(args['model_dir'], 'model_{}_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.bin'.format(args['fp16_opt_level'],3000,args['num_train_epochs'])) config_file = os.path.join(args['model_dir'], 'config_{}_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.json'.format(args['fp16_opt_level'],3000,args['num_train_epochs'])) torch.save(model.state_dict(), model_file) model.config.to_json_file(config_file)
lr = 5e-5 num_train_epochs = 5 max_grad_norm = 1. txt_gen_len = 100 ppo_config = {'batch_size': 1, 'forward_batch_size': 1} batch_size = ppo_config['batch_size'] train_dataset = GPT21024Dataset('CNN/gpt2_1024_data', 'CNN/ids.json', mode='train', length=3000) val_dataset = GPT21024Dataset('CNN/gpt2_1024_data', 'CNN/ids.json', mode='valid', length=500) tokenizer = add_special_tokens() ignore_idx = tokenizer.pad_token_id train_sampler = RandomSampler(train_dataset) train_dl = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size, num_workers=num_workers) val_sampler = RandomSampler(val_dataset) val_dl = DataLoader(val_dataset, sampler=val_sampler, batch_size=batch_size, num_workers=num_workers) gpt2_model = GPT2HeadWithValueModel.from_pretrained( './weights/partial_masked/')
def main(): # used from arguments.py args = argparser().parse_args() if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count( ) if torch.cuda.is_available() and not args.no_cuda else 0 else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare our task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None, ) model = AutoModelForSequenceClassification.from_pretrained( args.model_path, config=config, cache_dir=args.cache_dir if args.cache_dir else None, ) add_special_tokens(model, tokenizer, processor) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(tokenizer, "train", args) global_step, tr_loss = train(model, tokenizer, train_dataset, processor, args) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = AutoModelForSequenceClassification.from_pretrained( args.output_dir) tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model.to(args.device) # Evaluation assert not (args.do_test and args.do_eval) results = {} if (args.do_eval or args.do_test) and args.local_rank in [-1, 0]: mode = "dev" if args.do_eval else "test" tokenizer = AutoTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate(%s) the following checkpoints: %s", mode, checkpoints) for checkpoint in checkpoints: logger.info("Checkpoint: %s", checkpoint) global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( "/")[-1] if checkpoint.find("checkpoint") != -1 else "" model = AutoModelForSequenceClassification.from_pretrained( checkpoint) model.to(args.device) result = evaluate(model, tokenizer, processor, mode, args, prefix=prefix) result = dict( (k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results