def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.static_embedding = nn.Embedding(25419, config.hidden_size) # (num_entities, hidden_size) self.scorer = nn.CosineSimilarity() self.init_weights()
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, config.num_labels) # self.scorer = nn.CosineSimilarity() self.el_criterion = CrossEntropyLoss() self.init_weights()
def __init__(self, config): super().__init__(config) self.num_labels = 10 self.mention_boundary_embeddings = nn.Embedding( config.type_vocab_size, config.hidden_size) self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, 1) self.init_weights()
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.start_tag_idx = config.num_labels self.stop_tag_idx = config.num_labels + 1 self.tagset_size = config.num_labels + 2 self.bert = BertModel(config) # self.bert.init_weights() self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.tagset_size) # Matrix of transition parameters. Entry i,j is the score of # transitioning *to* i *from* j. self.transitions = nn.Parameter( torch.randn(self.tagset_size, self.tagset_size)) # These two statements enforce the constraint that we never transfer # to the start tag and we never transfer from the stop tag self.transitions.data[self.start_tag_idx, :] = -10000 self.transitions.data[:, self.stop_tag_idx] = -10000
def __init__(self, config, dec_config, n_op, n_domain, update_id, mask_word_id, eos_id, pad_id, val_sep_id, type_vocab_size, exclude_domain=False): super(TransformerDST, self).__init__(config) self.val_sep_id = val_sep_id # TODO: v2 special print("### word index of '-', ", self.val_sep_id) self.hidden_size = config.hidden_size self.n_op = n_op self.update_id = update_id self.mask_word_id = mask_word_id self.bert = BertModel(config, type_vocab_size) # predictor self.encoder = Encoder(config, self.bert, n_op, n_domain, update_id, exclude_domain) self.decoder = BertForSeq2SeqDecoder(config, dec_config, self.bert, self.bert.embeddings.word_embeddings.weight, mask_word_id, eos_id, pad_id) self.apply(self.init_weights)
def __init__(self, opt): self.opt = opt out_file = './stat/{}_{}_domain{}_adv{}_aux{}_resplit{}_epoch{}'.format( self.opt.model_name, self.opt.dataset, self.opt.domain, str(self.opt.adv), str(self.opt.aux), str(self.opt.resplit), (self.opt.num_epoch)) print(out_file) if 'bert' in opt.model_name: # if opt.model_name == 'bert_kg': # tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) # bert = BertForTokenClassification.from_pretrained('ernie_base') # self.model = opt.model_class(bert, opt).to(opt.device) # self.model.to(opt.device) if opt.model_name == 'lcf_bert': from pytorch_transformers import BertModel, BertForTokenClassification, BertConfig tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=False) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) elif opt.model_name == 'bert': from pytorch_transformers import BertModel, BertForTokenClassification, BertConfig tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) elif opt.model_name in ['bert_spc', 'td_bert']: from pytorch_transformers import BertModel, BertForTokenClassification, BertConfig tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) # self.model.load_state_dict(torch.load('./state_dict/bert_multi_target_val_acc0.7714')) elif opt.model_name == 'bert_label': tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) self.model = opt.model_class(bert, opt).to(opt.device) elif opt.model_name == 'bert_compete': tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) num_added_tokens = tokenizer.add_tokens( ['[aspect_b]', '[aspect_e]']) bert.resize_token_embeddings(len(tokenizer.tokenizer)) self.model = opt.model_class(bert, opt).to(opt.device) else: from modeling_bert import BertModel, BertForTokenClassification, BertConfig # bert_mulit_target tokenizer = Tokenizer4Bert(opt.max_seq_len, opt.pretrained_bert_name) config = BertConfig.from_pretrained(opt.pretrained_bert_name, output_attentions=True) bert = BertModel.from_pretrained(opt.pretrained_bert_name, config=config) if opt.domain == 'pt': bert = BertModel.from_pretrained( './bert_models/pt_bert-base-uncased_amazon_yelp') if opt.domain == 'joint': bert = BertModel.from_pretrained( './bert_models/laptops_and_restaurants_2mio_ep15') if opt.domain == 'res': bert = BertModel.from_pretrained( './bert_models/restaurants_10mio_ep3') if opt.domain == 'laptop': bert = BertModel.from_pretrained( './bert_models/laptops_1mio_ep30') if opt.domain == 'ernie': bert = BertModel.from_pretrained( './bert_models/ERNIE_Base_en_stable-2.0.0_pytorch') # num_added_tokens = tokenizer.add_tokens(['[target_b]','[target_e]']) # num_added_tokens = tokenizer.add_tokens(['[aspect_b]','[aspect_e]']) for i in range(20): b = '[' + str(i) + 'b]' e = '[' + str(i) + 'e]' num_added_tokens = tokenizer.add_tokens([b, e]) bert.resize_token_embeddings(len(tokenizer.tokenizer)) self.model = opt.model_class(bert, opt).to(opt.device) # self.model.load_state_dict(torch.load('./state_dict/state_dict/bert_multi_target_restaurant_doamin-res_can0_adv0_aux1.0_val_acc0.8688')) else: tokenizer = build_tokenizer( fnames=[opt.dataset_file['train'], opt.dataset_file['test']], max_seq_len=opt.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(opt.dataset)) embedding_matrix = build_embedding_matrix( word2idx=tokenizer.word2idx, embed_dim=opt.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format( str(opt.embed_dim), opt.dataset)) self.model = opt.model_class(embedding_matrix, opt).to(opt.device) self.trainset = ABSADataset(opt.dataset_file['train'], tokenizer, 'train', opt) self.testset = ABSADataset(opt.dataset_file['test'], tokenizer, 'test', opt) if int(opt.resplit) == 0: valset_ratio = 0.05 assert 0 <= opt.valset_ratio < 1 if opt.valset_ratio > 0: valset_len = int(len(self.trainset) * opt.valset_ratio) self.trainset, self.valset = random_split( self.trainset, (len(self.trainset) - valset_len, valset_len)) else: if int(self.opt.resplit) == 1 or int(self.opt.resplit) == 2: self.valset = ABSADataset('valid', tokenizer, 'valid', opt) else: self.valset = self.testset if opt.device.type == 'cuda': logger.info('cuda memory allocated: {}'.format( torch.cuda.memory_allocated(device=opt.device.index))) # if opt.load_mode == 1: # self.model.load_state_dict(torch.load('/home/nus/temp/ABSA-PyTorch/state_dict/bert_spc_twitter_val_acc0.7384')) # find the highese # model.load_state_dict(torch.load(PATH)) self._print_args()
def __init__(self, config): super().__init__(config) self.bert = BertModel(config)
def main(): # Required parameters parser = argparse.ArgumentParser() parser.add_argument('--data_dir', dest='data_dir', required=True, help='Which directory contains a {train,val,test}.jsonl file?') parser.add_argument('--output_dir', dest='output_dir', required=True, help='Where shall we write intermediate models + final data to?') parser.add_argument('--model_params', dest='model_params', required=True, help='JSoN file for loading arbitrary model parameters (e.g. optimizers, pre-saved files, etc.') parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--eval_split", type=str, default="val") parser.add_argument("--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument("--debug", action="store_true", default=False) parser.add_argument("--tf_summary", action="store_true", default=False) parser.add_argument("--out_domain", action="store_true", default=False) parser.add_argument("--random_evidence", action="store_true", default=False) parser.add_argument("--low_resource", action="store_true", default=False) # Input parameters parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded.") parser.add_argument("--max_query_length", default=64, type=int) # Variants of baselines that changes what input is loaded parser.add_argument("--full_doc", action="store_true", default=False) parser.add_argument("--gold_evidence", action="store_true", default=False) parser.add_argument("--focus_attention", action="store_true", default=False) parser.add_argument("--pal_attention", action="store_true", default=False) parser.add_argument("--multitask", action="store_true", default=False) parser.add_argument("--predicted_train_evidence_file", type=str, default=None) parser.add_argument("--predicted_eval_evidence_file", type=str, default=None) parser.add_argument("--gamma", type=float, default=1.0, help="How much gold to feed to the supervision branch") # Training parameters parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--wait_step", default=5, type=int) parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=50.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--warmup_proportion", default=0.0, type=float, help="Linear warmup over warmup_steps.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") # Logging parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=200, help="Save checkpoint every X updates steps.") parser.add_argument("--eval_all_checkpoints", action='store_true', help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") # Multi-GPU parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") parser.add_argument('--fp16_opt_level', type=str, default='O1', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() # Parse model args json with open(args.model_params, 'r') as fp: logging.debug(f'Loading model parameters from {args.model_params}') model_params = json.load(fp) if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(model_params["config_name"] if model_params["config_name"] else model_params["model_name_or_path"]) config.num_labels = len(model_params['classes']) tokenizer = tokenizer_class.from_pretrained(model_params["tokenizer_name"] if model_params["tokenizer_name"] else model_params["model_name_or_path"], do_lower_case=model_params["do_lower_case"]) model = model_class.from_pretrained(model_params["model_name_or_path"], from_tf=bool('.ckpt' in model_params["model_name_or_path"]), config=config) if args.pal_attention: model.bert_pal = BertModel.from_pretrained(model_params["model_name_or_path"], config=config) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) if args.do_train: if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) train_dataset, _ = load_and_cache_examples(args, model_params, tokenizer, evaluate=False, split="train", output_examples=False) global_step, tr_loss = train(args, model_params, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # model = model_class.from_pretrained(args.output_dir) # tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=model_params["do_lower_case"]) # model.to(args.device) results = {} if args.do_eval and args.local_rank in [-1, 0]: checkpoints = [args.output_dir + "/best_model"] if args.eval_all_checkpoints: checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: # Reload the model global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" model = model_class.from_pretrained(checkpoint, force_download=True) #if args.pal_attention: # # TODO: Wrong since we need to initialize using exact params? # model.bert_pal = BertModel.from_pretrained(model_params["model_name_or_path"], config=config) model.to(args.device) # Evaluate result = evaluate(args, model_params, model, tokenizer, prefix=global_step, output_examples=True, split=args.eval_split) results = {"Best F1":result[0], "Best Accuracy":result[1]} logger.info("Results on the split {} : {}".format(args.eval_split, results)) return results