def __init__(self, pretrained_model: str, requires_grad: bool = False, top_layer_only: bool = False) -> None: model = BertModel.from_pretrained(pretrained_model) for param in model.parameters(): param.requires_grad = requires_grad super().__init__(bert_model=model, top_layer_only=top_layer_only)
def __init__(self, name, **kwargs): super(BERTBaseEmbeddings, self).__init__(name=name, **kwargs) global BERT_TOKENIZER self.dsz = kwargs.get('dsz') if BERT_TOKENIZER is None: BERT_TOKENIZER = BertTokenizer.from_pretrained(kwargs.get('embed_file')) self.model = BertModel.from_pretrained(kwargs.get('embed_file')) self.vocab = BERT_TOKENIZER.vocab self.vsz = len(BERT_TOKENIZER.vocab) # 30522 self.model.embeddings.word_embeddings.num_embeddings self.layer_indices = kwargs.get('layers', [-1, -2, -3, -4]) self.operator = kwargs.get('operator', 'concat')
def __init__(self, update_embedding=False, embedding_reduction='none', pretrained_model_name='bert-base-uncased', cache_dir='../data/bert_cache'): super().__init__() # Check if choice of pretrained model is valid assert pretrained_model_name in ('bert-base-uncased', 'bert-large-uncased', 'bert-base-cased') # Load pre-trained BERT model self.bert = BertModel.from_pretrained( pretrained_model_name_or_path=pretrained_model_name, cache_dir=cache_dir) self.embedding = self.bert.embeddings self.embedding_size = self.embedding.word_embeddings.embedding_dim self.reduction = embedding_reduction # (Remove or not) BERT model parameters from optimization for param in self.bert.parameters(): param.requires_grad = update_embedding
def __init__(self, hiddenDim, tagsetSize, batchSize): super(NetEDU, self).__init__() self.hiddenDim = hiddenDim # 768 self.batchSize = batchSize self.tagsetSize = tagsetSize self.bert = BertModel.from_pretrained('bert-base-chinese').cuda() # classification layer self.hidden2tag = nn.Linear( self.hiddenDim, self.tagsetSize) # convert to label set size # dropout layer self.dropout = nn.Dropout(0.1) # CRF layer self.transitions = nn.Parameter( torch.randn(self.tagsetSize, self.tagsetSize).cuda()) # initialize self.transitions.data[ tagToIdx['[START]'], :] = -10000. # no transition to SOS self.transitions.data[:, tagToIdx[ '[END]']] = -10000. # no transition from EOS except to PAD self.transitions.data[:, tagToIdx[ '[PAD]']] = -10000. # no transition from PAD except to PAD self.transitions.data[tagToIdx[ '[PAD]'], :] = -10000. # no transition to PAD except from EOS self.transitions.data[tagToIdx['[PAD]'], tagToIdx['[END]']] = 0. self.transitions.data[tagToIdx['[PAD]'], tagToIdx['[PAD]']] = 0.
def __init__(self, bert_model_path, n_tgt_vocab, len_max_seq, d_word_vec=768, d_model=768, d_inner=3072, n_layers=12, n_head=12, d_k=64, d_v=64, dropout=0.1): super().__init__() self.encoder = BertModel.from_pretrained(bert_model_path) self.config = BertConfig(bert_model_path+'bert_config.json') self.decoder = Decoder( n_tgt_vocab=n_tgt_vocab, len_max_seq=len_max_seq, d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, dropout=dropout) self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False) nn.init.xavier_normal_(self.tgt_word_prj.weight) self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight self.x_logit_scale = (d_model ** -0.5) self.o_l = nn.Linear(d_model, 512, bias=False) self.h_l = nn.Linear(512, 1, bias=True) nn.init.xavier_normal_(self.o_l.weight) nn.init.xavier_normal_(self.h_l.weight) self.a_l_1 = nn.Linear(d_model, 512, bias=False) self.a_l_2 = nn.Linear(d_model, 512, bias=False) nn.init.xavier_normal_(self.a_l_1.weight) nn.init.xavier_normal_(self.a_l_2.weight)
def __init__(self, config, cls_sup: bool = False, evidence_lambda=0.8, extra_yesno_lambda=0.5): super(BertQAYesnoCLSHierarchical, self).__init__(config) print(f'The model {self.__class__.__name__} is loading...') print(f'The coefficient of evidence loss is {evidence_lambda}') print(f'Use cls extra supervision: {cls_sup}') print(f'The extra yesno loss lambda is {extra_yesno_lambda}') layers.set_seq_dropout(True) layers.set_my_dropout_prob(config.hidden_dropout_prob) self.bert = BertModel(config) # self.dropout = nn.Dropout(config.hidden_dropout_prob) # self.answer_choice = nn.Linear(config.hidden_size, 2) self.doc_word_sum = layers.AttentionScore(config.hidden_size, 250, do_similarity=False) self.que_word_sum = layers.AttentionScore(config.hidden_size, 250, do_similarity=False) self.doc_sen_sum = layers.AttentionScore(config.hidden_size, 250, do_similarity=False) self.cls_sup = cls_sup self.extra_yesno_lam = extra_yesno_lambda if cls_sup: self.extra_predictor = nn.Linear(config.hidden_size, 3) self.yesno_predictor = nn.Linear(config.hidden_size * 2, 3) self.evidence_lam = evidence_lambda self.apply(self.init_bert_weights)
def __init__( self, config, output_attentions=False, keep_multihead_output=False, cls_alpha=1.0, mask_p=0.0, ): super(BertForCoQA, self).__init__(config) self.cls_alpha = cls_alpha self.mask_p = mask_p self.output_attentions = output_attentions self.bert = BertModel( config, output_attentions=output_attentions, keep_multihead_output=keep_multihead_output, ) # self.qa_outputs_mid = nn.Linear(config.hidden_size, config.hidden_size) # self.dropout = nn.Dropout(config.hidden_dropout_prob) # NOTE: It hurts. self.qa_outputs = nn.Linear(config.hidden_size, 2) # self.cls_outputs_mid = nn.Linear(config.hidden_size, # config.hidden_size) self.cls_outputs = nn.Linear(config.hidden_size, 4) self.apply(self.init_bert_weights)
def __init__(self, config, evidence_lambda: float = 0.8, my_dropout_p: float = 0.2, tf_layers: int = 1, tf_inter_size: int = 3072): super(BertHierarchicalTransformer, self).__init__(config) logger.info(f'Model {__class__.__name__} is loading...') logger.info(f'Model parameters:') logger.info(f'Evidence lambda: {evidence_lambda}') layers.set_seq_dropout(True) layers.set_my_dropout_prob(my_dropout_p) self.bert = BertModel(config) self.query_self_attn = layers.MultiHeadPooling(config.hidden_size, 6) self.value_self_attn = layers.MultiHeadPooling(config.hidden_size, 6) # self.sentence_input = layers.BertSentInput(config) config.num_hidden_layers = tf_layers config.intermediate_size = tf_inter_size self.sentence_encoder = BertEncoder(config) self.attention_score = layers.AttentionScore(config.hidden_size, 256) # Output layer self.evidence_lambda = evidence_lambda self.predictor = nn.Linear(config.hidden_size * 2, 3)
def __init__(self, num_choices, bert_config_file, init_embeddings): self.num_choices = num_choices self.bert_config = BertConfig.from_json_file(bert_config_file) BertPreTrainedModel.__init__(self, self.bert_config) self.bert = BertModel(self.bert_config) self.apply(self.init_bert_weights) self.dropout = nn.Dropout(self.bert_config.hidden_dropout_prob) self.vocab_size, self.embed_size = np.shape(init_embeddings) self.embed = nn.Embedding.from_pretrained( torch.FloatTensor(init_embeddings), freeze=False) #self.classifier = nn.Linear(self.bert_config.hidden_size + self.embed_size, 1) self.classifier = nn.Linear(self.bert_config.hidden_size, 1) self.reshape = nn.Linear(self.bert_config.hidden_size, self.embed_size, bias=False) self.reshape_know = nn.Linear(self.embed_size, self.bert_config.hidden_size, bias=True) self.relu = nn.ReLU() self.softmax = nn.Softmax(dim=-1) self.activation = nn.Sigmoid()
def __init__(self, config, num_classes, encoding_type='bio', target_vocab=None, dropout=0.2): super(SubjectModel, self).__init__(config) self.bert = BertModel(config) self.apply(self.init_bert_weights) self.dropout = nn.Dropout(dropout) self.fc = nn.Linear(config.hidden_size, num_classes) trans = None if target_vocab is not None and encoding_type is not None: trans = allowed_transitions(target_vocab, encoding_type=encoding_type, include_start_end=True) self.crf = ConditionalRandomField(num_classes, include_start_end_trans=True, allowed_transitions=trans)
def __init__(self, config, evidence_lambda=0.8, negative_lambda=1.0, add_entropy: bool = False): super(BertQAYesnoHierarchicalNeg, self).__init__(config) print(f'The model {self.__class__.__name__} is loading...') print(f'The coefficient of evidence loss is {evidence_lambda}') print(f'The coefficient of negative samples loss is {negative_lambda}') print(f'Add entropy loss: {add_entropy}') layers.set_seq_dropout(True) layers.set_my_dropout_prob(config.hidden_dropout_prob) self.bert = BertModel(config) # self.dropout = nn.Dropout(config.hidden_dropout_prob) # self.answer_choice = nn.Linear(config.hidden_size, 2) self.doc_sen_self_attn = layers.LinearSelfAttnAllennlp( config.hidden_size) self.que_self_attn = layers.LinearSelfAttn(config.hidden_size) self.word_similarity = layers.AttentionScore(config.hidden_size, 250, do_similarity=False) self.vector_similarity = layers.AttentionScore(config.hidden_size, 250, do_similarity=False) # self.yesno_predictor = nn.Linear(config.hidden_size, 2) self.yesno_predictor = nn.Linear(config.hidden_size * 2, 3) self.evidence_lam = evidence_lambda self.negative_lam = negative_lambda self.add_entropy = add_entropy self.apply(self.init_bert_weights)
def __init__(self, config, evidence_lambda=0.8, num_choices=4): super(BertRACEHierarchicalTopK, self).__init__(config) logger.info(f'The model {self.__class__.__name__} is loading...') logger.info(f'Currently the number of choices is {num_choices}') logger.info(f'The coefficient of evidence loss is {evidence_lambda}') layers.set_seq_dropout(True) layers.set_my_dropout_prob(config.hidden_dropout_prob) rep_layers.set_seq_dropout(True) rep_layers.set_my_dropout_prob(config.hidden_dropout_prob) self.bert = BertModel(config) self.doc_sen_self_attn = rep_layers.LinearSelfAttention(config.hidden_size) self.que_self_attn = rep_layers.LinearSelfAttention(config.hidden_size) self.word_similarity = layers.AttentionScore(config.hidden_size, 250, do_similarity=False) self.vector_similarity = layers.AttentionScore(config.hidden_size, 250, do_similarity=False) # self.yesno_predictor = nn.Linear(config.hidden_size, 2) self.classifier = nn.Linear(config.hidden_size * 2, 1) self.evidence_lam = evidence_lambda self.num_choices = num_choices self.apply(self.init_bert_weights)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--input_file", default=None, type=str, required=True) parser.add_argument("--output_file", default=None, type=str, required=True) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) ## Other parameters parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {} distributed training: {}".format( device, n_gpu, bool(args.local_rank != -1))) layer_indexes = [int(x) for x in args.layers.split(",")] tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) examples = read_examples(args.input_file) features = convert_examples_to_features(examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model = BertModel.from_pretrained(args.bert_model) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) model.eval() with open(args.output_file, "w", encoding='utf-8') as writer: for input_ids, input_mask, example_indices in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) all_encoder_layers = all_encoder_layers for b, example_index in enumerate(example_indices): feature = features[example_index.item()] unique_id = int(feature.unique_id) # feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id all_out_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): layer_output = all_encoder_layers[int( layer_index)].detach().cpu().numpy() layer_output = layer_output[b] layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(x.item(), 6) for x in layer_output[i] ] all_layers.append(layers) out_features = collections.OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) output_json["features"] = all_out_features writer.write(json.dumps(output_json) + "\n")
def __init__(self, params, vocab_size, hidden_size, emb_dim, dropout, tok2id): global CUDA super(Seq2Seq, self).__init__() self.vocab_size = vocab_size self.hidden_dim = hidden_size self.emb_dim = emb_dim self.dropout = dropout self.pad_id = 0 self.tok2id = tok2id self.params = params self.embeddings = nn.Embedding(self.vocab_size, self.emb_dim, self.pad_id) self.encoder = LSTMEncoder( self.emb_dim, self.hidden_dim, layers=1, bidirectional=True, dropout=self.dropout) self.h_t_projection = nn.Linear(hidden_size, hidden_size) self.c_t_projection = nn.Linear(hidden_size, hidden_size) self.bridge = nn.Linear(768 if self.params['bert_encoder'] else self.hidden_dim, self.hidden_dim) if self.params['transformer_decoder']: self.decoder = transformer.TransformerDecoder( num_layers=self.params['transformer_layers'], d_model=self.hidden_dim, heads=8, d_ff=self.hidden_dim, copy_attn=False, self_attn_type='scaled-dot', dropout=self.dropout, embeddings=self.embeddings, max_relative_positions=0) else: self.decoder = StackedAttentionLSTM( params, self.emb_dim, self.hidden_dim, layers=1, dropout=self.dropout) self.output_projection = nn.Linear(self.hidden_dim, self.vocab_size) self.softmax = nn.Softmax(dim=-1) # for training self.log_softmax = nn.LogSoftmax(dim=-1) self.init_weights() # pretrained embs from bert (after init to avoid overwrite) if self.params['bert_word_embeddings'] or \ self.params['bert_full_embeddings'] or \ self.params['bert_encoder']: model = BertModel.from_pretrained( self.params['bert_model'], self.params['working_dir'] + '/cache') if self.params['bert_word_embeddings']: self.embeddings = model.embeddings.word_embeddings if self.params['bert_encoder']: self.encoder = model # share bert word embeddings with decoder self.embeddings = model.embeddings.word_embeddings if self.params['bert_full_embeddings']: self.embeddings = model.embeddings if self.params['freeze_embeddings']: for param in self.embeddings.parameters(): param.requires_grad = False self.enrich_input = torch.ones(hidden_size) if CUDA: self.enrich_input = self.enrich_input.cuda() self.enricher = nn.Linear(hidden_size, hidden_size)
path=args.db_fi, align_strat=args.align_strat, subsample_all=args.subsample_all) else: sentdb = data.SentDB(args.sent_fi, args.tag_fi, tokenizer, args.val_sent_fi, args.val_tag_fi, lower=args.lower, align_strat=args.align_strat, subsample_all=args.subsample_all) nebert = model.bert if args.zero_shot and "newne" not in args.just_eval: nebert = BertModel.from_pretrained(args.bert_model, cache_dir=CACHEDIR) nebert = nebert.to(device) def avg_bert_emb(x): mask = (x != 0) rep, _ = nebert(x, attention_mask=mask.long(), output_all_encoded_layers=False) mask = mask.float().unsqueeze(2) # bsz x T x 1 avgs = (rep * mask).sum(1) / mask.sum(1) # bsz x hid return avgs nebsz, nne = 128, 500 # we always compute neighbors w/ cosine; seems to be a bit better model.eval() sentdb.compute_top_neighbs(nebsz,
def __init__(self, config): super(Model, self).__init__() self.bert = BertModel.from_pretrained(config.bert_path) for param in self.bert.parameters(): param.requires_grad = True self.fc = nn.Linear(config.hidden_size, config.num_classes)
def __init__(self, config): super(DMCNN_Encoder_argument0, self).__init__(config) self.bert = BertModel(config) self.dropout = nn.Dropout(p=keepProb) self.maxpooling = nn.MaxPool1d(SenLen)
def __init__(self, config): super(BertForUtteranceEncoding, self).__init__(config) self.config = config self.bert = BertModel(config)
token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained_2": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options_2.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights_2.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_bert": print("Loading bert model") model = BertModel.from_pretrained('bert-base-uncased') token_embedding = BertEmbedder(model) PROJECT_DIM = 768 else: print("Error: Some weird Embedding type", EMBEDDING_TYPE) exit() word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) HIDDEN_DIM = 200 params = Params({ 'input_dim': PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) attend_feedforward = FeedForward.from_params(params)
def main(): logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) args = get_args() log_info.print_args(args) device, n_gpu = initialization.init_cuda_from_args(args, logger=logger) initialization.init_seed(args, n_gpu=n_gpu, logger=logger) initialization.init_train_batch_size(args) initialization.init_output_dir(args) initialization.save_args(args) task = get_task(args.task_name, args.data_dir) # prepare examples, load model as encoder tokenizer = shared_model_setup.create_tokenizer( bert_model_name=args.bert_model, bert_load_mode=args.bert_load_mode, do_lower_case=args.do_lower_case, bert_vocab_path=args.bert_vocab_path, ) all_state = shared_model_setup.load_overall_state(args.bert_load_path, relaxed=True) # Load Model... if args.bert_load_mode == "state_model_only": state_dict = all_state['model'] bert_as_encoder = BertModel.from_state_dict( config_file=args.bert_config_json_path, state_dict=state_dict) else: assert args.bert_load_mode == "from_pretrained" cache_dir = PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format( args.local_rank) bert_as_encoder = BertModel.from_pretrained( pretrained_model_name_or_path=args.bert_model, cache_dir=cache_dir) bert_as_encoder.to(device) runner_param = RunnerParameters( max_seq_length=args.max_seq_length, local_rank=args.local_rank, n_gpu=n_gpu, fp16=args.fp16, learning_rate=args.learning_rate, gradient_accumulation_steps=args.gradient_accumulation_steps, t_total=None, warmup_proportion=args.warmup_proportion, num_train_epochs=args.num_train_epochs, train_batch_size=args.train_batch_size, eval_batch_size=args.eval_batch_size, ) runner = EmbeddingTaskRunner(bert_model=bert_as_encoder, optimizer=None, tokenizer=tokenizer, label_list=task.get_labels(), device=device, rparams=runner_param) # Run training set encoding... print("Run training set encoding ... ") train_examples = task.get_train_examples() train_dataset = runner.run_encoding(train_examples, verbose=True, mode='train') print("saving embeddings ... ") torch.save(train_dataset, os.path.join(args.output_dir, "train.dataset")) # Run development set encoding ... eval_examples = task.get_dev_examples() eval_dataset = runner.run_encoding(eval_examples, verbose=True, mode='eval') print("saving embeddings ... ") torch.save(eval_dataset, os.path.join(args.output_dir, 'dev.dataset')) # Run test set encoding ... test_examples = task.get_test_examples() test_dataset = runner.run_encoding(test_examples, verbose=True, mode='test') print("saving embeddings ... ") torch.save(test_dataset, os.path.join(args.output_dir, "test.dataset")) # HACK for MNLI mis-matched set ... if args.task_name == 'mnli': print("=== Start embedding task for MNLI mis-matched ===") mm_eval_examples = MnliMismatchedProcessor().get_dev_examples( task.data_dir) mm_eval_dataset = runner.run_encoding(mm_eval_examples, verbose=True, mode='eval') print("=== Saving eval dataset ===") torch.save(mm_eval_dataset, os.path.join(args.output_dir, "mm_dev.dataset")) print("=== Saved ===") mm_test_examples = MnliMismatchedProcessor().get_test_examples( task.data_dir) mm_test_dataset = runner.run_encoding(mm_test_examples, verbose=True, mode='test') print("=== Saving tensor dataset ===") torch.save(mm_test_dataset, os.path.join(args.output_dir, "mm_test.dataset")) print("=== Saved ===")
class BertQAYesnoHierarchicalReinforceRACE(BertPreTrainedModel): """ Hard attention using reinforce learning """ def __init__(self, config, evidence_lambda=0.8, num_choices=4, sample_steps: int = 5, reward_func: int = 0, freeze_bert=False): super(BertQAYesnoHierarchicalReinforceRACE, self).__init__(config) logger.info(f'The model {self.__class__.__name__} is loading...') logger.info(f'The coefficient of evidence loss is {evidence_lambda}') logger.info(f'Currently the number of choices is {num_choices}') logger.info(f'Sample steps: {sample_steps}') logger.info(f'Reward function: {reward_func}') logger.info(f'If freeze BERT\'s parameters: {freeze_bert} ') layers.set_seq_dropout(True) layers.set_my_dropout_prob(config.hidden_dropout_prob) rep_layers.set_seq_dropout(True) rep_layers.set_my_dropout_prob(config.hidden_dropout_prob) self.bert = BertModel(config) if freeze_bert: for p in self.bert.parameters(): p.requires_grad = False self.doc_sen_self_attn = rep_layers.LinearSelfAttention( config.hidden_size) self.que_self_attn = rep_layers.LinearSelfAttention(config.hidden_size) self.word_similarity = layers.AttentionScore(config.hidden_size, 250, do_similarity=False) self.vector_similarity = layers.AttentionScore(config.hidden_size, 250, do_similarity=False) # self.yesno_predictor = nn.Linear(config.hidden_size * 2, 3) self.classifier = nn.Linear(config.hidden_size * 2, 1) self.evidence_lam = evidence_lambda self.sample_steps = sample_steps self.reward_func = [self.reinforce_step, self.reinforce_step_1][reward_func] self.num_choices = num_choices self.apply(self.init_bert_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, sentence_span_list=None, sentence_ids=None, max_sentences: int = 0): flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_token_type_ids = token_type_ids.view( -1, token_type_ids.size(-1)) if token_type_ids is not None else None flat_attention_mask = attention_mask.view( -1, attention_mask.size(-1)) if attention_mask is not None else None sequence_output, _ = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False) # mask: 1 for masked value and 0 for true value # doc, que, doc_mask, que_mask = layers.split_doc_que(sequence_output, token_type_ids, attention_mask) doc_sen, que, doc_sen_mask, que_mask, sentence_mask = \ rep_layers.split_doc_sen_que(sequence_output, flat_token_type_ids, flat_attention_mask, sentence_span_list, max_sentences=max_sentences) batch, max_sen, doc_len = doc_sen_mask.size() que_vec = self.que_self_attn(que, que_mask).view(batch, 1, -1) doc = doc_sen.reshape(batch, max_sen * doc_len, -1) word_sim = self.word_similarity(que_vec, doc).view(batch * max_sen, doc_len) doc = doc_sen.reshape(batch * max_sen, doc_len, -1) doc_mask = doc_sen_mask.reshape(batch * max_sen, doc_len) word_hidden = rep_layers.masked_softmax(word_sim, doc_mask, dim=1).unsqueeze(1).bmm(doc) word_hidden = word_hidden.view(batch, max_sen, -1) doc_vecs = self.doc_sen_self_attn(doc, doc_mask).view(batch, max_sen, -1) sentence_sim = self.vector_similarity(que_vec, doc_vecs) if self.training: _sample_prob, _sample_log_prob = self.sample_one_hot( sentence_sim, sentence_mask) loss_and_reward, _ = self.reward_func(word_hidden, que_vec, labels, _sample_prob, _sample_log_prob) output_dict = {'loss': loss_and_reward} else: _prob, _ = self.sample_one_hot(sentence_sim, sentence_mask) loss, _choice_logits = self.simple_step(word_hidden, que_vec, labels, _prob) sentence_scores = rep_layers.masked_softmax(sentence_sim, sentence_mask, dim=-1).squeeze_(1) output_dict = { 'sentence_logits': sentence_scores.float(), 'loss': loss, 'choice_logits': _choice_logits.float() } return output_dict def sample_one_hot(self, _similarity, _mask): _probability = rep_layers.masked_softmax(_similarity, _mask) dtype = _probability.dtype _probability = _probability.float() # _log_probability = masked_log_softmax(_similarity, _mask) if self.training: _distribution = Categorical(_probability) _sample_index = _distribution.sample((self.sample_steps, )) logger.debug(str(_sample_index.size())) new_shape = (self.sample_steps, ) + _similarity.size() logger.debug(str(new_shape)) _sample_one_hot = F.one_hot(_sample_index, num_classes=_similarity.size(-1)) # _sample_one_hot = _similarity.new_zeros(new_shape).scatter(-1, _sample_index.unsqueeze(-1), 1.0) logger.debug(str(_sample_one_hot.size())) _log_prob = _distribution.log_prob( _sample_index) # sample_steps, batch, 1 assert _log_prob.size() == new_shape[:-1], (_log_prob.size(), new_shape) _sample_one_hot = _sample_one_hot.transpose( 0, 1) # batch, sample_steps, 1, max_sen _log_prob = _log_prob.transpose(0, 1) # batch, sample_steps, 1 return _sample_one_hot.to(dtype=dtype), _log_prob.to(dtype=dtype) else: _max_index = _probability.float().max(dim=-1, keepdim=True)[1] _one_hot = torch.zeros_like(_similarity).scatter_( -1, _max_index, 1.0) # _log_prob = _log_probability.gather(-1, _max_index) return _one_hot, None def reinforce_step(self, hidden, q_vec, label, prob, log_prob): batch, max_sen, hidden_dim = hidden.size() assert q_vec.size() == (batch, 1, hidden_dim) assert prob.size() == (batch, self.sample_steps, 1, max_sen) assert log_prob.size() == (batch, self.sample_steps, 1) expanded_hidden = hidden.unsqueeze(1).expand(-1, self.sample_steps, -1, -1) h = prob.matmul(expanded_hidden).squeeze( 2) # batch, sample_steps, hidden_dim q = q_vec.expand(-1, self.sample_steps, -1) # _logits = self.classifier(torch.cat([h, q], dim=2)).view(-1, self.num_choices) # batch, sample_steps, 3 # Note the rank of dimension here _logits = self.classifier(torch.cat([h, q], dim=2)).view(label.size(0), self.num_choices, self.sample_steps)\ .transpose(1, 2).reshape(-1, self.num_choices) expanded_label = label.unsqueeze(1).expand( -1, self.sample_steps).reshape(-1) _loss = F.cross_entropy(_logits, expanded_label) corrects = (_logits.max(dim=-1)[1] == expanded_label).to(hidden.dtype) log_prob = log_prob.reshape(label.size(0), self.num_choices, self.sample_steps).transpose( 1, 2).mean(dim=-1) reward1 = (log_prob.reshape(-1) * corrects).sum() / (self.sample_steps * label.size(0)) return _loss - reward1, _logits def reinforce_step_1(self, hidden, q_vec, label, prob, log_prob): batch, max_sen, hidden_dim = hidden.size() assert q_vec.size() == (batch, 1, hidden_dim) assert prob.size() == (batch, self.sample_steps, 1, max_sen) assert log_prob.size() == (batch, self.sample_steps, 1) expanded_hidden = hidden.unsqueeze(1).expand(-1, self.sample_steps, -1, -1) h = prob.matmul(expanded_hidden).squeeze( 2) # batch, sample_steps, hidden_dim q = q_vec.expand(-1, self.sample_steps, -1) # _logits = self.classifier(torch.cat([h, q], dim=2)).view(-1, self.num_choices) # batch * sample_steps, 3 _logits = self.classifier(torch.cat([h, q], dim=2)).view(label.size(0), self.num_choices, self.sample_steps)\ .transpose(1, 2).reshape(-1, self.num_choices) expanded_label = label.unsqueeze(1).expand( -1, self.sample_steps).reshape(-1) # batch * sample_steps _loss = F.cross_entropy(_logits, expanded_label) _final_log_prob = F.log_softmax(_logits, dim=-1) # ignore_mask = (expanded_label == -1) # expanded_label = expanded_label.masked_fill(ignore_mask, 0) selected_log_prob = _final_log_prob.gather( 1, expanded_label.unsqueeze(1)).squeeze(-1) # batch * sample_steps assert selected_log_prob.size() == ( label.size(0) * self.sample_steps, ), selected_log_prob.size() log_prob = log_prob.reshape(label.size(0), self.num_choices, self.sample_steps).transpose( 1, 2).mean(dim=-1) # reward2 = - (log_prob.reshape(-1) * (selected_log_prob * (1 - ignore_mask).to(log_prob.dtype))).sum() / ( # self.sample_steps * batch) reward2 = -(log_prob.reshape(-1) * selected_log_prob).sum() / ( self.sample_steps * label.size(0)) return _loss - reward2, _logits def simple_step(self, hidden, q_vec, label, prob): batch, max_sen, hidden_dim = hidden.size() assert q_vec.size() == (batch, 1, hidden_dim) assert prob.size() == (batch, 1, max_sen) h = prob.bmm(hidden) _logits = self.classifier(torch.cat([h, q_vec], dim=2)).view(-1, self.num_choices) if label is not None: _loss = F.cross_entropy(_logits, label) else: _loss = _logits.new_zeros(1) return _loss, _logits
def extract_embeddings(dataname, layer_indexes=[-1, -2, -3, -4], bert_model='bert-large-uncased', max_seq_length=128, batch_size=32, data_dir='../data/'): input_corpus = data_dir + dataname + '/corpus.txt' input_vocab = data_dir + dataname + '/vocab.txt' output_embedding = data_dir + dataname + '/bert_embeddings.pickle' # output_tokenized_corpus = data_dir + dataname + '/tokenized_corpus.pickle' reader = open(input_corpus, 'r', encoding='utf8') total_lines = get_num_lines(input_corpus) vocab = set([ each.split('\t')[0] + '||' for each in open(input_vocab, 'r', encoding='utf8').read().split('\n') ]) tokenizer = BertTokenizer.from_pretrained(bert_model) model = BertModel.from_pretrained(bert_model) model.cuda() model.eval() batch_sentences = {} fout = open(output_embedding, 'wb') with torch.no_grad(): for sent_id, line in enumerate(tqdm(reader, total=total_lines)): if len(batch_sentences) < batch_size and sent_id < total_lines - 1: line = line.strip() terms = line.split(' ') intersection = set(terms).intersection(vocab) if intersection: raw_line = re.sub("\|\|.+?\|\|", '', line).replace('_', ' ') sent_info = get_sent_info(raw_line, intersection, sent_id, max_seq_length) batch_sentences[sent_id] = { 'raw_sent': raw_line, 'sent_info': sent_info } else: batch_tokenized_sents = [ tokenize_sent(sent_id, values['raw_sent'], max_seq_length, tokenizer) for sent_id, values in batch_sentences.items() ] batch_input_ids = torch.tensor( [sent['input_ids'] for sent in batch_tokenized_sents], dtype=torch.long).cuda() batch_input_mask = torch.tensor( [sent['input_mask'] for sent in batch_tokenized_sents], dtype=torch.long).cuda() all_encoder_layers, _ = model(batch_input_ids, attention_mask=batch_input_mask) ### performance bottleneck #s = time.time() emb_encoder_layers = torch.stack( all_encoder_layers)[layer_indexes] for idx, sent_id in enumerate(batch_sentences): sent_info = batch_sentences[sent_id]['sent_info'] [term_info.__setitem__('embedding', emb_encoder_layers[:, idx, term_info['loc'][0]:term_info['loc'][1], :].detach().cpu().numpy().astype(np.float16))\ for term_info in sent_info] #print(time.time()-s) ### performance bottleneck pickle.dump(batch_sentences, fout) batch_sentences = {} line = line.strip() terms = line.split(' ') intersection = set(terms).intersection(vocab) if intersection: raw_line = re.sub("\|\|.+?\|\|", '', line).replace('_', ' ') sent_info = get_sent_info(raw_line, intersection, sent_id, max_seq_length) batch_sentences[sent_id] = { 'raw_sent': raw_line, 'sent_info': sent_info } reader.close() fout.close()
def __init__(self, corpus=None, emb_size=256, jemb_drop_out=0.1, bert_model='bert-base-uncased', \ coordmap=True, leaky=False, dataset=None, light=False,seg=False,att=False,args=None): super(grounding_model, self).__init__() self.coordmap = coordmap self.light = light self.seg = seg self.att = att self.lstm = (corpus is not None) self.emb_size = emb_size if bert_model == 'bert-base-uncased': self.textdim = 768 else: self.textdim = 1024 ## Visual model self.visumodel = Darknet(config_path='./model/yolov3.cfg') self.visumodel.load_weights('./saved_models/yolov3.weights') # self.visumodel = torch.hub.load('pytorch/vision:v0.6.0', 'deeplabv3_resnet101', pretrained=True) # self.visumodel =deeplabv3_resnet101(pretrained=False, progress=True, num_classes=21, aux_loss=None) self.intmd_fea = [] ## Text model if self.lstm: self.textdim, self.embdim = 1024, 512 self.textmodel = RNNEncoder(vocab_size=len(corpus), word_embedding_size=self.embdim, word_vec_size=self.textdim // 2, hidden_size=self.textdim // 2, bidirectional=True, input_dropout_p=0.2, variable_lengths=True) else: self.textmodel = BertModel.from_pretrained(bert_model) ## Mapping module self.mapping_visu = nn.Sequential( OrderedDict([ ('0', ConvBatchNormReLU(1024, emb_size, 1, 1, 0, 1, leaky=leaky)), ('1', ConvBatchNormReLU(512, emb_size, 1, 1, 0, 1, leaky=leaky)), ('2', ConvBatchNormReLU(256, emb_size, 1, 1, 0, 1, leaky=leaky)) ])) self.mapping_lang = torch.nn.Sequential( nn.Linear(self.textdim, emb_size), nn.BatchNorm1d(emb_size), nn.ReLU(), nn.Dropout(jemb_drop_out), nn.Linear(emb_size, emb_size), nn.BatchNorm1d(emb_size), nn.ReLU(), ) embin_size = emb_size * 2 if self.coordmap: embin_size += 8 if self.light: self.fcn_emb = nn.Sequential( OrderedDict([ ('0', torch.nn.Sequential( ConvBatchNormReLU(embin_size, emb_size, 1, 1, 0, 1, leaky=leaky), )), ('1', torch.nn.Sequential( ConvBatchNormReLU(embin_size, emb_size, 1, 1, 0, 1, leaky=leaky), )), ('2', torch.nn.Sequential( ConvBatchNormReLU(embin_size, emb_size, 1, 1, 0, 1, leaky=leaky), )), ])) self.fcn_out = nn.Sequential( OrderedDict([ ('0', torch.nn.Sequential( nn.Conv2d(emb_size, 3 * 5, kernel_size=1), )), ('1', torch.nn.Sequential( nn.Conv2d(emb_size, 3 * 5, kernel_size=1), )), ('2', torch.nn.Sequential( nn.Conv2d(emb_size, 3 * 5, kernel_size=1), )), ])) else: self.fcn_emb = nn.Sequential( OrderedDict([ ( '0', torch.nn.Sequential( ConvBatchNormReLU(embin_size, emb_size, 1, 1, 0, 1, leaky=leaky), # Self_Attn(emb_size,'relu'), ConvBatchNormReLU(emb_size, emb_size, 3, 1, 1, 1, leaky=leaky), # Self_Attn(emb_size, 'relu'), ConvBatchNormReLU(emb_size, emb_size, 1, 1, 0, 1, leaky=leaky), # NLBlockND(in_channels=emb_size, dimension=2) )), ( '1', torch.nn.Sequential( ConvBatchNormReLU(embin_size, emb_size, 1, 1, 0, 1, leaky=leaky), # Self_Attn(emb_size, 'relu'), ConvBatchNormReLU(emb_size, emb_size, 3, 1, 1, 1, leaky=leaky), # Self_Attn(emb_size, 'relu'), ConvBatchNormReLU(emb_size, emb_size, 1, 1, 0, 1, leaky=leaky), # NLBlockND(in_channels=emb_size, dimension=2) )), ( '2', torch.nn.Sequential( ConvBatchNormReLU(embin_size, emb_size, 1, 1, 0, 1, leaky=leaky), # Self_Attn(emb_size, 'relu'), ConvBatchNormReLU(emb_size, emb_size, 3, 1, 1, 1, leaky=leaky), # Self_Attn(emb_size, 'relu'), ConvBatchNormReLU(emb_size, emb_size, 1, 1, 0, 1, leaky=leaky), # NLBlockND(in_channels=emb_size, dimension=2) )), ])) self.fcn_out = nn.Sequential( OrderedDict([ ('0', torch.nn.Sequential( ConvBatchNormReLU(emb_size, emb_size // 2, 1, 1, 0, 1, leaky=leaky), nn.Conv2d(emb_size // 2, 3, kernel_size=1), )), ('1', torch.nn.Sequential( ConvBatchNormReLU(emb_size, emb_size // 2, 1, 1, 0, 1, leaky=leaky), nn.Conv2d(emb_size // 2, 3, kernel_size=1), )), ('2', torch.nn.Sequential( ConvBatchNormReLU(emb_size, emb_size // 2, 1, 1, 0, 1, leaky=leaky), nn.Conv2d(emb_size // 2, 3, kernel_size=1), )), ])) if self.att: self.attn_emb = Self_Attn(4, emb_size, 'relu') # self.fcn_emb=torch.nn.Sequential( # ConvBatchNormReLU(embin_size, emb_size, 1, 1, 0, 1, leaky=leaky), # # Self_Attn(emb_size,'relu'), # ConvBatchNormReLU(emb_size, emb_size, 3, 1, 1, 1, leaky=leaky), # # Self_Attn(emb_size, 'relu'), # ConvBatchNormReLU(emb_size, emb_size, 1, 1, 0, 1, leaky=leaky),) self.fcn_out_offset = nn.Sequential( OrderedDict([ ('0', torch.nn.Sequential( ConvBatchNormReLU(emb_size, emb_size // 2, 1, 1, 0, 1, leaky=leaky), nn.Conv2d(emb_size // 2, 2, kernel_size=1), )), ('1', torch.nn.Sequential( ConvBatchNormReLU(emb_size, emb_size // 2, 1, 1, 0, 1, leaky=leaky), nn.Conv2d(emb_size // 2, 2, kernel_size=1), )), ('2', torch.nn.Sequential( ConvBatchNormReLU(emb_size, emb_size // 2, 1, 1, 0, 1, leaky=leaky), nn.Conv2d(emb_size // 2, 2, kernel_size=1), )), ])) self.fcn_out_center = nn.Sequential( OrderedDict([ ( '0', torch.nn.Sequential( ConvBatchNormReLU(emb_size, emb_size, 1, 1, 0, 1, leaky=leaky), ConvBatchNormReLU(emb_size, emb_size, 3, 1, 1, 1, leaky=leaky), # NLBlockND(in_channels=emb_size, dimension=2), ConvBatchNormReLU(emb_size, emb_size // 2, 1, 1, 0, 1, leaky=leaky), nn.Conv2d(emb_size // 2, int(args.size / 32) * int(args.size / 32), kernel_size=1), )), ( '1', torch.nn.Sequential( ConvBatchNormReLU(emb_size, emb_size, 1, 1, 0, 1, leaky=leaky), ConvBatchNormReLU(emb_size, emb_size, 3, 1, 1, 1, leaky=leaky), # NLBlockND(in_channels=emb_size, dimension=2), ConvBatchNormReLU(emb_size, emb_size // 2, 1, 1, 0, 1, leaky=leaky), nn.Conv2d(emb_size // 2, int(args.size / 16) * int(args.size / 16), kernel_size=1), )), ( '2', torch.nn.Sequential( ConvBatchNormReLU(emb_size, emb_size, 1, 1, 0, 1, leaky=leaky), ConvBatchNormReLU(emb_size, emb_size, 3, 1, 1, 1, leaky=leaky), # NLBlockND(in_channels=emb_size, dimension=2), ConvBatchNormReLU(emb_size, emb_size, 1, 1, 0, 1, leaky=leaky), nn.Conv2d(emb_size, int(args.size / 8) * int(args.size / 8), kernel_size=1), )), ])) # if self.seg: self.segmentation = ReferCam()
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument("--input_file", default=None, type=str, required=True) parser.add_argument("--output_file", default=None, type=str, required=True) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) # Other parameters # parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using uncased model.") # parser.add_argument("--layers", default="-2", type=str) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") args = parser.parse_args() device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {} n_gpu: {}".format(device, n_gpu)) # layer_indexes = [int(x) for x in args.layers.split(",")] layer_index = -2 # second-to-last, which showed reasonable performance in BERT paper dset = BertSingleSeqDataset(args.input_file, args.bert_model, args.max_seq_length) model = BertModel.from_pretrained(args.bert_model) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) eval_sampler = SequentialSampler(dset) eval_dataloader = DataLoader(dset, sampler=eval_sampler, batch_size=args.batch_size, collate_fn=pad_collate, num_workers=8) model.eval() torch.set_grad_enabled(False) with h5py.File(args.output_file, "w") as h5_f: for batch in tqdm(eval_dataloader): input_ids = batch.token_ids.to(device) input_mask = batch.token_ids_mask.to(device) unique_ids = batch.unique_id all_encoder_layers, _ = model( input_ids, token_type_ids=None, attention_mask=input_mask) # (#layers, bsz, #tokens, hsz) layer_output = all_encoder_layers[layer_index].detach().cpu( ).numpy() # (bsz, #tokens, hsz) print("layer_output", layer_output.shape) for batch_idx, unique_id in enumerate(unique_ids): original_token_embeddings = get_original_token_embedding( layer_output[batch_idx], batch.token_ids_mask[batch_idx], batch.token_map[batch_idx]) h5_f.create_dataset(str(unique_id), data=original_token_embeddings, dtype=np.float32)
class BertQAYesnoHierarchicalHardRACE(BertPreTrainedModel): """ Hard: Hard attention, using gumbel softmax of reinforcement learning. """ def __init__(self, config, evidence_lambda=0.8, num_choices=4, use_gumbel=True, freeze_bert=False): super(BertQAYesnoHierarchicalHardRACE, self).__init__(config) logger.info(f'The model {self.__class__.__name__} is loading...') logger.info(f'The coefficient of evidence loss is {evidence_lambda}') logger.info(f'Currently the number of choices is {num_choices}') logger.info(f'Use gumbel: {use_gumbel}') logger.info(f'If freeze BERT\'s parameters: {freeze_bert} ') layers.set_seq_dropout(True) layers.set_my_dropout_prob(config.hidden_dropout_prob) rep_layers.set_seq_dropout(True) rep_layers.set_my_dropout_prob(config.hidden_dropout_prob) self.bert = BertModel(config) if freeze_bert: for p in self.bert.parameters(): p.requires_grad = False # self.doc_sen_self_attn = layers.LinearSelfAttnAllennlp(config.hidden_size) # self.que_self_attn = layers.LinearSelfAttn(config.hidden_size) self.doc_sen_self_attn = rep_layers.LinearSelfAttention( config.hidden_size) self.que_self_attn = rep_layers.LinearSelfAttention(config.hidden_size) self.word_similarity = layers.AttentionScore(config.hidden_size, 250, do_similarity=False) self.vector_similarity = layers.AttentionScore(config.hidden_size, 250, do_similarity=False) self.classifier = nn.Linear(config.hidden_size * 2, 1) self.evidence_lam = evidence_lambda self.use_gumbel = use_gumbel self.num_choices = num_choices self.apply(self.init_bert_weights) def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, sentence_span_list=None, sentence_ids=None, max_sentences: int = 0): flat_input_ids = input_ids.view(-1, input_ids.size(-1)) flat_token_type_ids = token_type_ids.view( -1, token_type_ids.size(-1)) if token_type_ids is not None else None flat_attention_mask = attention_mask.view( -1, attention_mask.size(-1)) if attention_mask is not None else None sequence_output, _ = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False) # mask: 1 for masked value and 0 for true value # doc, que, doc_mask, que_mask = layers.split_doc_que(sequence_output, token_type_ids, attention_mask) doc_sen, que, doc_sen_mask, que_mask, sentence_mask = \ rep_layers.split_doc_sen_que(sequence_output, flat_token_type_ids, flat_attention_mask, sentence_span_list, max_sentences=max_sentences) batch, max_sen, doc_len = doc_sen_mask.size() # que_len = que_mask.size(1) # que_vec = layers.weighted_avg(que, self.que_self_attn(que, que_mask)).view(batch, 1, -1) que_vec = self.que_self_attn(que, que_mask).view(batch, 1, -1) doc = doc_sen.reshape(batch, max_sen * doc_len, -1) word_sim = self.word_similarity(que_vec, doc).view(batch * max_sen, doc_len) doc = doc_sen.reshape(batch * max_sen, doc_len, -1) doc_mask = doc_sen_mask.reshape(batch * max_sen, doc_len) word_hidden = rep_layers.masked_softmax(word_sim, doc_mask, dim=1).unsqueeze(1).bmm(doc) word_hidden = word_hidden.view(batch, max_sen, -1) doc_vecs = self.doc_sen_self_attn(doc, doc_mask).view(batch, max_sen, -1) sentence_sim = self.vector_similarity(que_vec, doc_vecs) sentence_hidden = self.hard_sample( sentence_sim, use_gumbel=self.use_gumbel, dim=-1, hard=True, mask=sentence_mask).bmm(word_hidden).squeeze(1) choice_logits = self.classifier( torch.cat([sentence_hidden, que_vec.squeeze(1)], dim=1)).reshape(-1, self.num_choices) sentence_scores = rep_layers.masked_softmax(sentence_sim, sentence_mask, dim=-1).squeeze_(1) output_dict = { 'choice_logits': choice_logits.float(), 'sentence_logits': sentence_scores.reshape(choice_logits.size(0), self.num_choices, max_sen).detach().cpu().float(), } loss = 0 if labels is not None: choice_loss = F.cross_entropy(choice_logits, labels) loss += choice_loss if sentence_ids is not None: log_sentence_sim = rep_layers.masked_log_softmax( sentence_sim.squeeze(1), sentence_mask, dim=-1) sentence_loss = F.nll_loss(log_sentence_sim, sentence_ids.view(batch), reduction='sum', ignore_index=-1) loss += self.evidence_lam * sentence_loss / choice_logits.size(0) output_dict['loss'] = loss return output_dict def hard_sample(self, logits, use_gumbel, dim=-1, hard=True, mask=None): if use_gumbel: if self.training: probs = rep_layers.gumbel_softmax(logits, mask=mask, hard=hard, dim=dim) return probs else: probs = rep_layers.masked_softmax(logits, mask, dim=dim) index = probs.max(dim, keepdim=True)[1] y_hard = torch.zeros_like(logits).scatter_(dim, index, 1.0) return y_hard else: pass
class SANBertNetwork(nn.Module): def __init__(self, opt, bert_config=None): super(SANBertNetwork, self).__init__() self.dropout_list = nn.ModuleList() self.bert_config = BertConfig.from_dict(opt) self.bert = BertModel(self.bert_config) if opt.get('dump_feature', False): self.opt = opt return if opt['update_bert_opt'] > 0: for p in self.bert.parameters(): p.requires_grad = False mem_size = self.bert_config.hidden_size self.decoder_opt = opt['answer_opt'] self.scoring_list = nn.ModuleList() labels = [int(ls) for ls in opt['label_size'].split(',')] task_dropout_p = opt['tasks_dropout_p'] self.bert_pooler = None for task, lab in enumerate(labels): decoder_opt = self.decoder_opt[task] dropout = DropoutWrapper(task_dropout_p[task], opt['vb_dropout']) self.dropout_list.append(dropout) if decoder_opt == 1: out_proj = SANClassifier(mem_size, mem_size, lab, opt, prefix='answer', dropout=dropout) self.scoring_list.append(out_proj) else: out_proj = nn.Linear(self.bert_config.hidden_size, lab) self.scoring_list.append(out_proj) self.opt = opt self._my_init() self.set_embed(opt) def _my_init(self): def init_weights(module): if isinstance(module, (nn.Linear, nn.Embedding)): # Slightly different from the TF version which uses truncated_normal for initialization # cf https://github.com/pytorch/pytorch/pull/5617 module.weight.data.normal_(mean=0.0, std=self.bert_config.initializer_range * self.opt['init_ratio']) elif isinstance(module, BertLayerNorm): # Slightly different from the BERT pytorch version, which should be a bug. # Note that it only affects on training from scratch. For detailed discussions, please contact xiaodl@. # Layer normalization (https://arxiv.org/abs/1607.06450) # support both old/latest version if 'beta' in dir(module) and 'gamma' in dir(module): module.beta.data.zero_() module.gamma.data.fill_(1.0) else: module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear): module.bias.data.zero_() self.apply(init_weights) def nbert_layer(self): return len(self.bert.encoder.layer) def freeze_layers(self, max_n): assert max_n < self.nbert_layer() for i in range(0, max_n): self.freeze_layer(i) def freeze_layer(self, n): assert n < self.nbert_layer() layer = self.bert.encoder.layer[n] for p in layer.parameters(): p.requires_grad = False def set_embed(self, opt): bert_embeddings = self.bert.embeddings emb_opt = opt['embedding_opt'] if emb_opt == 1: for p in bert_embeddings.word_embeddings.parameters(): p.requires_grad = False elif emb_opt == 2: for p in bert_embeddings.position_embeddings.parameters(): p.requires_grad = False elif emb_opt == 3: for p in bert_embeddings.token_type_embeddings.parameters(): p.requires_grad = False elif emb_opt == 4: for p in bert_embeddings.token_type_embeddings.parameters(): p.requires_grad = False for p in bert_embeddings.position_embeddings.parameters(): p.requires_grad = False def forward(self, input_ids, token_type_ids, attention_mask, premise_mask=None, hyp_mask=None, task_id=0): all_encoder_layers, pooled_output = self.bert(input_ids, token_type_ids, attention_mask) sequence_output = all_encoder_layers[-1] if self.bert_pooler is not None: pooled_output = self.bert_pooler(sequence_output) decoder_opt = self.decoder_opt[task_id] if decoder_opt == 1: max_query = hyp_mask.size(1) assert max_query > 0 assert premise_mask is not None assert hyp_mask is not None hyp_mem = sequence_output[:, :max_query, :] logits = self.scoring_list[task_id](sequence_output, hyp_mem, premise_mask, hyp_mask) else: pooled_output = self.dropout_list[task_id](pooled_output) logits = self.scoring_list[task_id](pooled_output) return logits
def inference(params): voc, all_def_embs, id2def = load_req(params) mapping = load_model(params, torch.tensor(voc.embedding)) mapping.eval() torch.set_grad_enabled(False) if params.model_type == 'baseline': import tensorflow_hub as hub import tensorflow as tf sent_embed = hub.Module( "https://tfhub.dev/google/universal-sentence-encoder-large/3") input_sent = tf.placeholder(tf.string, shape=(None)) encoded = sent_embed(input_sent) with tf.Session() as sess: sess.run( [tf.global_variables_initializer(), tf.tables_initializer()]) sess.graph.finalize() while True: _, w_id, ctx = get_input(voc) if w_id == None: continue ctx_emb = np.round( sess.run(encoded, { input_sent: [ctx] }).astype(np.float64), 6) # (1, 512) answer(mapping, all_def_embs, id2def, ctx_emb, w_id) elif params.model_type == 'ELMo': from allennlp.commands.elmo import ElmoEmbedder elmo = ElmoEmbedder() while True: word, w_id, ctx = get_input(voc) if w_id == None: continue ctx = ctx.split() ctx_emb = elmo.embed_sentence(ctx) # (3, seq_len, 1024) word_pos, _ = find_varaint_word(word, ctx) if word_pos == None: continue ctx_emb = ctx_emb[:, word_pos][np.newaxis, :] answer(mapping, all_def_embs, id2def, ctx_emb, w_id) else: from pytorch_pretrained_bert.tokenization import BertTokenizer from pytorch_pretrained_bert.modeling import BertModel tokenizer = BertTokenizer.from_pretrained(params.bert_model, do_lower_case=True) model = BertModel.from_pretrained(params.bert_model) model.to(device) model.eval() while True: word, w_id, ctx = get_input(voc) if w_id == None: continue _, word = find_varaint_word(word, ctx.split()) if word == None: continue input_ids, input_mask, key_ids = convert_examples_to_features( word, ctx, 128, tokenizer) all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) ctx_emb = np.zeros((params.n_feats, 3, params.emb1_dim)) for j, ly_id in enumerate(reversed(range(-params.n_feats, 0))): # -1, -2, -3 ... layer_output = all_encoder_layers[ly_id].detach().cpu().numpy( ).astype(np.float64).squeeze() ctx_emb[j, :len(key_ids)] = np.round( layer_output[key_ids], 6).tolist() # (3, 768/1024) ctx_emb = np.transpose(ctx_emb, (0, 2, 1)) # (n_feats, 768/1024, 3) answer(mapping, all_def_embs, id2def, ctx_emb, w_id)
def __init__(self, label_num=4): super(FlatBertModel, self).__init__() self.sentences_encoder = BertModel.from_pretrained('bert-base-cased') self.hidden_size = self.sentences_encoder.config.hidden_size self.classifer = nn.Linear(self.hidden_size, label_num)
def main(args): # to pick up here. if args.data_type == "matres": label_map = matres_label_map elif args.data_type == "tbd": label_map = tbd_label_map all_labels = list(OrderedDict.fromkeys(label_map.values())) args._label_to_id = OrderedDict([(all_labels[l], l) for l in range(len(all_labels))]) args._id_to_label = OrderedDict([(l, all_labels[l]) for l in range(len(all_labels))]) print(args._label_to_id) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') if args.load_model_dir: output_model_file = os.path.join(args.load_model_dir, "pytorch_model.bin") model_state_dict = torch.load(output_model_file) bert_model = BertModel.from_pretrained('bert-base-uncased', state_dict=model_state_dict) else: bert_model = BertModel.from_pretrained('bert-base-uncased') train_data = pickle.load(open(args.data_dir + "/train.pickle", "rb")) print("process train...") data = [ parallel(v, k, args, tokenizer, bert_model) for k, v in train_data.items() ] if args.data_type in ['tbd']: print("process dev...") dev_data = pickle.load(open(args.data_dir + "/dev.pickle", "rb")) dev_data = [ parallel(v, k, args, tokenizer, bert_model) for k, v in dev_data.items() ] data += dev_data # doc splits if args.data_type in ['matres']: train_docs, dev_docs = train_test_split(args.train_docs, test_size=0.2, random_state=args.seed) # TBDense data has given splits on train/dev/test else: train_docs = args.train_docs dev_docs = args.dev_docs if not os.path.isdir(args.save_data_dir): os.mkdir(args.save_data_dir) if 'all' in args.split: print("process test...") test_data = pickle.load(open(args.data_dir + "/test.pickle", "rb")) test_data = [ parallel(v, k, args, tokenizer, bert_model) for k, v in test_data.items() ] print(len(test_data)) print(args.save_data_dir) with open(args.save_data_dir + '/test.pickle', 'wb') as handle: pickle.dump(test_data, handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close() split_and_save(train_docs, dev_docs, data, args.seed, args.save_data_dir) # quick trick to reduce number of tokens in GloVe # reduce_vocab(data + test_data, args.save_data_dir, args.w2i, args.glove) return
# this forward is just for predict, not for train # dont confuse this with _forward_alg above. def forward(self, input_ids, segment_ids, input_mask): # Get the emission scores from the BiLSTM bert_feats = self._get_bert_features(input_ids, segment_ids, input_mask) # Find the best path, given the features. value, score, label_seq_ids = self._viterbi_decode(bert_feats) return value, score, label_seq_ids start_label_id = conllProcessor.get_start_label_id() stop_label_id = conllProcessor.get_stop_label_id() bert_model = BertModel.from_pretrained(bert_model_scale) model = BERT_CRF_NER(bert_model, start_label_id, stop_label_id, len(label_list), max_seq_length, batch_size, device) #%% if load_checkpoint and os.path.exists(output_dir + '/ner_bert_crf_checkpoint.pt'): checkpoint = torch.load(output_dir + '/ner_bert_crf_checkpoint.pt', map_location='cpu') start_epoch = checkpoint['epoch'] + 1 valid_acc_prev = checkpoint['valid_acc'] valid_f1_prev = checkpoint['valid_f1'] pretrained_dict = checkpoint['model_state'] net_state_dict = model.state_dict() pretrained_dict_selected = { k: v
def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): """ :param model:BertModel Pytorch model instance to be converted :param ckpt_dir: Tensorflow model directory :param model_name: model name :return: Currently supported HF models: Y BertModel N BertForMaskedLM N BertForPreTraining N BertForMultipleChoice N BertForNextSentencePrediction N BertForSequenceClassification N BertForQuestionAnswering """ tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") var_map = (('layer.', 'layer_'), ('word_embeddings.weight', 'word_embeddings'), ('position_embeddings.weight', 'position_embeddings'), ('token_type_embeddings.weight', 'token_type_embeddings'), ('.', '/'), ('LayerNorm/weight', 'LayerNorm/gamma'), ('LayerNorm/bias', 'LayerNorm/beta'), ('weight', 'kernel')) if not os.path.isdir(ckpt_dir): os.makedirs(ckpt_dir) state_dict = model.state_dict() def to_tf_var_name(name: str): for patt, repl in iter(var_map): name = name.replace(patt, repl) return 'bert/{}'.format(name) def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): tf_dtype = tf.dtypes.as_dtype(tensor.dtype) tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) session.run(tf.variables_initializer([tf_var])) session.run(tf_var) return tf_var tf.reset_default_graph() with tf.Session() as session: for var_name in state_dict: tf_name = to_tf_var_name(var_name) torch_tensor = state_dict[var_name].numpy() if any([x in var_name for x in tensors_to_transpose]): torch_tensor = torch_tensor.T tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) tf.keras.backend.set_value(tf_var, torch_tensor) tf_weight = session.run(tf_var) print("Successfully created {}: {}".format( tf_name, np.allclose(tf_weight, torch_tensor))) saver = tf.train.Saver(tf.trainable_variables()) saver.save( session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
random.shuffle(train_examples) random.shuffle(val_examples) train_dataloader = DataLoader(dataset=DomainData(train_examples, label_list, max_seq_length, tokenizer), batch_size=batch_size, shuffle=True, drop_last=False) val_dataset = DomainData(val_examples, label_list, max_seq_length, tokenizer) val_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size) num_train_steps = int(len(train_examples) / batch_size * total_epoch_num) bert = BertModel.from_pretrained(bert_model, PYTORCH_PRETRAINED_BERT_CACHE) generator = Generator1(noise_size=noise_size, output_size=768, hidden_sizes=[768], dropout_rate=0.1) discriminator = Discriminator(input_size=768, hidden_sizes=[768], num_labels=len(label_list), dropout_rate=0.1) bert.to(device) if multi_gpu: bert = torch.nn.DataParallel(bert, device_ids=device_ids) generator.to(device) discriminator.to(device)