예제 #1
0
def eval(args):
    batch_size = 32
    train_on_gpu = torch.cuda.is_available()

    enc = RNNEncoder(300, args.embedding_file)
    dec = RNNDecoder(300, args.embedding_file)

    device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

    model = Seq2Seq(enc, dec, device).to(device)
    ckpt = torch.load(args.model_path)
    model.load_state_dict(ckpt['state_dict'])

    model.eval()

    embedding_matrix = pickle.load(open(args.embedding_file, 'rb'))
    tokenizer = Tokenizer(lower=True)
    tokenizer.set_vocab(embedding_matrix.vocab)
    eval_data = pickle.load(open(args.test_data_path, 'rb'))
    eval_loader = DataLoader(eval_data,
                             batch_size=batch_size,
                             num_workers=0,
                             shuffle=False,
                             collate_fn=eval_data.collate_fn)

    output_file = open(args.output_path, 'w')
    val_losses = []
    prediction = {}
    for batch in tqdm(eval_loader):
        pred = model(batch, 0)
        pred = torch.argmax(pred, dim=2)
        # batch, seq_len

        for i in range(len(pred)):
            prediction[batch['id'][i]] = tokenizer.decode(
                pred[i]).split('</s>')[0].split(' ', 1)[1]
    pred_output = [
        json.dumps({
            'id': key,
            'predict': value
        })
        for key, value in sorted(prediction.items(), key=lambda item: item[0])
    ]
    output_file.write('\n'.join(pred_output))
    output_file.write('\n')
    output_file.close()
예제 #2
0
def eval(args):
   
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    #print(device)
    BATCH_SIZE = 32

    ENC_HID_DIM = 128
    DEC_HID_DIM = 128
    N_LAYERS = 1
    ENC_DROPOUT = 0.5
    DEC_DROPOUT = 0.5
    PADDING_INDEX = 0

    embedding = pickle.load(open(args.embedding_file, 'rb'))
    tokenizer = Tokenizer(lower=True)
    tokenizer.set_vocab(embedding.vocab)
    embedding_matrix = embedding.vectors.to(device)

    output_dim = len(embedding.vectors)
    embedding_dim = 300


    attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
    encoder = Encoder(embedding_dim, ENC_HID_DIM, DEC_HID_DIM,
                      embedding_matrix, N_LAYERS, ENC_DROPOUT)
    decoder = Decoder(output_dim, embedding_dim,
                      ENC_HID_DIM, DEC_HID_DIM, embedding_matrix, N_LAYERS, DEC_DROPOUT, attn)

    model = Seq2Seq(encoder, decoder, PADDING_INDEX, device).to(device)


    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss(ignore_index=PADDING_INDEX)
    ckpt = torch.load(args.model_path)
    model.load_state_dict(ckpt['state_dict'])
    
        
    model.eval()

    
    eval_data = pickle.load(open(args.test_data_path, 'rb'))
    eval_loader = DataLoader(eval_data, batch_size=BATCH_SIZE, num_workers=0, shuffle=False, collate_fn=eval_data.collate_fn)
    
    
  
    output_file = open(args.output_path, 'w')
    val_losses = []
    prediction={}
    for batch in tqdm(eval_loader):
        #print(text.size())
        pred ,attention= model(batch, 0)
        #print(pred.size())
        pred = torch.argmax(pred, dim=2)
        #print(pred.size())
        pred = pred.permute(1, 0)
        #print(pred.size())
        
   
        for i in range(len(pred)):
            prediction[batch['id'][i]] = tokenizer.decode(pred[i]).split('</s>')[0].split(' ',1)[1]
    pred_output = [json.dumps({'id':key, 'predict': value}) for key, value in sorted(prediction.items(), key=lambda item: item[0])]
    output_file.write('\n'.join(pred_output))
    output_file.write('\n')
    output_file.close()    
예제 #3
0
class Summarizer:
    """Use a test model to generate fielded query sentences from documents"""
    def __init__(self,
                 f_abs,
                 n_best=1,
                 min_length=1,
                 max_length=50,
                 beam_size=4,
                 bert_model='bert-base-uncased'):
        self.n_best = n_best
        self.min_length = min_length
        self.max_length = max_length
        self.beam_size = beam_size
        self.abs_model = self.load_abs_model(f_abs)
        self.eval()
        logger.info(f'Loading BERT Tokenizer [{bert_model}]...')
        self.tokenizerB = BertTokenizer.from_pretrained('bert-base-uncased')
        self.spt_ids_B, self.spt_ids_C, self.eos_mapping = get_special_tokens()
        logger.info('Loading custom Tokenizer for using WBMET embeddings')
        self.tokenizerC = Tokenizer(self.abs_model.args.vocab_size)
        self.tokenizerC.from_pretrained(self.abs_model.args.file_dec_emb)

    @staticmethod
    def load_abs_model(f_abs):
        """Load a pre-trained abs model"""
        logger.info(f'Loading an abstractive test model from {f_abs}...')
        data = torch.load(f_abs, map_location=lambda storage, loc: storage)
        mdl = AbstractiveSummarizer(data['args'])
        mdl.load_state_dict(data['model']).cuda()
        return mdl

    def translate(self, docs):
        """Translate a batch of documents."""
        batch_size = docs.inp.size(0)
        spt_ids = self.spt_ids_C
        decode_strategy = BeamSearch(self.beam_size, batch_size, self.n_best,
                                     self.min_length, self.max_length, spt_ids,
                                     self.eos_mapping)
        return self._translate_batch_with_strategy(docs, decode_strategy)

    def _translate_batch_with_strategy(self, batch, decode_strategy):
        """Translate a batch of documents step by step using cache

        :param batch (dict): A batch of documentsj
        :param decode_strategy (DecodeStrategy): A decode strategy for
            generating translations step by step. I.e., BeamSearch
        """

        # (1) Run the encoder on the src
        ext_scores, hidden_states = \
            self.abs_model.encoder(batch.inp,
                                   attention_mask=batch.mask_inp,
                                   token_type_ids=batch.segs)

        # (2) Prepare decoder and decode_strategy
        self.abs_model.decoder.init_state(batch.inp)
        field_signals = batch.tgt[:, 0]
        fn_map_state, memory_bank, memory_pad_mask = \
            decode_strategy.initialize(hidden_states[-1], batch.src_lens,
                                       field_signals)
        if fn_map_state is not None:
            self.abs_model.decoder.map_state(fn_map_state)

        # (3) Begin decoding step by step:
        for step in range(decode_strategy.max_length):
            decoder_input = decode_strategy.current_predictions.unsqueeze(-1)
            dec_out, attns = self.abs_model.decoder(decoder_input,
                                                    memory_bank,
                                                    memory_pad_mask,
                                                    step=step)
            log_probs = self.abs_model.generator(dec_out[:, -1, :].squeeze(1))
            # Beam advance
            decode_strategy.advance(log_probs, attns)

            any_finished = decode_strategy.is_finished.any()
            if any_finished:
                decode_strategy.update_finished()
                if decode_strategy.done:
                    break

            select_indices = decode_strategy.select_indices
            if any_finished:
                # Reorder states.
                memory_bank = memory_bank.index_select(0, select_indices)
                memory_pad_mask = memory_pad_mask.index_select(
                    0, select_indices)

            if self.beam_size > 1 or any_finished:
                self.abs_model.decoder.map_state(
                    lambda state, dim: state.index_select(dim, select_indices))
        res = {
            'batch':
            batch,
            'gold_scores':
            self._gold_score(batch, hidden_states[-1], batch.mask_inp),
            'scores':
            decode_strategy.scores,
            'predictions':
            decode_strategy.predictions,
            'ext_scores':
            ext_scores,
            'attentions':
            decode_strategy.attention
        }
        return res

    def results_to_translations(self, data):
        """Convert results into Translation object"""
        batch = data['batch']
        translations = []
        for i, did in enumerate(batch.did):
            src_input_ = batch.inp[i]
            src_ = self.tokenizerB.decode(src_input_)
            topic_ = \
                self.tokenizerC.convert_id_to_token(batch.tgt[i][0].item())
            pred_sents_ = [
                self.tokenizerC.decode(data['predictions'][i][n])
                for n in range(self.n_best)
            ]
            gold_sent_ = self.tokenizerC.decode(batch.tgt[i])
            x = Translation(did=did,
                            src_input=src_input_,
                            src=src_,
                            topic=topic_,
                            ext_scores=data['ext_scores'][i],
                            pred_sents=pred_sents_,
                            pred_scores=data['scores'][i],
                            gold_sent=gold_sent_,
                            gold_score=data['gold_scores'][i],
                            attentions=data['attentions'][i])
            translations.append(x)
        return translations

    def _gold_score(self, batch, memory_bank, memory_pad_mask):
        if hasattr(batch, 'tgt'):
            gs = self._score_target(batch, memory_bank, memory_pad_mask)
            self.abs_model.decoder.init_state(batch.inp)
        else:
            gs = [0] * batch.batch_size
        return gs

    def _score_target(self, batch, memory_bank, memory_pad_mask):
        tgt_in = batch.tgt[:, :-1]
        dec_out, _ = self.abs_model.decoder(tgt_in, memory_bank,
                                            memory_pad_mask)
        log_probs = self.abs_model.generator(dec_out)
        gold = batch.tgt[:, 1:]
        tgt_pad_mask = gold.eq(self.spt_ids_C['[PAD]'])
        log_probs[tgt_pad_mask] = 0
        gold_scores = log_probs.gather(2, gold.unsqueeze(-1))
        gold_scores = gold_scores.sum(dim=1).view(-1)
        return gold_scores.tolist()

    def eval(self):
        self.abs_model.eval()
예제 #4
0
with open(hparams.embedding_path, 'rb') as f:
    embedding = pickle.load(f)

tokenizer.set_vocab(embedding.vocab)

dataLoader = DataLoader(dataset,
                        hparams.batch_size,
                        shuffle=False,
                        collate_fn=dataset.collate_fn)

model = Model(hparams)
model.load(hparams.load_model_path)

st_time = datetime.now()
predicts = model.eval(dataLoader)

results = []
for predict in predicts:
    res = {
        "id": predict["id"],
        "predict": tokenizer.decode(predict["predict"])
    }
    results.append(res)

with open(args.store_path, 'w') as fp:
    for result in results:
        s = json.dumps(result)
        fp.write(f"{s}\n")

print(f"Cost time: {datetime.now()-st_time}")