예제 #1
0
 def __call__(self, samples):
     '''
     samples: [dict]: [{'input_text':'text to condition on'}]
     returns: [dict]: [{'input_text':'text to condition on', 'pred_text':"text from BARTs decoder"}]
     '''
     samples = self.BART_numericalise_transform(samples)
     if self.show_tqdm:
         pbar = tqdm(list(chunks(samples, self.chunk_size)),
                     desc="BART is thinking:")
     else:
         pbar = samples
     for chunk in pbar:
         input_tensor = torch.nn.utils.rnn.pad_sequence(
             [
                 torch.tensor(sample_obj["input_ids"], dtype=torch.long)
                 for sample_obj in chunk
             ],
             padding_value=self.PAD).T.to(self.device)
         attention_mask = (input_tensor != self.PAD).type(torch.float).to(
             self.device)
         output_ids = self.BART_conditional_generator.generate(
             input_tensor,
             attention_mask=attention_mask,
             pad_token_id=self.PAD,
             num_beams=4,
             max_length=512,
             early_stopping=False)
         for i in range(len(chunk)):
             single_out_ids = output_ids[i].tolist()
             chunk[i]["pred_ids"] = single_out_ids
         del input_tensor
         del attention_mask
         del output_ids
     samples = self.BART_denumericalise_transform(samples)
     return samples
예제 #2
0
 def __call__(self, samples):
     '''
     samples: [dict]: [{'input_ids':[34,2,8...], 'type_ids':[0,0,1,1]}]
     returns: [dict]: [{'input_ids':[34,2,8...], 'type_ids':[0,0,1,1], "score":0.56}]
     '''
     all_scores = torch.zeros((0, 1), device=self.device)
     for sample_obj_batch in chunks(samples, self.batch_size):
         with torch.no_grad():
             input_tensor = torch.nn.utils.rnn.pad_sequence(
                 [
                     torch.tensor(sample_obj["input_ids"],
                                  dtype=torch.long,
                                  device=self.device)
                     for sample_obj in sample_obj_batch
                 ],
                 padding_value=self.PAD).T
             type_ids = torch.nn.utils.rnn.pad_sequence(
                 [
                     torch.tensor(sample_obj["type_ids"], dtype=torch.long)
                     for sample_obj in sample_obj_batch
                 ],
                 padding_value=self.PAD).T.to(self.device)
             attention_mask = (input_tensor != self.PAD).type(
                 torch.float).to(self.device)
             scores = self.BERT_Reranker(
                 input_tensor,
                 attention_mask=attention_mask,
                 token_type_ids=type_ids)[0][:, 1].tolist()
         for sample_obj, score in zip(sample_obj_batch, scores):
             sample_obj["score"] = score
     return samples
예제 #3
0
 def __call__(self, samples):
     '''
     The score given corresponds to the likelihood A is more relevant than B. So I higher score is favorrable for A.
     
     samples: [dict]: [{'input_ids':[34,2,8...], 'type_ids':[0,0,1,1], ...}]
     returns: [dict]: [{'input_ids':[34,2,8...], 'type_ids':[0,0,1,1], 'score':0.95, ...}]
     '''
     for sample_obj_batch in chunks(samples, self.batch_size):
         with torch.no_grad():
             input_tensor = torch.nn.utils.rnn.pad_sequence(
                 [
                     torch.tensor(sample_obj["input_ids"], dtype=torch.long)
                     for sample_obj in sample_obj_batch
                 ],
                 padding_value=self.PAD).T.to(self.device)
             type_ids = torch.nn.utils.rnn.pad_sequence(
                 [
                     torch.tensor(sample_obj["type_ids"], dtype=torch.long)
                     for sample_obj in sample_obj_batch
                 ],
                 padding_value=self.PAD).T.to(self.device)
             attention_mask = (input_tensor != self.PAD).type(
                 torch.float).to(self.device)
             scores = outputs = self.duoBERT_Reranker(
                 input_tensor,
                 attention_mask=attention_mask,
                 token_type_ids=type_ids)[0][:, 1].tolist()
         for sample_obj, score in zip(sample_obj_batch, scores):
             sample_obj["score"] = score
     return samples
예제 #4
0
    def __init__(self,
                 samples,
                 slow_pipe,
                 real_time_pipe,
                 valid_sample_fn=None,
                 sort_key_fn=None,
                 batch_bucket_size=1,
                 shuffle=False,
                 **kwargs):

        self.real_time_pipe = real_time_pipe
        self.PAD = 0

        pbar = tqdm(slow_pipe)
        for transform in pbar:
            pbar.set_description(transform.__class__.__name__)
            samples = transform(samples)
        self.samples = samples

        if sort_key_fn:
            assert batch_bucket_size < len(
                self.samples), 'Bucket size too large'
            flag_not_valid = []
            items_keys = []
            for i in tqdm(range(len(self.samples)),
                          desc='pre-sort processing'):
                sample = self.__getitem__(i)
                if valid_sample_fn:
                    if valid_sample_fn(sample) == False:
                        flag_not_valid.append(i)
                items_keys.append(sort_key_fn(sample))
            sort_idxs = np.argsort(items_keys)[::-1]
            sort_idxs = [idx for idx in sort_idxs if idx not in flag_not_valid]
            idx_chunks = list(chunks(sort_idxs, batch_bucket_size))
            first_idx_batch_largest = idx_chunks[0]
            even_chunks = idx_chunks[1:-1]
            last_chunk = idx_chunks[-1]
            if shuffle:
                random.shuffle(even_chunks)
            bucketed_idxs = list(first_idx_batch_largest) + [
                item for sublist in even_chunks for item in sublist
            ] + list(last_chunk)
            self.samples = [self.samples[i] for i in bucketed_idxs]

        super().__init__()