예제 #1
0
    def predict(self, text_list):
        result = []
        print(text_list)
        test_examples = self.processor.get_ifrn_examples(text_list)
        print("test_examples", test_examples[0].text_a)

        test_features = convert_examples_to_features(test_examples,
                                                     self.label_list,
                                                     self.args.max_seq_length,
                                                     self.tokenizer,
                                                     show_exp=False)
        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        test_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids)
        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=self.args.eval_batch_size)

        for idx, (input_ids, input_mask,
                  segment_ids) in enumerate(test_dataloader):
            item = {}
            input_ids = input_ids.to(self.device)
            input_mask = input_mask.to(self.device)
            segment_ids = segment_ids.to(self.device)
            text = test_examples[idx].text_a
            with torch.no_grad():
                logits = self.model(input_ids, segment_ids, input_mask)
                print("logits: ", logits)
                logits = F.softmax(logits, dim=1)
                print("logits2: ", logits)
                pred = logits.max(1)[1]
                print("pred: ", pred)
                logits = logits.detach().cpu().numpy()[0].tolist()
                print("logits3", logits)
                if return_text:
                    item['text'] = text
                item['label'] = pred.item()
                item['scores'] = {0: logits[0], 1: logits[1]}
                result.append(item)
        return result
예제 #2
0
def get_matrix_of_similarities(arguments=[]):
    tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)
    if len(arguments) == 0:
        arguments = arguments_init[:]
    input_examples, output_examples, index_mapping = create_examples(arguments)
    eval_features = convert_examples_to_features(input_examples,
                                                 max_seq_length, tokenizer)

    all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                   dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=eval_batch_size)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SigmoidBERT.from_pretrained(model_path, )
    model.to(device)
    model.eval()

    predicted_logits = []
    with torch.no_grad():
        for input_ids, input_mask, segment_ids in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)

            logits = model(input_ids, segment_ids, input_mask)
            logits = logits.detach().cpu().numpy()
            predicted_logits.extend(logits[:, 0])

    distance_matrix = np.zeros(shape=(len(arguments), len(arguments)))
    for idx in range(len(predicted_logits)):
        output_examples[idx].append(predicted_logits[idx])

    for i in range(len(arguments)):
        for j in range(len(arguments)):
            if i == j:
                distance_matrix[i, j] = 0.0
            elif (i, j) in index_mapping:
                distance_matrix[i,
                                j] = 1.0 - predicted_logits[index_mapping[(i,
                                                                           j)]]
            else:
                distance_matrix[i,
                                j] = 1.0 - predicted_logits[index_mapping[(j,
                                                                           i)]]

    #Sort by similarity
    output_examples = sorted(output_examples, key=lambda x: x[2], reverse=True)

    # print("Predicted similarities (sorted by similarity):")
    # for idx in range(len(output_examples)):
    #     example = output_examples[idx]
    #     print("Sentence A:", example[0])
    #     print("Sentence B:", example[1])
    #     print("Similarity:", example[2])
    #     print("")

    return distance_matrix
예제 #3
0
    'Overwhelming scientific consensus says human activity is primarily responsible for global climate change.',
    'Rising levels of human-produced gases released into the atmosphere create a greenhouse effect that traps heat and causes global warming.'
]

#Compare every argument with each other
input_examples = []
output_examples = []

for i in range(0, len(arguments) - 1):
    for j in range(i + 1, len(arguments)):
        input_examples.append(
            InputExample(text_a=arguments[i], text_b=arguments[j], label=-1))
        output_examples.append([arguments[i], arguments[j]])

tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=True)
eval_features = convert_examples_to_features(input_examples, max_seq_length,
                                             tokenizer)

all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                             dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                              dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                               dtype=torch.long)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data,
                             sampler=eval_sampler,
                             batch_size=eval_batch_size)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")