예제 #1
0
 def read_examples_from_file(self, data_dir,
                             mode: Union[Split, str]) -> List[InputExample]:
     if isinstance(mode, Split):
         mode = mode.value
     file_path = os.path.join(data_dir, f"{mode}.txt")
     guid_index = 1
     examples = []
     with open(file_path, encoding="utf-8") as f:
         words = []
         labels = []
         for line in f:
             if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                 if words:
                     examples.append(
                         InputExample(guid=f"{mode}-{guid_index}",
                                      words=words,
                                      labels=labels))
                     guid_index += 1
                     words = []
                     labels = []
             else:
                 splits = line.split(" ")
                 words.append(splits[0])
                 if len(splits) > 1:
                     labels.append(splits[self.label_idx].replace("\n", ""))
                 else:
                     # Examples could have no label for mode = "test"
                     labels.append("O")
         if words:
             examples.append(
                 InputExample(guid=f"{mode}-{guid_index}",
                              words=words,
                              labels=labels))
     return examples
예제 #2
0
def read_examples_from_line(line):
    guid_index = 1
    examples = []
    words = []
    labels = []
    for word in line:
        words.append(word)
        labels.append("O")

    if words:
        examples.append(
            InputExample(guid="{}-{}".format('predict', guid_index),
                         words=words,
                         labels=labels))

    return examples
예제 #3
0
    def read_examples_from_file(self, data_dir, mode: Union[Split, str]) -> List[InputExample]:
        if isinstance(mode, Split):
            mode = mode.value
        file_path = os.path.join(data_dir, f"{mode}.txt")
        guid_index = 1
        examples = []

        with open(file_path, encoding="utf-8") as f:
            for sentence in parse_incr(f):
                words = []
                labels = []
                for token in sentence:
                    words.append(token["form"])
                    labels.append(token["upos"])
                assert len(words) == len(labels)
                if words:
                    examples.append(InputExample(guid=f"{mode}-{guid_index}", words=words, labels=labels))
                    guid_index += 1
        return examples
예제 #4
0
    def set_data(self, tok_sents: List[List[str]]):
        """Expects a document given as a list of sentences
        where each sentence is tokenized already."""
        examples = []
        for guid, sent in enumerate(tok_sents):
            words = [x + "\n" for x in sent]
            labels = ["O" for _ in range(len(sent))]
            examples.append(InputExample(guid=f"pred-{guid}", words=words, labels=labels))

        data = NerDataset(
            tokenizer=self.tokenizer,
            examples=examples,
            labels=["B", "O"],
            model_type="BertForTokenClassification",
            max_seq_length=256,
            mode=Split.pred
        )

        self.data = data
예제 #5
0
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):
    #eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    
    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Eval!
    logger.info("***** Running evaluation %s *****", prefix)
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    model.eval()


    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file

    logger.info("Creating features from dataset file at %s", args.data_dir)
    examples = read_examples_from_file(args.data_dir, mode)
    print(len(examples))
    print(examples[0])  # list of words in one document (sentence)

    out_label_listX=[]
    preds_listX=[]

    for example in tqdm(examples, desc="Evaluating"):
        max_length = 500
        min_context = 128
        l = len(example.words) #TODO number of segments for each word? word_tokens = tokenizer.tokenize(word)
        word_tokens_lengths=[len(tokenizer.tokenize(word)) for word in example.words]

        ws=windows(word_tokens_lengths, max_length, min_context)
        print(ws)
        
        text_examples=[]
        for start_all, start_content, end_content, end_all in ws:
            ex=InputExample(guid=example.guid, words=example.words[start_all:end_all], labels=example.labels[start_all:end_all])
            text_examples.append(ex)

        # przed tym trzeba podzielić
        features = convert_examples_to_features(
            text_examples,
            labels,
            512, #args.max_seq_length,
            tokenizer,
            cls_token_at_end=bool(args.model_type in ["xlnet"]),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=bool(args.model_type in ["roberta"]),
            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=bool(args.model_type in ["xlnet"]),
            # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
            pad_token_label_id=pad_token_label_id,
        )
    
        if args.local_rank == 0 and not evaluate:
            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    
        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
    
        eval_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    







    
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    

        # Eval!

        logger.info("  Num examples = %d", len(eval_dataset))
        
        # batch = next(iter(eval_dataloader))
        a = []
        b = []
        for batch, (start_all, start_content, end_content, end_all) in tqdm(zip(eval_dataloader, ws), desc="Evaluating"):
            preds = None
            out_label_ids = None
            batch = tuple(t.to(args.device) for t in batch)
    
            with torch.no_grad():
                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
                if args.model_type != "distilbert":
                    inputs["token_type_ids"] = (
                        batch[2] if args.model_type in ["bert", "xlnet"] else None
                    )  # XLM and RoBERTa don"t use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
    
                if args.n_gpu > 1:
                    tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating
    
                eval_loss += tmp_eval_loss.item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
        
            
            preds = np.argmax(preds, axis=2)
        
            label_map = {i: label for i, label in enumerate(labels)}
        
            out_label_list = [[] for _ in range(out_label_ids.shape[0])]
            preds_list = [[] for _ in range(out_label_ids.shape[0])]
        
            for i in range(out_label_ids.shape[0]):
                for j in range(out_label_ids.shape[1]):
                    if out_label_ids[i, j] != pad_token_label_id:
                        out_label_list[i].append(label_map[out_label_ids[i][j]])
                        preds_list[i].append(label_map[preds[i][j]])
    
            #join
            
            for i in range(len(out_label_list)):
                a.extend(out_label_list[i][start_content-start_all:end_content-start_all])
                b.extend(preds_list[i][start_content-start_all:end_content-start_all])
    
        out_label_listX.append(a)
        preds_listX.append(b)
        # results = {
        #     "loss": eval_loss,
        #     "precision": precision_score(out_label_list, preds_list),
        #     "recall": recall_score(out_label_list, preds_list),
        #     "f1": f1_score(out_label_list, preds_list),
        # }

    eval_loss = eval_loss / nb_eval_steps

    try:
        results = {
            "loss": eval_loss,
            "precision": precision_score(out_label_listX, preds_listX),
            "recall": recall_score(out_label_listX, preds_listX),
            "f1": f1_score(out_label_listX, preds_listX),
        }
        
        logger.info("***** Eval results %s *****", prefix)
        for key in sorted(results.keys()):
            logger.info("  %s = %s", key, str(results[key]))
        
        return results, preds_listX
    except IndexError: #no output labels in file
        return {}, preds_listX