def __init__(self,
                 squad_model_config: str,
                 vocab_file: str,
                 do_lower_case: bool,
                 max_seq_length: int = 512,
                 batch_size: int = 10,
                 lang: str = 'en',
                 **kwargs) -> None:
        config = json.load(open(squad_model_config))
        config['chainer']['pipe'][0]['max_seq_length'] = max_seq_length
        self.model = build_model(config)
        self.max_seq_length = max_seq_length

        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(
                vocab_file, do_lower_case=do_lower_case)

        self.batch_size = batch_size

        if lang == 'en':
            from nltk import sent_tokenize
            self.sent_tokenizer = sent_tokenize
        elif lang == 'ru':
            from ru_sent_tokenize import ru_sent_tokenize
            self.sent_tokenizer = ru_sent_tokenize
        else:
            raise RuntimeError('en and ru languages are supported only')
示例#2
0
 def predict(self, sentence: str) -> torch.Tensor:
     tokenized_input = AutoTokenizer(sentence)
     output = self.model(tokenized_input["input_ids"],
                         tokenized_input["attention_mask"])
     logit = output.logits
     pred_idx = torch.argmax(logit, dim=-1)
     return self.idx_2_label[pred_idx]
示例#3
0
def main(test_dir_pattern):
    test_files = glob.glob(test_dir_pattern)
    tokenizer = AutoTokenizer(config.CHECKPOINT)
    ner_model = NERModel()
    ner_model.load(config.MODEL_CHECKPOINT)

    bert = ner_model.bert.eval().cuda()
    last_layer = ner_model.last_layer.eval().cuda()

    mapping = {}
    submission_mapping = defaultdict(list)
    doc_index = 0
    n_files = len(test_files)

    for i in range(0, n_files, 50):
        files_to_processed = test_files[i:i + 50]
        test_examples = []
        for file in files_to_processed:
            doc = Document.read_from_path(file)
            sentences = doc.get_all_sentences()
            mapping[doc_index] = doc.document_id
            for sentence in sentences:
                te = TestExample(doc_index, sentence)
                test_examples.append(te)
            doc_index += 1

        dataset = NERTestDataset(test_examples, tokenizer)
        data_loader = DataLoader(dataset, batch_size=config.TEST_BATCH_SIZE)

        for batch in data_loader:
            o = bert(
                input_ids=batch['input_ids'].to('cuda'),
                attention_mask=batch['attention_mask'].to('cuda'),
                token_type_ids=batch['token_type_ids'].to('cuda'),
            )
            o = last_layer(o['last_hidden_state'])
            o = o.detach()
            o = o.argmax(axis=-1)
            probabilities = o.max(axis=-1)
            for j, sentence_prediction in enumerate(o):
                last_valid = batch['attention_mask'][j].argmin().item()
                candidates = LabelEncoding.extract_candidates(
                    sentence_prediction,
                    probabilities[j],
                    last_valid,
                )
                if len(candidates) == 0:
                    continue

                id_ = batch['id'][j].item()
                ids = batch['input_ids'][j]
                for start_index, end_index in candidates:
                    submission_mapping[mapping[id_]].append(
                        tokenizer.decode(ids[start_index:end_index]))

    return submission_mapping
 def __init__(self,
              vocab_file: str,
              do_lower_case: bool = True,
              max_seq_length: int = 512,
              return_tokens: bool = False,
              **kwargs) -> None:
     self.max_seq_length = max_seq_length
     self.return_tokens = return_tokens
     if Path(vocab_file).is_file():
         vocab_file = str(expand_path(vocab_file))
         self.tokenizer = AutoTokenizer(vocab_file=vocab_file,
                                        do_lower_case=do_lower_case)
     else:
         self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)
 def __init__(self,
              vocab_file: str,
              do_lower_case: bool = False,
              max_seq_length: int = 512,
              max_subword_length: int = None,
              token_masking_prob: float = 0.0,
              provide_subword_tags: bool = False,
              subword_mask_mode: str = "first",
              **kwargs):
     self._re_tokenizer = re.compile(r"[\w']+|[^\w ]")
     self.provide_subword_tags = provide_subword_tags
     self.mode = kwargs.get('mode')
     self.max_seq_length = max_seq_length
     self.max_subword_length = max_subword_length
     self.subword_mask_mode = subword_mask_mode
     if Path(vocab_file).is_file():
         vocab_file = str(expand_path(vocab_file))
         self.tokenizer = AutoTokenizer(vocab_file=vocab_file,
                                        do_lower_case=do_lower_case)
     else:
         self.tokenizer = AutoTokenizer.from_pretrained(vocab_file, do_lower_case=do_lower_case)
     self.token_masking_prob = token_masking_prob