def create_and_check_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): config.num_labels = self.num_labels model = AlbertForTokenClassification(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels))
def create_and_check_albert_for_token_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.num_labels = self.num_labels model = AlbertForTokenClassification(config=config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels) self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result)
class AlbertNER: """ Class to use Albert to extract named entities. TODO: Update model and checkpoints to work with last versions of transformers """ def __init__(self, path: str ,device: str = 'cpu'): """ Init the NER Albert """ if not os.path.exists(path): raise NotADirectoryError( f"{os.path.abspath(path)} must be a directory containing the model files: config, tokenizer, weights.") files = os.listdir(path) if CONFIG_JSON_FILE not in files: raise FileNotFoundError(f"{CONFIG_JSON_FILE} must be in {path}.") if WEIGHTS_FILE not in files: raise FileNotFoundError(f"{WEIGHTS_FILE} must be in {path}.") with open(os.path.join(path, CONFIG_JSON_FILE), "r") as f: config = json.load(f) self.tokenizer = AutoTokenizer.from_pretrained(path) weights = torch.load(os.path.join(path, WEIGHTS_FILE), map_location=lambda storage, loc: storage) # Load pretrained model/tokenizer config = AlbertConfig.from_dict(config) self.model = AlbertForTokenClassification(config) self.model.load_state_dict(weights) self.model = self.model.eval() self.args = albert_args_ner if device == "cuda": logger.debug("Setting model with CUDA") self.args['device'] = 'cuda' self.model.to('cuda') def extract(self, text: str, **kwargs: dict) -> List[Tuple[str, str]]: """ Extract named entities from text Keyword Arguments: :param text: Text to extract entities from :return: List of named entities extacted """ for key in kwargs: if key in self.args: self.args[key] = kwargs[key] tokens = self.tokenizer.tokenize(self.tokenizer.decode(self.tokenizer.encode(text))) inputs = self.tokenizer.encode(text, return_tensors="pt") outputs = self.model(inputs, **kwargs)[0] predictions = torch.argmax(outputs, dim=2) return [(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())]