class PyramidNer(object): class _Model(nn.Module): def __init__(self, encoder, pyramid, classifier): super(PyramidNer._Model, self).__init__() self.encoder = encoder self.pyramid = pyramid self.classifier = classifier def forward(self, *args, **kwargs): x, mask = self.encoder(*args, **kwargs) h, h_remedy = self.pyramid(x, mask) return self.classifier(h, h_remedy) def __init__(self, word_lexicon, word_embeddings, entities_lexicon, language_model=None, language_model_casing=True, char_embeddings_dim=60, encoder_hidden_size=100, encoder_output_size=200, decoder_hidden_size=100, pyramid_max_depth=None, inverse_pyramid=False, custom_tokenizer=None, decoder_dropout=0.4, encoder_dropout=0.4, device='cpu'): if isinstance(word_embeddings, str): word_embeddings = [word_embeddings] if isinstance(word_embeddings, list) and isinstance( word_embeddings[0], str): word_embeddings = FlairWordEmbeddings(word_embeddings, lexicon=word_lexicon, padding_idx=0) self._model_args = { 'word_embeddings': word_embeddings.to(device), 'language_model': language_model, 'char_embeddings_dim': char_embeddings_dim, 'encoder_hidden_size': encoder_hidden_size, 'encoder_output_size': encoder_output_size, 'decoder_hidden_size': decoder_hidden_size, 'pyramid_max_depth': pyramid_max_depth, 'batch_first': True, 'inverse_pyramid': inverse_pyramid, 'decoder_dropout': decoder_dropout, 'encoder_dropout': encoder_dropout, 'device': device } self.tokenizer = custom_tokenizer or (lambda t: t.split()) self.label_encoder = PyramidLabelEncoder() self.word_vectorizer = WordVectorizer() self.char_vectorizer = CharVectorizer() self.label_encoder.fit(entities_lexicon) self.word_vectorizer.fit(word_lexicon) self.char_vectorizer.fit() for component in [ self.word_vectorizer, self.char_vectorizer, self.label_encoder ]: component.set_tokenizer(self.tokenizer) if language_model is not None: self._model_args['language_model'] = TransformerWordEmbeddings( language_model, word_lexicon, padding_idx=0, device=device, casing=language_model_casing) self.nnet = self._init_nnet() @property def device(self): return self._model_args['device'] def reset_weights(self): self.nnet = self._init_nnet() print('New model created!') def logits_to_classes(self, logits): return [ torch.argmax(nn.functional.softmax(logit, dim=-1), dim=-1) for logit in logits ] def remedy_to_classes(self, logits): if logits is None: return return torch.round(torch.sigmoid(logits)) def classes_to_iob2(self, classes, remedy=None): labels = self.label_encoder.inverse_transform(classes) if remedy is not None: labels += self.label_encoder.inverse_remedy_transform(remedy) return labels def parse(self, x): if isinstance(x, list): return [self._parse_text(text) for text in x] return self._parse_text(x) def _parse_text(self, text): assert isinstance(text, str), f'Can not parse {text} (not a string).' x = " ".join(self.tokenizer(text)) # tokenization device = self.device x_word, word_mask = self.word_vectorizer.pad_sequences( self.word_vectorizer.transform([DataPoint(x)])) x_char, char_mask = self.char_vectorizer.pad_sequences( self.char_vectorizer.transform([DataPoint(x)])) self.nnet.eval() with torch.no_grad(): layers, remedy = self.nnet(x_word.to(device), word_mask.to(device), x_char.to(device), char_mask.to(device)) self.nnet.train(mode=True) layers_classes = self.logits_to_classes(layers) remedy_classes = self.remedy_to_classes(remedy) labels = self.classes_to_iob2(layers_classes, remedy=remedy_classes) entities = list() tokens = x.split() for l, layer in enumerate(labels): assert len(layer) == 1 sequence = layer[0] for token, tag in enumerate(sequence): if tag == 'O': continue entity = tag[2:] # discard the IOB2 notation value = " ".join(tokens[token:token + l + 1]) stop = len(" ".join(tokens[:token + l + 1])) start = stop - len(value) entities.append(Entity(entity, value, start, stop)) return DataPoint(x, entities) def _build_char_embeddings(self, char_embeddings_dim): if not char_embeddings_dim: return None if char_embeddings_dim % 2: raise ValueError(f'Dimension of character embeddings must be ' f'an even number (got {char_embeddings_dim})') return CharEmbedding(self.char_vectorizer.X, rnn=nn.LSTM, bidirectional=True, embedding_dim=int(char_embeddings_dim / 2)) def _init_nnet(self): sentence_encoder = SentenceEncoder( self._model_args['word_embeddings'], char_embeddings=self._build_char_embeddings( self._model_args['char_embeddings_dim']), hidden_size=self._model_args['encoder_hidden_size'], output_size=self._model_args['encoder_output_size'], rnn_class=nn.LSTM, language_model=self._model_args['language_model'], dropout=self._model_args['encoder_dropout'], ) if self._model_args['inverse_pyramid']: pyramid_cls = BidirectionalPyramidDecoder else: pyramid_cls = PyramidDecoder pyramid_decoder = pyramid_cls( input_size=self._model_args['encoder_output_size'], hidden_size=self._model_args['decoder_hidden_size'], dropout=self._model_args['decoder_dropout'], max_depth=self._model_args['pyramid_max_depth']) decoder_output_size = self._model_args['decoder_hidden_size'] * 2 * ( 1 + int(self._model_args['inverse_pyramid'])) classifier = LinearDecoder(decoder_output_size, classes=len(self.label_encoder.entities)) return self._Model(sentence_encoder, pyramid_decoder, classifier).to(self.device) def save(self, path, name='pyramid_ner'): """ Saves the model weights and metadata to the specified path, so it can be loaded later using the `load` class method. :param path: :param name: :return: """ if not os.path.isdir(path): raise ValueError(f"{path} is not a directory.") folder = os.path.join(path, name) os.makedirs(folder, exist_ok=True) model_metadata = self._model_args if isinstance(self._model_args['word_embeddings'], FlairWordEmbeddings): model_metadata['word_embeddings'] = self._model_args[ 'word_embeddings'].word_embeddings_names else: embedding_layer = model_metadata['word_embeddings'] model_metadata['word_embeddings'] = { 'num_embedding': embedding_layer.num_embeddings, 'embedding_dim': embedding_layer.embedding_dim, 'padding_idx': embedding_layer.padding_idx, 'freeze': not embedding_layer.requires_grad } if isinstance(self._model_args['language_model'], TransformerWordEmbeddings): model_metadata['language_model'] = self._model_args[ 'language_model'].transformer # persist metadata yaml.safe_dump(model_metadata, open(os.path.join(folder, 'metadata.yml'), 'w')) # persist token lexicon with open(os.path.join(folder, 'lexicon.txt'), 'w') as lex: for token in self.word_vectorizer.lexicon: lex.write(f"{token}\n") # persist label lexicon with open(os.path.join(folder, 'entities.txt'), 'w') as en: for entity in self.label_encoder.entities: if entity is not None: en.write(f"{entity}\n") state_dict = self.nnet.state_dict() # persist state_dict (model weight) torch.save(state_dict, os.path.join(folder, 'weights.bin')) @classmethod def load(cls, path, custom_tokenizer=None, force_device=None, force_language_model=None, force_embeddings=None): if not os.path.isdir(path): raise ValueError(f"{path} is not a directory.") try: model_metadata = yaml.safe_load(path) except Exception as e: raise ValueError( f"Could not load 'metadata.yml' file at {path}: {e}") model_metadata['device'] = force_device or model_metadata['device'] model_metadata[ 'language_model'] = force_language_model or model_metadata[ 'language_model'] model_metadata['word_embeddings'] = force_embeddings or model_metadata[ 'word_embeddings'] if isinstance(model_metadata['word_embeddings'], dict): # rebuild the word embeddings matrix (the weights will be loaded from the state_dict) freeze = model_metadata['word_embeddings'].pop('freeze') place_holder = nn.Embedding(**model_metadata['word_embeddings']) place_holder.weight.requires_grad = not freeze model_metadata['word_embeddings'] = place_holder with open(os.path.join(path, 'lexicon.txt'), 'r') as lex: lexicon = [token for token in lex.read().split('\n') if token] with open(os.path.join(path, 'entities.txt'), 'r') as en: entities = [entity for entity in en.read().split('\n') if entity] kwargs = model_metadata kwargs['word_lexicon'] = lexicon kwargs['entities_lexicon'] = entities kwargs['custom_tokenizer'] = custom_tokenizer obj = cls(**kwargs) state_dict = torch.load(os.path.join(path, 'weights.bin')) obj.nnet.load_state_dict(state_dict) return obj
class PyramidNerDataset(Dataset): """ Dataset class. Use its `get_dataloader` method to recover a DataLoader that can properly recover batches of samples from this Dataset, since this Dataset performs dynamic padding of tensors and collation is not straight-forward. """ class _Collator(Dataset): def __init__(self, dataset, device='cpu'): self._device = device self._wrapped_dataset = dataset self._indices = torch.arange(len(dataset)) def collate_fn(self, batch): batch = torch.stack(batch) actual_batch = dict() for name, tensors in self._wrapped_dataset[batch].items(): if name == 'y': actual_batch[name] = [ tensor.to(self._device) for tensor in tensors ] elif isinstance(batch[name], torch.Tensor): actual_batch[name] = tensors.to(self._device) elif isinstance(batch[name], dict): actual_batch[name] = self.collate_fn(batch[name]) return actual_batch def __len__(self): return len(self._wrapped_dataset) def __getitem__(self, i): return self._indices[i] def __init__(self, data_reader, token_lexicon=None, custom_tokenizer=None, char_vectorizer=False, pyramid_max_depth=None): """ :param data_reader: generator of DataPoint objects representing the samples in the dataset. :param token_lexicon: iterable of strings containing the lexicon. If it's None, this will be automatically generated from the data. :param custom_tokenizer: callable that performs tokenization given a single text input. If left to None, uses utils.text.default_tokenizer. :param char_vectorizer: adds char encodings. :param pyramid_max_depth: None for infinite depth. """ self.data = [data_point for data_point in data_reader] self.tokenizer = custom_tokenizer or default_tokenizer if not token_lexicon: token_lexicon = { token for x in self.data for token in self.tokenizer(x.text) } self.word_vectorizer = WordVectorizer() self.word_vectorizer.set_tokenizer(self.tokenizer) self.word_vectorizer.fit(token_lexicon) if char_vectorizer: self.char_vectorizer = CharVectorizer() self.char_vectorizer.set_tokenizer(self.tokenizer) self.char_vectorizer.fit() else: self.char_vectorizer = None self.pyramid_max_depth = pyramid_max_depth self.label_encoder = PyramidLabelEncoder() self.label_encoder.set_tokenizer(self.tokenizer) self.label_encoder.fit([e.name for x in self.data for e in x.entities]) def __len__(self): return len(self.data) def __getitem__(self, i): if isinstance(i, int): ids = torch.tensor([i]) sample = [self.data[i]] else: indices = torch.arange(len(self.data)).long() sample = [self.data[index] for index in indices[i]] ids = torch.tensor([index for index in indices[i]]) data = self._transform_x(sample) max_depth = self.pyramid_max_depth data['y'], data['y_remedy'] = self.label_encoder.transform( sample, max_depth=max_depth) data['id'] = ids.long() return data def _transform_x(self, sample): x = dict() for vect, role in [(self.word_vectorizer, 'word'), (self.char_vectorizer, 'char')]: if vect is not None: vectors, mask = vect.pad_sequences(vect.transform(sample)) x[f'{role}_vectors'], x[f'{role}_mask'] = vectors, mask return x def get_dataloader(self, batch_size=32, shuffle=True, device='cpu', bucketing=False): def _collate_fn(batch, device=device): batch = batch[0] for name in batch.keys(): if name == 'y': batch[name] = [tensor.to(device) for tensor in batch[name]] elif isinstance(batch[name], torch.Tensor): batch[name] = batch[name].to(device) elif isinstance(batch[name], dict): batch[name] = _collate_fn([batch[name]], device) return batch if bucketing: # use sequence bucketing sequence_lengths = torch.tensor( [len(self.tokenizer(sample.text)) for sample in self.data]) dataloader = SequenceBucketing.as_dataloader( self, sequence_lengths, batch_size, shuffle) else: collator = self._Collator(self, device) dataloader = DataLoader(collator, batch_size=batch_size, shuffle=shuffle) _collate_fn = collator.collate_fn dataloader.collate_fn = _collate_fn return dataloader