def init_on_reset(self, input_text: Union[List[str], str]): sent = Sentence(input_text) self.doc_embeddings.embed(sent) self._current_token_embeddings = [ token.embedding.cpu().detach() for token in sent ] sent.clear_embeddings()
def _get_sentence_embedding(self, text: str) -> torch.Tensor: text = "..." if len(text) == 0 else text sent = Sentence(text) self.doc_embeddings.embed(sent) if len(sent) >= 1: embedding = torch.tensor(sent.embedding.cpu().detach().numpy()).reshape(1, -1) else: embedding = torch.tensor(sent[0].embedding.cpu().detach().numpy()).reshape(1, -1) sent.clear_embeddings() return embedding
def test_load_use_classifier(): loaded_model: TextClassifier = TextClassifier.load("sentiment") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) del loaded_model sentence.clear_embeddings() sentence_empty.clear_embeddings()
def test_load_use_serialized_tagger(): loaded_model = SequenceTagger.load(u'ner') sentence = Sentence(u'I love Berlin') sentence_empty = Sentence(u' ') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) sentence.clear_embeddings() sentence_empty.clear_embeddings() loaded_model = SequenceTagger.load(u'pos') loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty])
def test_load_use_serialized_tagger(): loaded_model: SequenceTagger = SequenceTagger.load("ner") sentence = Sentence("I love Berlin") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) sentence.clear_embeddings() sentence_empty.clear_embeddings() loaded_model: SequenceTagger = SequenceTagger.load("pos") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty])
def create_emb(doc): batch_size = 60 dataset_section = [] for section_title, section_text in doc['raw']['sections'].items(): text = section_text.strip() # tokenize each sentence # skip language detection, embedding wont match anyway nlp_doc = nlp(text) texts = [] flair_sentences = [] for sentence in nlp_doc.sents: flair_sent = Sentence(sentence.text) flair_sentences.append(flair_sent) texts.append(sentence.text) for i in range(0, len(flair_sentences), batch_size): flair_emb.embed(flair_sentences[i:i + batch_size]) for flair_sent, text in zip(flair_sentences, texts): for token in flair_sent.tokens: mean_vector = token.embedding max_value = -2 max_part = None for possible_part, candidates in poss_sections.items(): for candidate in candidates: score = cos(mean_vector, candidate) if score > 0.9: # consideramos valido if max_value < score: max_value = score max_part = possible_part flair_sent.clear_embeddings() if max_part is not None: dataset_section.append({ 'hash_id': doc['hash_id'], 'title': section_title, 'text': text, 'match': max_part }) return dataset_section
def test_fine_tunable_flair_embedding(): language_model_forward = LanguageModel(Dictionary.load( 'chars'), is_forward_lm=True, hidden_size=32, nlayers=1) embeddings = DocumentRNNEmbeddings([FlairEmbeddings( language_model_forward, fine_tune=True)], hidden_size=128, bidirectional=False) sentence = Sentence('I love Berlin.') embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 128) assert (len(sentence.get_embedding()) == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0) embeddings = DocumentLMEmbeddings( [FlairEmbeddings(language_model_forward, fine_tune=True)]) sentence = Sentence('I love Berlin.') embeddings.embed(sentence) assert (len(sentence.get_embedding()) == 32) assert (len(sentence.get_embedding()) == embeddings.embedding_length) sentence.clear_embeddings() assert (len(sentence.get_embedding()) == 0)
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6) poss_sections = { '#gender': ['gender', 'sex', 'gentleman'], '#male': ['male', 'man', 'men'], '#female': ['female', 'woman', 'women'], } # substitute term by its embedding for list_candidates in tqdm(poss_sections.values(), desc='Embedding search terms'): for i in range(len(list_candidates)): sentence = Sentence(list_candidates[i].lower()) flair_emb.embed(sentence) list_candidates[i] = sentence.embedding sentence.clear_embeddings() dataset = [] print('Retrieving documents from database...') documents = Database.list_raw_documents() def create_emb(doc): batch_size = 60 dataset_section = [] for section_title, section_text in doc['raw']['sections'].items(): text = section_text.strip() # tokenize each sentence # skip language detection, embedding wont match anyway nlp_doc = nlp(text)
def _get_input_dim(self): sent = Sentence("A random text to get the embedding dimension") self.doc_embeddings.embed(sent) dim = sent[0].embedding.shape[0] sent.clear_embeddings() return dim