Exemplo n.º 1
0
 def init_on_reset(self, input_text: Union[List[str], str]):
     sent = Sentence(input_text)
     self.doc_embeddings.embed(sent)
     self._current_token_embeddings = [
         token.embedding.cpu().detach() for token in sent
     ]
     sent.clear_embeddings()
Exemplo n.º 2
0
 def _get_sentence_embedding(self, text: str) -> torch.Tensor:
     text = "..." if len(text) == 0 else text
     sent = Sentence(text)
     self.doc_embeddings.embed(sent)
     if len(sent) >= 1:
         embedding = torch.tensor(sent.embedding.cpu().detach().numpy()).reshape(1, -1)
     else:
         embedding = torch.tensor(sent[0].embedding.cpu().detach().numpy()).reshape(1, -1)
     sent.clear_embeddings()
     return embedding
Exemplo n.º 3
0
def test_load_use_classifier():
    loaded_model: TextClassifier = TextClassifier.load("sentiment")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
    del loaded_model

    sentence.clear_embeddings()
    sentence_empty.clear_embeddings()
Exemplo n.º 4
0
def test_load_use_serialized_tagger():
    loaded_model = SequenceTagger.load(u'ner')
    sentence = Sentence(u'I love Berlin')
    sentence_empty = Sentence(u'       ')
    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
    sentence.clear_embeddings()
    sentence_empty.clear_embeddings()
    loaded_model = SequenceTagger.load(u'pos')
    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
Exemplo n.º 5
0
def test_load_use_serialized_tagger():
    loaded_model: SequenceTagger = SequenceTagger.load("ner")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    sentence.clear_embeddings()
    sentence_empty.clear_embeddings()

    loaded_model: SequenceTagger = SequenceTagger.load("pos")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])
Exemplo n.º 6
0
def create_emb(doc):
    batch_size = 60

    dataset_section = []
    for section_title, section_text in doc['raw']['sections'].items():
        text = section_text.strip()
        # tokenize each sentence
        # skip language detection, embedding wont match anyway
        nlp_doc = nlp(text)

        texts = []
        flair_sentences = []
        for sentence in nlp_doc.sents:
            flair_sent = Sentence(sentence.text)
            flair_sentences.append(flair_sent)
            texts.append(sentence.text)

        for i in range(0, len(flair_sentences), batch_size):
            flair_emb.embed(flair_sentences[i:i + batch_size])

        for flair_sent, text in zip(flair_sentences, texts):
            for token in flair_sent.tokens:
                mean_vector = token.embedding
                max_value = -2
                max_part = None
                for possible_part, candidates in poss_sections.items():
                    for candidate in candidates:
                        score = cos(mean_vector, candidate)
                        if score > 0.9:  # consideramos valido
                            if max_value < score:
                                max_value = score
                                max_part = possible_part

            flair_sent.clear_embeddings()

            if max_part is not None:
                dataset_section.append({
                    'hash_id': doc['hash_id'],
                    'title': section_title,
                    'text': text,
                    'match': max_part
                })

    return dataset_section
Exemplo n.º 7
0
def test_fine_tunable_flair_embedding():
    language_model_forward = LanguageModel(Dictionary.load(
        'chars'), is_forward_lm=True, hidden_size=32, nlayers=1)
    embeddings = DocumentRNNEmbeddings([FlairEmbeddings(
        language_model_forward, fine_tune=True)], hidden_size=128, bidirectional=False)
    sentence = Sentence('I love Berlin.')
    embeddings.embed(sentence)
    assert (len(sentence.get_embedding()) == 128)
    assert (len(sentence.get_embedding()) == embeddings.embedding_length)
    sentence.clear_embeddings()
    assert (len(sentence.get_embedding()) == 0)
    embeddings = DocumentLMEmbeddings(
        [FlairEmbeddings(language_model_forward, fine_tune=True)])
    sentence = Sentence('I love Berlin.')
    embeddings.embed(sentence)
    assert (len(sentence.get_embedding()) == 32)
    assert (len(sentence.get_embedding()) == embeddings.embedding_length)
    sentence.clear_embeddings()
    assert (len(sentence.get_embedding()) == 0)
Exemplo n.º 8
0
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

poss_sections = {
    '#gender': ['gender', 'sex', 'gentleman'],
    '#male': ['male', 'man', 'men'],
    '#female': ['female', 'woman', 'women'],
}

# substitute term by its embedding
for list_candidates in tqdm(poss_sections.values(),
                            desc='Embedding search terms'):
    for i in range(len(list_candidates)):
        sentence = Sentence(list_candidates[i].lower())
        flair_emb.embed(sentence)
        list_candidates[i] = sentence.embedding
        sentence.clear_embeddings()

dataset = []
print('Retrieving documents from database...')
documents = Database.list_raw_documents()


def create_emb(doc):
    batch_size = 60

    dataset_section = []
    for section_title, section_text in doc['raw']['sections'].items():
        text = section_text.strip()
        # tokenize each sentence
        # skip language detection, embedding wont match anyway
        nlp_doc = nlp(text)
Exemplo n.º 9
0
 def _get_input_dim(self):
     sent = Sentence("A random text to get the embedding dimension")
     self.doc_embeddings.embed(sent)
     dim = sent[0].embedding.shape[0]
     sent.clear_embeddings()
     return dim