def process_df(xdf, ydf=None, passing_y=False):
    args1 = [' '.join(nltk.sent_tokenize(x)[0:5]) for x in xdf['argument1'].tolist()]
    args1 = [x[0:500] for x in args1]
    args2 = [' '.join(nltk.sent_tokenize(x)[0:5]) for x in xdf['argument2'].tolist()]
    args2 = [x[0:500] for x in args2]
    ys     = ydf['is_same_side'].tolist()
    
    
    
    x1_out = []
    for c in chunks(args1, 2):
        sents  = [Sentence(x, use_tokenizer=True) for x in c]
        document_embeddings.embed(sents)
        for sent in sents:
            x1_out.append(sent.get_embedding().detach())
        
        del sents

    x2_out = []
    for c in chunks(args2, 2):
        sents  = [Sentence(x, use_tokenizer=True) for x in c]
        document_embeddings.embed(sents)
        for sent in sents:
            x2_out.append(sent.get_embedding().detach())
        
        del sents

    if passing_y:
        ys = [1 if y else 0 for y in ys]
    else:
        ys = [0] * len(x1_out)

    return x1_out, x2_out, ys
Пример #2
0
    def chunked_embed(corpus, embeddings, chunk_size=256):
        def find_nth(n, substring, text, start):
            index = start
            for _ in range(n):
                index = text.find(substring, index + 1)
            return index

        try:
            partial_embeddings = []
            i = 0
            while i < len(corpus):
                next_i = find_nth(chunk_size, " ", corpus, i)
                if next_i < i:
                    next_i = len(corpus)
                chunk = corpus[i:next_i]
                sentence = Sentence(chunk, use_tokenizer=False)
                embeddings.embed(sentence)
                partial_embeddings.append(sentence.get_embedding().numpy())
                i = next_i
            avg = np.average(np.asarray(partial_embeddings), axis=0)
            return avg
        except RuntimeError:
            print(
                "Please, ignore the message above indicating that the sentence is too long. The problem has been solved."
            )
            return FeatureExtractor.chunked_embed(corpus, embeddings,
                                                  int(chunk_size / 2))
Пример #3
0
    def get_embeddings(self, sentence):

        # document_embeddings = DocumentPoolEmbeddings(
        #    [self.glove_embedding,  # initialize the document embeddings, mode = mean
        #     self.flair_embedding_backward,
        #     self.flair_embedding_forward])

        # Glove + BPE
        document_embeddings = DocumentPoolEmbeddings(
            [self.glove_embedding, self.bpe_embedding])

        # Nilc fasttext 600 emdedding
        #document_embeddings = DocumentPoolEmbeddings(
        #            [self.fast_text_embedding])

        # Flair
        #document_embeddings = DocumentPoolEmbeddings(
        #    [self.flair_embedding_forward])

        # ElMO
        #document_embeddings = DocumentPoolEmbeddings(
        #    [self.elmo_embedding])

        # create an example sentence
        sentence = Sentence(sentence)

        # embed the sentence with our document embedding
        document_embeddings.embed(sentence)

        # now check out the embedded sentence.
        return sentence.get_embedding()
Пример #4
0
def get_pooling_embedding(document):
    tokens = [token.text for token in nlp(document)]
    text = ' '.join(tokens)
    sentence = Sentence(text)
    document_pooling_embeddings.embed(sentence)

    return sentence.get_embedding().squeeze().tolist()
Пример #5
0
 def get_sentence_vector(self, text):
     sentence = Sentence(clean_text(text))
     _ = self.embeddings.embed(sentence)
     a = sentence.get_embedding()
     result = a.cpu().detach().numpy()
     if np.sum(result[0:5]) == 0:
         result = np.random.randn(self.n_dims)
     return result
Пример #6
0
def get_fastText_embeding(text_str):
    '''
    Fonction qui retourne l'embedding d'un
    :param text_str: text a embeddé
    :return: le vecteur d'embeding correspondant au text
    '''
    text = Sentence(text_str)
    pool_embeddings.embed(text)
    return text.get_embedding()
  def construct_vector(self, original_sentence):
    """
    Given a sentence, Contruct and return a vector based on different stacked embeddings
    """
    
    sentence = Sentence(original_sentence)
    self.stacked_embedding.embed(sentence)
    sentence_embedding = sentence.get_embedding()
    sentence_embedding_array = sentence_embedding.detach().numpy()

    return sentence_embedding_array
def other_embeddings(embd):
    sess = tf.InteractiveSession()
    train_data_list = []
    test_data_list = []
    val_data_list = []
    if embd == 'glove':
        print('Starting Glove Embedding...')
        glove_embedding = WordEmbeddings('glove')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[glove_embedding])
    elif embd == 'xlnet':
        print('Starting XLNet Embedding...')
        xlnet_embedding = XLNetEmbeddings('xlnet-large-cased')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[xlnet_embedding])
    elif embd == 'fasttext':
        print('Starting Fasttext Embedding...')
        fasttext_embedding = WordEmbeddings('en')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[fasttext_embedding])
    elif embd == 'elmo':
        print('Starting ELMo Embedding...')
        elmo_embedding = ELMoEmbeddings()
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[elmo_embedding])
    else:
        # init Flair embeddings
        flair_forward_embedding = FlairEmbeddings('multi-forward')
        flair_backward_embedding = FlairEmbeddings('multi-backward')
        glove_embedding = WordEmbeddings('glove')
        # now create the DocumentPoolEmbeddings object that combines all embeddings
        document_embeddings = DocumentPoolEmbeddings(embeddings=[
            glove_embedding, flair_forward_embedding, flair_backward_embedding
        ])
    print('Train embedding Started...')
    for text in final_train['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        train_data_list.append(emb)
    print('Embedded Train data!!')
    print('Test embedding Started...')
    for text in final_test['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        test_data_list.append(emb)
    print('Embedded Test data!!')
    for text in final_val['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        val_data_list.append(emb)
    print('Embedded Test data!!')
    return train_data_list, test_data_list, val_data_list
def save_json(jfile):
    global dataset

    user_id = os.path.basename(jfile).split('.')[0]
    save_path = os.path.join('..', DATA_DIR,
                             'pos_tags_{}_embeds'.format(dataset),
                             '{}.json'.format(user_id))

    if os.path.isfile(save_path):
        print("Skipping user {} file already exists".format(user_id))
        return

    document_embeddings = get_doc_embeddings()

    with open(jfile, encoding='utf-8') as f:
        user_data = json.load(f)

        if len(user_data['tokens']) > 500:
            print('User {}.json has {} posts NOT skipping'.format(
                user_id, len(user_data['tokens'])))
            # return

        posts_list = user_data['tokens']  # each post is a list of tokens
        pos_tags_list = user_data['posTags']
        posts_lowercase_list = []
        posts_embeddings_list = []
        pos_tags_list_lowercase = []

        for i, (post, pos_tags) in enumerate(zip(posts_list, pos_tags_list)):
            post_lowercase = [token.lower() for token in post]
            if any("http" in word for word in post_lowercase):
                continue
            if 0 < len(post_lowercase):
                posts_lowercase_list.append(post_lowercase)
                pos_tags_list_lowercase.append(pos_tags)
                post_sentence = Sentence(' '.join(
                    [post for post in post_lowercase]))
                document_embeddings.embed(post_sentence)
                posts_embeddings_list.append(
                    post_sentence.get_embedding().tolist())
            elif len(post_lowercase) > 100:
                print('long post')
            else:
                continue

        user_data["tokens"] = posts_lowercase_list
        user_data["posTags"] = pos_tags_list_lowercase
        user_data["embeddings"] = posts_embeddings_list

        with open(save_path, 'w') as out_file:
            json.dump(user_data, out_file)
        print('Finished with file {}.json'.format(user_id))
Пример #10
0
def embed_flair(texts, max_length=100, max_words=1000):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    texts = tokenizer.sequences_to_texts(sequences)

    sentence_embeddings = []
    padding = np.zeros(embedding_features)
    count = 0
    step = 3
    max = len(texts)
    for text in texts:
        sentence_embedding = []
        paddings = []
        sentence = Sentence(text)
        embeddings_flair.embed(sentence)
        for token in sentence:
            sentence_embedding.append(token.embedding.cpu().numpy())
        for i in range(max_length - len(sentence_embedding)):
            paddings.append(padding)
        if len(paddings) > 0:
            sentence_embedding = np.concatenate([paddings, sentence_embedding],
                                                axis=0)
        else:
            sentence_embedding = np.array(sentence_embedding[:max_length])
        count += 1
        if (100 * count / max > step):
            print(str(step) + '%')
            step += 3
        sentence_embeddings.append(sentence_embedding)

    return np.array(sentence_embeddings)
Пример #11
0
    def get(self, keys):

        if self._embeddings is None:

            if self._no_cuda:
                import flair
                import torch
                flair.device = torch.device('cpu')

            from .flair_bert import BertEmbeddings

            self._embeddings = BertEmbeddings(
                bert_model_or_path=self._path,
                layers=self._layers,
                pooling_operation=self._pooling_operation,
                use_scalar_mix=self._use_scalar_mix)

        sentences = [Sentence(key) for key in keys]

        # noinspection PyUnresolvedReferences
        self._embeddings.embed(sentences)

        for s_idx, sentence in enumerate(sentences):

            for t_idx, token in enumerate(sentence):

                emb = token.embedding.cpu().numpy()
                tok = str(token)

                yield tok, emb

                del token

            del sentence
Пример #12
0
def embed_tweet(tweetList):
    # initialize the word embeddings
    tr_embedding = WordEmbeddings('tr')
    char_embedding = CharacterEmbeddings()

    # initialize the document embeddings, mode = mean
    document_embeddings = DocumentPoolEmbeddings(
        [tr_embedding, char_embedding])

    tweetTensors = []
    for tweet in tweetList:
        #print(norm_tweet(tweet))
        sentence = Sentence(norm_tweet(tweet))
        document_embeddings.embed(sentence)
        tweetTensors.append(sentence.get_embedding().data)
    return tweetTensors
Пример #13
0
    def get_word_vectors(self, words: List[str]) -> List[np.ndarray]:
        """
        Vectorizes the list of words, using pretrained Flair embeddings. These embeddings are context dependent, so this
        method is preferred over fetching word vectors for single words.

        :param words: The list of words to vectorize.
        :return: A list of word vectors.
        """
        sentence = Sentence(' '.join(words))
        self.model.embed(sentence)
        return list(
            map(lambda token: np.array(token.embedding), list(sentence)))
Пример #14
0
    def embed_data(self, sentences):
        sentences = [Sentence(s) for s in sentences]
        self.embedding.embed(sentences)

        if self.method == "average":
            sentences = [torch.stack([word.embedding.detach().cpu() for word in s]).mean(
                0) for s in sentences]
        else:
            sentences = [torch.stack(
                [word.embedding.detach().cpu() for word in s]) for s in sentences]

        return sentences
Пример #15
0
    def find(self, text):
        if not self.compiled:
            raise Exception('You need to compile the vocabulary first.')

        # Apply the general tagger
        if self.ner_tagger:
            text_ = Sentence(text)
            self.ner_tagger.predict(text_)
            for ent in text_.to_dict(tag_type='ner')['entities']:
                yield {
                    'text': ent['text'],
                    'start_pos': ent['start_pos'],
                    'end_pos': ent['end_pos']
                }

        # Apply the especialized vocabulary
        if self.compiled:
            for item in self.compiled.finditer(text):
                span = item.span(0)
                text = item.group(0)
                yield {'text': text, 'start_pos': span[0], 'end_pos': span[1]}
Пример #16
0
    def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray':
        """

        :param data: a 1d array of string type in size `B`
        :return: an ndarray in size `B x D`
        """
        import torch
        from flair.embeddings import Sentence
        c_batch = [Sentence(row) for row in data]
        self.model.embed(c_batch)
        return torch.stack([c_text.get_embedding()
                            for c_text in c_batch]).detach().numpy()
Пример #17
0
 def transform(self, X: dt.Frame):
     X.replace([None, math.inf, -math.inf], self._repl_val)
     from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence
     if self.embedding_name in ["glove", "en"]:
         self.embedding = WordEmbeddings(self.embedding_name)
     elif self.embedding_name in ["bert"]:
         self.embedding = BertEmbeddings()
     self.doc_embedding = DocumentPoolEmbeddings([self.embedding])
     output = []
     X = X.to_pandas()
     text1_arr = X.iloc[:, 0].values
     text2_arr = X.iloc[:, 1].values
     for ind, text1 in enumerate(text1_arr):
         try:
             text1 = Sentence(str(text1).lower())
             self.doc_embedding.embed(text1)
             text2 = text2_arr[ind]
             text2 = Sentence(str(text2).lower())
             self.doc_embedding.embed(text2)
             score = cosine_similarity(text1.get_embedding().reshape(1, -1),
                                       text2.get_embedding().reshape(1,
                                                                     -1))[0,
                                                                          0]
             output.append(score)
         except:
             output.append(-99)
     return np.array(output)
Пример #18
0
    def load_documents_into_embedding(self):
        print("Embedding ", len(self.documents_orig), " Documents")
        #self.documents_orig = self.documents_orig[0:50]
        self.documents = [
            self.elmo.embed(Sentence(elem)) for elem in self.documents_orig
        ]

        self.documents = torch.stack([
            torch.cat([token.embedding.unsqueeze(0) for token in elem[0]],
                      dim=0)[0] for elem in self.documents
        ])

        np.save("./documents_embedded.npy", self.documents)
Пример #19
0
    def get_word_vector(self, word: str) -> Optional[np.ndarray]:
        """
        Returns the word vector for word |word| or None. It is discouraged to use this method as it invalidates the
        purpose of Flair embeddings. Instead, utilize the context as well for more accurate vectorization.

        In reality, Flair embeddings never return None, even for bogus words.

        :param word: The word to vectorize.
        :return: Either the word vector or None.
        """
        dummy_sentence = Sentence(word)
        self.model.embed(dummy_sentence)
        return np.array(list(dummy_sentence)[0].embedding)
Пример #20
0
def answer_similarity(ans1, real):
    sent1 = Sentence(ans1)
    sent2 = Sentence(real)
    document_embeddings.embed(sent1)
    document_embeddings.embed(sent2)
    emb1 = sent1.get_embedding()
    emb2 = sent2.get_embedding()
    emb1 /= torch.sqrt((emb1**2).sum())
    emb2 /= torch.sqrt((emb2**2).sum())

    return max(0., (emb1.T @ emb2).item())
Пример #21
0
    def get(self, keys, return_positions):

        from flair.embeddings import Sentence

        sentences = [Sentence(key, use_tokenizer=self._use_tokenizer) for key in keys]

        # noinspection PyUnresolvedReferences
        self._embeddings.embed(sentences)

        for s_idx, (sentence, ret_positions) in enumerate(zip(sentences, return_positions)):

            for t_idx, token in enumerate(sentence):

                if t_idx not in ret_positions:
                    continue  # ignore tokens where embeddings have not been requested

                yield s_idx, token.text, token.embedding.cpu().numpy()
Пример #22
0
def flair_embeddings(x, *args):
    from flair.embeddings import DocumentPoolEmbeddings, DocumentRNNEmbeddings, Sentence

    word_embedders, aggregating_strategy, aggregating_params = args[0], args[
        1], args[2]
    embedding = None
    if aggregating_strategy == 'pooling':
        # TODO: check if kwargs work
        embedding = DocumentPoolEmbeddings(word_embedders,
                                           **aggregating_params)
    if aggregating_strategy == 'rnn':
        # TODO: check if kwargs work
        embedding = DocumentRNNEmbeddings(word_embedders, **aggregating_params)
    if embedding is None:
        raise KeyError("Insufficient vespine gas")
    sentence = Sentence(x)
    embedding.embed(sentence)
    return sentence.embedding.detach().numpy().reshape(-1, 1)
Пример #23
0
def criterion(str1, str2, embed):
    try:
        s1 = Sentence(str1)
        s2 = Sentence(str2)
        embed.embed(s1)
        s1_emb = s1.get_embedding()
        embed.embed(s2)
        s2_emb = s2.get_embedding()

        return torch.cosine_similarity(s1_emb.unsqueeze(0), s2_emb.unsqueeze(0)).item()
    
    except:
        return 0.5
Пример #24
0
    def run_query(self, query, k=None):
        """Run a query on the given documents based on word embeddings
        
        Arguments:
            query {str} -- Query string.
        
        Keyword Arguments:
            k {int} -- The top documents to return (default: 10)
        
        Returns:
            list[tuple[float, int]] -- Sorted list of tuples, which contain the score and the document id.
                Made up example to show the formatting with k=5:
                        [(0.89316645860672, 1567), 
                        (0.6174346804618835, 125), 
                        (0.5975501537321234, 1181), 
                        (0.5779426293373108, 3979), 
                        (0.5110726475715637, 7155)]
        """
        if k is None:
            k = 10

        sentence = Sentence(query)

        #self.embedding.embed(sentence)

        self.elmo.embed(sentence)

        sentence = [token.embedding.unsqueeze(0) for token in sentence][0]

        #print(sentence)

        # A returned list should look like this for k=5. Btw. the numbers are made up!

        #[
        #            (0.89316645860672, 1567),
        #            (0.6174346804618835, 125),
        #            (0.5975501537321234, 1181),
        #            (0.5779426293373108, 3979),
        #            (0.5110726475715637, 7155),
        #        ]

        return self.knn(sentence, query, k=k)
def get_embeddings(encoder, sentence, input_lang):
    with torch.no_grad():
        if word_vecs == "flair":
            flair_embedding = StackedEmbeddings([
                FlairEmbeddings('de-forward'),
                FlairEmbeddings('de-backward'),
            ])

            sent = Sentence(sentence + " <EOS>")
            flair_embedding.embed(sent)
            input_tensor = [token.embedding for token in sent.tokens]
            input_length = len(input_tensor)
        else:
            input_tensor = tensorFromSentence(input_lang, sentence)
            input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        for ei in range(input_length):
            _, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        return encoder_hidden
def glove_eucleadian(question, sentence_list):
    question = Sentence(question)
    euc = nn.PairwiseDistance(p=2)
    document_embeddings.embed(question)
    q_emd = question.get_embedding()
    q_emd = q_emd.unsqueeze(0)
    sentence_vectors = torch.empty((1, EMBEDDING_DIM))  # .to(device)
    for idx, sent in enumerate(sentence_list):
        sent = Sentence(sent)
        document_embeddings.embed(sent)
        sent_emd = sent.get_embedding()
        if idx == 0:
            sentence_vectors = sent_emd.unsqueeze(0)
        else:
            sentence_vectors = torch.cat(
                (sentence_vectors, sent_emd.unsqueeze(0)))

    output = euc(q_emd, sentence_vectors)
    return output
def glove_cosine_similarity(question, sentence_list):
    question = Sentence(question)
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    document_embeddings.embed(question)
    q_emd = question.get_embedding()
    q_emd = q_emd.unsqueeze(0)
    sentence_vectors = torch.empty((1, EMBEDDING_DIM))  # .to(device)
    for idx, sent in enumerate(sentence_list):
        sent = Sentence(sent)
        document_embeddings.embed(sent)
        sent_emd = sent.get_embedding()
        if idx == 0:
            sentence_vectors = sent_emd.unsqueeze(0)
        else:
            sentence_vectors = torch.cat(
                (sentence_vectors, sent_emd.unsqueeze(0)))

    output = cos(q_emd, sentence_vectors)
    return output
Пример #28
0
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, OneHotEmbeddings, \
 DocumentRNNEmbeddings

# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')
# embeddings = OneHotEmbeddings(corpus)

glove_embedding = WordEmbeddings('glove')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings(
    [glove_embedding],
    # flair_embedding_backward, flair_embedding_forward],
    # pooling='min',
    fine_tune_mode='nonlinear')
document_embeddings = DocumentRNNEmbeddings([glove_embedding])

document_lstm_embeddings = DocumentRNNEmbeddings([glove_embedding],
                                                 rnn_type='LSTM')

# create an example sentence
sentence = Sentence('The grass is green . And the sky is blue .')

# embed the sentence with our document embedding
document_embeddings.embed(sentence)

# now check out the embedded sentence.
print(sentence.get_embedding())
Пример #29
0
import numpy as np
from pandas import read_csv
import pickle

from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings, Sentence

data = read_csv('data/abcnews-date-text.csv', error_bad_lines=False)
documents = data[['headline_text']].values.reshape(-1).tolist()
# documents = list(pickle.load(open( "./corpus/df_proyectosFECYT.pkl", "rb" ) )['LEMAS_UC3M'])

glove_embedding = WordEmbeddings('glove')
document_embeddings = DocumentRNNEmbeddings([glove_embedding], hidden_size=512)
embeddings = []

count = 0

try:
    for document in documents:
        count += 1
        sentence = Sentence(document)

        document_embeddings.embed(sentence)

        embeddings.append(sentence.get_embedding().tolist())

        if (count % 1000 == 0): print(count)

finally:  # In case an error occurs before finish, we store previous results
    embedings_array = np.array(embeddings)
    np.save("embeds_abcnews_512_2.npy", embedings_array)
corpus = pickle.load(open(inputFileName, 'br'))

# In[5]:

vectors = {}

# In[6]:

for d in corpus:
    print("processing ", d)
    totLen = len(corpus[d]['text'])
    for i, s in enumerate(corpus[d]['text']):
        if i % 10 == 0:
            print("processed {}/{}        ".format(i, totLen), end='\r')
        sentence = Sentence(s)
        char_lm_embeddings.embed(sentence)
        for token in sentence:
            if not token.text in vectors:
                string = token.text
                for v in token.embedding.cpu().numpy():
                    string += ' {}'.format(v)
                vectors[token.text] = string
    print("processed {}/{}        ".format(i, totLen))

# In[11]:

with open(outputFileName, 'wt') as f:
    for k in vectors:
        f.write(vectors[k])
        f.write("\n")