예제 #1
0
def elmo_embeddings_flair_lib_tokenization(sentences, output_file=None):
    if output_file:
        f = open(output_file, 'w')
    # init embedding
    # For English biomedical data you can use 'pubmed'
    embedding = ELMoEmbeddings(
        'original')  # English	4096-hidden, 2 layers, 93.6M parameters
    for i, sent in enumerate(sentences):
        print("Encoding the {}th input sentence!".format(i))
        # create a sentence
        sentence = Sentence(sent[0])  # [sent] --> sent

        # embed words in sentence
        embedding.embed(sentence)
        for token in sentence:
            if output_file:

                f.write(
                    token.text + " " +
                    " ".join([str(num)
                              for num in token.embedding.tolist()]) + '\n')
            else:
                print(token.text + " " +
                      " ".join([str(num)
                                for num in token.embedding.tolist()]) + '\n')
        f.write('\n')
def main(directory, embeddings, strategy):
    # 1. find corpora in data directory
    corpora = {"train": None, "dev": None, "test": None}
    for labelset in corpora:
        for file in sorted(os.listdir(directory)):
            if infer_split(file) == labelset:
                corpora[labelset] = pd.read_csv(
                    os.path.join(directory, file),
                    sep="\t",
                    names=["text", "pos", "lemma", "label"],
                    engine="python",
                    error_bad_lines=False,
                    quoting=csv.QUOTE_NONE).fillna("")
                break

    if embeddings == "elmo":
        embedder = ELMoEmbeddings("original")
    elif embeddings == "flair":
        embedder = FlairEmbeddings("news-forward")
    elif embeddings == "bert":
        embedder = TransformerWordEmbeddings('bert-base-cased')

    embeddings_dir = os.path.join(directory, embeddings + "_embeddings")
    if not os.path.exists(embeddings_dir):
        os.makedirs(embeddings_dir, exist_ok=True)

    strategy = np.mean if strategy == "mean" else np.max if strategy == "max" else np.sum if strategy == "sum" else None

    for labelset, corpus in corpora.items():
        if corpus is None:
            print(f"empty corpus: {labelset}")
            continue
        voc = sorted(corpus["text"].unique())
        print(f"Unique tokens: {len(voc)}")

        with open(os.path.join(embeddings_dir, labelset + ".w2v"), "w") as f:
            for word in voc:
                sentence = Sentence(word)
                if len(sentence) == 0:
                    continue
                embedder.embed(sentence)
                token_embedding = strategy(
                    [token.embedding.cpu().numpy() for token in sentence],
                    axis=0)
                f.write(
                    word + " " +
                    " ".join([str(num)
                              for num in token_embedding.tolist()]) + '\n')
예제 #3
0
def use_flair_to_extract_context_embeddings(file, dest_folder, embedding_type, embedding_size, pretrained_model=None):
    if embedding_type.lower() == 'elmo':
        context_embedding = ELMoEmbeddings(model='pubmed')
    elif embedding_type.lower() == 'elmo_transformer':
        context_embedding = ELMoTransformerEmbeddings()
    elif embedding_type.lower() == 'flair':
        context_embedding = PooledFlairEmbeddings()
    elif embedding_type.lower() == 'bioflair':
        flair_1 = PooledFlairEmbeddings('pubmed-forward')
        flair_2 = PooledFlairEmbeddings('pubmed-backward')
        elmo = ELMoEmbeddings(model='pubmed')
        #bert = BertEmbeddings(bert_model_or_path='bert-base-multilingual-cased', layers='-1')
        context_embedding = StackedEmbeddings(embeddings=[flair_1, flair_2, elmo])
    elif embedding_type.lower() == 'biobert' or embedding_type.lower() == 'bert':
        context_embedding = BertEmbeddings(bert_model_or_path=pretrained_model, layers='-1')

    data = {}
    dest_name = os.path.basename(file).split('.')

    print(dest_folder)
    with open(file, 'r') as f, open('{}/{}.pickle'.format(dest_folder, dest_name[0]), 'wb') as d:
        sentence = ''
        instance = []
        j = 0
        for i in f.readlines():
            if i != '\n':
                i = i.split()
                sentence += ' '+i[0]
            elif i == '\n':
                sent = Sentence(sentence.strip())
                context_embedding.embed(sent)
                v = ''
                for i in sent:
                    instance.append((i.text, i.embedding[:embedding_size]))
                sentence = ''

                if instance:
                    data[j] = list(zip(*(instance.copy())))
                    j += 1
                instance.clear()
        pickle.dump(data, d)
        f.close()
        d.close()
예제 #4
0
def elmo_embeddings(sentences, tokenized_contents, output_file=None):
    if output_file:
        f = open(output_file, 'w')
    # init embedding
    # For English biomedical data you can use 'pubmed'
    embedding = ELMoEmbeddings(
        'original')  # English	4096-hidden, 2 layers, 93.6M parameters
    for i, (sent, sent_tokens) in enumerate(zip(sentences,
                                                tokenized_contents)):
        print("Encoding the {}th input sentence for ELMO embedding!".format(i))
        # Getting the tokens from our own tokenized sentence!
        tokens: List[Token] = [Token(token) for token in sent_tokens]

        if len(tokens) != len(sent_tokens):
            raise ValueError("token length does not match sent_tokens length")

        # Create new empty sentence
        sentence = Sentence()

        # add our own tokens
        sentence.tokens = tokens

        embedding.embed(sentence)

        for token in sentence:

            if output_file:
                f.write(
                    token.text + " " +
                    " ".join([str(num)
                              for num in token.embedding.tolist()]) + '\n')
            else:
                print(token.text + " " +
                      " ".join([str(num)
                                for num in token.embedding.tolist()]) + '\n')
        f.write('\n')
예제 #5
0
class Database(object):
    def __init__(self, docs):
        #self.documents_orig = np.loadtxt(docs, delimiter='\n', dtype = str)   # only 9999 documents
        self.documents_orig = []
        with open(docs, 'r') as f:  # getting 10k docs using this
            self.documents_orig = f.readlines()

        self.documents = []
        self.elmo = ELMoEmbeddings()
        #self.embedding = DocumentPoolEmbeddings([self.elmo])
        self.debug = True

    def knn(self, query, query_txt, k):
        #cos_sim = torch.mm(self.documents, query) / (torch.norm(query) * torch.norm(self.documents))

        cos_sim = torch.nn.functional.cosine_similarity(self.documents, query)

        topk, topk_indices = torch.topk(cos_sim, k, 0, True)

        topk_indices = topk_indices.numpy().astype('int')
        topk = topk.numpy().astype('float')
        top_combined = np.vstack((topk, topk_indices)).T

        if self.debug:
            print("\n")
            print("Query: ", query_txt, " | index: ", topk_indices.T)
            [
                print(self.documents_orig[int(i[1])], " --- ", i[0])
                for i in top_combined
            ]

        return list(zip(topk, topk_indices))  #used to return tuples

    def load_documents_into_embedding(self):
        print("Embedding ", len(self.documents_orig), " Documents")
        #self.documents_orig = self.documents_orig[0:50]
        self.documents = [
            self.elmo.embed(Sentence(elem)) for elem in self.documents_orig
        ]

        self.documents = torch.stack([
            torch.cat([token.embedding.unsqueeze(0) for token in elem[0]],
                      dim=0)[0] for elem in self.documents
        ])

        np.save("./documents_embedded.npy", self.documents)

    def run_query(self, query, k=None):
        """Run a query on the given documents based on word embeddings
        
        Arguments:
            query {str} -- Query string.
        
        Keyword Arguments:
            k {int} -- The top documents to return (default: 10)
        
        Returns:
            list[tuple[float, int]] -- Sorted list of tuples, which contain the score and the document id.
                Made up example to show the formatting with k=5:
                        [(0.89316645860672, 1567), 
                        (0.6174346804618835, 125), 
                        (0.5975501537321234, 1181), 
                        (0.5779426293373108, 3979), 
                        (0.5110726475715637, 7155)]
        """
        if k is None:
            k = 10

        sentence = Sentence(query)

        #self.embedding.embed(sentence)

        self.elmo.embed(sentence)

        sentence = [token.embedding.unsqueeze(0) for token in sentence][0]

        #print(sentence)

        # A returned list should look like this for k=5. Btw. the numbers are made up!

        #[
        #            (0.89316645860672, 1567),
        #            (0.6174346804618835, 125),
        #            (0.5975501537321234, 1181),
        #            (0.5779426293373108, 3979),
        #            (0.5110726475715637, 7155),
        #        ]

        return self.knn(sentence, query, k=k)

    def run_query_txt(self, text):
        self.queries = np.loadtxt(text, delimiter='\n', dtype=str)

        results = []

        for query in self.queries:
            out = self.run_query(query)
            results.append(out)

        #saving results

        file = open("results.txt", 'w')

        for elem in results:
            out = ""
            for res in elem:
                out += str(res[0]) + "," + str(res[1]) + ";"
            out = out[:-1]
            out += '\n'
            file.write(out)

        file.close()
예제 #6
0
)

parser.add_argument(
    '--lm_emb_save_path',
    default='./wv/elmo.emb.pkl',
    action='store',
)

args = parser.parse_args()

embedding = ELMoEmbeddings(args.model_name)

flag = args.dataset
dataset = []
with open(f'./datasets/unified/train.{flag}.json') as f:
    dataset += json.load(f)
with open(f'./datasets/unified/valid.{flag}.json') as f:
    dataset += json.load(f)
with open(f'./datasets/unified/test.{flag}.json') as f:
    dataset += json.load(f)

bert_emb_dict = {}
for item in tqdm(dataset):
    tokens = tuple(item['tokens'])
    s = form_sentence(tokens)
    embedding.embed(s)
    emb = get_embs(s)
    bert_emb_dict[tokens] = emb.astype('float16')

with open(args.lm_emb_save_path, 'wb') as f:
    pickle.dump(bert_emb_dict, f)
import numpy as np
import pickle
import tqdm as tqdm


def get_elmo(word2idx):
     """Returns an ELMo embedding of the word tokens.
    
    Args:
        word2idx (dict): word to index pairs {'word': index}
    Returns:
        embedding_matrix (np.array): embedding matrix of shape (vocabulary_size + 1, embedding_dim=3072)
    
    """
    embedding_matrix = np.zeros((vocabulary_size + 1, 3072))
    elmo = ELMoEmbeddings()
    for word, index in tqdm(word2idx.items()):
        try:
            word_ = Sentence(word)
            elmo.embed(word_)
            embedding_vector = word_[0].embedding.cpu().detach().numpy()
            embedding_matrix[index] = embedding_vector
        except KeyError:
            embedding_matrix[index] = np.random.normal(0, np.sqrt(0.25), 3072)
    
    return embedding_matrix

# save embeddings
pickle_out = open("elmo_3072.pickle","wb")
pickle.dump(embedding_matrix, pickle_out)
pickle_out.close()