def get_embedding_layer(tokenizer): word_index = tokenizer.word_index num_words = len(word_index) + 1 embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) print('about to get kz') kz = KazumaCharEmbedding() print('got kz') for word, i in word_index.items(): if i >= MAX_NB_WORDS: continue embedding_vector = kz.emb(word) if embedding_vector is not None: if sum(embedding_vector) == 0: print("failed to find embedding for:" + word) # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector print("Number of words:" + str(num_words)) embedding_layer = Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False) return embedding_layer
def load_embedding(self): glove = GloveEmbedding() kazuma = KazumaCharEmbedding() embed = self.context_encoder.embedding.weight.data for word, idx in self.vocab.word2idx.items(): embed[idx] = torch.tensor( glove.emb(word, default="zero") + kazuma.emb(word, default="zero"))
def get_embeddings(self): num_words = len(self.word2idx) embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) print('about to get kz') kz = KazumaCharEmbedding() print('got kz') for word, i in self.word2idx.items(): if i >= MAX_NB_WORDS: continue embedding_vector = kz.emb(word) if embedding_vector is not None: if sum(embedding_vector) == 0: print("failed to find embedding for:" + word) # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector self.idx_to_embedding = embedding_matrix
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding import numpy as np from read_rules import read_rul, read_csv g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') k = KazumaCharEmbedding() c = ConcatEmbedding([g, k]) for w in ['metal1', 'metal', 'M', 'EXT', '<', '0.035', 'MSMG2', 'MSMG', 'A']: word = np.array(g.emb(w)) word1 = np.array(k.emb(w)) if None in word: print(w, ":\tbad embedding") else: print(w, ":\tgood embedding") out = np.append(word1, word) print(out.shape) diff1 = np.array(k.emb('metal1')) - np.array(k.emb('METAL1')) diff2 = np.array(k.emb('metal1')) - np.array(k.emb('layer')) # print(np.abs(np.mean(diff1))) # print(np.abs(np.mean(diff2))) pdk15_csv = read_csv("calibreDRC_15.csv") pdk45_csv = read_csv("calibreDRC_45.csv")
# prepare embedding matrix # num_words = min(MAX_NB_WORDS, len(word_index)) num_words = len(word_index) + 1 # word_index is indexed from 1-N embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) kz = KazumaCharEmbedding() for word, i in word_index.items(): if i >= MAX_NB_WORDS: continue embedding_vector = kz.emb(word) # i = 0 # while sum(embedding_vector) == 0 and i <= 1000: # embedding_vector = k.emb(word) # i++; # if i == 1000: # print("fail") if embedding_vector is not None: if sum(embedding_vector) == 0: print("failed to find embedding for:" + word) # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector #else: # print(word + )
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True) f = FastTextEmbedding() k = KazumaCharEmbedding() c = ConcatEmbedding([g, f, k]) for w in ['canada', 'vancouver', 'toronto']: print('embedding {}'.format(w)) print(g.emb(w)) print(f.emb(w)) print(k.emb(w)) print(c.emb(w))
class RuleEmbedding: def __init__(self, embedding_type, inputs): if embedding_type == "char": k = KazumaCharEmbedding() self.wordEmbed = k.emb self.sentenceEmbed = self.embed_sentence self.size = 100 elif embedding_type == "glove": g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') self.wordEmbed = g.emb self.sentenceEmbed = self.embed_sentence self.size = 300 elif embedding_type == "concat": self.k = KazumaCharEmbedding() self.g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') self.wordEmbed = self.concatEmbed self.sentenceEmbed = self.embed_sentence self.size = 400 # elif embedding_type == "bert": # try: # bertEmbed = SentenceTransformer('./src/bert-base-nli-mean-tokens') # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = bertEmbed.encode # self.size = 768 # elif embedding_type == "bert-stsb": # try: # bertEmbed = SentenceTransformer('./src/bert-base-nli-stsb-mean-tokens') # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = bertEmbed.encode # self.size = 768 # elif embedding_type == "universal": # try: # univEmbed = hub.load("./src/universal-sentence-encoder_4") # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = univEmbed # self.size = 512 else: print("Error: Embedding type \"%s\" not recognized" % embedding_type) print( "Supported types: \"char\", \"bert\", \"bert-stsb\", \"universal\"" ) exit(1) self.type = embedding_type self.pdk = inputs['pdk'] self.features = inputs['features'] self.weights = inputs['weights'] self.word_counts = inputs['word_counts'] self.a = inputs['a'] self.number_replacement = inputs['number_replacement'] self.remove_pc = inputs['remove_pc'] self.weigh_capitals = inputs['weigh_capitals'] ############################################################################### # Concatenates char and glove embeddings ############################################################################### def concatEmbed(self, word): one = np.array(self.k.emb(word)) two = np.array(self.g.emb(word)) return np.append(one, two) ############################################################################### # embed_sentence(): # Returns list of embeddings for the provided sentences # If self.word_counts != None, computes a weighted average of the word embeddings # Weighted average based on paper by Arora et al. https://github.com/PrincetonML/SIF ############################################################################### def embed_sentence(self, text): embeddings = [] N = len(text) for i in range(N): sentence = text[i] words = sentence.split(' ') num_words = len(words) total = np.zeros(self.size) for i in range(num_words): w = words[i].strip() # remove numbers if self.number_replacement and w.replace('.', '', 1).isdigit(): w = self.number_replacement embed = np.array(self.wordEmbed(w)) # add weight to words that are all caps if self.weigh_capitals and w.isalpha() and w.isupper(): embed = self.weigh_capitals * embed # weigh words based on inverse of probability if self.word_counts and w in self.word_counts.keys(): prob = self.word_counts[w] / self.word_counts['total-words'] weight = self.a / (self.a + prob) embed = weight * embed total += embed result = total / num_words embeddings.append(result) return embeddings ############################################################################### # embed_key(): # Returns a matrix of sentence embeddings for the designated rule feature. # This can be "rule", "description", layer, name, etc. # Embedding type is set by self.embedding_type ############################################################################### def embed_key(self, key): pdk = self.pdk N = len(pdk) sentences = [] for i in range(N): # in case we embed a feature like name, which is not a list if isinstance(pdk[i][key], list): s = ' '.join(pdk[i][key]) else: s = pdk[i][key] sentences.append(s) result = np.array(self.sentenceEmbed(sentences)) return result ############################################################################### # embed_all(): # Compute rule embeddings using a weighted sum of the features. # Weights are stored in self.weights and features are stored in self.features. # Remove first principle component if self.useSIF == True ############################################################################### def embed_all(self): num_features = len(self.features) N = len(self.pdk) partial_embed = np.zeros((num_features, N, self.size)) for i in range(num_features): result = self.embed_key(self.features[i]) # remove first principle component if self.remove_pc: emb = remove_pc(result, 1) partial_embed[i] = emb else: partial_embed[i] = result # compute weight sum of embeddings (f[1]*w[1] + f[2]*w[2]) output = np.tensordot(partial_embed, self.weights, axes=(0, 0)) return output