class Word2vecUtils(): def __init__(self): super(Word2vecUtils, self).__init__() self.word_embed = GloveEmbedding('common_crawl_48', d_emb=300) self.initializer = lambda: np.random.normal(size=300).tolist() def load_embeddings(self, module, vocab, device='cpu'): """ Initialize the embedding with glove and char embedding """ emb_size = module.weight.data.size(-1) assert emb_size == 300, 'Embedding size is not 300, cannot be initialized by GLOVE' outliers = 0 for word in vocab.word2id: if word == PAD: # PAD symbol is always 0-vector module.weight.data[vocab[PAD]] = torch.zeros(emb_size, dtype=torch.float, device=device) continue word_emb = self.word_embed.emb(word, default='none') if word_emb[0] is None: # oov word_emb = self.initializer() outliers += 1 module.weight.data[vocab[word]] = torch.tensor(word_emb, dtype=torch.float, device=device) return 1 - outliers / float(len(vocab)) def emb(self, word): word_emb = self.word_embed.emb(word, default='none') if word_emb[0] is None: return None else: return word_emb
def load_embedding(self): glove = GloveEmbedding() kazuma = KazumaCharEmbedding() embed = self.context_encoder.embedding.weight.data for word, idx in self.vocab.word2idx.items(): embed[idx] = torch.tensor( glove.emb(word, default="zero") + kazuma.emb(word, default="zero"))
def init_word_embeddings(embed_file_name, word_set, edim): embeddings = {} tokens = embed_file_name.split('-') embedding = None if tokens[0] == 'glove': embedding = GloveEmbedding(tokens[1], d_emb=edim, show_progress=True) if embedding: for word in word_set: emb = embedding.emb(word) if emb is not None: embeddings[word] = emb return embeddings
def cluster_partioning_glove(sorted_sent_ids, id_sentence_map, num_clusters): d_emb = 300 embeddings = GloveEmbedding('common_crawl_840', d_emb=d_emb, show_progress=True) sents = [] for sent_id in sorted_sent_ids: sents.append([t[0] for t in id_sentence_map[sent_id]]) sent_vecs = sents_to_embeddings(embeddings, d_emb, sents) n_vecs = 18 base_vecs = random_unit_vecs(d_emb, n_vecs) sims = cosine_similarity(sent_vecs, base_vecs) sims = sims >= 0 partition_ids = {} sent_partitions = {} curr_partition_id = 0 for idx, sim_vec in enumerate(sims): part_id = partition_ids.get(tuple(sim_vec)) if part_id is None: part_id = curr_partition_id curr_partition_id += 1 partition_ids[tuple(sim_vec)] = part_id sent_partitions[idx] = part_id return sent_partitions
def __init__(self, id_sentence_map, threshold=0.5): self.id_sentence_map = id_sentence_map sorted_sent_ids = sorted(id_sentence_map.keys()) d_emb = 300 embeddings = GloveEmbedding('common_crawl_840', d_emb=d_emb, show_progress=True) model = TfidfVectorizer(stop_words="english", max_features=5000, min_df=2) tf_idf = model.fit_transform([ " ".join([t[0] for t in id_sentence_map[sid]]) for sid in sorted_sent_ids ]) #sent_vecs = fast_sents_to_embeddings(embeddings, d_emb, ) self.tf_idf = tf_idf #self.id_vec_map = dict((sorted_sent_ids[idx], vec) for idx, vec in enumerate(sent_vecs)) self.threshold = threshold
def from_sentences(cls, rewards, doc_sents, id_sentence_map, normalize=False): #model = TfidfVectorizer(stop_words="english") start_time = time.time() sorted_sent_ids = sorted(id_sentence_map.keys()) d_emb = 300 embeddings = GloveEmbedding('common_crawl_840', d_emb=d_emb, show_progress=True) sent_vecs = fast_sents_to_embeddings( embeddings, d_emb, [s.as_token_attr_sequence("form_lowercase") for s in doc_sents]) n_vecs = 16 base_vecs = random_unit_vecs(d_emb, n_vecs) doc_sims = cosine_similarity(sent_vecs, base_vecs) doc_hashes = doc_sims >= 0 logger.debug( "Computed doc sentences hashes (time: {}s)".format(time.time() - start_time)) start_time = time.time() buckets = Counter(map(tuple, doc_hashes)) candidate_vecs = fast_sents_to_embeddings( embeddings, d_emb, [[t[0] for t in id_sentence_map[sid]] for sid in sorted_sent_ids]) cand_sims = cosine_similarity(candidate_vecs, base_vecs) cand_hashes = [tuple(h) for h in cand_sims >= 0] logger.debug( "Computed candidate hashes (time: {}s)".format(time.time() - start_time)) start_time = time.time() precomputed_hash_sims = fast_precompute_hash_sims(cand_hashes, buckets) overlaps = {} per_cand_hashes = {} for sent_id, hash_ in zip(sorted_sent_ids, cand_hashes): #overlaps[sent_id] = precomputed_hash_sims[hash_] per_cand_hashes[sent_id] = hash_ return BucketedRedundancyFactor(rewards, buckets, per_cand_hashes, precomputed_hash_sims, normalize=normalize)
def process_raw_dataset( self, models_path="models", train_path: Text = None, dev_path: Text = None, test_path: Text = None, ): """ data path Args: train_path: dev_path: test_path: models_path: Returns: """ if not os.path.isdir(models_path): os.makedirs(models_path) splits_path = {} if train_path: splits_path.update({'train': train_path}) if dev_path: splits_path.update({'dev': dev_path}) if test_path: splits_path.update({'test': test_path}) for name, path in splits_path.items(): self.dataset[name] = Dataset.annotate_raw(path) self.dataset[name].numericalize_(self.vocab) self.ontology += self.dataset[name].extract_ontology() ann_path = path[:-5] + "_ann.json" with open(ann_path, 'wt') as f: json.dump(self.dataset[name].to_dict(), f, indent=4) self.ontology.numericalize_(self.vocab) with open(os.path.join(models_path, 'ontology.json'), 'wt') as f: json.dump(self.ontology.to_dict(), f, indent=4) with open(os.path.join(models_path, 'vocab.json'), 'wt') as f: json.dump(self.vocab.to_dict(), f, indent=4) # Generate embedding file embeddings = [GloveEmbedding(), KazumaCharEmbedding()] E = [] for w in tqdm(self.vocab._index2word): e = [] for emb in embeddings: e += emb.emb(w, default='zero') E.append(e) self.embeddings = E with open(os.path.join(models_path, 'emb.json'), 'wt') as f: json.dump(E, f)
def get_pretrained_embeddings(dataset, words, slots, intents): vocab = set(words + slots + intents) for symbol in [BOS, EOS, UNK, EQUAL]: vocab.add(symbol) # GK Embedding word_embed, char_embed = GloveEmbedding( default='zero'), KazumaCharEmbedding() embed_size = word_embed.d_emb + char_embed.d_emb progress = 0 with open(EMBEDDING(dataset), 'w') as out_file: for word in vocab: progress += 1 vector = word_embed.emb(word) + char_embed.emb(word) string = ' '.join([str(v) for v in vector]) out_file.write(word + ' ' + string + '\n') if progress % 1000 == 0: print("Retrieve 400-dim GK Embedding for the", progress, "-th word ...") print('In total, process %d words in %s' % (len(vocab), dataset))
def cluster_kmeans_glove(sorted_sent_ids, id_sentence_map, num_clusters): d_emb = 300 embeddings = GloveEmbedding('common_crawl_840', d_emb=d_emb, show_progress=True) vecs = np.zeros(shape=(len(sorted_sent_ids), 300)) for idx, sent_id in enumerate(sorted_sent_ids): for token, _ in id_sentence_map[sent_id]: vecs[idx] += np.array(embeddings.emb(token.lower(), "zero")) vecs[idx] /= len(id_sentence_map[sent_id]) if num_clusters is None: num_clusters = max(len(id_sentence_map) // 25, 2) #clusterer = AgglomerativeClustering(n_clusters=num_clusters) #clustering = clusterer.fit_predict(vecs) clustering = KMeans(n_clusters=num_clusters).fit_predict(vecs) return clustering
def dump_pretrained_emb(word2index, index2word, dump_path): print("Dumping pretrained embeddings...") embeddings = [GloveEmbedding(), KazumaCharEmbedding()] E = [] for i in tqdm(range(len(word2index.keys()))): w = index2word[i] e = [] for emb in embeddings: e += emb.emb(w, default='zero') E.append(e) with open(dump_path, 'wt') as f: json.dump(E, f)
def dump_pretrained_emb(word2index, index2word, dump_path): print("Dumping pretrained embeddings...") os.environ["HOME"] = "D:/ANAHOME" # add HOME directory temporarily embeddings = [GloveEmbedding(), KazumaCharEmbedding()] E = [] for i in tqdm(range(len(word2index.keys()))): w = index2word[i] e = [] for emb in embeddings: e += emb.emb(w, default='zero') E.append(e) with open(dump_path, 'wt') as f: json.dump(E, f)
def dump_pretrained_emb(word2index, index2word, dump_path): print("Dumping pretrained embeddings...") # import ssl # ssl._create_default_https_context = ssl._create_unverified_context embeddings = [GloveEmbedding(), KazumaCharEmbedding()] E = [] for i in range(len(word2index.keys())): w = index2word[i] e = [] for emb in embeddings: e += emb.emb(w, default='zero') E.append(e) with open(dump_path, 'wt') as f: json.dump(E, f)
def gen_slot_embed_for_each_dom_from_glove(dom2slots, slot2desc, save_file): ## 1. generate slot2embs slots = list(sorted(slot2desc.keys())) desps = [slot2desc[k] for k in slots] word2emb = {} # collect words for des in desps: splits = des.split() for word in splits: if word not in word2emb: word2emb[word] = [] # load embeddings glove_emb = GloveEmbedding() # calculate slot embs slot2embs = {} for i, slot in enumerate(slots): word_list = slot2desc[slot].split() embs = np.zeros(300) for word in word_list: embs = embs + glove_emb.emb(word, default='zero') slot2embs[slot] = embs ## 2. generate slot2embs based on each domain slot_embs_based_on_each_domain = {} for domain, slot_names in dom2slots.items(): slot_embs = np.zeros((len(slot_names), 300)) for i, slot in enumerate(slot_names): embs = slot2embs[slot] slot_embs[i] = embs slot_embs_based_on_each_domain[domain] = slot_embs with open(save_file, "wb") as f: pickle.dump(slot_embs_based_on_each_domain, f) return slot2embs
def dump_pretrained_emb_new(tokenizer, dump_path): print("Dumping pretrained embeddings...") embeddings = [GloveEmbedding(), KazumaCharEmbedding()] E = [] word_list = [] for w, i in sorted(tokenizer.vocab.items(), key=lambda i: i[1]): word_list.append(w) for i in tqdm(range(len(word_list))): w = word_list[i] e = [] for emb in embeddings: e += emb.emb(w, default='zero') E.append(e) with open(dump_path, 'wt') as f: json.dump(E, f)
def __init__(self, device, use_glove=True, use_elmo=True): glove_size = 300 if use_glove else 0 elmo_size = 1024 if use_elmo else 0 super(WordEmbeddingModel, self).__init__(glove_size + elmo_size) if not use_glove and not use_elmo: raise ValueError("Should use at least one form of embedding.") if use_elmo: self._elmo = ElmoEmbedding(device=device) if use_glove: self._glove = GloveEmbedding(GLOVE_TRAIN_FILE, device=device) # if use_bert: # self._bert = BertEmbedding(model_type='bert-large-cased', device=device) self._use_elmo = use_elmo # self._use_bert = use_bert self._use_glove = use_glove
def cluster_db_scan(sorted_sent_ids, id_sentence_map, num_clusters, cluster_id_map): clusters = cluster_id_map d_emb = 300 embeddings = GloveEmbedding('common_crawl_840', d_emb=d_emb, show_progress=True) X = fast_sents_to_embeddings(embeddings, 300, [[t[0] for t in id_sentence_map[sid]] for sid in sorted_sent_ids]) avg_distances = [] for cl, ids in clusters.items(): X_cl = X[ids, :] dists = np.abs(euclidean_distances(X_cl)) avg_distances.append( np.sum(dists) / max(1, X_cl.shape[0]**2 - X_cl.shape[0])) avg_intra_cl_dist = sum(avg_distances) / len(avg_distances) return DBSCAN(avg_intra_cl_dist, n_jobs=-1).fit_predict(X)
def cluster_clustering_kmeans(sorted_sent_ids, id_sentence_map, num_clusters, cluster_id_map): clusters = cluster_id_map d_emb = 300 embeddings = GloveEmbedding('common_crawl_840', d_emb=d_emb, show_progress=True) X = fast_sents_to_embeddings(embeddings, 300, [[t[0] for t in id_sentence_map[sid]] for sid in sorted_sent_ids]) avg_distances = [] X_cls = [] for cl, ids in sorted(clusters.items()): X_cl = X[ids, :] dists = np.abs(euclidean_distances(X_cl)) avg_distances.append( np.sum(dists) / max(1, X_cl.shape[0]**2 - X_cl.shape[0])) X_cls.append(X_cl.sum(axis=0)) X_cls = np.stack(X_cls) avg_intra_cl_dist = sum(avg_distances) / len(avg_distances) cluster_clusters = KMeans(len(cluster_id_map) // 5).fit_predict(X_cls) clusters = [0 for _ in range(len(sorted_sent_ids))] for cl_id, cl_cluster_id in zip(sorted(cluster_id_map), cluster_clusters): for sent_id in cluster_id_map[cl_id]: clusters[sent_id] = cl_cluster_id return clusters
def dump_pretrained_emb(word2index, index2word, dump_path, mode='en'): print("Dumping pretrained embeddings...") if mode == 'cn': embeddings = [CNEmbedding()] else: # embeddings = [GloveEmbedding(), KazumaCharEmbedding()] embeddings = [GloveEmbedding()] E = [] count = [0., 0.] for i in tqdm(range(len(word2index.keys()))): w = index2word[i] e = [] for emb in embeddings: e += emb.emb(w, default='zero') # stat embed existance count[1] += 1. if w in embeddings[0].word2vec: count[0] += 1. # e += [0.] * 300 E.append(e) with open(dump_path, 'wt') as f: json.dump(E, f) print(f'word exists in embedding mat: {count[0]/count[1]*100}')
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding import numpy as np from read_rules import read_rul, read_csv g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') k = KazumaCharEmbedding() c = ConcatEmbedding([g, k]) for w in ['metal1', 'metal', 'M', 'EXT', '<', '0.035', 'MSMG2', 'MSMG', 'A']: word = np.array(g.emb(w)) word1 = np.array(k.emb(w)) if None in word: print(w, ":\tbad embedding") else: print(w, ":\tgood embedding") out = np.append(word1, word) print(out.shape) diff1 = np.array(k.emb('metal1')) - np.array(k.emb('METAL1')) diff2 = np.array(k.emb('metal1')) - np.array(k.emb('layer')) # print(np.abs(np.mean(diff1))) # print(np.abs(np.mean(diff2))) pdk15_csv = read_csv("calibreDRC_15.csv") pdk45_csv = read_csv("calibreDRC_45.csv")
class RuleEmbedding: def __init__(self, embedding_type, inputs): if embedding_type == "char": k = KazumaCharEmbedding() self.wordEmbed = k.emb self.sentenceEmbed = self.embed_sentence self.size = 100 elif embedding_type == "glove": g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') self.wordEmbed = g.emb self.sentenceEmbed = self.embed_sentence self.size = 300 elif embedding_type == "concat": self.k = KazumaCharEmbedding() self.g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') self.wordEmbed = self.concatEmbed self.sentenceEmbed = self.embed_sentence self.size = 400 # elif embedding_type == "bert": # try: # bertEmbed = SentenceTransformer('./src/bert-base-nli-mean-tokens') # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = bertEmbed.encode # self.size = 768 # elif embedding_type == "bert-stsb": # try: # bertEmbed = SentenceTransformer('./src/bert-base-nli-stsb-mean-tokens') # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = bertEmbed.encode # self.size = 768 # elif embedding_type == "universal": # try: # univEmbed = hub.load("./src/universal-sentence-encoder_4") # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = univEmbed # self.size = 512 else: print("Error: Embedding type \"%s\" not recognized" % embedding_type) print( "Supported types: \"char\", \"bert\", \"bert-stsb\", \"universal\"" ) exit(1) self.type = embedding_type self.pdk = inputs['pdk'] self.features = inputs['features'] self.weights = inputs['weights'] self.word_counts = inputs['word_counts'] self.a = inputs['a'] self.number_replacement = inputs['number_replacement'] self.remove_pc = inputs['remove_pc'] self.weigh_capitals = inputs['weigh_capitals'] ############################################################################### # Concatenates char and glove embeddings ############################################################################### def concatEmbed(self, word): one = np.array(self.k.emb(word)) two = np.array(self.g.emb(word)) return np.append(one, two) ############################################################################### # embed_sentence(): # Returns list of embeddings for the provided sentences # If self.word_counts != None, computes a weighted average of the word embeddings # Weighted average based on paper by Arora et al. https://github.com/PrincetonML/SIF ############################################################################### def embed_sentence(self, text): embeddings = [] N = len(text) for i in range(N): sentence = text[i] words = sentence.split(' ') num_words = len(words) total = np.zeros(self.size) for i in range(num_words): w = words[i].strip() # remove numbers if self.number_replacement and w.replace('.', '', 1).isdigit(): w = self.number_replacement embed = np.array(self.wordEmbed(w)) # add weight to words that are all caps if self.weigh_capitals and w.isalpha() and w.isupper(): embed = self.weigh_capitals * embed # weigh words based on inverse of probability if self.word_counts and w in self.word_counts.keys(): prob = self.word_counts[w] / self.word_counts['total-words'] weight = self.a / (self.a + prob) embed = weight * embed total += embed result = total / num_words embeddings.append(result) return embeddings ############################################################################### # embed_key(): # Returns a matrix of sentence embeddings for the designated rule feature. # This can be "rule", "description", layer, name, etc. # Embedding type is set by self.embedding_type ############################################################################### def embed_key(self, key): pdk = self.pdk N = len(pdk) sentences = [] for i in range(N): # in case we embed a feature like name, which is not a list if isinstance(pdk[i][key], list): s = ' '.join(pdk[i][key]) else: s = pdk[i][key] sentences.append(s) result = np.array(self.sentenceEmbed(sentences)) return result ############################################################################### # embed_all(): # Compute rule embeddings using a weighted sum of the features. # Weights are stored in self.weights and features are stored in self.features. # Remove first principle component if self.useSIF == True ############################################################################### def embed_all(self): num_features = len(self.features) N = len(self.pdk) partial_embed = np.zeros((num_features, N, self.size)) for i in range(num_features): result = self.embed_key(self.features[i]) # remove first principle component if self.remove_pc: emb = remove_pc(result, 1) partial_embed[i] = emb else: partial_embed[i] = result # compute weight sum of embeddings (f[1]*w[1] + f[2]*w[2]) output = np.tensordot(partial_embed, self.weights, axes=(0, 0)) return output
if not os.path.isdir(dann): os.makedirs(dann) dataset = {} ontology = Ontology() vocab = Vocab() vocab.word2index(['<sos>', '<eos>'], train=True) for s in splits: fname = '{}.json'.format(s) logging.warn('Annotating {}'.format(s)) dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname)) dataset[s].numericalize_(vocab) ontology = ontology + dataset[s].extract_ontology() with open(os.path.join(dann, fname), 'wt') as f: json.dump(dataset[s].to_dict(), f) ontology.numericalize_(vocab) with open(os.path.join(dann, 'ontology.json'), 'wt') as f: json.dump(ontology.to_dict(), f) with open(os.path.join(dann, 'vocab.json'), 'wt') as f: json.dump(vocab.to_dict(), f) logging.warn('Computing word embeddings') embeddings = [GloveEmbedding(), KazumaCharEmbedding()] E = [] for w in tqdm(vocab._index2word): e = [] for emb in embeddings: e += emb.emb(w, default='zero') E.append(e) with open(os.path.join(dann, 'emb.json'), 'wt') as f: json.dump(E, f)
def __init__(self, embedding_type, inputs): if embedding_type == "char": k = KazumaCharEmbedding() self.wordEmbed = k.emb self.sentenceEmbed = self.embed_sentence self.size = 100 elif embedding_type == "glove": g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') self.wordEmbed = g.emb self.sentenceEmbed = self.embed_sentence self.size = 300 elif embedding_type == "concat": self.k = KazumaCharEmbedding() self.g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True, default='zero') self.wordEmbed = self.concatEmbed self.sentenceEmbed = self.embed_sentence self.size = 400 # elif embedding_type == "bert": # try: # bertEmbed = SentenceTransformer('./src/bert-base-nli-mean-tokens') # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = bertEmbed.encode # self.size = 768 # elif embedding_type == "bert-stsb": # try: # bertEmbed = SentenceTransformer('./src/bert-base-nli-stsb-mean-tokens') # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = bertEmbed.encode # self.size = 768 # elif embedding_type == "universal": # try: # univEmbed = hub.load("./src/universal-sentence-encoder_4") # except OSError as e: # print(e) # print("Could not find model in current directory: %s" % os.getcwd()) # exit(1) # self.sentenceEmbed = univEmbed # self.size = 512 else: print("Error: Embedding type \"%s\" not recognized" % embedding_type) print( "Supported types: \"char\", \"bert\", \"bert-stsb\", \"universal\"" ) exit(1) self.type = embedding_type self.pdk = inputs['pdk'] self.features = inputs['features'] self.weights = inputs['weights'] self.word_counts = inputs['word_counts'] self.a = inputs['a'] self.number_replacement = inputs['number_replacement'] self.remove_pc = inputs['remove_pc'] self.weigh_capitals = inputs['weigh_capitals']
from embeddings import GloveEmbedding, FastTextEmbedding, KazumaCharEmbedding, ConcatEmbedding g = GloveEmbedding('common_crawl_840', d_emb=300, show_progress=True) f = FastTextEmbedding() k = KazumaCharEmbedding() c = ConcatEmbedding([g, f, k]) for w in ['canada', 'vancouver', 'toronto']: print('embedding {}'.format(w)) print(g.emb(w)) print(f.emb(w)) print(k.emb(w)) print(c.emb(w))
import json import sys sys.path.append('..') import mgnn.config_train as args import paths import re import csv import pickle import numpy as np from nltk.tokenize import word_tokenize from pretreatment.DataExtract import EntityLinking, GetPredicateList, Entity_Link_Falcon from pretreatment.QueryFilter import * from torchnlp.word_to_vector import FastText, GloVe fasttext = FastText() from embeddings import GloveEmbedding g = GloveEmbedding('common_crawl_840', d_emb=300) import math def get_ngram(text, n): word_list = text res = [] for i in range(len(word_list)): if i+n > len(word_list): break res.append(word_list[i:i+n]) return res def get_ngram_embedding(text, n): embeddings = [] for i in range(len(text)):
def __init__(self): super(Word2vecUtils, self).__init__() self.word_embed = GloveEmbedding('common_crawl_48', d_emb=300) self.initializer = lambda: np.random.normal(size=300).tolist()
from embeddings import GloveEmbedding embeddings_name = 'common_crawl_840' embeddings_dimension = 300 glove_embeddings = GloveEmbedding(name=embeddings_name, d_emb=embeddings_dimension, show_progress=True)
from embeddings import GloveEmbedding from keras.models import Sequential from keras.layers import Dense, Dropout from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score from collections import OrderedDict import numpy as np seed = 7 np.random.seed(seed) ID_TEXT_DELIMITER = " <:sep:> " """ 0. initialize twitter word embeddings """ glove = GloveEmbedding(name="twitter", d_emb=50, show_progress=True) """ 1. Index tweets and it labels for error analysis """ labels = OrderedDict() tweets = OrderedDict() fp = open("../../../../resources/n-tweets-id_tokens.txt", 'r') idx = 0 for sample in fp.readlines(): label, tweet = sample.split(ID_TEXT_DELIMITER) if label.strip() == 'YES': labels[idx] = 1 elif label.strip() == 'NO': labels[idx] = 0 tokens_as_string = '' for tok in tweet.strip().replace("[", "").replace(']',
fields=[('src', srcF), ('tgt', tgtF), ('tgt_be', tgt_beF), ('dis', disF), ('label', labelF)]) dev = TabularDataset(path=args.dev_data, format='tsv', fields=[('src', srcF), ('tgt', tgtF), ('tgt_be', tgt_beF), ('dis', disF), ('label', labelF)]) tgt_beF.build_vocab(all_data, min_freq=1) disF.build_vocab(all_data, min_freq=1) srcF.build_vocab(all_data, min_freq=1) vocab = srcF.vocab tgtF.vocab = vocab args.vocab_size = len(vocab) g = GloveEmbedding('common_crawl_840', d_emb=300) embedding = [] for i in range(len(vocab)): if not g.emb(vocab.itos[i])[0]: embedding.append(np.random.uniform(-0.25, 0.25, size=(1, 300))[0]) else: embedding.append(np.array(g.emb(vocab.itos[i]))) embedding = np.array(embedding, dtype=np.float32) args.pre_embedding = True args.embedding = embedding args.update_embedding = False print('build batch iterator...') train_batch_iterator = BucketIterator(dataset=train, batch_size=args.batch_size, sort=False,
import sys from embeddings import GloveEmbedding if len(sys.argv) < 3: print("please provide embeddings and pos conl file") exit(0) embs = GloveEmbedding(sys.argv[1], default="random") unk = "<UNK>" outFile = open(sys.argv[2] + ".glove", "w") curSent = "" for line in open(sys.argv[2]): if len(line) < 2: outFile.write(curSent + "\n") curSent = "" else: tok = line.strip().split("\t") emb = embs.emb(tok[0]) embStr = "emb=" + ",".join([str(x) for x in emb]) curSent += "\t".join(tok + [embStr]) + "\n" outFile.close()
def from_sentences(cls, doc_sents, id_sentence_map, id_date_map, num_date_anchors=100, num_base_vecs=100, normalize=False): start_time = time.time() date_freqs = Counter(s.predicted_date for s in doc_sents) sorted_dates_with_freq = sorted(date_freqs.items()) num_sents = len(doc_sents) sents_per_bucket = num_sents // num_date_anchors date_buckets = {} curr_bucket = 0 curr_freq_sum = 0 for date, freq in sorted_dates_with_freq: date_buckets[date] = curr_bucket curr_freq_sum += freq if curr_freq_sum >= sents_per_bucket and curr_bucket < num_date_anchors: curr_bucket += 1 curr_freq_sum = 0 sent_and_date_buckets = defaultdict(lambda: ([], set())) for sent in doc_sents: date = sent.predicted_date bucket_sents, bucket_dates = sent_and_date_buckets[ date_buckets[date]] bucket_sents.append(sent) bucket_dates.add(date) logger.debug("Computed date buckets (time: {}s)".format(time.time() - start_time)) start_time = time.time() d_emb = 300 embeddings = GloveEmbedding('common_crawl_840', d_emb=d_emb, show_progress=True) checkpoints = [] for sents, dates in sent_and_date_buckets.values(): sent_vecs = sents_to_embeddings( embeddings, d_emb, [s.as_token_attr_sequence("form_lowercase") for s in sents]) base_vecs, num_matches = create_compressed_sent_repr( sent_vecs, num_base_vecs) center_date = min(dates) + (max(dates) - min(dates)) / 2 checkpoints.append( (center_date, base_vecs, num_matches, len(sents))) logger.debug("Computed checkpoints (time: {}s)".format(time.time() - start_time)) start_time = time.time() sents_by_date = defaultdict(list) for sent_id, sent in id_sentence_map.items(): sent_date = id_date_map[sent_id] sents_by_date[sent_date].append(sent_id) sent_scores = {} for idx, (sents_date, sent_ids) in enumerate(sents_by_date.items()): print("{}/{}".format(idx, len(sents_by_date))) sent_sims = np.zeros(len(sent_ids)) factor_sum = 0.0 sent_vecs = fast_sents_to_embeddings( embeddings, d_emb, [[t[0] for t in id_sentence_map[id_]] for id_ in sent_ids]) for check_date, base_vecs, num_matches, num_members in checkpoints: factor = 1.0 / (abs((check_date - sents_date).days) + 1) sent_signatures = (cosine_similarity(sent_vecs, base_vecs) >= 0.0).astype(np.float32) sent_signatures *= num_matches.reshape(1, len(base_vecs)) sent_signatures /= num_members cosine_sims = np.average(sent_signatures, axis=1) sent_sims += cosine_sims * factor factor_sum += factor sent_sims /= factor_sum for sid, score in zip(sent_ids, sent_sims): sent_scores[sid] = score logger.debug("Computed scores (time: {}s)".format(time.time() - start_time)) return BucketedCoverageFactor(sent_scores, normalize=normalize)