def train_mimic_model(polyglot_embedding_path: str, mimic_model_path: str, max_word_length: int, num_epochs: int, learning_rate: float, use_dev_set: bool): full_embedding = PolyEmbedding.load(str(polyglot_embedding_path)) embedding_size = len(full_embedding.zero_vector()) all_X, all_Y = compose_dataset(full_embedding, max_word_length) if use_dev_set: train_X = all_X[TEST_SET_SIZE:] train_Y = all_Y[TEST_SET_SIZE:] validation_data = (all_X[:TEST_SET_SIZE], all_Y[:TEST_SET_SIZE]) else: train_X, train_Y = all_X, all_Y validation_data = None model = create_mimic_model(max_word_length, embedding_size) optimizer = optimizers.Adam(lr=learning_rate) model.compile(optimizer, loss=mse_loss) if os.path.exists(mimic_model_path): model.load_weights(mimic_model_path) loss_to_monitor = 'val_loss' if use_dev_set else 'loss' save_model = ModelCheckpoint(mimic_model_path, verbose=1, monitor=loss_to_monitor, save_best_only=True) lr_reducer = ReduceLROnPlateau(verbose=1, factor=0.2, min_lr=1e-7, monitor=loss_to_monitor, cooldown=100) model.fit(train_X, train_Y, batch_size=1024, epochs=num_epochs, callbacks=[save_model, lr_reducer], validation_data=validation_data)
def cache(self, name, cache, url=None, max_vectors=None): if self.emb_format in ['polyglot', 'glove']: from polyglot.mapping import Embedding if self.emb_format == 'polyglot': embeddings = Embedding.load(name) else: embeddings = Embedding.from_glove(name) self.itos = embeddings.vocabulary.id_word self.stoi = embeddings.vocabulary.word_id self.dim = embeddings.shape[1] self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format in ['word2vec', 'fasttext']: try: from gensim.models import KeyedVectors except ImportError: logging.error('Please install `gensim` package first.') embeddings = KeyedVectors.load_word2vec_format( name, unicode_errors='ignore', binary=self.binary) self.itos = embeddings.index2word self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = embeddings.vector_size self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format == 'fonseca': import numpy as np import os embeddings = np.load(os.path.join(name, 'types-features.npy')) texts = open(os.path.join(name, 'vocabulary.txt'), 'r').read() words = set([w.strip() for w in texts.split('\n')]) self.itos = list(words) self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = embeddings.shape[1] self.vectors = torch.Tensor(embeddings).view(-1, self.dim) self.unk_vector = self.vectors.mean(0).unsqueeze(0)
def __init__(self, student_summary=[]): self.punctuations = ['.', ',', '[', ']', '(', ')'] self.stop_words = [ "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" ] self.student_summary = [self.clean_doc(s) for s in student_summary] self.embeddings = Embedding.load("data/embeddings_pkl.tar.bz2") self.summary_vetors = [] for summary in self.student_summary: self.summary_vetors.append(self.calculate_doc2vec(summary))
def getEmbeddings(lng): if lng not in EMBEDDINGS: home = expanduser("~") embeddings = Embedding.load(home + "/polyglot_data/embeddings2/" + lng + "/embeddings_pkl.tar.bz2") embeddings.apply_expansion(CaseExpander) EMBEDDINGS[lng] = embeddings return EMBEDDINGS[lng]
def cache(self, name, cache, url=None, max_vectors=None): if self.emb_format in ['polyglot', 'glove']: try: from polyglot.mapping import Embedding except ImportError: logger.error('Please install `polyglot` package first.') return None if self.emb_format == 'polyglot': embeddings = Embedding.load(name) else: embeddings = Embedding.from_glove(name) self.itos = embeddings.vocabulary.id_word self.stoi = embeddings.vocabulary.word_id self.dim = embeddings.shape[1] self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format in ['word2vec', 'fasttext']: try: from gensim.models import KeyedVectors except ImportError: logger.error('Please install `gensim` package first.') return None embeddings = KeyedVectors.load_word2vec_format( name, unicode_errors='ignore', binary=self.binary ) self.itos = embeddings.index2word self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = embeddings.vector_size self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format == 'text': tokens = [] vectors = [] if self.binary: import pickle # vectors should be a dict mapping str keys to numpy arrays with open(name, 'rb') as f: d = pickle.load(f) tokens = list(d.keys()) vectors = list(d.values()) else: # each line should contain a token and its following fields # <token> <vector_value_1> ... <vector_value_n> with open(name, 'r', encoding='utf8') as f: for line in f: if line: # ignore empty lines fields = line.rstrip().split() tokens.append(fields[0]) vectors.append(list(map(float, fields[1:]))) self.itos = tokens self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.vectors = torch.Tensor(vectors) self.dim = self.vectors.shape[1]
def external_polygot_embedding(ver): ver = ver.replace('hn', 'hi') home = path.expanduser('~') emb = Embedding.load( path.join(home, 'polyglot_data/embeddings2/%s/embeddings_pkl.tar.bz2' % ver)) word_idx = {w: i for i, w in enumerate(emb.words)} if (ver == 'hi'): word_idx = transdict_stl(word_idx) embedding = emb.vectors return embedding, word_idx
def loadembedding(filename): """Loads a precomputed embedding into memory Input: filename: of the model file Output: embedding object """ embedding = Embedding.load(filename) # Apply useful extensions embedding.apply_expansion(DigitExpander) # We might need this if we want to ignore case # embedding.apply_expansion(CaseExpander) return embedding
def train_word_embeddings(train, polyglot_data): embeddings = Embedding.load(polyglot_data) zpadd = [0] * 64 train_embds = [] for t in train: t_e = [] for w in t: if w == u'*': t_e.append(zpadd) else: e = embeddings.get(w) if e is not None: t_e.append(e) else: t_e.append(zpadd) train_embds.append(t_e) return train_embds
def train_word_embeddings(train): embeddings = Embedding.load( "/home/amir/polyglot_data/embeddings2/fa/embeddings_pkl.tar.bz2") zpadd = [0] * 64 train_embds = [] for t in train: t_e = [] for w in t: if w == u'*': t_e.append(zpadd) else: e = embeddings.get(w) if e is not None: t_e.append(e) else: t_e.append(zpadd) train_embds.append(t_e) return train_embds
def create_list_file(): try: embeddings = Embedding.load( os.path.join(DOWNLOAD_DIR, "embeddings2/en/embeddings_pkl.tar.bz2")) except Exception as e: print e.message ActivityLog.objects.create_log( None, level='C', view_name='scrappers_miners.utils.utils.create_list_file', message='Error in loading library (polyglot) - %s' % e.message, traceback=traceback.format_exc()) return False else: neighbors = [] for word in FILTER_LIST_WORDS: try: neighbors += embeddings.nearest_neighbors( word, top_k=NEAREST_NEIGHBORS) except Exception as e: ActivityLog.objects.create_log( None, level='W', view_name='scrappers_miners.utils.utils.create_list_file', message= 'Error in finding neighbors of a word in FILTER_LIST_WORDS with a message - %s' % e.message, traceback=traceback.format_exc()) filter_words_file = open(FILTER_WORD_FILE_PATH, 'w') for n in set(neighbors + FILTER_LIST_WORDS): filter_words_file.write(n.lower() + '\n') filter_words_file.close() return True
def _extract_we_polyglot(output_file, vocab_file, we_dic): #vocabulary vocabf = codecs.open(vocab_file, "r", "utf-8") vocab = [] vecList = [] for line in vocabf: vocab.append(line.split(" ")[0]) #export embeddings = Embedding.load(we_dic) f = codecs.open(output_file, "w", "utf-8") for token in vocab: token = token.decode("utf-8") if token in embeddings: vector = embeddings[token].tolist() vector.insert(0, token) vecList.append(vector) else: print "====", token f.write("\n".join(" ".join(map(str, x)) for x in vecList)) f.close() vocabf.close()
def loadExternalTools(self): ### Load external tools ### # get ContoPt wordnetLoadTimeStart = time.time() wordnet = ContoPtReader.ContoPtLoader() elapsedTimeWordnetLoad = time.time() - wordnetLoadTimeStart print "\nWordnet loaded in " + str(elapsedTimeWordnetLoad) + " sec.]\n" # get word2vec model wordEmbeddingLoadTimeStart = time.time() wordEmbeddingsModel = Embedding.load( parameters.paths["wordEmbeddings"] + "/polyglot-pt.pkl") #wordEmbeddingsModel = (self.wordEmbeddingsModel).normalize_words() elapsedTimeWordEmbeddingLoad = time.time() - wordEmbeddingLoadTimeStart print "\nWord2vec model loaded in " + str( elapsedTimeWordEmbeddingLoad) + " sec.]\n" return (wordnet, wordEmbeddingsModel)
def load_embeddings(data_root, languages): return { l: Embedding.load(data_root + (f"embeddings/{l}.tar.bz2")) for l in languages }
def load(cls, polyglot_embedding_path: str, mimic_model_path: str): e = PolyEmbedding.load(polyglot_embedding_path) model = load_model(mimic_model_path, compile=False) return cls(e, model)
def cache(self, name, cache, url=None, max_vectors=None): if self.emb_format == 'polyglot': try: from polyglot.mapping import Embedding except ImportError: logger.error('Please install `polyglot` package first.') return None embeddings = Embedding.load(name) self.itos = embeddings.vocabulary.id_word self.stoi = embeddings.vocabulary.word_id self.dim = embeddings.shape[1] self.vectors = torch.tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format == 'glove': itos = [] vectors = [] with open(name, 'r', encoding='utf8') as f: for line in f: try: values = line.rstrip().split() itos.append(values[0]) vectors.append([float(x) for x in values[1:]]) except ValueError as e: # ignore entries that look like: # by [email protected] 0.6882 -0.36436 ... continue self.itos = itos self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = len(vectors[0]) self.vectors = torch.tensor(vectors).view(-1, self.dim) elif self.emb_format == 'fasttext': try: from gensim.models import FastText except ImportError: logger.error('Please install `gensim` package first.') return None self.vectors = FastText.load_fasttext_format(name) self.itos = list(self.vectors.wv.vocab.keys()) self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.unk_vector = self.vectors['<unk>'] self.dim = self.vectors.vector_size elif self.emb_format == 'word2vec': try: from gensim.models import KeyedVectors except ImportError: logger.error('Please install `gensim` package first.') return None embeddings = KeyedVectors.load_word2vec_format( name, unicode_errors='ignore', binary=self.binary) self.itos = embeddings.index2word self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = embeddings.vector_size self.vectors = torch.tensor(embeddings.vectors).view(-1, self.dim) elif self.emb_format == 'text': tokens = [] vectors = [] if self.binary: import pickle # vectors should be a dict mapping str keys to numpy arrays with open(name, 'rb') as f: d = pickle.load(f) tokens = list(d.keys()) vectors = list(d.values()) else: # each line should contain a token and its following fields # <token> <vector_value_1> ... <vector_value_n> with open(name, 'r', encoding='utf8') as f: for line in f: if line: # ignore empty lines fields = line.rstrip().split() tokens.append(fields[0]) vectors.append(list(map(float, fields[1:]))) self.itos = tokens self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.vectors = torch.tensor(vectors) self.dim = self.vectors.shape[1] elif self.emb_format == 'fonseca': import numpy as np import os embeddings = np.load(os.path.join(name, 'types-features.npy')) texts = open(os.path.join(name, 'vocabulary.txt'), 'r').read() words = set([w.strip() for w in texts.split('\n')]) self.itos = list(words) self.stoi = dict(zip(self.itos, range(len(self.itos)))) self.dim = embeddings.shape[1] self.vectors = torch.tensor(embeddings).view(-1, self.dim) if self.unk_vector is None: self.unk_vector = self.vectors.mean(0).unsqueeze(0)
def test_polyglot(self) : from polyglot.mapping import Embedding embeddings = Embedding.load("/home/rmyeid/polyglot_data/embeddings2/en/embeddings_pkl.tar.bz2") neighbors = embeddings.nearest_neighbors("green")
def categorize_tweets(currentTwitterAccount, n_max_tweets=5, settings=None): if not settings: settings = load_from_config() subscription_key = settings["subscription_key"] api_url = "https://westcentralus.api.cognitive.microsoft.com/text/analytics/v2.0/" key_phrase_api_url = api_url + "keyPhrases" language_api_url = api_url + "languages" embeddings = Embedding.load(settings["model_location"]) consumer_key = settings["consumer_key"] consumer_secret = settings["consumer_secret"] access_token = settings["access_token"] access_token_secret = settings["access_token_secret"] auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) # Fetch swedish tweets def language_check(string): headers = {"Ocp-Apim-Subscription-Key": subscription_key} response = requests.post(language_api_url, headers=headers, json={"documents": [{"id": 1, "text": string}]}) if response.ok: return response.json()["documents"][0]["detectedLanguages"][0]["iso6391Name"] else: if response.status_code == 429: time.sleep(1) return language_check(string) response.raise_for_status() documents = {"documents": []} tweets_raw = [] i = 0 for tweet in tweepy.Cursor(api.user_timeline, id=currentTwitterAccount, tweet_mode="extended").items(n_max_tweets): # removing the http link at the end of the text result = re.sub(r"http\S+", "", tweet.full_text) if language_check(result) == "sv": documents['documents'].append({'id': i, 'language': 'sv', 'text': result}) tweets_raw.append((result, tweet.created_at)) i += 1 ### Extract key words headers = {"Ocp-Apim-Subscription-Key": subscription_key} response = requests.post(key_phrase_api_url, headers=headers, json=documents) key_phrases = response.json() # Parse key words key_words = [[y for y in x.values()][0] for x in key_phrases["documents"]] key_words = [[y.split(" ") for y in x] for x in key_words] key_words = [[y.strip() for sublist in l for y in sublist] for l in key_words] ### Determine closest category for the sets of key words def embedding_distances(word, category): # Adapter to handle missing words for embedding model try: return embeddings.distances(word, category) except: return [1e16] # If word is not present, return big integer.. def topic(word): # Determine category score for word topic_list = [embedding_distances(word.lower(), category) for category in CATEGORIES] # compute distances to categories topic_list = [min(l) for l in topic_list] # compute average of each sublist min_value = min(topic_list) return topic_list.index(min_value), min_value topic_dists = [[topic(word) for word in l] for l in key_words] def cluster_topics(topic_dist): topic_dict = {} for t in topic_dist: if t[0] in topic_dict: topic_dict[t[0]] = (min(topic_dict[t[0]][0], t[1]), topic_dict[t[0]][1] + 1) else: topic_dict[t[0]] = (t[1], 1) topics = [(key, value[0]) for key, value in topic_dict.items()] values = [x[1] for x in topics] return topics[values.index(min(values))] categorized_tweets = [{"text": tweets_raw[i][0], "category": CATEGORY_NAMES[cluster_topics(topic_dists[i])[0]], "time": str(tweets_raw[i][1])} for i in range(len(topic_dists))] return categorized_tweets
import os import glob import sqlite3 from polyglot.text import Text, Word from polyglot.downloader import downloader from polyglot.mapping import Embedding downloader.download("embeddings2.pt") downloader.download("pos2.pt") downloader.download("morph2.pt") downloader.supported_tasks(lang="pt") embeddings = Embedding.load( "/Users/emersonantonio/polyglot_data/embeddings2/pt/embeddings_pkl.tar.bz2" ) #neighbors = embeddings.nearest_neighbors("verde") #for w,d in zip(neighbors, embeddings.distances("green", neighbors)): # print("{:<8}{:.4f}".format(w,d)) # Criar o Banco de Dados con = sqlite3.connect('./db/dadosDipolNLTK.db') cur = con.cursor() sql_create = 'CREATE TABLE IF NOT EXISTS miniDicionario '\ '(' \ ' id integer primary key AUTOINCREMENT, '\ ' word varchar(50), ' \ ' radical varchar(50), ' \ ' tag varchar(50)' \ ')'
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer from os import curdir, sep from word2vec import transform_text, getKthNeighbour, closest_k_points_tsne from polyglot.mapping import Embedding import json from tsne import tsne from word2vec import transform_text # from sklearn.manifold import TSNE PORT_NUMBER = 8080 polish_embeddings = Embedding.load("polyglot-pl.pkl") # ------------- t-SNE init ------------------------------ # model = TSNE(n_components=2, random_state=0) # np.set_printoptions(suppress=True) # tsne_rep = tsne(polish_embeddings.vectors) # This class will handles any incoming request from # the browser print json.dumps(closest_k_points_tsne(polish_embeddings, "Beata", 10)) class myHandler(BaseHTTPRequestHandler): def do_POST(self): if self.path == "/sentence-find-near":
def langmodelload(language): ######################## global stop_words global question_words global embeddings global model global lang_dict ######################## LibLocLang = "./udpipe-ud/" ######################## if language == "en": model = Model(LibLocLang + 'english-ewt-ud-2.5-191206.udpipe') elif language == "ar": model = Model(LibLocLang + 'arabic-padt-ud-2.5-191206.udpipe') elif language == "zh": model = Model(LibLocLang + 'chinese-gsdsimp-ud-2.5-191206.udpipe') elif language == "id": model = Model(LibLocLang + 'indonesian-gsd-ud-2.5-191206.udpipe') elif language == "ko": model = Model(LibLocLang + 'korean-gsd-ud-2.5-191206.udpipe') elif language == "pt": model = Model(LibLocLang + 'portuguese-gsd-ud-2.5-191206.udpipe') elif language == "vi": model = Model(LibLocLang + 'vietnamese-vtb-ud-2.5-191206.udpipe') elif language == "hi": model = Model(LibLocLang + 'hindi-hdtb-ud-2.5-191206.udpipe') elif language == "jp": model = Model(LibLocLang + 'japanese-gsd-ud-2.5-191206.udpipe') elif language == 'es': model = Model(LibLocLang + 'spanish-gsd-ud-2.5-191206.udpipe') ######################## base_question_words = [ 'where', 'which', "who", "why", "what", "when", "please", "how", "is", "are", "will", "could", "should", "was", "were", "do", "did", "can" ] question_words = [] for i in range(0, len(base_question_words)): question_words.append( Text(base_question_words[i]).transliterate(language)) ######################## if stopwords.has_lang( language ) and language != "hi" and language != "ar" and language != "zh" and language != "vi" and language != "ko" and language != "jp" and language != "id" and language != "ms": ######################## stop_words = list(stopwords.stopwords(language)) stop_words_list = [] ######################## for i in range(0, len(stop_words)): try: text = Text(stop_words[i], hint_language_code=language) ######################## if (text.pos_tags[0][1] != "NOUN") and ( text.pos_tags[0][1] != "VERB") and (text.pos_tags[0][1] != "PRON"): stop_words_list.append(text.pos_tags[0][0]) except Exception as e: print(e) stop_words = stop_words_list else: print(language + " has errors.") stop_words = [] ######################## ######################## embeddings = Embedding.load("./polyglot_data/embeddings2/" + language + "/embeddings_pkl.tar.bz2") lang_dict[language] = { 'model': model, 'embeddings': embeddings, 'stop_words': stop_words }
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer from os import curdir, sep from word2vec import transform_text, getKthNeighbour, closest_k_points_tsne from polyglot.mapping import Embedding import json from tsne import tsne from word2vec import transform_text # from sklearn.manifold import TSNE PORT_NUMBER = 8080 polish_embeddings = Embedding.load("polyglot-pl.pkl") # ------------- t-SNE init ------------------------------ # model = TSNE(n_components=2, random_state=0) # np.set_printoptions(suppress=True) # tsne_rep = tsne(polish_embeddings.vectors) # This class will handles any incoming request from # the browser print json.dumps(closest_k_points_tsne(polish_embeddings, "Beata", 10)) class myHandler(BaseHTTPRequestHandler): def do_POST(self): if self.path=="/sentence-find-near":
def embedding(text, embeddingPATH): embeddings = Embedding.load(embeddingPATH) neighbors = embeddings.nearest_neighbors(text) for w, d in zip(neighbors, embeddings.distances(text, neighbors)): print("{}\n{}".format(w,d))
def load_embedding(self): path = os.path.join(self.c["data_root"], "embeddings", self.c["language"] + ".tar.bz2") return PolyglotEmbedding.load(path)
#This gives only the polyglot embeddings. import numpy as np from polyglot.mapping import Embedding import pickle from pos_helper import * from nltk import pos_tag src_embeddings = Embedding.load( "/home/krishna/polyglot_data/embeddings2/en/embeddings_pkl.tar.bz2") tar_embeddings = Embedding.load( "/home/krishna/polyglot_data/embeddings2/de/embeddings_pkl.tar.bz2") def make_align_dict(inp, nwords): inplist = inp.split() aldict = {} for j in range(nwords): aldict[j] = [] for j in inplist: a, b = j.split('-') a, b = int(a), int(b) if b not in aldict: aldict[b] = [] aldict[b].append(a) return aldict def get_target_embedding(ind, inlist): try: e2 = tar_embeddings[inlist[ind]]
from polyglot.mapping import Embedding import processitem import numpy as np embeddings = Embedding.load( '/home/luka/polyglot_data/embeddings2/nl/embeddings_pkl.tar.bz2') words = [] vectors = [] for w, v in embeddings: words.append(w) vectors.append(v) file = open('./RELPRON/RELPRON/translation_basic.txt', 'r', encoding='latin-1') items_raw = file.readlines() file.close() items_neat = [] for i in items_raw: neat = processitem.Item(i) items_neat.append(neat) #calculate lexical baselines: headN and V vectors NN_dist = [] NV_dist = [] for i in items_neat: t = i.termN h = i.headN v = i.V if t in words: