Пример #1
0
def train_mimic_model(polyglot_embedding_path: str, mimic_model_path: str,
                      max_word_length: int, num_epochs: int,
                      learning_rate: float, use_dev_set: bool):
    full_embedding = PolyEmbedding.load(str(polyglot_embedding_path))
    embedding_size = len(full_embedding.zero_vector())
    all_X, all_Y = compose_dataset(full_embedding, max_word_length)
    if use_dev_set:
        train_X = all_X[TEST_SET_SIZE:]
        train_Y = all_Y[TEST_SET_SIZE:]
        validation_data = (all_X[:TEST_SET_SIZE], all_Y[:TEST_SET_SIZE])
    else:
        train_X, train_Y = all_X, all_Y
        validation_data = None
    model = create_mimic_model(max_word_length, embedding_size)
    optimizer = optimizers.Adam(lr=learning_rate)
    model.compile(optimizer, loss=mse_loss)
    if os.path.exists(mimic_model_path):
        model.load_weights(mimic_model_path)
    loss_to_monitor = 'val_loss' if use_dev_set else 'loss'
    save_model = ModelCheckpoint(mimic_model_path,
                                 verbose=1,
                                 monitor=loss_to_monitor,
                                 save_best_only=True)
    lr_reducer = ReduceLROnPlateau(verbose=1,
                                   factor=0.2,
                                   min_lr=1e-7,
                                   monitor=loss_to_monitor,
                                   cooldown=100)
    model.fit(train_X,
              train_Y,
              batch_size=1024,
              epochs=num_epochs,
              callbacks=[save_model, lr_reducer],
              validation_data=validation_data)
Пример #2
0
    def cache(self, name, cache, url=None, max_vectors=None):
        if self.emb_format in ['polyglot', 'glove']:
            from polyglot.mapping import Embedding
            if self.emb_format == 'polyglot':
                embeddings = Embedding.load(name)
            else:
                embeddings = Embedding.from_glove(name)
            self.itos = embeddings.vocabulary.id_word
            self.stoi = embeddings.vocabulary.word_id
            self.dim = embeddings.shape[1]
            self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim)
        elif self.emb_format in ['word2vec', 'fasttext']:
            try:
                from gensim.models import KeyedVectors
            except ImportError:
                logging.error('Please install `gensim` package first.')

            embeddings = KeyedVectors.load_word2vec_format(
                name, unicode_errors='ignore', binary=self.binary)
            self.itos = embeddings.index2word
            self.stoi = dict(zip(self.itos, range(len(self.itos))))
            self.dim = embeddings.vector_size
            self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim)
        elif self.emb_format == 'fonseca':
            import numpy as np
            import os
            embeddings = np.load(os.path.join(name, 'types-features.npy'))
            texts = open(os.path.join(name, 'vocabulary.txt'), 'r').read()
            words = set([w.strip() for w in texts.split('\n')])
            self.itos = list(words)
            self.stoi = dict(zip(self.itos, range(len(self.itos))))
            self.dim = embeddings.shape[1]
            self.vectors = torch.Tensor(embeddings).view(-1, self.dim)
        self.unk_vector = self.vectors.mean(0).unsqueeze(0)
Пример #3
0
 def __init__(self, student_summary=[]):
     self.punctuations = ['.', ',', '[', ']', '(', ')']
     self.stop_words = [
         "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you",
         "your", "yours", "yourself", "yourselves", "he", "him", "his",
         "himself", "she", "her", "hers", "herself", "it", "its", "itself",
         "they", "them", "their", "theirs", "themselves", "what", "which",
         "who", "whom", "this", "that", "these", "those", "am", "is", "are",
         "was", "were", "be", "been", "being", "have", "has", "had",
         "having", "do", "does", "did", "doing", "a", "an", "the", "and",
         "but", "if", "or", "because", "as", "until", "while", "of", "at",
         "by", "for", "with", "about", "against", "between", "into",
         "through", "during", "before", "after", "above", "below", "to",
         "from", "up", "down", "in", "out", "on", "off", "over", "under",
         "again", "further", "then", "once", "here", "there", "when",
         "where", "why", "how", "all", "any", "both", "each", "few", "more",
         "most", "other", "some", "such", "no", "nor", "not", "only", "own",
         "same", "so", "than", "too", "very", "s", "t", "can", "will",
         "just", "don", "should", "now"
     ]
     self.student_summary = [self.clean_doc(s) for s in student_summary]
     self.embeddings = Embedding.load("data/embeddings_pkl.tar.bz2")
     self.summary_vetors = []
     for summary in self.student_summary:
         self.summary_vetors.append(self.calculate_doc2vec(summary))
Пример #4
0
def getEmbeddings(lng):
    if lng not in EMBEDDINGS:
        home = expanduser("~")
        embeddings = Embedding.load(home + "/polyglot_data/embeddings2/" +
                                    lng + "/embeddings_pkl.tar.bz2")
        embeddings.apply_expansion(CaseExpander)
        EMBEDDINGS[lng] = embeddings
    return EMBEDDINGS[lng]
Пример #5
0
    def cache(self, name, cache, url=None, max_vectors=None):
        if self.emb_format in ['polyglot', 'glove']:
            try:
                from polyglot.mapping import Embedding
            except ImportError:
                logger.error('Please install `polyglot` package first.')
                return None
            if self.emb_format == 'polyglot':
                embeddings = Embedding.load(name)
            else:
                embeddings = Embedding.from_glove(name)
            self.itos = embeddings.vocabulary.id_word
            self.stoi = embeddings.vocabulary.word_id
            self.dim = embeddings.shape[1]
            self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim)

        elif self.emb_format in ['word2vec', 'fasttext']:
            try:
                from gensim.models import KeyedVectors
            except ImportError:
                logger.error('Please install `gensim` package first.')
                return None
            embeddings = KeyedVectors.load_word2vec_format(
                name, unicode_errors='ignore', binary=self.binary
            )
            self.itos = embeddings.index2word
            self.stoi = dict(zip(self.itos, range(len(self.itos))))
            self.dim = embeddings.vector_size
            self.vectors = torch.Tensor(embeddings.vectors).view(-1, self.dim)

        elif self.emb_format == 'text':
            tokens = []
            vectors = []
            if self.binary:
                import pickle

                # vectors should be a dict mapping str keys to numpy arrays
                with open(name, 'rb') as f:
                    d = pickle.load(f)
                    tokens = list(d.keys())
                    vectors = list(d.values())
            else:
                # each line should contain a token and its following fields
                # <token> <vector_value_1> ... <vector_value_n>
                with open(name, 'r', encoding='utf8') as f:
                    for line in f:
                        if line:  # ignore empty lines
                            fields = line.rstrip().split()
                            tokens.append(fields[0])
                            vectors.append(list(map(float, fields[1:])))
            self.itos = tokens
            self.stoi = dict(zip(self.itos, range(len(self.itos))))
            self.vectors = torch.Tensor(vectors)
            self.dim = self.vectors.shape[1]
Пример #6
0
def external_polygot_embedding(ver):
    ver = ver.replace('hn', 'hi')
    home = path.expanduser('~')
    emb = Embedding.load(
        path.join(home,
                  'polyglot_data/embeddings2/%s/embeddings_pkl.tar.bz2' % ver))
    word_idx = {w: i for i, w in enumerate(emb.words)}
    if (ver == 'hi'):
        word_idx = transdict_stl(word_idx)
    embedding = emb.vectors
    return embedding, word_idx
Пример #7
0
def loadembedding(filename):
    """Loads a precomputed embedding into memory
    
    Input:
        filename: of the model file
    Output:
        embedding object
    """
    embedding = Embedding.load(filename)
    # Apply useful extensions
    embedding.apply_expansion(DigitExpander)
    # We might need this if we want to ignore case
    # embedding.apply_expansion(CaseExpander)
    return embedding
Пример #8
0
def train_word_embeddings(train, polyglot_data):
    embeddings = Embedding.load(polyglot_data)
    zpadd = [0] * 64
    train_embds = []
    for t in train:
        t_e = []
        for w in t:
            if w == u'*':
                t_e.append(zpadd)
            else:
                e = embeddings.get(w)
                if e is not None:
                    t_e.append(e)
                else:
                    t_e.append(zpadd)
        train_embds.append(t_e)
    return train_embds
def train_word_embeddings(train):
    embeddings = Embedding.load(
        "/home/amir/polyglot_data/embeddings2/fa/embeddings_pkl.tar.bz2")
    zpadd = [0] * 64
    train_embds = []
    for t in train:
        t_e = []
        for w in t:
            if w == u'*':
                t_e.append(zpadd)
            else:
                e = embeddings.get(w)
                if e is not None:
                    t_e.append(e)
                else:
                    t_e.append(zpadd)
        train_embds.append(t_e)
    return train_embds
Пример #10
0
def create_list_file():
    try:
        embeddings = Embedding.load(
            os.path.join(DOWNLOAD_DIR,
                         "embeddings2/en/embeddings_pkl.tar.bz2"))
    except Exception as e:
        print e.message
        ActivityLog.objects.create_log(
            None,
            level='C',
            view_name='scrappers_miners.utils.utils.create_list_file',
            message='Error in loading library (polyglot) - %s' % e.message,
            traceback=traceback.format_exc())
        return False
    else:

        neighbors = []

        for word in FILTER_LIST_WORDS:
            try:
                neighbors += embeddings.nearest_neighbors(
                    word, top_k=NEAREST_NEIGHBORS)
            except Exception as e:
                ActivityLog.objects.create_log(
                    None,
                    level='W',
                    view_name='scrappers_miners.utils.utils.create_list_file',
                    message=
                    'Error in finding neighbors of a word in FILTER_LIST_WORDS with a message - %s'
                    % e.message,
                    traceback=traceback.format_exc())

        filter_words_file = open(FILTER_WORD_FILE_PATH, 'w')

        for n in set(neighbors + FILTER_LIST_WORDS):
            filter_words_file.write(n.lower() + '\n')

        filter_words_file.close()

        return True
def _extract_we_polyglot(output_file, vocab_file, we_dic):
    #vocabulary
    vocabf = codecs.open(vocab_file, "r", "utf-8")
    vocab = []
    vecList = []
    for line in vocabf:
        vocab.append(line.split(" ")[0])
    #export
    embeddings = Embedding.load(we_dic)
    f = codecs.open(output_file, "w", "utf-8")
    for token in vocab:

        token = token.decode("utf-8")
        if token in embeddings:
            vector = embeddings[token].tolist()
            vector.insert(0, token)
            vecList.append(vector)
        else:
            print "====", token
    f.write("\n".join(" ".join(map(str, x)) for x in vecList))
    f.close()
    vocabf.close()
Пример #12
0
    def loadExternalTools(self):
        ###   Load external tools   ###

        # get ContoPt
        wordnetLoadTimeStart = time.time()

        wordnet = ContoPtReader.ContoPtLoader()

        elapsedTimeWordnetLoad = time.time() - wordnetLoadTimeStart
        print "\nWordnet loaded in " + str(elapsedTimeWordnetLoad) + " sec.]\n"

        #  get word2vec model
        wordEmbeddingLoadTimeStart = time.time()

        wordEmbeddingsModel = Embedding.load(
            parameters.paths["wordEmbeddings"] + "/polyglot-pt.pkl")
        #wordEmbeddingsModel = (self.wordEmbeddingsModel).normalize_words()

        elapsedTimeWordEmbeddingLoad = time.time() - wordEmbeddingLoadTimeStart
        print "\nWord2vec model loaded in " + str(
            elapsedTimeWordEmbeddingLoad) + " sec.]\n"

        return (wordnet, wordEmbeddingsModel)
Пример #13
0
def load_embeddings(data_root, languages):
    return {
        l: Embedding.load(data_root + (f"embeddings/{l}.tar.bz2"))
        for l in languages
    }
Пример #14
0
 def load(cls, polyglot_embedding_path: str, mimic_model_path: str):
     e = PolyEmbedding.load(polyglot_embedding_path)
     model = load_model(mimic_model_path, compile=False)
     return cls(e, model)
Пример #15
0
    def cache(self, name, cache, url=None, max_vectors=None):
        if self.emb_format == 'polyglot':
            try:
                from polyglot.mapping import Embedding
            except ImportError:
                logger.error('Please install `polyglot` package first.')
                return None
            embeddings = Embedding.load(name)
            self.itos = embeddings.vocabulary.id_word
            self.stoi = embeddings.vocabulary.word_id
            self.dim = embeddings.shape[1]
            self.vectors = torch.tensor(embeddings.vectors).view(-1, self.dim)

        elif self.emb_format == 'glove':
            itos = []
            vectors = []
            with open(name, 'r', encoding='utf8') as f:
                for line in f:
                    try:
                        values = line.rstrip().split()
                        itos.append(values[0])
                        vectors.append([float(x) for x in values[1:]])
                    except ValueError as e:
                        # ignore entries that look like:
                        # by [email protected] 0.6882 -0.36436 ...
                        continue
            self.itos = itos
            self.stoi = dict(zip(self.itos, range(len(self.itos))))
            self.dim = len(vectors[0])
            self.vectors = torch.tensor(vectors).view(-1, self.dim)

        elif self.emb_format == 'fasttext':
            try:
                from gensim.models import FastText
            except ImportError:
                logger.error('Please install `gensim` package first.')
                return None
            self.vectors = FastText.load_fasttext_format(name)
            self.itos = list(self.vectors.wv.vocab.keys())
            self.stoi = dict(zip(self.itos, range(len(self.itos))))
            self.unk_vector = self.vectors['<unk>']
            self.dim = self.vectors.vector_size

        elif self.emb_format == 'word2vec':
            try:
                from gensim.models import KeyedVectors
            except ImportError:
                logger.error('Please install `gensim` package first.')
                return None
            embeddings = KeyedVectors.load_word2vec_format(
                name, unicode_errors='ignore', binary=self.binary)
            self.itos = embeddings.index2word
            self.stoi = dict(zip(self.itos, range(len(self.itos))))
            self.dim = embeddings.vector_size
            self.vectors = torch.tensor(embeddings.vectors).view(-1, self.dim)

        elif self.emb_format == 'text':
            tokens = []
            vectors = []
            if self.binary:
                import pickle

                # vectors should be a dict mapping str keys to numpy arrays
                with open(name, 'rb') as f:
                    d = pickle.load(f)
                    tokens = list(d.keys())
                    vectors = list(d.values())
            else:
                # each line should contain a token and its following fields
                # <token> <vector_value_1> ... <vector_value_n>
                with open(name, 'r', encoding='utf8') as f:
                    for line in f:
                        if line:  # ignore empty lines
                            fields = line.rstrip().split()
                            tokens.append(fields[0])
                            vectors.append(list(map(float, fields[1:])))
            self.itos = tokens
            self.stoi = dict(zip(self.itos, range(len(self.itos))))
            self.vectors = torch.tensor(vectors)
            self.dim = self.vectors.shape[1]

        elif self.emb_format == 'fonseca':
            import numpy as np
            import os
            embeddings = np.load(os.path.join(name, 'types-features.npy'))
            texts = open(os.path.join(name, 'vocabulary.txt'), 'r').read()
            words = set([w.strip() for w in texts.split('\n')])
            self.itos = list(words)
            self.stoi = dict(zip(self.itos, range(len(self.itos))))
            self.dim = embeddings.shape[1]
            self.vectors = torch.tensor(embeddings).view(-1, self.dim)

        if self.unk_vector is None:
            self.unk_vector = self.vectors.mean(0).unsqueeze(0)
Пример #16
0
 def test_polyglot(self) :
     from polyglot.mapping import Embedding
     embeddings = Embedding.load("/home/rmyeid/polyglot_data/embeddings2/en/embeddings_pkl.tar.bz2")
     
     neighbors = embeddings.nearest_neighbors("green")
Пример #17
0
def categorize_tweets(currentTwitterAccount, n_max_tweets=5, settings=None):
    if not settings:
        settings = load_from_config()

    subscription_key = settings["subscription_key"]
    api_url = "https://westcentralus.api.cognitive.microsoft.com/text/analytics/v2.0/"
    key_phrase_api_url = api_url + "keyPhrases"
    language_api_url = api_url + "languages"

    embeddings = Embedding.load(settings["model_location"])

    consumer_key = settings["consumer_key"]
    consumer_secret = settings["consumer_secret"]
    access_token = settings["access_token"]
    access_token_secret = settings["access_token_secret"]

    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)

    api = tweepy.API(auth)

    # Fetch swedish tweets

    def language_check(string):
        headers = {"Ocp-Apim-Subscription-Key": subscription_key}
        response = requests.post(language_api_url, headers=headers, json={"documents": [{"id": 1, "text": string}]})
        if response.ok:
            return response.json()["documents"][0]["detectedLanguages"][0]["iso6391Name"]
        else:
            if response.status_code == 429:
                time.sleep(1)
                return language_check(string)
            response.raise_for_status()

    documents = {"documents": []}
    tweets_raw = []
    i = 0
    for tweet in tweepy.Cursor(api.user_timeline, id=currentTwitterAccount, tweet_mode="extended").items(n_max_tweets):
        # removing the http link at the end of the text
        result = re.sub(r"http\S+", "", tweet.full_text)
        if language_check(result) == "sv":
            documents['documents'].append({'id': i, 'language': 'sv', 'text': result})
            tweets_raw.append((result, tweet.created_at))
            i += 1

    ### Extract key words

    headers = {"Ocp-Apim-Subscription-Key": subscription_key}
    response = requests.post(key_phrase_api_url, headers=headers, json=documents)
    key_phrases = response.json()

    # Parse key words
    key_words = [[y for y in x.values()][0] for x in key_phrases["documents"]]
    key_words = [[y.split(" ") for y in x] for x in key_words]
    key_words = [[y.strip() for sublist in l for y in sublist] for l in key_words]

    ### Determine closest category for the sets of key words

    def embedding_distances(word, category):  # Adapter to handle missing words for embedding model
        try:
            return embeddings.distances(word, category)
        except:
            return [1e16]  # If word is not present, return big integer..

    def topic(word):  # Determine category score for word
        topic_list = [embedding_distances(word.lower(), category) for category in
                      CATEGORIES]  # compute distances to categories
        topic_list = [min(l) for l in topic_list]  # compute average of each sublist
        min_value = min(topic_list)
        return topic_list.index(min_value), min_value

    topic_dists = [[topic(word) for word in l] for l in key_words]

    def cluster_topics(topic_dist):
        topic_dict = {}
        for t in topic_dist:
            if t[0] in topic_dict:
                topic_dict[t[0]] = (min(topic_dict[t[0]][0], t[1]), topic_dict[t[0]][1] + 1)
            else:
                topic_dict[t[0]] = (t[1], 1)
        topics = [(key, value[0]) for key, value in topic_dict.items()]
        values = [x[1] for x in topics]
        return topics[values.index(min(values))]

    categorized_tweets = [{"text": tweets_raw[i][0], "category": CATEGORY_NAMES[cluster_topics(topic_dists[i])[0]],
                           "time": str(tweets_raw[i][1])} for i in range(len(topic_dists))]
    return categorized_tweets
Пример #18
0
import os
import glob
import sqlite3
from polyglot.text import Text, Word
from polyglot.downloader import downloader
from polyglot.mapping import Embedding

downloader.download("embeddings2.pt")
downloader.download("pos2.pt")
downloader.download("morph2.pt")
downloader.supported_tasks(lang="pt")

embeddings = Embedding.load(
    "/Users/emersonantonio/polyglot_data/embeddings2/pt/embeddings_pkl.tar.bz2"
)

#neighbors = embeddings.nearest_neighbors("verde")
#for w,d in zip(neighbors, embeddings.distances("green", neighbors)):
#  print("{:<8}{:.4f}".format(w,d))

# Criar o Banco de Dados
con = sqlite3.connect('./db/dadosDipolNLTK.db')
cur = con.cursor()

sql_create = 'CREATE TABLE IF NOT EXISTS miniDicionario '\
'(' \
'  id integer primary key AUTOINCREMENT, '\
'   word varchar(50), ' \
'   radical varchar(50), ' \
'   tag varchar(50)' \
')'
Пример #19
0
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
from os import curdir, sep

from word2vec import transform_text, getKthNeighbour, closest_k_points_tsne
from polyglot.mapping import Embedding

import json

from tsne import tsne
from word2vec import transform_text

# from sklearn.manifold import TSNE

PORT_NUMBER = 8080
polish_embeddings = Embedding.load("polyglot-pl.pkl")

# ------------- t-SNE init ------------------------------
# model = TSNE(n_components=2, random_state=0)
# np.set_printoptions(suppress=True)
# tsne_rep = tsne(polish_embeddings.vectors)

# This class will handles any incoming request from
# the browser

print json.dumps(closest_k_points_tsne(polish_embeddings, "Beata", 10))


class myHandler(BaseHTTPRequestHandler):
    def do_POST(self):
        if self.path == "/sentence-find-near":
def langmodelload(language):
    ########################
    global stop_words
    global question_words
    global embeddings
    global model
    global lang_dict
    ########################
    LibLocLang = "./udpipe-ud/"
    ########################
    if language == "en":
        model = Model(LibLocLang + 'english-ewt-ud-2.5-191206.udpipe')
    elif language == "ar":
        model = Model(LibLocLang + 'arabic-padt-ud-2.5-191206.udpipe')
    elif language == "zh":
        model = Model(LibLocLang + 'chinese-gsdsimp-ud-2.5-191206.udpipe')
    elif language == "id":
        model = Model(LibLocLang + 'indonesian-gsd-ud-2.5-191206.udpipe')
    elif language == "ko":
        model = Model(LibLocLang + 'korean-gsd-ud-2.5-191206.udpipe')
    elif language == "pt":
        model = Model(LibLocLang + 'portuguese-gsd-ud-2.5-191206.udpipe')
    elif language == "vi":
        model = Model(LibLocLang + 'vietnamese-vtb-ud-2.5-191206.udpipe')
    elif language == "hi":
        model = Model(LibLocLang + 'hindi-hdtb-ud-2.5-191206.udpipe')
    elif language == "jp":
        model = Model(LibLocLang + 'japanese-gsd-ud-2.5-191206.udpipe')
    elif language == 'es':
        model = Model(LibLocLang + 'spanish-gsd-ud-2.5-191206.udpipe')
    ########################
    base_question_words = [
        'where', 'which', "who", "why", "what", "when", "please", "how", "is",
        "are", "will", "could", "should", "was", "were", "do", "did", "can"
    ]
    question_words = []
    for i in range(0, len(base_question_words)):
        question_words.append(
            Text(base_question_words[i]).transliterate(language))
    ########################
    if stopwords.has_lang(
            language
    ) and language != "hi" and language != "ar" and language != "zh" and language != "vi" and language != "ko" and language != "jp" and language != "id" and language != "ms":
        ########################
        stop_words = list(stopwords.stopwords(language))
        stop_words_list = []
        ########################
        for i in range(0, len(stop_words)):
            try:
                text = Text(stop_words[i], hint_language_code=language)
                ########################
                if (text.pos_tags[0][1] != "NOUN") and (
                        text.pos_tags[0][1] != "VERB") and (text.pos_tags[0][1]
                                                            != "PRON"):
                    stop_words_list.append(text.pos_tags[0][0])
            except Exception as e:
                print(e)
        stop_words = stop_words_list
    else:
        print(language + " has errors.")
        stop_words = []
    ########################
    ########################

    embeddings = Embedding.load("./polyglot_data/embeddings2/" + language +
                                "/embeddings_pkl.tar.bz2")
    lang_dict[language] = {
        'model': model,
        'embeddings': embeddings,
        'stop_words': stop_words
    }
Пример #21
0
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
from os import curdir, sep

from word2vec import transform_text, getKthNeighbour, closest_k_points_tsne
from polyglot.mapping import Embedding

import json

from tsne import tsne
from word2vec import transform_text

# from sklearn.manifold import TSNE

PORT_NUMBER = 8080
polish_embeddings = Embedding.load("polyglot-pl.pkl")


# ------------- t-SNE init ------------------------------
# model = TSNE(n_components=2, random_state=0)
# np.set_printoptions(suppress=True)
# tsne_rep = tsne(polish_embeddings.vectors)

# This class will handles any incoming request from
# the browser

print json.dumps(closest_k_points_tsne(polish_embeddings, "Beata", 10))

class myHandler(BaseHTTPRequestHandler):
    def do_POST(self):
        if self.path=="/sentence-find-near":
Пример #22
0
def embedding(text, embeddingPATH):
	embeddings = Embedding.load(embeddingPATH)
	neighbors = embeddings.nearest_neighbors(text)
	for w, d in zip(neighbors, embeddings.distances(text, neighbors)):
  		print("{}\n{}".format(w,d))
Пример #23
0
 def load_embedding(self):
     path = os.path.join(self.c["data_root"], "embeddings",
                         self.c["language"] + ".tar.bz2")
     return PolyglotEmbedding.load(path)
#This gives only the polyglot embeddings.

import numpy as np
from polyglot.mapping import Embedding
import pickle
from pos_helper import *
from nltk import pos_tag

src_embeddings = Embedding.load(
    "/home/krishna/polyglot_data/embeddings2/en/embeddings_pkl.tar.bz2")
tar_embeddings = Embedding.load(
    "/home/krishna/polyglot_data/embeddings2/de/embeddings_pkl.tar.bz2")


def make_align_dict(inp, nwords):
    inplist = inp.split()
    aldict = {}
    for j in range(nwords):
        aldict[j] = []
    for j in inplist:
        a, b = j.split('-')
        a, b = int(a), int(b)
        if b not in aldict:
            aldict[b] = []
        aldict[b].append(a)
    return aldict


def get_target_embedding(ind, inlist):
    try:
        e2 = tar_embeddings[inlist[ind]]
Пример #25
0
from polyglot.mapping import Embedding
import processitem
import numpy as np

embeddings = Embedding.load(
    '/home/luka/polyglot_data/embeddings2/nl/embeddings_pkl.tar.bz2')

words = []
vectors = []

for w, v in embeddings:
    words.append(w)
    vectors.append(v)

file = open('./RELPRON/RELPRON/translation_basic.txt', 'r', encoding='latin-1')
items_raw = file.readlines()
file.close()

items_neat = []
for i in items_raw:
    neat = processitem.Item(i)
    items_neat.append(neat)

#calculate lexical baselines: headN and V vectors
NN_dist = []
NV_dist = []
for i in items_neat:
    t = i.termN
    h = i.headN
    v = i.V
    if t in words: