Exemplos de Word2Vec.load_word2vec_format em Python, exemplos de gensim.models.Word2Vec.load_word2vec_format em Python

Exemplo n.º 1

0

Exibir arquivo

def load_word_embedding(data_name='google_news', data_type='bin'):
    logger.info('Start load word2vec word embedding')
    os_name = get_os_name()
    if os_name == "windows":
        file1 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz'
        file2 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz'
        file3 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin'
        file4 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin'
    elif os_name == 'ubuntu':
        file1 = '/home/hs/Data/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz'
        file2 = '/home/hs/Data/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz'
        file3 = '/home/hs/Data/Word_Embeddings/google_news.bin'
        file4 = '/home/hs/Data/Word_Embeddings/freebase.bin'
    if data_name == 'google_news':
        if data_type == 'bin':
            model = Word2Vec.load_word2vec_format(file3, binary=True)
        else:  # load .bin.gz data
            model = Word2Vec.load_word2vec_format(file1, binary=True)
    else:  # load freebase
        if data_type == 'bin':
            model = Word2Vec.load_word2vec_format(file4, binary=True)
        else:
            model = Word2Vec.load_word2vec_format(file2, binary=True)

    # using gzipped/bz2 input works too, no need to unzip:
    logging.info('Loading word embedding complete')
    return model

Exemplo n.º 2

0

Exibir arquivo

Arquivo: load_data.py Projeto: candlewill/short_texts_sentiment_analysis

def load_word_embedding(data_name='google_news', data_type='bin'):
    logger.info('Start load word2vec word embedding')
    os_name = get_os_name()
    if os_name == "windows":
        file1 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz'
        file2 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz'
        file3 = 'D:/Word_Embeddings/GoogleNews-vectors-negative300.bin'
        file4 = 'D:/Word_Embeddings/freebase-vectors-skipgram1000.bin'
    elif os_name == 'ubuntu':
        file1 = '/home/hs/Data/Word_Embeddings/GoogleNews-vectors-negative300.bin.gz'
        file2 = '/home/hs/Data/Word_Embeddings/freebase-vectors-skipgram1000.bin.gz'
        file3 = '/home/hs/Data/Word_Embeddings/google_news.bin'
        file4 = '/home/hs/Data/Word_Embeddings/freebase.bin'
    if data_name == 'google_news':
        if data_type == 'bin':
            model = Word2Vec.load_word2vec_format(file3, binary=True)
        else:  # load .bin.gz data
            model = Word2Vec.load_word2vec_format(file1, binary=True)
    else:  # load freebase
        if data_type == 'bin':
            model = Word2Vec.load_word2vec_format(file4, binary=True)
        else:
            model = Word2Vec.load_word2vec_format(file2, binary=True)

    # using gzipped/bz2 input works too, no need to unzip:
    logging.info('Loading word embedding complete')
    return model

Exemplo n.º 3

0

Exibir arquivo

Arquivo: load.py Projeto: sylb/LinguisticEvolution

def createTransformationMatrix(modelA, modelB):
    # initialize the matrices
    labels = []
    A = []
    B = []
    # keep the common words and add them to the matrices
    nb_words_A = len(modelA.index2word)
    nb_words_B = len(modelA.index2word)
    for i in range(0, nb_words_A):
        word = modelA.index2word[i]
        if word in modelB.index2word:
            # add the word to the matrices (and the labels)
            labels.append(word)
            A.append(modelA[word])
            B.append(modelB[word])
    # create the transformation matrix
    TransM, _ = orthogonal_procrustes(np.asarray(A),
                                      np.asarray(B),
                                      check_finite=False)

    # apply the transofrmation matrix to the first model matrix
    Z = np.matmul(A, TransM)

    # create the 2 models manually (by first creating a text file and reading it).
    # it would be most efficient not to have to store the results on files like this.
    constructModel(np.asarray(Z), labels, "tmpZ.model.txt")
    constructModel(np.asarray(B), labels, "tmpB.model.txt")

    modelZ_ = Word2Vec.load_word2vec_format('tmpZ.model.txt', binary=False)
    modelB_ = Word2Vec.load_word2vec_format('tmpB.model.txt', binary=False)

    return modelZ_, modelB_

Exemplo n.º 4

0

Exibir arquivo

 def __init__(self, label_vec_f, feature_vec_f, binary=False):
     label2vec = Word2Vec.load_word2vec_format(label_vec_f, binary=binary)
     self.label_embed = label2vec.syn0
     self.dictionary = label2vec.index2word
     self.vocab = label2vec.vocab
     self.feat_embed = Word2Vec.load_word2vec_format(feature_vec_f,
                                                     binary=binary).syn0

Exemplo n.º 5

0

Exibir arquivo

Arquivo: prep.py Projeto: vyraun/cnn_basics

def load_embeddings(path=None):
    path = just.make_path(path)
    binary = path.endswith("gz") or path.endswith("bz2")
    if binary:
        embeddings = Word2Vec.load_word2vec_format(path, binary=True)
    else:
        embeddings = Word2Vec.load_word2vec_format(path, binary=False)
    esize = _get_embedding_size(embeddings)
    return embeddings, esize

Exemplo n.º 6

0

Exibir arquivo

    def set_embedding_weights(self, embedding_init):
        # load embedding with gensim
        from gensim.models import Word2Vec
        try:
            m = Word2Vec.load_word2vec_format(embedding_init, binary=False)
            edim = m.layer1_size
        except UnicodeDecodeError:
            try:
                m = Word2Vec.load_word2vec_format(embedding_init, binary=True)
                edim = m.layer1_size
            except UnicodeDecodeError:
                # not in word2vec format
                m = Word2Vec.load(embedding_init)
                edim = m.layer1_size
        except ValueError:
            # glove model
            m = {}
            if embedding_init.endswith('gz'):
                fp = gzip.open(embedding_init)
            else:
                fp = open(embedding_init)
            for l in fp:
                le = l.split()
                m[le[0].decode('utf-8')] = numpy.array(
                    [float(e) for e in le[1:]], dtype=theano.config.floatX)
                edim = len(le) - 1

        if edim != self.edim:
            raise Exception("Embedding dim and edim doesn't match")
        m_lower = {}
        vocab = (m.vocab if hasattr(m, 'vocab') else m)
        for k in vocab:
            if k in ['UNKNOWN', 'PADDING']:
                continue
            if self.num:
                m_lower[replace_numerals(k.lower())] = m[k]
            else:
                m_lower[k.lower()] = m[k]
        # transform weight matrix with using self.w2i
        params = numpy.zeros(
            self.tagger.layers[0].layers[0].get_param_vector().shape,
            dtype=theano.config.floatX)
        e = self.edim
        for w in self.w2i:
            if w in m_lower:
                v = m_lower[w]
                i = self.w2i[w]
                params[i * e:(i + 1) * e] = v
        if 'UNKNOWN' in vocab:
            params[-1 * e:] = vocab['UNKNOWN']
        if 'PADDING' in vocab:
            params[-2 * e:-1 * e] = vocab['PADDING']
        self.tagger.layers[0].layers[0].set_param_vector(params)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: embedding.py Projeto: qianrenjian/wordsim

 def load_model(model_fn, model_type):
     logging.info('Loading model: {0}'.format(model_fn))
     if model_type == 'word2vec':
         model = Word2Vec.load_word2vec_format(model_fn, binary=True)
     elif model_type == 'word2vec_txt':
         model = Word2Vec.load_word2vec_format(model_fn, binary=False)
     elif model_type == 'gensim':
         model = Word2Vec.load(model_fn)
     else:
         raise Exception('Unknown model format')
     logging.info('Model loaded: {0}'.format(model_fn))
     return model

Exemplo n.º 8

0

Exibir arquivo

Arquivo: load_vector_model.py Projeto: csong27/NgramNeuralNetworks

def read_glove_model(dim=50, huge=False):
    print "reading gloVe word embedding vectors..."
    if dim == 50:
        return Word2Vec.load_word2vec_format(glove_vector_50, binary=False)
    elif dim == 100:
        return Word2Vec.load_word2vec_format(glove_vector_100, binary=False)
    elif dim == 200:
        return Word2Vec.load_word2vec_format(glove_vector_200, binary=False)
    elif dim == 300:
        return Word2Vec.load_word2vec_format(glove_vector_300, binary=False)
    elif huge:
        return read_glove_to_dict(glove_vector_huge)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: sequence_tagger.py Projeto: zseder/hunvec

    def set_embedding_weights(self, embedding_init):
        # load embedding with gensim
        from gensim.models import Word2Vec
        try:
            m = Word2Vec.load_word2vec_format(embedding_init, binary=False)
            edim = m.layer1_size
        except UnicodeDecodeError:
            try:
                m = Word2Vec.load_word2vec_format(embedding_init, binary=True)
                edim = m.layer1_size
            except UnicodeDecodeError:
                # not in word2vec format
                m = Word2Vec.load(embedding_init)
                edim = m.layer1_size
        except ValueError:
            # glove model
            m = {}
            if embedding_init.endswith('gz'):
                fp = gzip.open(embedding_init)
            else:
                fp = open(embedding_init)
            for l in fp:
                le = l.split()
                m[le[0].decode('utf-8')] = numpy.array(
                    [float(e) for e in le[1:]], dtype=theano.config.floatX)
                edim = len(le) - 1

        if edim != self.edim:
            raise Exception("Embedding dim and edim doesn't match")
        m_lower = {}
        vocab = (m.vocab if hasattr(m, 'vocab') else m)
        for k in vocab:
            if k in ['UNKNOWN', 'PADDING']:
                continue
            if self.num:
                m_lower[replace_numerals(k.lower())] = m[k]
            else:
                m_lower[k.lower()] = m[k]
        # transform weight matrix with using self.w2i
        params = numpy.zeros(
            self.tagger.layers[0].layers[0].get_param_vector().shape, dtype=theano.config.floatX)
        e = self.edim
        for w in self.w2i:
            if w in m_lower:
                v = m_lower[w]
                i = self.w2i[w]
                params[i*e:(i+1)*e] = v
        if 'UNKNOWN' in vocab:
            params[-1*e:] = vocab['UNKNOWN']
        if 'PADDING' in vocab:
            params[-2*e:-1*e] = vocab['PADDING']
        self.tagger.layers[0].layers[0].set_param_vector(params)

Exemplo n.º 10

0

Exibir arquivo

def generate_cnn_train_test(char_name, word_name):
    cnn_vec_dir = 'cnn_vec'
    ensure_path(cnn_vec_dir)

    char_embedding = Word2Vec.load_word2vec_format(char_name, binary=False)
    word_embedding = Word2Vec.load_word2vec_format(word_name, binary=False)
    train_doc, train_label = load_train()
    # test_doc, test_label = load_test()
    train_vec_file = cnn_vec_dir + '/' + 'train.txt'
    # test_vec_file = cnn_vec_dir + '/' + 'test.txt'
    generate_cnn_vec(char_embedding, word_embedding, train_doc, train_label,
                     train_vec_file)
    print 'generate cnn train feature ok'

Exemplo n.º 11

0

Exibir arquivo

Arquivo: find_transform.py Projeto: hlt-bme-hu/eval_dict

def load_matrix_and_dictionary(fn, typ, dict_fn=None, filt_dict=None):
    if typ == 'numpy':
        return np.load(fn), load_dictionary_as_dict(dict_fn)
    elif typ == 'glove':
        from glove import Glove
        m = Glove().load_stanford(fn)
        return m.word_vectors, m.dictionary
    elif typ == 'word2vec':
        from gensim.models import Word2Vec
        if 'txt' in fn or 'w2v' in fn:
            m = Word2Vec.load_word2vec_format(fn, binary=False)
        else:
            m = Word2Vec.load_word2vec_format(fn, binary=True)
        return extract_wordvec_matrix_and_dict(m, filt_dict)
    raise Exception('Unknown matrix format: {}'.format(typ))

Exemplo n.º 12

0

Exibir arquivo

Arquivo: utilities.py Projeto: lijielife/mordecai

def setup_w2v(word2vec_model, country_names_json):
    ''' Given the path to a word2vec model and a JSON file containing country
    names and codes, setup the indices and vocabulary for geocoding.'''
    prebuilt = Word2Vec.load_word2vec_format(word2vec_model, binary=True)
    vocab_set = set(prebuilt.vocab.keys())
    with open(country_names_json) as f:
        stopword_country_names = json.load(f)
    countries = stopword_country_names.keys()
    idx_country_mapping = {}
    index = numpy.empty(shape=(len(countries), 300), dtype='float64')
    for idx, country in enumerate(countries):
        country = unidecode(country)
        try:
            vector = prebuilt[country]
        except KeyError:
            pass
        index[idx] = vector
        try:
            idx_country_mapping[idx] = stopword_country_names[country]
        except KeyError:
            pass
    return {
        'prebuilt': prebuilt,
        'vocab_set': vocab_set,
        'index': index,
        'idx_country_mapping': idx_country_mapping
    }

Exemplo n.º 13

0

Exibir arquivo

Arquivo: train.py Projeto: jbdatascience/voynich-translation

def __filter_w2v_model(filename, words_to_remove, num_to_keep):
    """Filters the words in the Spanish model, removing all the words in the given list and returning the top x words

    :param filename: The name of the file to read the words in from
    :param words_to_remove: A list of all the words to get rid of
    :param num_to_keep: The number of words to keep
    """
    good_words = list()

    with open(filename, 'r') as f:
        for line in f:
            for word in words_to_remove:
                if not line.startswith(word):
                    good_words.append(line)

    random.shuffle(good_words)

    kept_words = good_words[:num_to_keep]

    with open('tempmodel', 'w') as f:
        for word in kept_words:
            f.write(word)
            f.write('\n')

    return Word2Vec.load_word2vec_format('tempmodel')

Exemplo n.º 14

0

Exibir arquivo

Arquivo: wikipedia.py Projeto: youyanggu/adulteration

def tokens_to_word2vec(tokens, model):
    if model == 'word2vec':
        model = Word2Vec.load_word2vec_format(
            os.path.join(DIRNAME, '../word2vec/GoogleNews-vectors-negative300.bin'), binary=True)
    elif model == 'glove':
        word_to_vector_glove = {}
        tokens_glove = set(tokens)
        #with open(os.path.join(DIRNAME, '../glove/glove.6B/glove.6B.300d.txt'), 'r') as f:
        with open(os.path.join(DIRNAME, '../glove/glove.42B.300d.txt'), 'r') as f:
            for line in f:
                split_index = line.index(' ')
                word = line[:split_index]
                vector = np.fromstring(line[split_index+1:], dtype=float, sep=' ')
                assert len(vector)==300
                if word == '.':
                    word = '</s>'
                if word in tokens_glove:
                    word_to_vector_glove[word] = vector
        return word_to_vector_glove
    word_to_vector = {}
    for word in tokens:
        try:
            arr = model[word]
        except KeyError:
            continue
        word_to_vector[word] = arr
    return word_to_vector

Exemplo n.º 15

0

Exibir arquivo

Arquivo: evaluate.py Projeto: nahgnaw/sci-kb

def compute_pair_similarity(benchmark_file, embedding_file, binary_embedding=True):
    logger = logging.getLogger()

    logger.info('Loading embeddings from {}...'.format(embedding_file))
    embedding_model = Word2Vec.load_word2vec_format(embedding_file, binary=binary_embedding)

    pair_similarities = {}
    with open(benchmark_file) as bf:
        for line in bf:
            line = line.strip()
            if line:
                pair = tuple(line.split(','))
                term_1, term_2 = pair
                if term_1 in embedding_model and term_2 in embedding_model:
                    sim_score = float(embedding_model.similarity(term_1, term_2))
                    pair_similarities[pair] = sim_score

    accuracy = []
    for threshold in arange(0.0, 1.1, 0.1):
        similar_pair_count = 0
        for pair in pair_similarities:
            if pair_similarities[pair] >= threshold:
                similar_pair_count += 1
        accuracy.append(float(similar_pair_count) / len(pair_similarities))

    logger.info('Accuracy: {}'.format(accuracy))
    return accuracy

Exemplo n.º 16

0

Exibir arquivo

Arquivo: w2v_cpp2.py Projeto: Sanqiang/entity2vector

    def populate_entity(self, path_vec,path_entity, prod_model = True):
        self.path_vec = path_vec
        self.path_entity = path_entity
        self.prod_model = prod_model
        self.entity_model = Word2Vec.load_word2vec_format(path_vec)

        self.entity2idx = {}
        self.idx2entity = OrderedDict()

        f = open(path_entity, "r")
        for line in f:
            if self.method == "LDA":
                entity = line[0:-1]
                idx = len(self.entity2idx)

                if entity not in self.entity2idx:
                    self.entity2idx[entity] = idx
                    self.idx2entity[idx] = entity
                else:
                    print("dup?")
            else:
                entity = line[0:line.rindex("_")]
                idx = int(line[1 + line.rindex("_"):])

                self.entity2idx[entity] = idx
                self.idx2entity[idx] = entity

Exemplo n.º 17

0

Exibir arquivo

Arquivo: wordtwovec.py Projeto: wayne9qiu/CSK

 def __init__(self, model_file: str) -> None:
     if model_file.endswith(".bin"):
         self.model = Word2Vec.load_word2vec_format(model_file, binary=True)
     elif model_file.endswith(".model"):
         self.model = api.load(model_file[:-6])
     else:
         self.model = Word2Vec.load(model_file)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: crf_pystruct.py Projeto: bwallace/Deep-PICO

def run():
    w2v = True
    l1 = 1
    l2 = 1e-3
    iters = 200
    wiki = True
    words_before = 4
    words_after = 4
    shallow_parse = True

    try:
        opts, args = getopt.getopt(sys.argv[1:], 'w:i:c:l:', ['w2v=', 'iters=', 'l1=', 'l2=', 'wiki=',
                                                              'words_before=', 'words_after=', 'shallow_parse='])
    except getopt.GetoptError as e:
        print(e)
        sys.exit(2)

    for opt, arg in opts:
        if opt in ('-w', '--w2v'):
            option = int(arg)

            if option == 1:
                w2v = True
        elif opt in ('-i', '--iters'):
            iters = int(arg)
        elif opt in ('-c', '--l1'):
            l1 = float(arg)
        elif opt in ('-l', '--l2'):
            l2 = float(l2)
        elif opt == '--wiki':
            option = int(arg)


            if option == 0:
                wiki = False
        elif opt == '--words_before':
            words_before = int(arg)
        elif opt == '--words_after':
            words_after = int(arg)
        elif opt == 'shallow_parse':
            option = int(arg)

            if option == 0:
                shallow_parse = False


        else:
            sys.exit(2)
    if w2v:
        print('Loading word2vec model...')

        if wiki:
            word2vec_model = 'wikipedia-pubmed-and-PMC-w2v.bin'
        else:
            word2vec_model = 'PubMed-w2v.bin'
        w2v = Word2Vec.load_word2vec_format(word2vec_model, binary=True)
        print('Loaded word2vec model')
    else:
        w2v = None
    run_crf(w2v, words_before, words_after, shallow_parse)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: similarity.py Projeto: Crazyconv/Word2Vec2NLP

def compare(dataset, model_name, pre_model_name):

    # build model
    if(os.path.isfile(model_name)):
        model = Word2Vec.load(model_name)
        logger.debug("model %s already exist, stop training wordvector", model_name)
    else:
        logger.info("start trainning word vector")
        start_time = timeit.default_timer()
        model = wordvector.build_word_vector(dataset, save=True, save_file=model_name)
        logger.info("model %s trained in %.4lfs", model_name, timeit.default_timer() - start_time)

    # find most similar words:
    for word in keywords:
        print word
        print model.most_similar(word, topn=10);

    # load pre-trained google news model
    logger.info("start loading pre-trained dataset")
    start_time = timeit.default_timer()
    pre_model = Word2Vec.load_word2vec_format(pre_model_name, binary=True)
    logger.info("pre-trained dataset loaded in %.4lfs", timeit.default_timer() - start_time)

    # find most similar words:
    for word in keywords:
        print word
        print pre_model.most_similar(word, topn=10);

Exemplo n.º 20

0

Exibir arquivo

Arquivo: tp.py Projeto: uclnlp/ntp

 def get_model(self):
     if self.word2vec is None:
         print("Loading word2vec...")
         self.word2vec = Word2Vec.load_word2vec_format(self.path,
                                                       binary=True)
         print("Done!")
     return self.word2vec

Exemplo n.º 21

0

Exibir arquivo

Arquivo: w2v_cpp2.py Projeto: afcarl/entity2vector

    def populate_entity(self, path_vec, path_entity, prod_model=True):
        self.path_vec = path_vec
        self.path_entity = path_entity
        self.prod_model = prod_model
        self.entity_model = Word2Vec.load_word2vec_format(path_vec)

        self.entity2idx = {}
        self.idx2entity = OrderedDict()

        f = open(path_entity, "r")
        for line in f:
            if self.method == "LDA":
                entity = line[0:-1]
                idx = len(self.entity2idx)

                if entity not in self.entity2idx:
                    self.entity2idx[entity] = idx
                    self.idx2entity[idx] = entity
                else:
                    print("dup?")
            else:
                entity = line[0:line.rindex("_")]
                idx = int(line[1 + line.rindex("_"):])

                self.entity2idx[entity] = idx
                self.idx2entity[idx] = entity

Exemplo n.º 22

0

Exibir arquivo

Arquivo: test-sentence-generation.py Projeto: yashtatia/BREDS

def main():
    print "Loading word2vec"
    global word2vec
    word2vec = Word2Vec.load_word2vec_format(sys.argv[2], binary=True)
    tagger = load("taggers/maxent_treebank_pos_tagger/english.pickle")
    f_sentences = codecs.open(sys.argv[1], encoding="utf-8")
    invalid = list()
    valid = list()
    on = False
    for line in f_sentences:
        if line.startswith("#"):
            continue
        if line.startswith("VALID"):
            on = True
            continue
        sentence = Sentence(line.strip(), "ORG", "LOC", 6, 1, 2, tagger)
        for rel in sentence.relationships:
            t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before, rel.between, rel.after)
            if on is True:
                valid.append(t)
            elif on is False:
                invalid.append(t)
    f_sentences.close()

    for v in valid:
        for i in invalid:
            score = similarity_3_contexts(v, i)
            print "VALID", v.e1, v.e2, "\t", v.bet_words
            print "INVALID", i.e1, i.e2, "\t", i.bet_words
            print score

Exemplo n.º 23

0

Exibir arquivo

Arquivo: mail2vec.py Projeto: strin/surrobot

def init_word2vec():
    global WORD2VEC_MODEL
    if not WORD2VEC_MODEL:
        print 'loading word2vec model'
        from gensim.models import Word2Vec
        WORD2VEC_MODEL = Word2Vec.load_word2vec_format('model/GoogleNews-vectors-negative300.bin', binary=True)
        print 'loading word2vec model [done]'

Exemplo n.º 24

0

Exibir arquivo

Arquivo: iterator.py Projeto: guptaanil2k1/CaffeDataLayerText

 def __init__(self, config, deterministic=False, *args, **kwargs):
     self.config = config
     self.deterministic = deterministic
     self.word2vec_model = Word2Vec.load_word2vec_format(config.get('word2vec_model'), binary=True)
     self.word2vec_keys = { word.lower(): word for word in self.word2vec_model.vocab }
     self.thesarus = data.get_thesarus(config.get('thesarus'))
     super(SharedIterator, self).__init__(*args, **kwargs)

Exemplo n.º 25

0

Exibir arquivo

Arquivo: get_topical_sentences.py Projeto: project-em/ns3-arg-extract

def main():
    topic_files = make_topic_map("./topic_sentences/selected_topics.txt",
                                 "./data/")

    topic_out_files = make_topic_map("./topic_sentences/selected_topics.txt",
                                     "./topic_sentences/",
                                     write=True)

    num_topics = len(topic_files)

    print "loading files"
    sentence_map = {
        topic: read_data_file(topic_file)
        for topic, topic_file in topic_files.items()
    }

    print "loading word2vec"
    word2vec = Word2Vec.load_word2vec_format(word2vec_filepath, binary=True)

    for topic in topic_files:
        sentences, _ = sentence_map[topic]
        sim_scores = compute_sim(sentences, topic, word2vec)
        outfile = topic_out_files[topic]
        count = 0
        for i, sentence in enumerate(sentences):
            if (sim_scores[i] > .5):
                outfile.write(sentence.encode('utf8', 'replace') + "\n")
                count += 1
        print "count for topic: ", topic, " is ", count

Exemplo n.º 26

0

Exibir arquivo

Arquivo: test_analogy.py Projeto: luckywood/conec

def evaluate_google():
    # see https://code.google.com/archive/p/word2vec/
    # load pretrained google embeddings and test
    from gensim.models import Word2Vec
    model_google = Word2Vec.load_word2vec_format(
        'data/GoogleNews-vectors-negative300.bin.gz', binary=True)
    _ = accuracy(model_google, "data/questions-words.txt", False)

Exemplo n.º 27

0

Exibir arquivo

Arquivo: New_cnn.py Projeto: curry24/ZhiHu

def read_test(data_file_name):
    word_name = '/home/niyao/zhaolei/ZhiHu/data/word_embedding.txt'
    word_embedding = Word2Vec.load_word2vec_format(word_name, binary=False)

    x_text = []
    reader = pd.read_table(data_file_name, sep='\t', header=None)
    for i in xrange(reader.shape[0]):
        x_text.append(reader.iloc[i][0])
    # print len(x_text)

    max_document_length = max([len(x.split(',')) for x in x_text])
    print 'the max document length of test is %d' % max_document_length

    j = 0
    x = []
    for features in x_text:
        xi = []
        for id in features.split(','):
            if id in word_embedding:
                xi.append(word_embedding[id])
        for i in xrange(len(xi), max_document_length):
            xi.append(np.zeros(256))
        x.append(xi)
        j += 1
        if j % 1000 == 0:
            print 'load data %d' % j

    x = np.array(x).astype(np.float32).reshape(217360,
                                               max_document_length * 256)
    print x.shape
    return x, max_document_length

Exemplo n.º 28

0

Exibir arquivo

Arquivo: run_UMLS_PICO.py Projeto: bwallace/UMLS_PICO_tagging

def load_CUI_vectors():
    ''' 
    From De Vine et al., CIKM 2014
    https://github.com/clinicalml/embeddings
    '''
    m = Word2Vec.load_word2vec_format("DeVine_etal_200.txt.gz")
    return m

Exemplo n.º 29

0

Exibir arquivo

Arquivo: post_generation.py Projeto: PorkShoulderHolder/battlebots

def get_word2vec(
        train_fn="data/rap/input.txt",
        saved_model_fn="save/save/GoogleNews-vectors-negative300.bin"):
    try:
        print "loading word2vec model at {0}".format(saved_model_fn)
        model = Word2Vec.load_word2vec_format(saved_model_fn, binary=True)
        print "model loaded"
        return model
    except IOError:
        print "no word2vec model found at {0}".format(saved_model_fn)
        with open(train_fn) as f:
            data = f.read()
            clean = TextLoader.clean_str(data)
            lines = [line.split(" ") for line in clean.split('\n')]
            full_data = brown.sents() + movie_reviews.sents() + treebank.sents(
            ) + lines
            print "training word2vec model"
            model = Word2Vec(workers=8)
            model.build_vocab(full_data)
            for i in xrange(0, 5):
                print "epoch " + str(i + 1)
                # full_data = shuffle(full_data)
                pb = ProgressBar(maxval=len(full_data))
                chunk_size = len(full_data) / 100
                j = 0
                pb.start()
                while j + chunk_size < len(full_data):
                    model.train(full_data[j:j + chunk_size])
                    j += chunk_size
                    pb.update(j)

            print "done training"
            model.save(saved_model_fn)
            return model

Exemplo n.º 30

0

Exibir arquivo

def word_2_vec():
	csv_paths = ['set1.csv','set2.csv','combined.csv']
	model = Word2Vec.load_word2vec_format('/root/libanghuai/homework/GoogleNews-vectors-negative300.bin', binary=True)
	for csv_path in csv_paths:
		print "deal_with %s \n" % csv_path
		out_file_name = "word2vec_result_"+csv_path
		wordpairs = list(csv_parser(csv_path))
		wordpairs = cal_rank(wordpairs,3)
		ans_list=[]
		for wordpair in wordpairs:
			fst_word = wordpair[1]
			sec_word = wordpair[2]
			max_sim = model.similarity(fst_word,sec_word)
			wordpair.append(max_sim)
			ans_list.append(wordpair)

		ans_list = cal_rank(ans_list,5)
		num = 0
		sum_gap = 0
		for line in ans_list:
			num += 1
			sum_gap += (line[4] - line[6])*(line[4] - line[6])
		print num
		output_file(out_file_name,ans_list)
		print (1-sum_gap*6.0/(num*(num*num - 1)))

Exemplo n.º 31

0

Exibir arquivo

    def __init__(self,
                 model_name,
                 glove=False,
                 binary=True,
                 dims=300,
                 models_path="F:\\wiki"):
        """
        Konstruktor wrappera modeli wektorowych
        :param model_name: nazwa pliku modeli
        :param glove: flaga czy model jest w formacie glove czy word2vec
        :param binary: flaga czy model w2v jest w formacie binarnym czy tekstowym
        :param dims: liczba wymiarów wektorów w modelu
        """
        self._model_name = model_name
        self._model_path = os.path.join(models_path, model_name)

        self._glove = glove
        self._binary = binary
        self.dims = dims

        if self._glove:
            self._model = VectorModelWrap.load_stf(self._model_path, self.dims)
        else:
            self._model = Word2Vec.load_word2vec_format(self._model_path, binary=True) if self._binary else\
                Word2Vec.load(self._model_path)

Exemplo n.º 32

0

Exibir arquivo

Arquivo: reprocessing.py Projeto: dpton/tf-rnn

 def __init__(self, model_path, model_type='fasttext', **kwarg):
     if model_type == "fasttext":
         self._model = FastText.load_fasttext_format(model_path)
     elif model_type == "word2vec":
         self._model = Word2Vec.load_word2vec_format(model_path)
     else:
         raise NotImplementedError("other model is not supported")

Exemplo n.º 33

0

Exibir arquivo

Arquivo: write_comment.py Projeto: zjc-enigma/writer

def load_word2vec_model(model):
    """

    """
    embed_data_path = "../data/embed_dat"
    embed_vocab_path = "../data/embed_vocab"
    vector_model_path = "../data/user_vector"

    if os.path.exists(embed_data_path):
        os.remove(embed_data_path)

    if os.path.exists(embed_vocab_path):
        os.remove(embed_vocab_path)

    if not os.path.exists(embed_data_path):
        print("Caching word embeddings in memmapped format...")

        wv = Word2Vec.load_word2vec_format(vector_model_path,  binary=True)

        print "wv syn0norm shape : " + str(wv.syn0norm.shape)
        fp = np.memmap(embed_data_path, dtype= np.double, mode='w+', shape=wv.syn0norm.shape)
        fp[:] = wv.syn0norm[:]
        with open(embed_vocab_path, "w") as f:
            for _, w in sorted((voc.index, word) for word, voc in wv.vocab.items()):
                f.write(w + "\n")

        del fp, wv

Exemplo n.º 34

0

Exibir arquivo

Arquivo: generate_network_ready_files.py Projeto: sougata09/CNN-QA

    def generate_word2vec_for_all(self):

        print 20 * "*"
        print "GENERATING NETWORK READY FILES."

        model = Word2Vec.load_word2vec_format(self.word2vec_path, binary=True)
        # model = ""
        for folder in self.folder_list:
            l_dir = os.path.join(self.raw_text_path, folder)
            print("Folder : ", folder)
            op_l_dir = os.path.join(self.op_path, folder)
            if not os.path.exists(op_l_dir):
                os.makedirs(op_l_dir)
            questions_dir = self.get_list_of_dirs(l_dir)
            for question_dir in questions_dir:
                file_list = self.get_list_of_files(
                    os.path.join(l_dir, question_dir))
                if not os.path.exists(os.path.join(op_l_dir, question_dir)):
                    os.makedirs(os.path.join(op_l_dir, question_dir))
                print("Question : ", question_dir)
                for fname in file_list:
                    with open(os.path.join(l_dir, question_dir, fname),
                              "r") as f:
                        if fname == 'support.txt':
                            is_closest_para_file = True
                            try:
                                text = f.readlines()[0]
                                raw_data_content = ""
                                count = 0
                                for s in sent_tokenize(text):
                                    if len(s.split()
                                           ) > self.num_of_words_in_sent:
                                        raw_data_content += " ".join(
                                            s.split()
                                            [:self.num_of_words_in_sent])
                                        raw_data_content += ". "
                                    else:
                                        raw_data_content += " ".join(s.split())
                                        raw_data_content += " "
                                    count += 1
                                    if count == self.num_of_sents_in_closest_para:
                                        break
                            except:
                                raw_data_content = f.readlines()
                        else:
                            is_closest_para_file = False
                            raw_data_content = f.readlines()
                    f = open(
                        os.path.join(op_l_dir, question_dir,
                                     fname[:-4] + ".pkl"), "w")
                    self.write_vecs_to_file(model, raw_data_content, f,
                                            is_closest_para_file)
                    f.close()
            print 20 * "***"
        print "saving final unknown word2vec dictionary to file"
        f = open(
            os.path.join(self.common_files_path,
                         self.unknown_words_vec_dict_file), "wb")
        pickle.dump(self.unknown_words_vec_dict, f)
        f.close()

Exemplo n.º 35

0

Exibir arquivo

def load_CUI_vectors():
    ''' 
    From De Vine et al., CIKM 2014
    https://github.com/clinicalml/embeddings
    '''
    m = Word2Vec.load_word2vec_format("DeVine_etal_200.txt.gz")
    return m

Exemplo n.º 36

0

Exibir arquivo

Arquivo: w2v-lstm-word-segment-class.py Projeto: thanhtd91/dnn-lstm-word-segment

	def __init__(self):
		'''initialize'''

		self.s_window 	= 5
		self.w2v_dim 	= 200
		self.nb_classes = 4

		self.label_id_dict    = {u'S': 0, u'B': 1, u'M': 2, u'E': 3}
		self.train_data_file  = "data/msr_training_taged"
		self.w2v_model_file   = "data/msr_training_single_word.w2v.bin"
		self.model_hdf5_file  = "pkl/w2v-word-segment.model"
		self.loss_history	  = "pkl/w2v-loss.png"
		self.check_point_file = "pkl/weights-{epoch:03d}.hdf5"

		self.NUM_LIST  = [str(i) for i in range(10)]
		self.ENG_LIST  = [i for i in list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")]

		self.w2v_model = Word2Vec.load_word2vec_format(
			self.w2v_model_file,
			binary=True,
			unicode_errors='ignore'
		)

		self.lstm_model = self.create_model(self.s_window,self.w2v_dim, self.nb_classes)

		# if os.name=="nt":os.system("cls")
		# else:os.system("clear")

		self.train_model()

Exemplo n.º 37

0

Exibir arquivo

Arquivo: w2v_model.py Projeto: IlyaGusev/nlp-practice

def load(filename):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    if filename[:-6] == "bin.gz":
        model = Word2Vec.load_word2vec_format(filename, binary=True)
    else:
        model = Word2Vec.load(filename)
    return model

Exemplo n.º 38

0

Exibir arquivo

    def wordvec_sim(self, write_flag=True):
        print 'load wordvec model:%s/%s' % (macro.DICT_DIR, self.w2v_model_file)
        w2v_model = Word2Vec.load_word2vec_format(r'%s/%s' % (macro.DICT_DIR, self.w2v_model_file),
                                                  binary=True)  # C format
        auto_sim_list = []
        for w1, w2, manu_sim in zip(self.word1_list, self.word2_list, self.manu_sim_list):
            try:
                auto_sim = w2v_model.similarity(w1, w2)  # 将余弦相似度放到0-10得分
                if auto_sim <= 0:
                    auto_sim = 1.0
                else:
                    auto_sim = auto_sim * 9 + 1
                    # print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, auto_sim)
            except:
                auto_sim = 1  # cos值的最小值
                print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, '______Not Found______')
            auto_sim_list.append(auto_sim)

        for w1, w2, manu_sim, auto_sim in zip(self.word1_list, self.word2_list, self.manu_sim_list, auto_sim_list):
            print '%-10s\t%-10s\t%-10s\t%-10s' % (w1, w2, manu_sim, auto_sim)

        if write_flag:
            print 'write result to file...'
            with open('%s/%s' % (macro.RESULTS_DIR, self.ofname), 'w') as fw:
                fw.write(self.headline.strip() + '\tauto_sim_score\n')
                for w1, w2, manu_sim, auto_sim in zip(self.word1_list, self.word2_list, self.manu_sim_list,
                                                      auto_sim_list):
                    fw.write('%s\t%s\t%s\t%s\n' % (w1, w2, manu_sim, auto_sim))

        return self.word1_list, self.word2_list, self.manu_sim_list, auto_sim_list, self.headline

Exemplo n.º 39

0

Exibir arquivo

def load_model(model_file_name):
    w2v_model = Word2Vec.load_word2vec_format(model_file_name, binary=True)
    # info('loaded {}'.format(model_name))
    w2v_model.init_sims(replace=True)  # to save memory
    vocab, vector_dim = w2v_model.syn0.shape
    # info('The model shape: {} {} (Vocabulary, dimension)'.format(vocab, vector_dim))
    return w2v_model, vector_dim

Exemplo n.º 40

0

Exibir arquivo

def find_nearest_neighbors(vector_inpath, max_n, wordlist):
    """
	Find the nearest neighbors for a list of words based on their word embeddings.

	Args:
		vector_inpath (str): Path to vector file. File has to have the following format (separated by spaces):
			<index of original vector #1> <index of original vector #2> <Dimension 1> ... <Dimension n>
		max_n (int): Number of nearest neighbors that should be determined.
		wordlist (list): List of words nearest neighbors should be found for.
	"""
    print "Loading vectors...."
    model = w2v.load_word2vec_format(vector_inpath, binary=False)
    print wordlist

    # Find nearest neighbors
    for word in wordlist:
        most_similar_with_score = model.most_similar(positive=[word],
                                                     topn=max_n)
        for v in most_similar_with_score:
            print v
        most_similar_words = [pair[0] for pair in most_similar_with_score
                              ]  # Only use words, not scores

        # Print results
        print u"%i most similar words of %s in dataset %s" % (max_n, word,
                                                              vector_inpath)
        for i in range(len(most_similar_words)):
            print u"%i: %s" % (i + 1, most_similar_words[i])

Exemplo n.º 41

0

Exibir arquivo

Arquivo: train.py Projeto: DethRaid/voynich-translation

def __filter_w2v_model(filename, words_to_remove, num_to_keep):
    """Filters the words in the Spanish model, removing all the words in the given list and returning the top x words

    :param filename: The name of the file to read the words in from
    :param words_to_remove: A list of all the words to get rid of
    :param num_to_keep: The number of words to keep
    """
    good_words = list()

    with open(filename, 'r') as f:
        for line in f:
            for word in words_to_remove:
                if not line.startswith(word):
                    good_words.append(line)

    random.shuffle(good_words)

    kept_words = good_words[:num_to_keep]

    with open('tempmodel', 'w') as f:
        for word in kept_words:
            f.write(word)
            f.write('\n')

    return Word2Vec.load_word2vec_format('tempmodel')

Exemplo n.º 42

0

Exibir arquivo

    def __init__(self, vec_file='models/GoogleNews-vectors-negative300.bin', binary=True):
        """

        :param vec_file: the file storing vectors
        :param binary: if vector are stored in binary. Google news use binary while yelp not
        """
        self._wordvec = Word2Vec.load_word2vec_format(FileIO.filename(vec_file), binary=binary)

Exemplo n.º 43

0

Exibir arquivo

def LoadModel(MakeNew=False, useWiki=False):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    typeOfModel = "wiki" if useWiki else "text8"

    if not MakeNew:
        if os.path.isfile("word2vec/" + typeOfModel + ".model"):
            print("Using " + typeOfModel + ".model file")
            model = Word2Vec.load("word2vec/" + typeOfModel + ".model")
            return model

        if os.path.isfile("word2vec/" + typeOfModel + ".bin"):
            print("Using " + typeOfModel + ".bin file")
            model = Word2Vec.load_word2vec_format(
                'word2vec/" + typeOfModel + ".bin',
                binary=True)  # C binary format
            return model

    print("Generating new model. This may take some time")
    sentences = gensim.models.word2vec.Text8Corpus('word2vec/text8')
    model = Word2Vec(sentences, size=200, workers=4)
    print("Saving model as text8.model")
    model.save('word2vec/text8.model')
    return model

Exemplo n.º 44

0

Exibir arquivo

Arquivo: language_sim.py Projeto: trideeprath/LanguageAnalysis

    def create_lang_similarty_data(self):
        model = Word2Vec.load_word2vec_format(fp.w2vfilepath)
        lang_list = [
            'java', 'python', 'matlab', 'html', 'c++', 'c', 'mysql',
            'javascript', 'sql'
        ]
        '''
        for lang_o,lang_i in zip(lang_list,lang_list):
            for lan_i in lang_list:
                sim = model.similarity(lang_o,lan_i)
                print(lang_o,lan_i,sim)
        '''

        with open(fp.language_sim, 'w') as mycsvfile:
            temp_list = lang_list[:]
            temp_list.insert(0, " ")
            print(temp_list)
            datawriter = csv.writer(mycsvfile)
            datawriter.writerow(temp_list)
            for lang_o in lang_list:
                lang_sim = [
                    "{0:.2f}".format(abs(model.similarity(lang_o, lang)))
                    for lang in lang_list
                ]
                lang_sim.insert(0, lang_o)
                print(lang_sim)
                datawriter.writerow(lang_sim)

Exemplo n.º 45

0

Exibir arquivo

Arquivo: my_word2vec.py Projeto: cswangyuhui/RAE

def used_model_m():
    """测试一个词的词向量"""
    model = Word2Vec.load_word2vec_format(
        'data/GoogleNews-vectors-negative300.bin', binary=True)
    # model =Word2Vec.load('temp/temp.bin')
    b = model['spilt']
    print b

Exemplo n.º 46

0

Exibir arquivo

Arquivo: teaser.py Projeto: elani0/topic_modelling

def main():
    analogies_name = config.analogies_name
    model_name = config.model_name

    print("[%i Vocab]" % (config.restrict_vocab_nb or -1))
    print("Analogies: %s | Model: %s" % (analogies_name, model_name))

    start = now()
    analogies = read_analogies(analogies_name)
    analogies = prepare_analogies(analogies)

    model = Word2Vec.load_word2vec_format(model_name,
                                          binary=config.is_model_binary)
    model.init_sims(replace=True)
    model_loaded = now()
    print("Model loaded for [%s]" % str(model_loaded - start))

    correct_guesses = perform_experiment(model, analogies=analogies)
    end = now()

    result_ratio, result_percentage = experiment_result_str(
        correct_guesses, analogies)
    print("Correct number of predictions out of all predictions %s [%i%%]" %
          (result_ratio, result_percentage))
    print(
        "It took %s to load the model. After that, it took %s to perform the check"
        %
        (delta_to_str(model_loaded - start), delta_to_str(end - model_loaded)))

Exemplo n.º 47

0

Exibir arquivo

Arquivo: Word2Vec_to_file.py Projeto: simonhughes22/PythonNlpResearch

def vectors_to_pickled_dict(desired_words, output_file, norm = True, filename = __BIN_FILE_):
    print("Loading Model")
    model = Word2Vec.load_word2vec_format(filename, binary=True)
    print("Loaded")

    wd2vec = dict()
    if desired_words:
        desired_words = set(desired_words)
    else:
        desired_words = model.vocab.keys()

    for i, wd in enumerate(desired_words):
        if i % 1000 == 0:
            print(i)

        wd = remove_non_ascii(wd).replace("  "," ").strip()
        # for phrases
        wd_key = wd.replace(" ", "_")
        if wd_key in model.vocab:
            ix = model.vocab[wd_key].index
            vector = model.syn0norm[ix] if norm else model.syn0[ix]
            wd2vec[wd.replace("_"," ").strip()] = vector

    with open(output_file, "w+") as f:
        Pickle.dump(wd2vec, f)

Exemplo n.º 48

0

Exibir arquivo

Arquivo: __main__.py Projeto: MarcosGrzeca/wordembeddingpython

def main():
    model = Word2Vec.load_word2vec_format('comments.bin', binary=True)
    badword_list = json.load(open('badword_list.json'))
    vocabulary = json.load(open('vocabulary.json'))
    badwords = []

    for badword in badword_list:
        for word in vocabulary:
            d = distance(badword, word)
            r = ratio(badword, word)
            if d < 2 and r > 0.8:
                badwords.append(word)
                #print(badword + " = " + word + " | Distance: " + str(d) + " Ratio:" + str(r))

    similarities = {}

    for word1 in badwords:
        biggest = 0
        for word2 in vocabulary:
            if word1 != word2:
                try:
                    s = model.similarity(word1,word2)
                    if s > biggest:
                        similarities[word1] = (word2, s)
                        biggest = s
                except:
                    pass

    for word in similarities:
        print(word + ": " + str(similarities[word]))

Exemplo n.º 49

0

Exibir arquivo

Arquivo: achain.py Projeto: kzinmr/achain

def initialize(fword, tword, modelfn, start, debug):
    juman = Juman()
    # parse and check from_word
    ms_f = juman.analysis(fword).mrph_list()
    if len(ms_f) > 1:
        print(u'{} is parsed multiple words'.format(fword))
        exit(1)
    wm_f = ms_f[0]
    if not wm_f.repname:
        print(u'no repname with {}'.format(fword))
        exit(1)
    fword = wm_f.repname
    # parse and check to_word
    ms_t = juman.analysis(tword).mrph_list()
    if len(ms_t) > 1:
        print(u'{} is parsed multiple words'.format(tword))
        exit(1)
    wm_t = ms_t[0]
    if not wm_t.repname:
        print(u'no repname with {}'.format(tword))
        exit(1)
    tword = wm_t.repname
    # load and check model
    print(u'loading model...')
    if modelfn.split('.')[-1] == 'model':
        model = Word2Vec.load(modelfn)
    elif modelfn.split('.')[-1] == 'bin':
        model = Word2Vec.load_word2vec_format(modelfn, binary=True, unicode_errors='ignore')
    if fword not in model.vocab:
        raise KeyError(u'{} is not found in the model'.format(fword))
        exit(1)
    elif tword not in model.vocab:
        raise KeyError(u'{} is not found in the model'.format(tword))
        exit(1)
    model.save('hs0.100m.500.5.18mgt100.model')

    t1 = time.clock() - start
    if debug:
        printtime(t1)

    print(u'constructing id2vocab map...')
    id2vocab = {}
    for i, v in enumerate(model.vocab):
        id2vocab[i] = v

    t2 = time.clock() - t1
    if debug:
        printtime(t2)

    print(u'constructing V...')
    V = []
    for v in model.vocab:
        V.append(model[v])
    V = np.vstack(V)

    t3 = time.clock() - t2
    if debug:
        printtime(t3)
    return fword, tword, model, V, id2vocab, t3

Exemplo n.º 50

0

Exibir arquivo

Arquivo: main.py Projeto: mgarchery/word2sem

def extract_relations(model_path, n_entities, min_relation_count, out_path, shuffle, dump_vectors):
    print 'Loading model...'
    model = Word2Vec.load_word2vec_format(model_path, binary=True)
    print 'Finished loading model'

    relation_vectors = dict()
    if n_entities > 0:
        if shuffle:
            base_entities = random.sample(model.vocab.keys(), n_entities)
        else:
            base_entities = model.vocab.keys()[:n_entities]
    else:
        base_entities = model.vocab.keys()

    for i, base_entity in enumerate(base_entities):
        print i, base_entity
        for (relation, related_entity) in get_relations_from_base_entity(base_entity):
            related_entity = unicode(related_entity).encode('utf8')
            relation = unicode(relation).encode('utf8')
            if related_entity.startswith(DBPEDIA_PREFIX):
                related_entity_without_prefix = related_entity[len(DBPEDIA_PREFIX):]
                if related_entity_without_prefix in model:
                    v1, v2 = model[base_entity], model[related_entity_without_prefix]
                    if relation in relation_vectors:
                        relation_vectors[relation].append(v2 - v1)
                    else:
                        relation_vectors[relation] = [v2 - v1]

    # print vector_entities
    relations_statistics = []
    mean_relation_vectors = dict()

    n_relations = len(relation_vectors)
    for r, relation in enumerate(relation_vectors):
        print r, '/', n_relations, relation
        vectors = relation_vectors[relation]

        if len(vectors) > min_relation_count:
            cosine_distances = []
            for i, vi in enumerate(vectors):
                for j, vj in enumerate(vectors[i + 1:]):
                    cosine_distances.append(1.0 - spatial.distance.cosine(vi, vj))
            if len(cosine_distances) > 1:
                avg_cos, std_cos = np.mean(cosine_distances), np.std(cosine_distances)
                if not isnan(avg_cos) and not isnan(std_cos):
                    count = len(vectors)
                    relations_statistics.append((relation, count, avg_cos, std_cos))
                    if dump_vectors:
                        mean_relation_vectors[relation] = (np.mean(vectors, axis=0), count, avg_cos, std_cos)

    print 'Sorting relations'
    relations_statistics.sort(key=lambda x: x[2], reverse=True)
    print 'Writing to csv'
    write_csv(relations_statistics, out_path)

    if dump_vectors:
        print 'Writing vectors dump'
        f = open(out_path + '.vectors.pkl', 'wb')
        cPickle.dump(mean_relation_vectors, f)

Exemplo n.º 51

0

Exibir arquivo

Arquivo: WordVecs.py Projeto: liangkai/DSTC4

def InitModel():
    global _WORDVEC_MODEL
    assert (_WORDVEC_MODEL == None),'InitModel has already been called.'
    print >>sys.stderr, "Loading Word2Vec Models ..."
    start = time.time()
    _WORDVEC_MODEL = Word2Vec.load_word2vec_format('/home/limiao/open_tools/Word2Vec/models/wiki_en_models/wiki.en.text.vector', binary=False)
    end = time.time()
    print >>sys.stderr, "Completed! time: ", end-start, "sec."

Exemplo n.º 52

0

Exibir arquivo

Arquivo: setup.py Projeto: Wingie/NLQA

def QA(question):
	# model = Word2Vec.load('out')
	model = Word2Vec.load_word2vec_format('/home/david/Work/googlenews.bin', binary=True)
	extractor = Rake()
	words = extractor.run(question)
	keywords = [words[i][0] for i in xrange(len(words))]

	return model.most_similar(positive=keywords)[0][0]

Exemplo n.º 53

0

Exibir arquivo

Arquivo: target_engineering.py Projeto: Nathx/parental_advisory_ml

def build_w2b_mat(filename, vocab):
    model = Word2Vec.load_word2vec_format('GloVe-1.2/vectors.txt', binary=False)


    w2v_mat = np.zeros((len(model[vocab[0]]), len(vocab)))
    for j, word in enumerate(vocab):
        w2v_mat[:, j] = model[word]
    return w2v_mat

Exemplo n.º 54

0

Exibir arquivo

Arquivo: Bootstrap.py Projeto: mannefedov/Relext

 def __init__(self, corpus):
     self.sequence = []
     self.all_found = set()
     self.entities = dict()
     self.corpus = corpus
     self.seed = set()
     self.model = w2v.load_word2vec_format('news_vectors.bin', binary=True)
     self.candidate_patterns = []

Exemplo n.º 55

0

Exibir arquivo

Arquivo: word2vec.py Projeto: kbiscanic/apt_project

def w2v_model_load():
    global w2v_model
    global w2v_dimension
    if w2v_model is None:
        # w2v_model = Word2Vec.load_word2vec_format("features/karlo/GoogleNews-vectors-negative300.bin", binary=True)
        #w2v_dimension = 300
        w2v_model = Word2Vec.load_word2vec_format("features/karlo/vectors.6B.50d.txt", binary=False)
        w2v_dimension = 50

Exemplo n.º 56

0

Exibir arquivo

Arquivo: similarity.py Projeto: houshuang/PycharmProjects

 def __init__(self,tag_data,user_data,k,path):
     self.tag_data = tag_data
     self.user_data = user_data
     self.k = k
     self.model = Word2Vec.load_word2vec_format(path, binary=True)
     self.minimium_model = {}
     self.no_match_tag = []
     self.vec_dict = {}
     self.corr_dict = {}

Exemplo n.º 57

0

Exibir arquivo

Arquivo: get_loadings.py Projeto: joehoover/addr

def load_model(model_path):
    """Load Word2Vec model and return model, number of features, and word index"""


    model = Word2Vec.load_word2vec_format(model_path, binary=True)
    num_features = model.layer1_size
    model_word_set = set(model.index2word)
    print 'Finished loading model'
    return model, num_features, model_word_set

Exemplo n.º 58

0

Exibir arquivo

Arquivo: prepare_data_pretrain.py Projeto: IDRC-Tsinghua/RCNN-for-Discours-Compositionality

def main():
    print 'Preprocessing data ...'
    tags = preprocess_data()
    print 'Loading model ...'
    model = Word2Vec.load_word2vec_format(model_file, binary=True)
    print 'Reading and converting data from swda ...'
    data = process_data(model, tags)
    print 'Saving ...'
    save_data(data, data_file)

Exemplo n.º 59

0

Exibir arquivo

Arquivo: utils.py Projeto: amichalo/food2vec

def loadWord2Vec(filename):
    try:
        logger.info("Trying to load food2vec model from file: {0}".format(filename))
        food2vec = Word2Vec.load_word2vec_format(filename, binary=False)
        logger.info("Food2vec model has been loaded from file: {0}".format(filename))
        return food2vec
    except IOError as e:
        logger.error("Cannot load food2vec model from file: {0}: IOError: {1}".format(filename, e.strerror))
        sys.exit(e.errno)