Пример #1
0
def test_embedding():
    from gensim.models import KeyedVectors
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import WordRelatedness
    model_wiki = KeyedVectors.load_word2vec_format(FileIO.filename('models/w2v-model-enwiki_w2vformat'), binary=True)
    model_news = KeyedVectors.load_word2vec_format(FileIO.filename('models/googlenews.bin'), binary=True)
    rel = WordRelatedness(model_news)
    print(rel.word_similarity('happy','sad'))
    def setUp(self):
        self.source_word_vec_file = datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")
        self.target_word_vec_file = datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")

        self.word_pairs = [("one", "uno"), ("two", "due"), ("three", "tre"),
            ("four", "quattro"), ("five", "cinque"), ("seven", "sette"), ("eight", "otto"),
            ("dog", "cane"), ("pig", "maiale"), ("fish", "cavallo"), ("birds", "uccelli"),
            ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana")
        ]

        self.test_word_pairs = [("ten", "dieci"), ("cat", "gatto")]

        self.source_word_vec = KeyedVectors.load_word2vec_format(self.source_word_vec_file, binary=False)
        self.target_word_vec = KeyedVectors.load_word2vec_format(self.target_word_vec_file, binary=False)
 def __init__(self):
   print("Loading in word vectors...")
   self.word_vectors = KeyedVectors.load_word2vec_format(
     '../large_files/GoogleNews-vectors-negative300.bin',
     binary=True
   )
   print("Finished loading in word vectors")
Пример #4
0
    def load(self, *args, **kwargs) -> KeyedVectors:
        """
        Load dict of embeddings from given file

        Args:
            *args: arguments
            **kwargs: arguments

        Returns:

        """
        # Check that header with n_words emb_dim present
        with open(self.load_path, encoding='utf8') as f:
            header = f.readline()
            if len(header.split()) != 2:
                raise RuntimeError('The GloVe file must start with number_of_words embeddings_dim line! '
                                   'For example "40000 100" for 40000 words vocabulary and 100 embeddings '
                                   'dimension.')

        if self.load_path and self.load_path.is_file():
            log.info("[loading embeddings from `{}`]".format(self.load_path))
            model_file = str(self.load_path)
            model = KeyedVectors.load_word2vec_format(model_file)
        else:
            log.error('No pretrained GloVe model provided or provided load_path "{}" is incorrect.'
                      .format(self.load_path))
            sys.exit(1)

        return model
Пример #5
0
    def testConversion(self):
        word2vec2tensor(word2vec_model_path=self.datapath, tensor_filename=self.output_folder)

        with smart_open(self.metadata_file, 'rb') as f:
            metadata = f.readlines()

        with smart_open(self.tensor_file, 'rb') as f:
            vectors = f.readlines()

        # check if number of words and vector size in tensor file line up with word2vec
        with smart_open(self.datapath, 'rb') as f:
            first_line = f.readline().strip()

        number_words, vector_size = map(int, first_line.split(b' '))
        self.assertTrue(len(metadata) == len(vectors) == number_words,
            ('Metadata file %s and tensor file %s imply different number of rows.'
                % (self.metadata_file, self.tensor_file)))

        # grab metadata and vectors from written file
        metadata = [word.strip() for word in metadata]
        vectors = [vector.replace(b'\t', b' ') for vector in vectors]

        # get the originaly vector KV model
        orig_model = KeyedVectors.load_word2vec_format(self.datapath, binary=False)

        # check that the KV model and tensor files have the same values key-wise
        for word, vector in zip(metadata, vectors):
            word_string = word.decode("utf8")
            vector_string = vector.decode("utf8")
            vector_array = np.array(list(map(float, vector_string.split())))
            np.testing.assert_almost_equal(orig_model[word_string], vector_array, decimal=5)
Пример #6
0
def get_model():
    """
    Download model

    :return: `gensim` model
    """
    return KeyedVectors.load_word2vec_format(_download(), binary=True)
Пример #7
0
    def testAnnoyIndexingOfKeyedVectors(self):
        from gensim.similarities.index import AnnoyIndexer
        keyVectors_file = datapath('lee_fasttext.vec')
        model = KeyedVectors.load_word2vec_format(keyVectors_file)
        index = AnnoyIndexer(model, 10)

        self.assertEqual(index.num_trees, 10)
        self.assertVectorIsSimilarToItself(model, index)
        self.assertApproxNeighborsMatchExact(model, model, index)
Пример #8
0
	def initModel(self):
		path = self.getModelFilePath()
		modelFull = self.config.getBooleanConfig("common.model.full")[0]
		if modelFull:
			if self.model is None:
				self.model = Word2Vec.load(path)
			self.wv = self.model.wv
		else:
			if self.wv is None:
				self.wv = KeyedVectors.load(path, mmap='r')
Пример #9
0
 def load_embeddings(self, file_path):
     # Embeddins must be in fastText format either bin or
     print('Loading embeddins...')
     if file_path.endswith('.bin'):
         from gensim.models.wrappers import FastText
         embeddings = FastText.load_fasttext_format(file_path)
     else:
         from gensim.models import KeyedVectors
         embeddings = KeyedVectors.load_word2vec_format(file_path)
     return embeddings
Пример #10
0
    def test_add_single(self):
        """Test that adding entity in a manual way works correctly."""
        entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)]
        vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)]

        # Test `add` on already filled kv.
        for ent, vector in zip(entities, vectors):
            self.vectors.add(ent, vector)

        for ent, vector in zip(entities, vectors):
            self.assertTrue(np.allclose(self.vectors[ent], vector))

        # Test `add` on empty kv.
        kv = EuclideanKeyedVectors(self.vectors.vector_size)
        for ent, vector in zip(entities, vectors):
            kv.add(ent, vector)

        for ent, vector in zip(entities, vectors):
            self.assertTrue(np.allclose(kv[ent], vector))
Пример #11
0
    def __init__(self):
       self.cmdpairs = {
           "!similar": self.execute_cnb,
           "!similaryle": self.execute_yle,

           "!similarn": self.execute_n_cnb,
           "!similarnyle": self.execute_n_yle,

           "!similarnr": self.execute_n_cnb_r,
           "!similarnyler": self.execute_n_yle_r,

           "!xminusyplusz": self.execute_xyz_cnb,
           "!xminusypluszyle": self.execute_xyz_yle,
#            "!xminusyplusz": self.execute_x_minus_y_plus_z
       }
       self.cnb_wv = gensim.models.Word2Vec.load("./Resources/word2vec_2014-2019_04.model").wv
       self.yle_wv = KeyedVectors.load("./Resources/word2vec_yle_dersb")
def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size):
    model = KeyedVectors.load_word2vec_format(filepath, binary=True)

    emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std()

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        try:
            embedding_vector = model[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            continue
    return embedding_matrix
Пример #13
0
    def load(cls, np2vec_model_file, binary=False, word_ngrams=0):
        """
        Load the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.

        Returns:
            np2vec model to load
        """
        if word_ngrams == 0:
            return KeyedVectors.load_word2vec_format(
                np2vec_model_file, binary=binary)
        elif word_ngrams == 1:
            return FastText.load(np2vec_model_file)
        else:
            logger.error('invalid value for \'word_ngrams\'')
def wv(w1, w2, t):
    # lazy load the wordvector model...
    global wvmodel
    if wvmodel == None:
        print ' *', 'loading wordvector model (', modelFile, ')...'
        wvmodel = KeyedVectors.load_word2vec_format(modelFile, binary=False)
        wvmodel.init_sims(replace=True)  # no more updates, prune memory

    try:
        #
        # since we've got wordnet synset objects (like cat.n.01), we
        # must turn this back into a regular word ('cat') because the
        # word vector GloVe models are plain words with spaces turned
        # into hyphens on phrases (e.g. climate-change, black-and-white)
        #
        wv_w1, wv_w2 = _mk_wv_word(w1), _mk_wv_word(w2)
        distance = wvmodel.similarity(wv_w1, wv_w2)
        return distance if abs(distance) >= t else 0
    except:
        return 0
Пример #15
0
def load_embeddings(pytorch_embedding, word2idx, filename, embedding_size):
    print("Copying pretrained word embeddings from ", filename, flush=True)
    en_model = KeyedVectors.load_word2vec_format(filename)
    """ Fetching all of the words in the vocabulary. """
    pretrained_words = set()
    for word in en_model.vocab:
        pretrained_words.add(word)

    arr = [0] * len(word2idx)
    for word in word2idx:
        index = word2idx[word]
        if word in pretrained_words:
            arr[index] = en_model[word]
        else:
            arr[index] = np.random.uniform(-1.0, 1.0, embedding_size)

    """ Creating a numpy dictionary for the index -> embedding mapping """
    arr = np.array(arr)
    """ Add the word embeddings to the empty PyTorch Embedding object """
    pytorch_embedding.weight.data.copy_(torch.from_numpy(arr))
    return pytorch_embedding
Пример #16
0
    def fit(self, X, y=None):

        dw_params = self.get_params()
        print dw_params

        if False: #exists(self.output_file):
            model = KeyedVectors.load_word2vec_format(self.output_file)
        else:
            model = run_gensim(dw_params)
        nb_vecs = len(model.wv.vocab)

        # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) 
        features_matrix = np.asarray([model[str(node)] for node in range(nb_vecs)])
        #features_matrix = np.random.randn((4,2))

        if self.normalize:
            norms = np.linalg.norm(features_matrix, axis=1)
            if self.verbose:
                print norms
                print norms.shape

            assert norms.shape[0] == features_matrix.shape[0]
            for i in range(features_matrix.shape[0]):
                features_matrix[i,:] /= norms[i]

            norms = np.linalg.norm(features_matrix, axis=1)
            if self.verbose:
                print norms

        if self.verbose:
            print('features_matrix.shape = %s' % str(features_matrix.shape))

        self.dw_params_ = dw_params
        self.gs_model_ = model
        self.features_matrix_ = features_matrix
        print('fit', self.features_matrix_.shape)
        return self
Пример #17
0
For more information on this file, see
https://docs.djangoproject.com/en/1.10/topics/settings/

For the full list of settings and their values, see
https://docs.djangoproject.com/en/1.10/ref/settings/
"""

import os
from gensim.models import KeyedVectors
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

settings_dir = os.path.dirname(__file__)
PROJECT_ROOT = os.path.abspath(os.path.dirname(settings_dir))
MODEL_PATH = os.path.join(PROJECT_ROOT, 'apollo/w2v/GoogleNews-vectors-negative300.bin.gz')
MODEL = KeyedVectors.load_word2vec_format(MODEL_PATH, unicode_errors = 'replace', binary = 'True', limit=10000)

# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/

# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = ')p1#0dnupk$xc59wdfl^%!7)4myi--la+xd4=$krk&a55$%0rz'

# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True

ALLOWED_HOSTS = []


# Application definition
Пример #18
0
def eval_blogcat(embeddings_file, labels_matrix=None, G=None,
                 verbose=1, normalize=1, training_percents=[0.1, 0.6, 0.9]):

    # 0. Files
    #embeddings_file = "/mnt/raid1/deepwalk/blogcatalog.vec"
    if labels_matrix is None and G is None:
        G, labels_matrix = load_blogcat()
    
    # 1. Load Embeddings
    model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)

    labels = np.argwhere(labels_matrix)
    label_cnts = pd.Series(labels[:,1]).value_counts()

    if verbose > 1:
        print('\nLabel counts:')
        print(label_cnts)

    # delete the least frequent labels, which causes balancing problems
    labels_matrix = labels_matrix[:, :-2]

    # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) 
    features_matrix = np.asarray([model[str(node)] for node in range(len(G))])

    if normalize:
        norms = np.linalg.norm(features_matrix, axis=1)
        if verbose:
            print norms
            print norms.shape

        assert norms.shape[0] == features_matrix.shape[0]
        for i in range(features_matrix.shape[0]):
            features_matrix[i,:] /= norms[i]

        norms = np.linalg.norm(features_matrix, axis=1)
        if verbose:
            print norms

    if verbose:
        print('-'*100)
        print(embeddings_file)
        print('features_matrix.shape = %s' % str(features_matrix.shape))
        print('labels_matrix.shape   = %s' % str(labels_matrix.shape))

    # 2. Shuffle, to create train/test groups
    shuffles = []
    number_shuffles = 1
    for x in range(number_shuffles):
        # if we just have one group, make the split the same every time
        if number_shuffles == 1:
            shuffles.append(skshuffle(features_matrix, labels_matrix, random_state=123))
        else:
            shuffles.append(skshuffle(features_matrix, labels_matrix))

    # 3. to score each train/test group
    all_results = defaultdict(list)

    # uncomment for all training percents
    #training_percents = np.asarray(range(1,10))*.1
    for train_percent in training_percents:
        # print('-'*100)
        # print('pct_train: %.2f' % train_percent)

        for shuf in shuffles:
            X, y = shuf
            training_size = int(train_percent * X.shape[0])

            X_train = X[:training_size, :]
            y_train = y[:training_size]
            X_test = X[training_size:, :]
            y_test = y[training_size:]

            clf = TopKRanker(LogisticRegression())
            clf.fit(X_train, y_train)

            # find out how many labels should be predicted
            #top_k_list = [len(l) for l in y_test]
            top_k_list = np.array(np.sum(y_test, axis=1).flatten()[0])[0].astype(np.int32)
            preds = clf.predict(X_test, top_k_list)

            if y_test.shape[1] != preds.shape[1]:
                raise Exception("imbalance of class dims")
                #continue
            
            results = OrderedDict()
            averages = ["micro", "macro", "samples", "weighted"]
            for average in averages:
                results[average] = f1_score(y_test, preds, average=average)

            all_results[train_percent].append(results)
            #break

    if verbose:
        print '-------------------'
        for train_percent in sorted(all_results.keys()):
            print 'Train percent:', train_percent
            for x in all_results[train_percent]:
                print  x
            print '-------------------'
    return all_results
Пример #19
0
'''
[[  3.35454009e-03  -2.96757789e-03   8.95642443e-04 ...,   4.16836003e-03
   -3.26405023e-03  -1.91481831e-03]
 ...,
 [  7.19302261e-05   1.70022575e-03   3.59526509e-03 ...,   1.11010019e-03
    3.70053225e-03  -3.61868995e-03]]
'''

# 3、持久化模型
model.save('sample.en.text.model')
model.wv.save_word2vec_format('sample.en.text.vector', binary=True)
'''
save() 函数保存的完整的模型?额
wv.save_word2vec_format() 函数保存的其实就是词汇和对应向量,不过会丢失tree信息,所以无法进行增量训练
'''

# 4、加载持久化的模型,需与上面持久化的模型对应,此为方法一
new_model = Word2Vec.load('sample.en.text.model')
print(new_model)

# 4、加载持久化模型,方法二
from gensim.models import KeyedVectors

filename = 'sample.en.text.vector'
new_model = KeyedVectors.load_word2vec_format(filename, binary=True)

# 参考:
# [word2vec学习小记](https://www.jianshu.com/p/418f27df3968)
# [How to Develop Word Embeddings in Python with Gensim](https://machinelearningmastery.com/develop-word-embeddings-python-gensim/)
# [gensim.model.word2vec API](https://radimrehurek.com/gensim/models/word2vec.html)
Пример #20
0
def calcfeatures(stancesFile, bodiesFile):
    path = os.path.abspath("")
    #gensim.models.KeyedVectors.load_word2vec_format
    #wmd_model = Word2Vec.load_word2vec_format('/data/w2v_googlenews/GoogleNews-vectors-negative300.bin.gz', binary=True)
    wmd_model = KeyedVectors.load_word2vec_format(path+'/data/GoogleNews-vectors-negative300.bin', binary=True)
    wmd_model.init_sims(replace=True)
    tknzr = TweetTokenizer()

    count = 0
    features = []
    classes = []

    #N = getDocCount(path+'/data/training/train_bodies.csv')

    keys = {'agree': 0, 'disagree': 1, 'discuss': 2, 'unrelated': 3}

    bodies = loadBodies(bodiesFile)

    bigram_vectorizer = CountVectorizer(tokenizer=tknzr.tokenize, ngram_range=(1, 2), binary=False, lowercase=True, 
        stop_words='english', min_df=1)
    
    vectorizer = TfidfVectorizer(tokenizer=tknzr.tokenize, ngram_range=(1, 1), binary=False, lowercase=True, 
        stop_words='english', min_df=1)

    tfidfMat = vectorizer.fit_transform(list(bodies.values()))
    tfidfMat = vectorizer.transform(list(bodies.values()))
    tfidfMat = tfidfMat.toarray()
    vocab = vectorizer.get_feature_names()
    k = list(bodies.keys())

    bodiesTokens = loadBodiesTokens(bodiesFile)

    with open(stancesFile, 'r', encoding='UTF-8') as csvDataFile1: 
		 
        csvReader1 = csv.reader(csvDataFile1)
        first = 1
        for row in csvReader1:
            f = []
            if first == 1: 
                first = 0
            else:
                print(count)
                count = count + 1

                #class
                classes.append(keys[row[2]])	

                #canberra distance
                f.append(feat.canberraDist(row[0],bodies[row[1]], bigram_vectorizer))
                         
                #polarity scores
                neg, neu, pos = feat.polarityScores(row[0], bodies[row[1]])
                f.append(neg)
                f.append(neu)
                f.append(pos)

                tokens1 = tknzr.tokenize(row[0])
                tokens1=[token.lower() for token in tokens1 if (token.isalpha() and token not in stop_words)]
                tokens2 = bodiesTokens[row[1]]

                #word movers distance
                f.append(feat.wmd(tokens1, tokens2,wmd_model))

                #common words
                common = (set(tokens1) & set(tokens2))              
                f.append(feat.overlap(common))      
                        
                #tfidf
                f.append(feat.tfidf(tfidfMat, common,vocab,k.index(row[1])))
                               
                #negations
                f.append(feat.negWords(tokens1,tokens2))

                #add all features
                features.append(f)
								
    return np.array(features), np.array(classes)
# -*- coding:utf-8 -*-
# 首先加载必用的库
import tensorflow as tf
import numpy as np
# import gensim  用来加载预训练word vector
from gensim.models import KeyedVectors
import jieba
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings("ignore")

# 使用gensim加载预训练中文分词embedding, 有可能需要等待1-2分钟
cn_model = KeyedVectors.load_word2vec_format('../static/embeddings/sgns.zhihu.bigram', binary=False, unicode_errors="ignore")

# 由此可见每一个词都对应一个长度为300的向量
embedding_dim = cn_model['山东大学'].shape[0]
print("embedding_dim:", embedding_dim)

# 获得样本的索引
import pandas as pd
data_neg = pd.read_excel('../static/data/neg9.xlsx')
print('样本总数:'+str(len(data_neg)))

print("data_neg.head(1)", data_neg.head(1))

# 将所有的评价内容放置到一个list里
train_texts_orig = []
# 文本所对应的labels,也就是标记
train_target = []
Пример #22
0
def main():
    parser = ArgumentParser("scoring",
                            formatter_class=ArgumentDefaultsHelpFormatter,
                            conflict_handler='resolve')
    parser.add_argument("--emb",
                        default='result.embeddings',
                        help='Embeddings file')
    parser.add_argument(
        "--network",
        default='p2p.edgelist',
        help=
        'A .mat file containing the adjacency matrix and node labels of the input network.'
    )
    parser.add_argument(
        "--adj-matrix-name",
        default='network',
        help='Variable name of the adjacency matrix inside the .mat file.')
    parser.add_argument(
        "--label-matrix-name",
        default='group',
        help='Variable name of the labels matrix inside the .mat file.')
    parser.add_argument("--num-shuffles",
                        default=2,
                        type=int,
                        help='Number of shuffles.')
    parser.add_argument(
        "--all",
        default=False,
        action='store_true',
        help=
        'The embeddings are evaluated on all training percents from 10 to 90 when this flag is set to true. '
        'By default, only training percents of 10, 50 and 90 are used.')

    args = parser.parse_args()
    # 0. Files
    embeddings_file = args.emb
    matfile = args.network

    # 1. Load Embeddings
    model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)

    # 2. Load labels
    mat = loadmat(matfile)
    A = mat[args.adj_matrix_name]
    graph = sparse2graph(A)
    labels_matrix = mat[args.label_matrix_name]
    labels_count = labels_matrix.shape[1]
    mlb = MultiLabelBinarizer(range(labels_count))

    # Map nodes to their features (note:  assumes nodes are labeled as integers 1:N)
    features_matrix = numpy.asarray(
        [model[str(node)] for node in range(len(graph))])

    # 2. Shuffle, to create train/test groups
    shuffles = []
    for x in range(args.num_shuffles):
        shuffles.append(skshuffle(features_matrix, labels_matrix))

    # 3. to score each train/test group
    all_results = defaultdict(list)

    if args.all:
        training_percents = numpy.asarray(range(1, 10)) * .1
    else:
        training_percents = [0.1, 0.5, 0.9]
    for train_percent in training_percents:
        for shuf in shuffles:

            X, y = shuf

            training_size = int(train_percent * X.shape[0])

            X_train = X[:training_size, :]
            y_train_ = y[:training_size]

            y_train = [[] for x in range(y_train_.shape[0])]

            cy = y_train_.tocoo()
            for i, j in zip(cy.row, cy.col):
                y_train[i].append(j)

            assert sum(len(l) for l in y_train) == y_train_.nnz

            X_test = X[training_size:, :]
            y_test_ = y[training_size:]

            y_test = [[] for _ in range(y_test_.shape[0])]

            cy = y_test_.tocoo()
            for i, j in zip(cy.row, cy.col):
                y_test[i].append(j)

            clf = TopKRanker(LogisticRegression())
            clf.fit(X_train, y_train_)

            # find out how many labels should be predicted
            top_k_list = [len(l) for l in y_test]
            preds = clf.predict(X_test, top_k_list)

            results = {}
            averages = ["micro", "macro"]
            for average in averages:
                results[average] = f1_score(mlb.fit_transform(y_test),
                                            mlb.fit_transform(preds),
                                            average=average)

            all_results[train_percent].append(results)

    print('Results, using embeddings of dimensionality', X.shape[1])
    print('-------------------')
    for train_percent in sorted(all_results.keys()):
        print('Train percent:', train_percent)
        for index, result in enumerate(all_results[train_percent]):
            print('Shuffle #%d:   ' % (index + 1), result)
        avg_score = defaultdict(float)
        for score_dict in all_results[train_percent]:
            for metric, score in iteritems(score_dict):
                avg_score[metric] += score
        for metric in avg_score:
            avg_score[metric] /= len(all_results[train_percent])
        print('Average score:', dict(avg_score))
        print('-------------------')
import random
import string
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from gensim.models import KeyedVectors

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences




print("Loading embedder.......")
# Embedder
embedder = KeyedVectors.load_word2vec_format('/Users/petergramaglia/Documents/GitHub/new_connected/connected_journaling/data/GoogleNews-vectors-negative300.bin',binary=True)
word_vectors = embedder.wv

print("Reading dataset........")
dataset = pd.read_csv(data_path)   # 5k samples






print("Y stuff")
new_train_y = np.zeros(len(train_y))
new_test_y = np.zeros(len(test_y))

for i in range(0,len(train_y)):
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
from keras.utils import to_categorical
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense
import keras.optimizers
from keras.layers import Embedding
from collections import Counter
import operator
from sklearn.model_selection import train_test_split

modelw2v = KeyedVectors.load_word2vec_format('./data/w2v_model_M.bin',
                                             binary=True)

embeddings_index = {}
for word in modelw2v.wv.vocab.keys():
    embeddings_index[word] = modelw2v.wv[word]

file_path = './data/train_tweets_mod.csv'
dfs = pd.read_csv(file_path)
num_entries = 0
users = dfs['User'].values.tolist()
user_tweet_dict = Counter(users)
sorted_user_tweet_dict = sorted(user_tweet_dict.items(),
                                key=operator.itemgetter(1),
                                reverse=True)
values = user_tweet_dict.values()
count_dict = Counter(values)
Пример #25
0
path_root = './data/' + dataset_name + '/'
path_to_batches = path_root + '/batches_' + dataset_name + '/'
model_path = "./V1/agg=sum_bidir=True_discount=1_cutgradient=False/" + dataset_name + "/run1/"
path_to_save = './'
path_to_functions = './'

path_to_weights = model_path
n_runs = 4
nb_epochs_train = 150

my_prec = 5 # nb of decimals to keep in history files

runs = ['run%i' % i for i in range(n_runs)]

# Loading vectors
gensim_obj = KeyedVectors.load(path_root + 'word_vectors.kv', mmap='r') # needs an absolute path!
word_vecs = gensim_obj.wv.syn0
# add Gaussian initialized vector on top of embedding matrix (for padding)
pad_vec = np.random.normal(size=word_vecs.shape[1]) 
word_vecs = np.insert(word_vecs,0,pad_vec,0)

# Defining Network
## Inputs
sent_ints = Input(shape=(None,))
sent_wv = Embedding(input_dim=word_vecs.shape[0],
                    output_dim=word_vecs.shape[1],
                    weights=[word_vecs],
                    input_length=None, # sentence size vary from batch to batch
                    trainable=True
                    )(sent_ints)
Пример #26
0
#from gensim.models import KeyedVectors
#filename = 'GoogleNews-vectors-negative300.bin'
#model = KeyedVectors.load_word2vec_format(filename, binary=True)

from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = 'intents/glove.6B.100d.txt'
word2vec_output_file = 'intents/glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

# In[3]:

from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'intents/glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
word_vec = model.wv
#word_vec.get_vector('cab')

# In[4]:

x = word_vec.get_vector('pub')
#x.shape

# In[6]:

import numpy as np

cab_file = open('intents/cab.dat', 'r')
stopwords = open('intents/stopwords.txt', 'r')
Пример #27
0
Файл: SVM.py Проект: yc999/-
    transform = TfidfTransformer()
    Y = transform.fit_transform(X)    # 这里的输入是上面文档的计数矩阵
    print(Y.toarray())                # 输出转换为tf-idf后的 Y 矩阵
"""

# print(content_train_src)
EMBEDDING_DIM = 200  #词向量长度
EMBEDDING_length = 8824330

word2vec_path = '/public/ycdswork/dnswork/glove/Tencent_AILab_ChineseEmbedding.txt'
stopwords_path = "/public/ycdswork/dnswork/stopwords/cn_stopwords.txt"
webfilepath = "/public/ycdswork/dnswork/httpwebdata/"
file_dir = "/home/yangc/myclass/"
modelsave_path = "/public/ycdswork/modeldir/LSTMmodel"

tc_wv_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=False)
# EMBEDDING_length = 8824330
EMBEDDING_length = len(tc_wv_model.key_to_index)
print('Found %s word vectors.' % EMBEDDING_length)

embeddings_index = {}
embedding_matrix = np.zeros((EMBEDDING_length + 1, EMBEDDING_DIM))
# tc_wv_model.key_to_index
# for counter, key in enumerate(tc_wv_model.vocab.keys()):
for counter, key in enumerate(tc_wv_model.key_to_index):
    # print(counter,key)
    embeddings_index[key] = counter + 1
    coefs = np.asarray(tc_wv_model[key], dtype='float32')
    embedding_matrix[counter + 1] = coefs

del tc_wv_model
    'articleType': 'AIDaily',
    'method': 'zh_NER_TF',
    'contentMode': [1, 1, 0],
    'useExpanded': [1, 0, 1],
    'similarity': 50,
    'title_weight': 0.8,
    'cut_method': 'tfidf',
    'top_k': 8,
    'normalize_title_content': True,
    'file': "./media/data.json",
    'use': False
}

# 向model输入关键词, 输出与其最近的领域词,记录它的大小
# 导入tencent 词向量
model = KeyedVectors.load_word2vec_format(
    './model/dictionary/Tencent_AILab_ChineseEmbedding.txt', binary=False)


def remove_text(text, type='number'):
    """
    从文本中移除特定文本,例如数字或标点

    :param text: 文本
    :param type: 移除的文本类型, 可选'number', 'punc', 'both'
    :return: 移除后的文本
    """
    from zhon.hanzi import punctuation
    import string
    text = str(text)
    #text = re.sub("<>".format(punctuation, string.punctuation), " ", text)
    text = re.sub('<.*?>', '', text)
Пример #29
0
def get_model():
    '''
    :return: Downloads the `gensim` model.'''
    return KeyedVectors.load_word2vec_format(download(), binary=False)
Пример #30
0
import datetime
import matplotlib.pyplot as plt
import pickle as pkl
import gzip

# File paths
TRAIN_CSV = 'data/train.csv'
TEST_CSV = 'data/test.csv'
EMBEDDING_FILE = 'data/GoogleNews-vectors-negative300.bin.gz'
PROCESSED_DATA_FILE = 'data/processed_data.pkl.gz'
MODEL_FILE = 'model/sensim_adadelta_model_weights.h5'

# Load training and test set
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
embedding_dim = 300

embeddings, vocabulary = init_data_as_vectors(word2vec, [train_df, test_df],
                                              embedding_dim)

del word2vec

max_seq_length = max(
    train_df.question1.map(lambda x: len(x)).max(),
    train_df.question2.map(lambda x: len(x)).max(),
    test_df.question1.map(lambda x: len(x)).max(),
    test_df.question2.map(lambda x: len(x)).max())

# save processed data
data = {
Пример #31
0
def pega_dados(vecfile, target, ant, syn):
    
    import csv
    from gensim.models import KeyedVectors

    cosine_ant = []
    cosine_syn = []
    subcos_ant = []
    subcos_syn = []
    
    mod = KeyedVectors.load_word2vec_format("/home/bthalenberg/ic/novos novos/"+vecfile, binary=False)
    
    i = 0
    while i != len(target):
        
        #getting cosine similary between target and antonym
        try:
            cos = mod.similarity(target[i], ant[i])
        except KeyError:
            cos = None
        cosine_ant.append(cos)
        
        #getting cosine similary between target and synonym
        try:
            cos_s = mod.similarity(target[i], syn[i])
        except KeyError:
             cos_s = None
        cosine_syn.append(cos_s)


        #subtracting the antonym cosine similarity from the synonym similarity for syn input
        try:
            subcos_syn.append(cos_s - cos)
        except TypeError:
            subcos_syn.append(None)
        
        #negating subtracted values for ant input
        try:
            subcos_ant.append(-(cos_s - cos))
        except TypeError:
            subcos_ant.append(None)
        
        i += 1
        
    dirname = vecfile[:-4]
    
    with open(dirname+"/db_ant.csv", "w", encoding="utf-8") as f:
        writer = csv.writer(f)
        i = 0
        while i != len(target):
            writer.writerow([target[i], ant[i], cosine_ant[i]])
            i += 1
            
    with open(dirname+"/db_syn.csv", "w", encoding="utf-8") as f:
        writer = csv.writer(f)
        i = 0
        while i != len(target):
            writer.writerow([target[i], syn[i], cosine_syn[i]])
            i += 1
                        
    with open(dirname+"/db_sub_ant.csv", "w", encoding="utf-8") as f:
        writer = csv.writer(f)
        i = 0
        while i != len(target):
            writer.writerow([target[i], ant[i], subcos_ant[i]])
            i += 1
            
    with open(dirname+"/db_sub_syn.csv", "w", encoding="utf-8") as f:
        writer = csv.writer(f)
        i = 0
        while i != len(target):
            writer.writerow([target[i], syn[i], subcos_syn[i]])
            i += 1
Пример #32
0
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec, KeyedVectors

model = KeyedVectors.load_word2vec_format("/content/drive/My Drive/bio_files/PubMed-shuffle-win-30.bin", binary=True)


def words_to_vector(words):
  try:
    vector = sum(model.get_vector(w.text) for w in words)/len(words)
  except Exception as e:
    #print(e, type(words), words)
    pass
  return vector


def get_cosine_similarity(term, entity):
    cosine_similarities = []
    try:
        cosine_similarities.append(model.cosine_similarities(words_to_vector(entity[0]), [words_to_vector(term[0])])[0])
    except Exception as e:
        pass
    for synonym in term[1]['synonyms']:
        try:
            cosine_similarities.append(
                model.cosine_similarities(words_to_vector(entity[0]), [words_to_vector(synonym)])[0])
        except Exception as e:
            pass
    return max(cosine_similarities)
Пример #33
0
from gensim.models import Word2Vec, KeyedVectors

# lst=[['hello', 'this', 'is', 'the', 'sample', 'text']]
# # sentences = gensim.models.word2vec.LineSentence("new_fol.txt")
#
# model = gensim.models.Word2Vec()
# model.build_vocab(lst, min_count=1)
# model.train(lst, epochs=model.epochs, total_examples=model.corpus_count)
model=Word2Vec.load('thousand.txt')

# print(tmp)

cv=['trichy', 'chennai', 'gokul', 'klm', 'fog', 'mist', 'cloud', 'google', 'fb']


model1=KeyedVectors.load('thousand.txt')
lstt=[['trichy', 'chennai', 'gokul', 'klm', 'fog', 'mist', 'cloud', 'google', 'fb']]

model.build_vocab(lstt, update=True)
model.train(lstt, epochs=model.epochs, total_examples=model.corpus_count)
tmp=0
for i in cv:
    t=model.wv.get_vector(i)
    tmp=tmp+t

import numpy as np
model_word_vector = np.array( tmp, dtype='f')

print(model.most_similar([model_word_vector],[],topn=20000))
print(model.most_similar(positive=cv, negative=[], topn=1))
x=model.similar_by_word('issu',topn=1000, restrict_vocab=None)
print("creating word sequences...")
ws, ys = [], []
fin = codecs.open(INPUT_FILE, "r", encoding='utf-8')
for line in fin:
    label, sent = line.strip().split("\t")
    ys.append(int(label))
    words = [x.lower() for x in nltk.word_tokenize(sent)]
    wids = [word2index[word] for word in words]
    ws.append(wids)
fin.close()
W = pad_sequences(ws, maxlen=maxlen)
Y = np_utils.to_categorical(ys)

# GloVe 벡터 불러오기
print("loading word2vec vectors...")
word2vec = KeyedVectors.load_word2vec_format(WORD2VEC_MODEL, binary=True)

print("transferring embeddings...")
X = np.zeros((W.shape[0], EMBED_SIZE))
for i in range(W.shape[0]):
    E = np.zeros((EMBED_SIZE, maxlen))
    words = [index2word[wid] for wid in W[i].tolist()]
    for j in range(maxlen):
        try:
            E[:, j] = word2vec[words[j]]
        except KeyError:
            pass
    X[i, :] = np.sum(E, axis=1)

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                Y,
Пример #35
0
import pandas as pd
import numpy as np
import math
import os
import itertools
from tqdm import tqdm
import csv
import re
import multiprocessing as mp
from joblib import Parallel, delayed
from gensim.models import KeyedVectors

server_path = './'
model = KeyedVectors.load_word2vec_format('./word2vec.6B.300d.txt',
                                          binary=False)

global wd_labels
wd_prop_label_path = './'
# def parallelize(row, prefixes, df_GT):
# 	global df_ling
# 	if row[2] <= 0.0:
# 		return
# 	pred1 = [prefix+row[0] for prefix in prefixes]
# 	pred2 = [prefix+row[1] for prefix in prefixes]

# 	for i, j in itertools.product(pred1, pred2):
# 		if len(df_GT.loc[(df_GT['predE'] == i) & (df_GT['predC'] == j)]) > 0:
# 			df_ling = df_ling.append({'predE': i, 'predC': j, 'cosine_sim': row[2]}, ignore_index=True)
# 		if len(df_GT.loc[(df_GT['predE'] == j) & (df_GT['predC'] == i)]) > 0:
# 			df_ling = df_ling.append({'predE': j, 'predC': i, 'cosine_sim': row[2]}, ignore_index=True)
# 		if len(df_GT.loc[(df_GT['predE'] == i+'_inv') & (df_GT['predC'] == j)]) > 0:
Пример #36
0
def main(emb_path='glove.6B.100d.txt', data_path='data/msdialogue/'):
    device = 'cpu'
    if torch.cuda.is_available():
        device = 'cuda'
    print(f'DEVICE : {device}')
    params = {'batch_size': 128, 'shuffle': True}

    # 1) Data loading
    # # Пока так для дебага
    # X, y = load_from_json(data_path)
    # # 1. One-Hot Encode
    # labels = {'O': 0, 'FQ': 1, 'IR': 2,
    #           'OQ': 3, 'GG': 4, 'FD': 5,
    #           'JK': 6, 'NF': 7, 'PF': 8,
    #           'RQ': 9, 'CQ': 10, 'PA': 11}
    # y_train = []
    # for l in y:
    #     l = l.split('_')
    #     cur_y = [0] * len(labels)
    #     for un_l in l:
    #         cur_y[labels[un_l]] = 1
    #     y_train.append(cur_y)
    # y_train = torch.tensor(y_train)
    # # 2. Нужный вид
    # X_train = []
    # for i in range(len(X)):
    #     for j in range(len(X[i])):
    #         X_train.append(X[i][j])
    print('Building Embedding')
    if emb_path == 'glove.6B.100d.txt':
        tmp_file = get_tmpfile("test_word2vec.txt")
        _ = glove2word2vec(emb_path, tmp_file)
        word2vec = KeyedVectors.load_word2vec_format(tmp_file)
    else:
        word2vec = gensim.models.KeyedVectors.load_word2vec_format(emb_path,
                                                                   binary=True)
    EMB_DIM = word2vec.vectors.shape[1]
    word2vec.add('<UNK>', np.mean(word2vec.vectors.astype('float32'), axis=0))
    word2vec.add('<PAD>', np.array(np.zeros(EMB_DIM)))
    tokenizer = Vocab()
    tokenizer.build(word2vec)

    print('Loading Data')
    X_train = pd.read_csv(data_path + "train.tsv",
                          sep="\t",
                          header=None,
                          index_col=None)
    y_train = encode_label(X_train[0].to_numpy())
    X_train = tokenizer.tokenize(X_train[1].to_numpy(), max_len=MAX_SEQ_LEN)

    X_val = pd.read_csv(data_path + "valid.tsv",
                        sep="\t",
                        header=None,
                        index_col=None)
    y_val = encode_label(X_val[0].to_numpy())
    X_val = tokenizer.tokenize(X_val[1].to_numpy(), max_len=MAX_SEQ_LEN)

    X_test = pd.read_csv(data_path + "test.tsv",
                         sep="\t",
                         header=None,
                         index_col=None)
    y_test = encode_label(X_test[0].to_numpy())
    X_test = tokenizer.tokenize(X_test[1].to_numpy(), max_len=MAX_SEQ_LEN)

    # 2. padding
    pad_val = tokenizer.get_pad()
    X_train = pad_sequence(
        X_train, batch_first=True, padding_value=pad_val).to(
            torch.long)[1:, :MAX_SEQ_LEN]  # size: tensor(batch, max_seq_len)
    X_val = pad_sequence(X_val, batch_first=True, padding_value=pad_val).to(
        torch.long)[1:, :MAX_SEQ_LEN]
    X_test = pad_sequence(X_test, batch_first=True, padding_value=pad_val).to(
        torch.long)[1:, :MAX_SEQ_LEN]

    # 3) Batch iterator
    training = data.DataLoader(MSDialog(X_train, y_train), **params)
    validation = data.DataLoader(MSDialog(X_val, y_val), **params)
    testing = data.DataLoader(MSDialog(X_test, y_test), **params)

    # 4) Model, criterion and optimizer
    model = BaseCNN(word2vec, tokenizer.get_pad(), emb_dim=EMB_DIM).to(device)
    optimizer = Adam(model.parameters(),
                     lr=0.001,
                     betas=(0.9, 0.999),
                     eps=1e-08)
    criterion = nn.BCELoss()
    # 5) training process
    treshold = 0.5
    print('Train')
    #     for X, y in training:
    #         X, y = X.to(device), y.to(device)
    #         break
    for ep in range(N_EPOCHS):
        if ep == 10:
            optimizer = Adam(model.parameters(),
                             lr=0.0001,
                             betas=(0.9, 0.999),
                             eps=1e-08)
        print(f'epoch: {ep}')
        #         j = 0
        #         # model.train()
        #         losses = []
        #         for i in range(50):
        #             optimizer.zero_grad()

        #             output = model(X)
        #             loss = torch.tensor(0.0).to(output)
        #             for i in range(output.shape[1]):
        #                 criterion = nn.BCELoss()
        #                 loss += criterion(output[:, i].unsqueeze(1), y[:, i].unsqueeze(1).to(torch.float32))
        #             losses.append(float(loss.cpu())/output.shape[1])
        #             loss.backward()
        #             optimizer.step()

        #             # print(f'iter: {j}, loss: {loss}')
        #             j += 1
        #         print(f'train loss={np.mean(losses)}')

        j = 0
        model.train()
        losses = []
        for X, y in training:
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            output = model(X)
            loss = torch.tensor(0.0).to(output)
            for i in range(output.shape[1]):
                criterion = nn.BCELoss()
                loss += criterion(output[:, i].unsqueeze(1),
                                  y[:, i].unsqueeze(1).to(torch.float32))
            loss.backward()
            losses.append(float(loss.cpu()))
            optimizer.step()

            # print(f'iter: {j}, loss: {loss}')
            j += 1
        print(f'train loss={np.mean(losses)}')
        with torch.no_grad():
            model.eval()
            # print('EVALUATION________')
            losses = []
            f1_scores = []
            precisions = []
            recalls = []
            accuracies = []
            for X, y in validation:
                criterion = nn.MultiLabelSoftMarginLoss()

                X, y = X.to(device), y.to(device)

                output = model(X)
                loss = torch.tensor(0.0).to(output)
                for i in range(output.shape[1]):
                    criterion = nn.BCELoss()
                    loss += criterion(output[:, i].unsqueeze(1),
                                      y[:, i].unsqueeze(1).to(torch.float32))
                losses.append(float(loss.cpu()))
                output = output.cpu().numpy()
                for i in range(len(output)):
                    pred = output[i] > treshold
                    if sum(pred) == 0:
                        pred = output[i].max(axis=0, keepdims=1) == output[i]
                    output[i] = pred
                precisions.append(get_f1(y, output)[0])
                recalls.append(get_f1(y, output)[1])
                f1_scores.append(get_f1(y, output)[2])
                accuracies.append(get_accuracy(y, output))

            print('VAL:')
            print(f'val_loss={np.mean(losses)}')
            print(f'accuracy={np.mean(accuracies)}')
            print(f'precision={np.mean(precisions)}')
            print(f'recall={np.mean(recalls)}')
            print(f'f1-score={np.mean(f1_scores)}')

            print('__________________')
    torch.save(model.state_dict(), SAVE_PATH)
Пример #37
0
                    default=False,
                    dest='double_cycle',
                    help='Use double cycle')
ap.add_argument('-man', type=str, default=None)
ap.add_argument('-woman', type=str, default=None)
ap.add_argument('-king', type=str, default=None)

args = vars(ap.parse_args())

color = args['color']
word = args['word']
man = args['man']
woman = args['woman']
king = args['king']
double_cycle = args['double_cycle']
word_vectors = KeyedVectors.load_word2vec_format(args['vectors'], binary=False, unicode_errors='ignore')
print("vectors loaded")


def solid_shape(dim:int, color:str, word:str):
    """Draws a solid shape based on word for given vector . Better with 50 dimensions or less."""
    se = word_vectors[word]
    r_s = []
    thetas = []
    for x in range(0, dim):
        thetas.append(2 * np.pi * x / dim)
        r_s.append(se[x])
    thetas.append(2 * np.pi * 0 / dim)
    r_s.append(se[0])

    data = [go.Scatterpolar(r=r_s, theta=thetas, thetaunit="radians", mode='lines', marker=dict(color='peru'),
Пример #38
0
def main():
    # 读取命令行参数
    config_file = 'config/story_config.ini'

    switch = ['server_pmr_clf', 'testing', '2_17']

    parser = argparse.ArgumentParser()
    config = ConfigParser()
    config.read(config_file)

    parser.add_argument('--ref_file',
                        default='result/ref_' + switch[0] + '_' + switch[2] +
                        '.txt',
                        help='self_test')
    parser.add_argument('--hypo_file',
                        default='result/hypo_' + switch[0] + '_' + switch[2] +
                        '.txt',
                        help='self_test')
    parser.add_argument('--self_test', default=False, help='self_test')
    parser.add_argument('--test_story', default=False, help='self_test')
    parser.add_argument('--config_file',
                        default=config_file,
                        type=str,
                        help='Select cuda number')
    parser.add_argument('--switch',
                        default=switch,
                        type=str,
                        help='Select cuda number')
    parser.add_argument('--use_cuda',
                        default=config.getboolean(switch[0], 'use_cuda'),
                        type=str,
                        help='Select cuda number')
    parser.add_argument('--device',
                        default=config.get(switch[0], 'device'),
                        type=str,
                        help='Select cuda number')
    parser.add_argument('--gpu_para',
                        action='store_true',
                        default=config.getboolean(switch[0], 'gpu_para'),
                        help='Whether load checkpoint')  # gpu parallel

    parser.add_argument('--log_path',
                        default=config.get(switch[0], 'log_path').format(
                            switch[1], switch[0], switch[2]),
                        type=str,
                        required=False,
                        help='训练日志存放位置')
    parser.add_argument(
        '--data_path',
        default=config.get(switch[0], 'data_path'),
        help='load data file path'
    )  # train_sen_char_idx / train_gpt2_idx_12_24 / train_plutchik_12_26
    parser.add_argument('--raw_data_path',
                        default=config.get(switch[0], 'raw_data_path'),
                        help='load data file path')
    parser.add_argument('--num_epochs',
                        type=int,
                        default=config.getint(switch[0], 'num_epochs'),
                        help='num_epochs')
    parser.add_argument('--seed',
                        type=int,
                        default=config.getint(switch[0], 'seed'),
                        help='设置种子用于生成随机数,以使得训练的结果是确定的')  # None
    parser.add_argument('--batch_size',
                        type=int,
                        default=config.getint(switch[0], 'batch_size'),
                        help='number of batch_size')  # batch_size
    parser.add_argument('--num_workers',
                        type=int,
                        default=config.getint(switch[0], 'num_workers'),
                        help='number of workers')
    parser.add_argument('--lr',
                        type=float,
                        default=config.getfloat(switch[0], 'lr'),
                        help='size of learning rate')
    parser.add_argument('--dropout',
                        type=float,
                        default=config.getfloat(switch[0], 'dropout'),
                        help='size of dropout')
    parser.add_argument('--max_grad_norm',
                        type=float,
                        default=config.getfloat(switch[0], 'max_grad_norm'),
                        help='size of dropout')  # 1. / 5.
    parser.add_argument('--embedding_dim',
                        type=int,
                        default=config.getint(switch[0], 'embedding_dim'),
                        help='embedding_dim')  # 128 / 768 / 50 / 300
    parser.add_argument('--hidden_size',
                        type=int,
                        default=config.getint(switch[0], 'hidden_size'),
                        help='hidden_size')
    parser.add_argument('--max_oovs',
                        type=int,
                        default=config.getint(switch[0], 'max_oovs'),
                        help='number of max_oovs')
    parser.add_argument('--char_num',
                        type=int,
                        default=config.getint(switch[0], 'char_num'),
                        help='number of character')
    parser.add_argument('--pmr_size',
                        type=int,
                        default=config.getint(switch[0], 'pmr_size'),
                        help='number of pmr')
    parser.add_argument('--p_size',
                        type=int,
                        default=config.getint(switch[0], 'p_size'),
                        help='number of plutchik')
    parser.add_argument('--m_size',
                        type=int,
                        default=config.getint(switch[0], 'm_size'),
                        help='number of maslow')
    parser.add_argument('--r_size',
                        type=int,
                        default=config.getint(switch[0], 'r_size'),
                        help='number of reiss')
    parser.add_argument('--embed',
                        default=config.get(switch[0], 'embed'),
                        help='Select 50d or 300d embedding file path')
    parser.add_argument('--word_dict',
                        default=config.get(switch[0], 'word_dict'),
                        help='Select word_dict file path')
    parser.add_argument('--glove',
                        action='store_true',
                        default=config.getboolean(switch[0], 'glove'),
                        help='Whether use glove')

    # model
    parser.add_argument('--opt',
                        action='store_true',
                        default=config.getboolean(switch[0], 'opt'),
                        help='Select Adam or SGD optimizer. True is Adam')
    parser.add_argument('--gpt2',
                        action='store_true',
                        default=config.getboolean(switch[0], 'gpt2'),
                        help='Whether use gpt2')
    parser.add_argument('--bigru',
                        action='store_true',
                        default=config.getboolean(switch[0], 'bigru'),
                        help='Whether use bigru')
    parser.add_argument('--bilstm',
                        action='store_true',
                        default=config.getboolean(switch[0], 'bilstm'),
                        help='Whether use bilstm')
    parser.add_argument('--gate',
                        action='store_true',
                        default=config.getboolean(switch[0], 'gate'),
                        help='Whether use gate mechanism')
    parser.add_argument('--copy',
                        action='store_true',
                        default=config.getboolean(switch[0], 'copy'),
                        help='Whether use copy mechanism')
    parser.add_argument('--teacher_force',
                        action='store_true',
                        default=config.getboolean(switch[0], 'teacher_force'),
                        help='Whether use teacher force')

    # pmr & char
    parser.add_argument('--fix_encoder',
                        action='store_true',
                        default=config.getboolean(switch[0], 'fix_encoder'))
    parser.add_argument('--encoder_merge',
                        action='store_true',
                        default=config.getboolean(switch[0], 'encoder_merge'))
    parser.add_argument('--baseline',
                        action='store_true',
                        default=config.getboolean(switch[0], 'baseline'))
    parser.add_argument('--fix_decoder',
                        action='store_true',
                        default=config.getboolean(switch[0], 'fix_decoder'))
    parser.add_argument('--psy_clf',
                        action='store_true',
                        default=config.getboolean(switch[0], 'psy_clf'))
    parser.add_argument('--seq_attn',
                        action='store_true',
                        default=config.getboolean(switch[0], 'seq_attn'))
    parser.add_argument('--context',
                        action='store_true',
                        default=config.getboolean(switch[0], 'context'),
                        help='Whether add context')
    parser.add_argument('--only_plutchik',
                        action='store_true',
                        default=config.getboolean(switch[0], 'only_plutchik'),
                        help='Whether add pmr_input')
    parser.add_argument('--dynamic',
                        action='store_true',
                        default=config.getboolean(switch[0], 'dynamic'),
                        help='Whether add pmr_input')
    parser.add_argument('--pmr_input',
                        action='store_true',
                        default=config.getboolean(switch[0], 'pmr_input'),
                        help='Whether add pmr_input')
    parser.add_argument('--rep_inp_attn',
                        action='store_true',
                        default=config.getboolean(switch[0], 'rep_inp_attn'),
                        help='Whether add rep_inp_attn')
    parser.add_argument('--pmr_attn',
                        action='store_true',
                        default=config.getboolean(switch[0], 'pmr_attn'),
                        help='Whether use pmr_attn')
    parser.add_argument('--char_attn',
                        action='store_true',
                        default=config.getboolean(switch[0], 'char_attn'),
                        help='Whether use char_attn')

    # load & save model
    parser.add_argument('--load_ckpt',
                        action='store_true',
                        default=config.getboolean(switch[0], 'load_ckpt'),
                        help='Whether load checkpoint')  # load checkpoint
    parser.add_argument('--save_ckpt',
                        action='store_true',
                        default=config.getboolean(switch[0], 'save_ckpt'),
                        help='Whether save checkpoint')  # save checkpoint
    parser.add_argument('--load_ckpt_file',
                        default=config.get(switch[0], 'load_ckpt_file').format(
                            switch[2], switch[0]),
                        help='Set checkpoint file path')  # ckpt_path
    parser.add_argument('--ckpt_path',
                        default=config.get(switch[0],
                                           'ckpt_path').format(switch[2]),
                        help='Set checkpoint file path')  # ckpt_path
    parser.add_argument('--ckpt_file',
                        default=config.get(switch[0],
                                           'ckpt_file').format(switch[0]),
                        help='Set checkpoint file name')  # ckpt_file
    args = parser.parse_args()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    # torch.backends.cudnn.deterministic = True

    global logger
    logger = create_logger(args)
    logger.info('start game!')
    logger.info('switch: {}'.format(switch))
    logger.info(args)

    if args.use_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    else:
        device = torch.device('cpu')
    logger.info(device)

    if args.gpt2 == False:
        # 加载word_dict
        with open(args.word_dict) as f:
            for line in f:
                word_dict = json.loads(line)

        if args.glove:
            # inv_dict = {v: k for k, v in word_dict.items()}
            # 加载glove预训练词向量
            logger.info('loading: {}'.format(args.embed))
            tmp_word2vec = args.embed
            glove_model = KeyedVectors.load_word2vec_format(tmp_word2vec)
            args.embedding_dim = glove_model.vector_size
            embedding_matrix = np.zeros(
                (len(word_dict),
                 args.embedding_dim))  # [vocab_size,embedding_dim]
            for i in range(len(word_dict)):
                embedding_matrix[i, :] = glove_model[glove_model.index2word[i]]
            embedding_matrix = torch.from_numpy(embedding_matrix).float().to(
                device)
        else:
            embedding_matrix = None

        p_np, m_np, r_np = get_pmr(args, word_dict)  # (32, 1)

        vocab_size = len(word_dict)

    # model
    model = PMRClf(args,
                   device,
                   embedding_matrix,
                   vocab_size,
                   word_dict,
                   args.embedding_dim,
                   args.hidden_size,
                   dropout=args.dropout)
    if args.use_cuda and args.gpu_para:
        # model = nn.DataParallel(model, device_ids=[0, 1])  # multi-GPU
        model = nn.DataParallel(model,
                                device_ids=[
                                    int(i) for i in args.device.split(',')
                                ])  # multi-GPU
        torch.backends.cudnn.benchmark = True
    model = model.to(device)

    model.load_state_dict(torch.load(args.load_ckpt_file + '.pkl'))
    logger.info('loading checkpoint file {}'.format(args.load_ckpt_file))

    dataset = MyDataset_clf(args.data_path)

    train_loader, dev_loader = train_test_split(dataset,
                                                test_size=0.1,
                                                random_state=1)
    logger.info("loading {} data".format('dev_loader'))
    inv_dict = {v: k for k, v in word_dict.items()}
    test(args, model, dev_loader, inv_dict, word_dict, device)
#!/usr/bin/python
# -*- coding: utf-8 -*-

from  gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('/home/tj/big_data/data/talk/2j3s.vec', binary=False) 
model.save_word2vec_format('/home/tj/big_data/data/talk/2j3s.vec.bin', binary=True)

Пример #40
0
 def setUp(self):
     self.vectors = EuclideanKeyedVectors.load_word2vec_format(
         datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)
"""Script to rewrite Google's word2vec into a format that loads faster."""

import os

from gensim.models import KeyedVectors

if __name__ == "__main__":
    path = os.path.join("data", "GoogleNews-vectors-negative300.bin.gz")
    w2v = KeyedVectors.load_word2vec_format(path, binary=True)
    w2v.init_sims(replace=True)
    w2v.save(path)
Пример #42
0
def load_google_vec():
    
    from gensim.models import KeyedVectors
    
    return KeyedVectors.load_word2vec_format('~/nlp/w2v/GoogleNews-vectors-negative300.bin.gz', binary=True)
Пример #43
0
from gensim.models import KeyedVectors
en_vectors = KeyedVectors.load_word2vec_format('data/wiki-news-300d-1M.vec', binary=False)

from gensim.models import Word2Vec
vi_vectors = Word2Vec.load('data/vi.bin').wv

# # Lưu ý: đối với model glove, cần chuyển về format word2vec
# # Ví dụ
# from gensim.scripts.glove2word2vec import glove2word2vec
# glove2word2vec('data/glove.6B.50d.txt', 'data/en.vec')

en_vectors.vocab

en_vectors["cat"]

print ("vector size: ", en_vectors.vector_size)
print ("vocab size: ", len(en_vectors.vocab))

print ("vector size: ", vi_vectors.vector_size)
print ("vocab size: ", len(vi_vectors.vocab))

en_vectors.most_similar("cat")


vi_vectors.most_similar("mèo")


sim_words = en_vectors.most_similar(positive=['queen', 'man'], negative=['king'])
print('Queen is a: ', sim_words[0][0])

sim_words = en_vectors.most_similar(negative=['king'], positive=['kings', 'queen'])
Пример #44
0
import numpy as np
import gensim
import pymorphy2
from sklearn.neighbors import KNeighborsClassifier
from functools import lru_cache
from gensim.models import KeyedVectors
import pickle
import os
import re


model = KeyedVectors.load('my_model')
morph = pymorphy2.MorphAnalyzer()
clf_file = 'trained_knn.clf'
clf = None


@lru_cache(maxsize=10000)
def get_normal_form(i):
    return morph.normal_forms(i)[0]


def normalize_text(x):
    return ' '.join([get_normal_form(i) for i in re.findall('\w+', x)])


def get_question_vector(question):
    question_vect = np.zeros(300)
    try:
        for word in re.findall('\w+', question):
            question_vect += model.wv.__getitem__(word)
Пример #45
0
 def _get_embedding(self, embedding_path):
     model = KeyedVectors.load_word2vec_format(embedding_path)
     vocab = model.vocab
     vocab_len = len(vocab)
     return np.array([model.word_vec(k) for k in vocab.keys()])
Пример #46
0
 def loadGoogleModel(self, file_name):
     self.model = KeyedVectors.load_word2vec_format(file_name, binary=True)
     self.is_w2v = True
Пример #47
0
Файл: cnn.py Проект: wlf061/nlp
def do_keras_textcnn_w2v(text,stars,trainable):

    #转换成词袋序列
    max_document_length=200

    embedding_dims = 300


    #获取已经训练好的词向量
    model = KeyedVectors.load_word2vec_format(word2vec_file, binary=True)

    print model['word'].shape


    #设置分词最大个数 即词袋的单词个数
    tokenizer = Tokenizer(num_words=max_features,lower=True)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)

    x=pad_sequences(sequences, maxlen=max_document_length)


    #我们可以使用从scikit-learn LabelEncoder类。
    # 这个类通过 fit() 函数获取整个数据集模型所需的编码,然后使用transform()函数应用编码来创建一个新的输出变量。
    encoder=LabelEncoder()
    encoder.fit(stars)
    encoded_y = encoder.transform(stars)

    #labels = to_categorical(np.asarray(labels))也可以进行数据处理

    #获取word到对应数字编号的映射关系
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))


    #获取词向量的映射矩阵
    embedding_matrix = np.zeros((max_features + 1, embedding_dims))

    for word, i in word_index.items():

        #编号大于max_features的忽略 该字典是按照字典顺序 所以对应的id不一定是顺序的
        if i > max_features:
            continue

        try:
            embedding_matrix[i] = model[word].reshape(embedding_dims)

        except:
            print "%s not found!" % (word)


        #构造神经网络
    def baseline_model():

        #CNN参数

        #filters个数通常与文本长度相当 便于提取特征
        filters = max_document_length

        # Inputs
        input = Input(shape=[max_document_length])

        # 词向量层,本文使用了预训练word2vec词向量,把trainable设为False
        x = Embedding(max_features + 1,
                                    embedding_dims,
                                    weights=[embedding_matrix],
                                    trainable=trainable)(input)



        # conv layers
        convs = []
        for filter_size in [3,4,5]:
            l_conv = Conv1D(filters=filters, kernel_size=filter_size, activation='relu')(x)
            l_pool = MaxPooling1D()(l_conv)
            l_pool = Flatten()(l_pool)
            convs.append(l_pool)

        merge = concatenate(convs, axis=1)

        out = Dropout(0.2)(merge)

        output = Dense(32, activation='relu')(out)

        output = Dense(units=2, activation='softmax')(output)

        #输出层
        model = Model([input], output)

        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

        #可视化
        plot_model(model, to_file='yelp-cnn-model-textcnn.png',show_shapes=True)

        model.summary()

        return model
    #在 scikit-learn 中使用 Keras 的模型,我们必须使用 KerasClassifier 进行包装。这个类起到创建并返回我们的神经网络模型的作用。
    # 它需要传入调用 fit()所需要的参数,比如迭代次数和批处理大小。
    # 最新接口指定训练的次数为epochs
    clf = KerasClassifier(build_fn=baseline_model, epochs=10, batch_size=50, verbose=1)

    #使用5折交叉验证
    scores = cross_val_score(clf, x, encoded_y, cv=5, scoring='f1_micro')
    # print scores
    print("f1_micro: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Пример #48
0
    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r') as f:
            dataset.append((neg_label, f.read()))

    shuffle(dataset)

    return dataset


# In[2]:


from nltk.tokenize import TreebankWordTokenizer
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000)


def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])

            except KeyError:
                pass  # No matching token in the Google w2v vocab
Пример #49
0
def get_model():
    '''
    :return: Downloads the `gensim` model.'''
    return KeyedVectors.load_word2vec_format(download(),binary=False)
import random
import tensorflow as tf
import numpy as np

from gensim.models import KeyedVectors
from subjectivity.utils import get_data_from_list
from subjectivity.utils import is_objective, is_subjective

_bucket_size = 10
_path = os.path.dirname(__file__)
_saving_dir = os.path.join(_path, '../data/save')
_subjective_filename = os.path.join(
    _path, '../data/subj_dataset/subjective_test.txt')
_objective_filename = os.path.join(_path,
                                   '../data/subj_dataset/objective_test.txt')
_model = KeyedVectors.load_word2vec_format(
    os.path.join(_path, '../data/word_embeddings/glove.6B.50d.txt'))


def count_true_and_false_positives_and_negatives(prediction, expected):
    true_positives = sum([
        prediction[i] == is_subjective and expected[i] == is_subjective
        for i in range(len(expected))
    ])
    false_positives = sum([
        prediction[i] == is_subjective and expected[i] != is_subjective
        for i in range(len(expected))
    ])
    true_negatives = sum([
        prediction[i] != is_subjective and expected[i] != is_subjective
        for i in range(len(expected))
    ])
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from gensim.models import Word2Vec
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.model_selection import train_test_split
import sys

if (len(sys.argv) != 3):
    print("Usage: script.py <full path to w2v> <path to dataset>")

print("importing word2vec")
wv_from_bin = KeyedVectors.load_word2vec_format(datapath(sys.argv[1]),
                                                binary=True)  # C binary format
print("imported word2vec")

df = pd.read_csv(sys.argv[2],
                 sep=",",
                 index_col=0,
                 header=0,
                 names=["body", "isAdHominem"])

train, test = train_test_split(df, test_size=0.3, random_state=3)

print("In total, the train contains", sum(train["isAdHominem"] == True),
      "ad hominems")
print("In total, the test contains", sum(test["isAdHominem"] == True),
      "ad hominems")
Пример #52
0
#model = Word2Vec.load(r"C:\Users\Colouree\Desktop\Colouree\word2vec.model")
import time
start = time.time()
####from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
####from gensim.scripts.glove2word2vec import glove2word2vec
####
####
#####glove_file = datapath(r'C:\Users\Colouree\Desktop\Colouree\glove.840B.300d.txt')
#####tmp_file = get_tmpfile(r"glove.840B.300d_word2vec.txt")
#####_ = glove2word2vec(glove_file, tmp_file)
#####model = KeyedVectors.load_word2vec_format(tmp_file)
#####model.save(r"C:\Users\Colouree\Desktop\Colouree\word2vec.model")
#model=KeyedVectors.load(r"C:\Users\Colouree\Desktop\Colouree\word2vec.model")
model1 = KeyedVectors.load(
    r"C:\Users\Colouree\Desktop\Colouree\google_word2vec.model")
print("took {} secs to load the model".format(time.time() - start))
#start = time.time()
#def compare_two_words(tag1,tag2):
#    result=model.similarity(tag1, tag2)
#    return result

final_tags = []
for word in keys:
    x = ''
    for ij in word.split():
        if ij in model1.vocab:
            x += ij + ' '
    if not x == '':
        final_tags.append(x)
import pandas as pd
Frequency threshold = 10
2000 iterations
"""

# Word frequency = 10
frequency_threshold = 10

# Exclude stopwords
stop_words = set(stopwords.words('english'))

# Load embedding model
glove_file = datapath('/Users/jonabenja/Desktop/glove.twitter.27B/glove.twitter.27B.200d.txt')
tmp_file = get_tmpfile('test_word2vec.txt')

wordembeddings = glove2word2vec(glove_file, tmp_file)
word_embedding_model = KeyedVectors.load_word2vec_format(tmp_file)

# This model has 200 dimensions so we set the number of features to 200
num_features = 200

# Transform training data to use
filepath = 'data/MELD/train_sent_emo.csv'
dftrain = pd.read_csv(filepath)

dftrain['Utterance'] = dftrain['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")

filepath = './data/MELD/test_sent_emo.csv'
dftest = pd.read_csv(filepath)
dftest['Utterance'] = dftest['Utterance'].str.replace("\x92|\x97|\x91|\x93|\x94|\x85", "'")

training_instances = tokenize_data(dftrain['Utterance'])
Пример #54
0
    pad_id = word_to_token_map["<pad>"]

    # 对句子进行token转换,对于未在词典中出现过的词用unk的token填充
    tokens = [word_to_token_map.get(word, unk_id) for word in sentence]

    if len(tokens) < limit_size:  #补齐
        tokens.extend([0] * (limit_size - len(tokens)))
    else:  #截断
        tokens = tokens[:limit_size]

    return tokens


x_data = [convert_text_to_token(sentence) for sentence in x]
x_data = np.array(x_data)
wvmodel = KeyedVectors.load_word2vec_format('word60.vector')
static_embeddings = np.zeros([VOCAB_SIZE, EMBEDDING_SIZE])
for word, token in tqdm(a.items()):

    if word in wvmodel.vocab.keys():
        static_embeddings[token, :] = wvmodel[word]
    elif word == '<pad>':
        static_embeddings[token, :] = np.zeros(EMBEDDING_SIZE)
    else:
        static_embeddings[
            token, :] = 0.2 * np.random.random(EMBEDDING_SIZE) - 0.1

print(static_embeddings.shape)

X_train, X_test, y_train, y_test = train_test_split(x_data, y, test_size=0.3)
Пример #55
0
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

########################################
## index word vectors
########################################
print('Indexing word vectors')

word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
        binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

########################################
## process texts in datasets
########################################
print('Processing text dataset')

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    text = text.lower().split()
    # Optionally, remove stop words
    if remove_stopwords:
Пример #56
0
# 事前学習済みの単語ベクトル(例えば,Google Newsデータセット(約1,000億単語)での学習済み単語ベクトル)で単語埋め込みemb(x)を初期化し,学習せよ.

from gensim.models import KeyedVectors
modelvec = KeyedVectors.load_word2vec_format("/content/drive/My Drive/2020年度/勉強会/GoogleNews-vectors-negative300.bin", binary=True)

# 学習済み単語ベクトルの取得
VOCAB_SIZE = len(set(ids)) + 1
EMB_SIZE = 300
weights = np.zeros((VOCAB_SIZE, EMB_SIZE))
words_in_pretrained = 0
for i, word in enumerate(ids.keys()):
  try:
    weights[i] = modelvec[word]
    words_in_pretrained += 1
  except KeyError:
    weights[i] = np.random.normal(scale=0.4, size=(EMB_SIZE,))
weights = torch.from_numpy(weights.astype((np.float32)))

print(f'学習済みベクトル利用単語数: {words_in_pretrained} / {VOCAB_SIZE}')
print(weights.size())

class RNN(nn.Module):
  def __init__(self, vocab_size, emb_size, padding_idx, output_size, hidden_size, num_layers, emb_weights=None, bidirectional=False):
    super().__init__()
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.num_directions = bidirectional + 1  # 単方向:1、双方向:2
    if emb_weights != None:  # 指定があれば埋め込み層の重みをemb_weightsで初期化
      self.emb = nn.Embedding.from_pretrained(emb_weights, padding_idx=padding_idx)
    else:
      self.emb = nn.Embedding(vocab_size, emb_size, padding_idx=padding_idx)
Пример #57
0
            continue

        X = np.array(X)/np.linalg.norm(X)
        Y = np.array(Y)/np.linalg.norm(Y)
        o = np.dot(X, Y.T)/np.linalg.norm(X)/np.linalg.norm(Y)

        scores.append(o)

    scores = np.asarray(scores)
    return np.mean(scores), 1.96*np.std(scores)/float(len(scores)), np.std(scores)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('ground_truth', help="ground truth text file, one example per line")
    parser.add_argument('predicted', help="predicted text file, one example per line")
    parser.add_argument('embeddings', help="embeddings bin file")
    args = parser.parse_args()

    print("loading embeddings file...")
    w2v = KeyedVectors.load_word2vec_format(args.embeddings, binary=True)

    r = average(args.ground_truth, args.predicted,  w2v)
    print("Embedding Average Score: %f +/- %f ( %f )" %(r[0], r[1], r[2]))

    r = greedy_match(args.ground_truth, args.predicted, w2v)
    print("Greedy Matching Score: %f +/- %f ( %f )" %(r[0], r[1], r[2]))

    r = extrema_score(args.ground_truth, args.predicted, w2v)
    print("Extrema Score: %f +/- %f ( %f )" %(r[0], r[1], r[2]))
Пример #58
0
from gensim.models import KeyedVectors


model = KeyedVectors.load_word2vec_format(
    './data/GoogleNews-vectors-negative300.bin.gz',
    binary=True
)

# 国名の取得
countries = set()
with open('data/analogy_data_add.txt', 'r') as f:
  for line in f:
    line = line.split()
    if line[0] in ['capital-common-countries', 'capital-world']:
      countries.add(line[2])
    elif line[0] in ['currency', 'gram6-nationality-adjective']:
      countries.add(line[1])
countries = list(countries)

# 単語ベクトルの取得
countries_vec = [model[country] for country in countries]

from sklearn.cluster import KMeans
import numpy as np

# k-meansクラスタリング
kmeans = KMeans(n_clusters=5)
kmeans.fit(countries_vec)
for i in range(5):
    cluster = np.where(kmeans.labels_ == i)[0]
    print('cluster', i)
from future.utils import iteritems
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future


from gensim.models import KeyedVectors


# warning: takes quite awhile
# https://code.google.com/archive/p/word2vec/
# direct link: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
# 3 million words and phrases
# D = 300
word_vectors = KeyedVectors.load_word2vec_format(
  '../large_files/GoogleNews-vectors-negative300.bin',
  binary=True
)


# convenience
# result looks like:
# [('athens', 0.6001024842262268),
#  ('albert', 0.5729557275772095),
#  ('holmes', 0.569324254989624),
#  ('donnie', 0.5690680742263794),
#  ('italy', 0.5673537254333496),
#  ('toni', 0.5666348338127136),
#  ('spain', 0.5661854147911072),
#  ('jh', 0.5661597847938538),
#  ('pablo', 0.5631559491157532),
#  ('malta', 0.5620371103286743)]
Пример #60
0
import h5py

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.optimizers import Adam

from keras.layers import Input, LSTM, GlobalMaxPool1D, Dense, Dropout, Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.models import Model

from gensim.models import KeyedVectors

# %%
word2vec_model = KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin', binary=True, limit=1000000)

# %%
FILTERS = 60
MAXLEN = 100
MAX_FEAUTURE = 50000
DROPOUT_RATE = 0.1
DENSE_UNITS = 50
EMBED_SIZE = 128
LIST_CLASSES = [
    "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"
]


# %%
def get_tokenizer(texts):