Пример #1
0
def test_embedding():
    from gensim.models import KeyedVectors
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import WordRelatedness
    model_wiki = KeyedVectors.load_word2vec_format(FileIO.filename('models/w2v-model-enwiki_w2vformat'), binary=True)
    model_news = KeyedVectors.load_word2vec_format(FileIO.filename('models/googlenews.bin'), binary=True)
    rel = WordRelatedness(model_news)
    print(rel.word_similarity('happy','sad'))
    def setUp(self):
        self.source_word_vec_file = datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")
        self.target_word_vec_file = datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")

        self.word_pairs = [("one", "uno"), ("two", "due"), ("three", "tre"),
            ("four", "quattro"), ("five", "cinque"), ("seven", "sette"), ("eight", "otto"),
            ("dog", "cane"), ("pig", "maiale"), ("fish", "cavallo"), ("birds", "uccelli"),
            ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana")
        ]

        self.test_word_pairs = [("ten", "dieci"), ("cat", "gatto")]

        self.source_word_vec = KeyedVectors.load_word2vec_format(self.source_word_vec_file, binary=False)
        self.target_word_vec = KeyedVectors.load_word2vec_format(self.target_word_vec_file, binary=False)
 def __init__(self):
   print("Loading in word vectors...")
   self.word_vectors = KeyedVectors.load_word2vec_format(
     '../large_files/GoogleNews-vectors-negative300.bin',
     binary=True
   )
   print("Finished loading in word vectors")
Пример #4
0
    def load(self, *args, **kwargs) -> KeyedVectors:
        """
        Load dict of embeddings from given file

        Args:
            *args: arguments
            **kwargs: arguments

        Returns:

        """
        # Check that header with n_words emb_dim present
        with open(self.load_path, encoding='utf8') as f:
            header = f.readline()
            if len(header.split()) != 2:
                raise RuntimeError('The GloVe file must start with number_of_words embeddings_dim line! '
                                   'For example "40000 100" for 40000 words vocabulary and 100 embeddings '
                                   'dimension.')

        if self.load_path and self.load_path.is_file():
            log.info("[loading embeddings from `{}`]".format(self.load_path))
            model_file = str(self.load_path)
            model = KeyedVectors.load_word2vec_format(model_file)
        else:
            log.error('No pretrained GloVe model provided or provided load_path "{}" is incorrect.'
                      .format(self.load_path))
            sys.exit(1)

        return model
Пример #5
0
    def testConversion(self):
        word2vec2tensor(word2vec_model_path=self.datapath, tensor_filename=self.output_folder)

        with smart_open(self.metadata_file, 'rb') as f:
            metadata = f.readlines()

        with smart_open(self.tensor_file, 'rb') as f:
            vectors = f.readlines()

        # check if number of words and vector size in tensor file line up with word2vec
        with smart_open(self.datapath, 'rb') as f:
            first_line = f.readline().strip()

        number_words, vector_size = map(int, first_line.split(b' '))
        self.assertTrue(len(metadata) == len(vectors) == number_words,
            ('Metadata file %s and tensor file %s imply different number of rows.'
                % (self.metadata_file, self.tensor_file)))

        # grab metadata and vectors from written file
        metadata = [word.strip() for word in metadata]
        vectors = [vector.replace(b'\t', b' ') for vector in vectors]

        # get the originaly vector KV model
        orig_model = KeyedVectors.load_word2vec_format(self.datapath, binary=False)

        # check that the KV model and tensor files have the same values key-wise
        for word, vector in zip(metadata, vectors):
            word_string = word.decode("utf8")
            vector_string = vector.decode("utf8")
            vector_array = np.array(list(map(float, vector_string.split())))
            np.testing.assert_almost_equal(orig_model[word_string], vector_array, decimal=5)
Пример #6
0
def get_model():
    """
    Download model

    :return: `gensim` model
    """
    return KeyedVectors.load_word2vec_format(_download(), binary=True)
Пример #7
0
    def testAnnoyIndexingOfKeyedVectors(self):
        from gensim.similarities.index import AnnoyIndexer
        keyVectors_file = datapath('lee_fasttext.vec')
        model = KeyedVectors.load_word2vec_format(keyVectors_file)
        index = AnnoyIndexer(model, 10)

        self.assertEqual(index.num_trees, 10)
        self.assertVectorIsSimilarToItself(model, index)
        self.assertApproxNeighborsMatchExact(model, model, index)
Пример #8
0
 def load_embeddings(self, file_path):
     # Embeddins must be in fastText format either bin or
     print('Loading embeddins...')
     if file_path.endswith('.bin'):
         from gensim.models.wrappers import FastText
         embeddings = FastText.load_fasttext_format(file_path)
     else:
         from gensim.models import KeyedVectors
         embeddings = KeyedVectors.load_word2vec_format(file_path)
     return embeddings
def load_word2vec_embeddings(filepath, tokenizer, max_features, embedding_size):
    model = KeyedVectors.load_word2vec_format(filepath, binary=True)

    emb_mean, emb_std = model.wv.syn0.mean(), model.wv.syn0.std()

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embedding_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        try:
            embedding_vector = model[word]
            embedding_matrix[i] = embedding_vector
        except KeyError:
            continue
    return embedding_matrix
Пример #10
0
    def load(cls, np2vec_model_file, binary=False, word_ngrams=0):
        """
        Load the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.

        Returns:
            np2vec model to load
        """
        if word_ngrams == 0:
            return KeyedVectors.load_word2vec_format(
                np2vec_model_file, binary=binary)
        elif word_ngrams == 1:
            return FastText.load(np2vec_model_file)
        else:
            logger.error('invalid value for \'word_ngrams\'')
def wv(w1, w2, t):
    # lazy load the wordvector model...
    global wvmodel
    if wvmodel == None:
        print ' *', 'loading wordvector model (', modelFile, ')...'
        wvmodel = KeyedVectors.load_word2vec_format(modelFile, binary=False)
        wvmodel.init_sims(replace=True)  # no more updates, prune memory

    try:
        #
        # since we've got wordnet synset objects (like cat.n.01), we
        # must turn this back into a regular word ('cat') because the
        # word vector GloVe models are plain words with spaces turned
        # into hyphens on phrases (e.g. climate-change, black-and-white)
        #
        wv_w1, wv_w2 = _mk_wv_word(w1), _mk_wv_word(w2)
        distance = wvmodel.similarity(wv_w1, wv_w2)
        return distance if abs(distance) >= t else 0
    except:
        return 0
Пример #12
0
def load_embeddings(pytorch_embedding, word2idx, filename, embedding_size):
    print("Copying pretrained word embeddings from ", filename, flush=True)
    en_model = KeyedVectors.load_word2vec_format(filename)
    """ Fetching all of the words in the vocabulary. """
    pretrained_words = set()
    for word in en_model.vocab:
        pretrained_words.add(word)

    arr = [0] * len(word2idx)
    for word in word2idx:
        index = word2idx[word]
        if word in pretrained_words:
            arr[index] = en_model[word]
        else:
            arr[index] = np.random.uniform(-1.0, 1.0, embedding_size)

    """ Creating a numpy dictionary for the index -> embedding mapping """
    arr = np.array(arr)
    """ Add the word embeddings to the empty PyTorch Embedding object """
    pytorch_embedding.weight.data.copy_(torch.from_numpy(arr))
    return pytorch_embedding
Пример #13
0
    def fit(self, X, y=None):

        dw_params = self.get_params()
        print dw_params

        if False: #exists(self.output_file):
            model = KeyedVectors.load_word2vec_format(self.output_file)
        else:
            model = run_gensim(dw_params)
        nb_vecs = len(model.wv.vocab)

        # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) 
        features_matrix = np.asarray([model[str(node)] for node in range(nb_vecs)])
        #features_matrix = np.random.randn((4,2))

        if self.normalize:
            norms = np.linalg.norm(features_matrix, axis=1)
            if self.verbose:
                print norms
                print norms.shape

            assert norms.shape[0] == features_matrix.shape[0]
            for i in range(features_matrix.shape[0]):
                features_matrix[i,:] /= norms[i]

            norms = np.linalg.norm(features_matrix, axis=1)
            if self.verbose:
                print norms

        if self.verbose:
            print('features_matrix.shape = %s' % str(features_matrix.shape))

        self.dw_params_ = dw_params
        self.gs_model_ = model
        self.features_matrix_ = features_matrix
        print('fit', self.features_matrix_.shape)
        return self
Пример #14
0
def eval_blogcat(embeddings_file, labels_matrix=None, G=None,
                 verbose=1, normalize=1, training_percents=[0.1, 0.6, 0.9]):

    # 0. Files
    #embeddings_file = "/mnt/raid1/deepwalk/blogcatalog.vec"
    if labels_matrix is None and G is None:
        G, labels_matrix = load_blogcat()
    
    # 1. Load Embeddings
    model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)

    labels = np.argwhere(labels_matrix)
    label_cnts = pd.Series(labels[:,1]).value_counts()

    if verbose > 1:
        print('\nLabel counts:')
        print(label_cnts)

    # delete the least frequent labels, which causes balancing problems
    labels_matrix = labels_matrix[:, :-2]

    # Map nodes to their features (note: assumes nodes are labeled as integers 1:N) 
    features_matrix = np.asarray([model[str(node)] for node in range(len(G))])

    if normalize:
        norms = np.linalg.norm(features_matrix, axis=1)
        if verbose:
            print norms
            print norms.shape

        assert norms.shape[0] == features_matrix.shape[0]
        for i in range(features_matrix.shape[0]):
            features_matrix[i,:] /= norms[i]

        norms = np.linalg.norm(features_matrix, axis=1)
        if verbose:
            print norms

    if verbose:
        print('-'*100)
        print(embeddings_file)
        print('features_matrix.shape = %s' % str(features_matrix.shape))
        print('labels_matrix.shape   = %s' % str(labels_matrix.shape))

    # 2. Shuffle, to create train/test groups
    shuffles = []
    number_shuffles = 1
    for x in range(number_shuffles):
        # if we just have one group, make the split the same every time
        if number_shuffles == 1:
            shuffles.append(skshuffle(features_matrix, labels_matrix, random_state=123))
        else:
            shuffles.append(skshuffle(features_matrix, labels_matrix))

    # 3. to score each train/test group
    all_results = defaultdict(list)

    # uncomment for all training percents
    #training_percents = np.asarray(range(1,10))*.1
    for train_percent in training_percents:
        # print('-'*100)
        # print('pct_train: %.2f' % train_percent)

        for shuf in shuffles:
            X, y = shuf
            training_size = int(train_percent * X.shape[0])

            X_train = X[:training_size, :]
            y_train = y[:training_size]
            X_test = X[training_size:, :]
            y_test = y[training_size:]

            clf = TopKRanker(LogisticRegression())
            clf.fit(X_train, y_train)

            # find out how many labels should be predicted
            #top_k_list = [len(l) for l in y_test]
            top_k_list = np.array(np.sum(y_test, axis=1).flatten()[0])[0].astype(np.int32)
            preds = clf.predict(X_test, top_k_list)

            if y_test.shape[1] != preds.shape[1]:
                raise Exception("imbalance of class dims")
                #continue
            
            results = OrderedDict()
            averages = ["micro", "macro", "samples", "weighted"]
            for average in averages:
                results[average] = f1_score(y_test, preds, average=average)

            all_results[train_percent].append(results)
            #break

    if verbose:
        print '-------------------'
        for train_percent in sorted(all_results.keys()):
            print 'Train percent:', train_percent
            for x in all_results[train_percent]:
                print  x
            print '-------------------'
    return all_results
Пример #15
0
print("creating word sequences...")
ws, ys = [], []
fin = open(INPUT_FILE, "rb")
for line in fin:
    label, sent = line.strip().split("\t")
    ys.append(int(label))
    words = [x.lower() for x in nltk.word_tokenize(sent)]
    wids = [word2index[word] for word in words]
    ws.append(wids)
fin.close()
W = pad_sequences(ws, maxlen=maxlen)
Y = np_utils.to_categorical(ys)

# load GloVe vectors
print("loading word2vec vectors...")
word2vec = KeyedVectors.load_word2vec_format(WORD2VEC_MODEL, binary=True)

print("transferring embeddings...")
X = np.zeros((W.shape[0], EMBED_SIZE))
for i in range(W.shape[0]):
    E = np.zeros((EMBED_SIZE, maxlen))
    words = [index2word[wid] for wid in W[i].tolist()]
    for j in range(maxlen):
        try:
            E[:, j] = word2vec[words[j]]
        except KeyError:
            pass
    X[i, :] = np.sum(E, axis=1)
   
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3, 
                                                random_state=42)
Пример #16
0
def get_word_embeddings():
    word_embeddings = KeyedVectors.load_word2vec_format("wiki-news-300d-1M-subword.vec")
    word_embeddings.init_sims(replace=True)
    return word_embeddings
Пример #17
0
""" Main function. """

from util import is_word, load_model_embedding, info_from_line, log, get_tensor, repack_tensors
from data import InputData
from nn import CNN

import time
from gensim.models import KeyedVectors
from functools import partial
import tensorflow
from tensorflow.python import debug

def filter_1(string):
    return is_word(string)

MODEL = KeyedVectors.load_word2vec_format("../data/GoogleNews-vectors-negative300.bin",
                                          binary=True)
    
def readFile(path):
    load_model = partial(load_model_embedding, model=MODEL)
    inp = InputData(path, info_from_line, load_model, [filter_1])
    return inp

def print_info_map(map_name, map_data, top_k, file):
    with open(file, 'w') as f:
        for k, v in sorted(map_data.items(), key=lambda x: x[1], reverse=True)[0:top_k]:
            f.write("'%s'\t%d\n" % (k, v))

def create_file_info(path, prefix="", save_file_pref="", data_f=None, model_f=None):
    inp = readFile(path)

    all_words = {}
Пример #18
0
def get_model():
    '''
    :return: Downloads the `gensim` model.'''
    return KeyedVectors.load_word2vec_format(download(),binary=False)
Пример #19
0
 def _get_embedding(self, embedding_path):
     model = KeyedVectors.load_word2vec_format(embedding_path)
     vocab = model.vocab
     vocab_len = len(vocab)
     return np.array([model.word_vec(k) for k in vocab.keys()])
Пример #20
0
                    nargs="+",
                    help='location of json file with definitions.')

parser.add_argument('--save',
                    type=str,
                    required=True,
                    nargs="+",
                    help='where to save files')

parser.add_argument("--w2v",
                    type=str,
                    required=True,
                    help="location of binary w2v file")
args = parser.parse_args()

if len(args.defs) != len(args.save):
    parser.error("Number of defs files must match number of save locations")

word_vectors = KeyedVectors.load_word2vec_format(args.w2v, binary=True)
for i in range(len(args.defs)):
    vectors = []
    with open(args.defs[i], "r") as infile:
        definitions = json.load(infile)
    for elem in definitions:
        if elem[0][0] in word_vectors:
            vectors.append(word_vectors[elem[0][0]])
        else:
            vectors.append(np.zeros(word_vectors.vector_size))
    vectors = np.array(vectors)
    np.save(args.save[i], vectors)
Пример #21
0
from gensim.models import KeyedVectors
from mysite.settings import BASE_DIR
from pathlib import Path
import random, re
import CaboCha
import sys
sys.path.append(str(Path(BASE_DIR).joinpath('handaioh_NLP/utils/').resolve()))
# from Spotlight_return import Spotlight_return

data_path = str(
    Path(BASE_DIR).joinpath(
        'handaioh_NLP/utils/data/word2vec.300d.ja.txt').resolve())
model = KeyedVectors.load_word2vec_format(data_path)

# IREXに準拠
repl_align = {
    'ORGANIZATION': 'どこの機関',
    'PERSON': '誰',
    'LOCATION': 'どこの場所',
    'DATE': 'いつ',
    'TIME': 'いつ',
    'MONEY': 'いくら',
    'PERCENT': 'どのくらい',
    'ARTIFACT': '何',
    'O': '何',
}


class Chunk:
    def __init__(self):
        self.words = []
Пример #22
0
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# 输入文件
glove_file = r'E:/Study/Codings/python_work/nlp_pro1/word2vec_model/glove.6B/glove.6B.300d.txt'
# 输出文件
tmp_file = r'E:/Study/Codings/python_work/nlp_pro1/word2vec_model/glove_vec/glove.6B.300d.txt'

# 命令行调用
# python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>

# 开始转换
glove2word2vec(glove_file, tmp_file)

# 加载转化后的文件
model = KeyedVectors.load_word2vec_format(tmp_file)
Пример #23
0
 def __loadModel(self):
     print("Loading model '{0}'...".format(self.__modelPath))
     self.__model = KeyedVectors.load_word2vec_format(self.__modelPath,
                                                      binary=True)
     print("Loaded!")
Пример #24
0
def w2v_export(embedding_file):
    try:
        model = KeyedVectors.load(embedding_file)
    except Exception, e:
        model = KeyedVectors.load_word2vec_format(embedding_file)
Пример #25
0
def normalize_word(embedding_file):
    # 将词向量进行规范化
    try:
        model = KeyedVectors.load(embedding_file)
    except Exception, e:
        model = KeyedVectors.load_word2vec_format(embedding_file)
Пример #26
0
from gensim.models import KeyedVectors
import time

# Get the time at the beginning of the load
start_time = time.time()
print(time.ctime(start_time))

# Load the model file
loaded_model = KeyedVectors.load_word2vec_format(
    '~/Documents/glove_word2vec/word2vec.840B.300d.txt')

# Get the time at the end of the load and calculate how long it took
end_time = time.time()
print(time.ctime(end_time))
elapsed_time = end_time - start_time

print('Loaded model file in ' + str(elapsed_time / 60.0) + ' minutes')

# Get most similar words to "day"
word1 = loaded_model.get_vector('day')
# print(word1)
print(loaded_model.most_similar(positive=['day']))
Пример #27
0
from gensim.models import KeyedVectors
en_vectors = KeyedVectors.load_word2vec_format('data/wiki-news-300d-1M.vec', binary=False)

from gensim.models import Word2Vec
vi_vectors = Word2Vec.load('data/vi.bin').wv

# # Lưu ý: đối với model glove, cần chuyển về format word2vec
# # Ví dụ
# from gensim.scripts.glove2word2vec import glove2word2vec
# glove2word2vec('data/glove.6B.50d.txt', 'data/en.vec')

en_vectors.vocab

en_vectors["cat"]

print ("vector size: ", en_vectors.vector_size)
print ("vocab size: ", len(en_vectors.vocab))

print ("vector size: ", vi_vectors.vector_size)
print ("vocab size: ", len(vi_vectors.vocab))

en_vectors.most_similar("cat")


vi_vectors.most_similar("mèo")


sim_words = en_vectors.most_similar(positive=['queen', 'man'], negative=['king'])
print('Queen is a: ', sim_words[0][0])

sim_words = en_vectors.most_similar(negative=['king'], positive=['kings', 'queen'])
Пример #28
0
from tensorflow.python.keras.saving import load_model
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding, LSTM, Bidirectional
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.optimizers import Adam
# 进行训练和测试样本的分割
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau

# 忽略警告
warnings.filterwarnings("ignore")
# 测试加载预训练模型
cn_model = KeyedVectors.load_word2vec_format('sgns.zhihu.bigram',
                                             binary=False,
                                             unicode_errors="ignore")
# print(cn_model.similarity('橘子', '橙子'))
# print(cn_model.most_similar(positive=['大学'], topn=10))

# 我们数据集有4000条评论
# 只使用前50000个中文词做测试----目前作为测试,生产过程可以全部使用
num_words = 50000
# 词向量数----该值是基于sgns.zhihu.bigram中的维度来设定
embedding_dim = 300
# 输入的最大维度值----该值是代表所有被处理的评论的词数。
max_tokens = 236
# 建立一个权重的存储点
path_checkpoint = 'checkpoint.h5'
# 建立模型
model = Sequential()
Пример #29
0
Файл: cnn.py Проект: wlf061/nlp
def do_keras_textcnn_w2v(text,stars,trainable):

    #转换成词袋序列
    max_document_length=200

    embedding_dims = 300


    #获取已经训练好的词向量
    model = KeyedVectors.load_word2vec_format(word2vec_file, binary=True)

    print model['word'].shape


    #设置分词最大个数 即词袋的单词个数
    tokenizer = Tokenizer(num_words=max_features,lower=True)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)

    x=pad_sequences(sequences, maxlen=max_document_length)


    #我们可以使用从scikit-learn LabelEncoder类。
    # 这个类通过 fit() 函数获取整个数据集模型所需的编码,然后使用transform()函数应用编码来创建一个新的输出变量。
    encoder=LabelEncoder()
    encoder.fit(stars)
    encoded_y = encoder.transform(stars)

    #labels = to_categorical(np.asarray(labels))也可以进行数据处理

    #获取word到对应数字编号的映射关系
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))


    #获取词向量的映射矩阵
    embedding_matrix = np.zeros((max_features + 1, embedding_dims))

    for word, i in word_index.items():

        #编号大于max_features的忽略 该字典是按照字典顺序 所以对应的id不一定是顺序的
        if i > max_features:
            continue

        try:
            embedding_matrix[i] = model[word].reshape(embedding_dims)

        except:
            print "%s not found!" % (word)


        #构造神经网络
    def baseline_model():

        #CNN参数

        #filters个数通常与文本长度相当 便于提取特征
        filters = max_document_length

        # Inputs
        input = Input(shape=[max_document_length])

        # 词向量层,本文使用了预训练word2vec词向量,把trainable设为False
        x = Embedding(max_features + 1,
                                    embedding_dims,
                                    weights=[embedding_matrix],
                                    trainable=trainable)(input)



        # conv layers
        convs = []
        for filter_size in [3,4,5]:
            l_conv = Conv1D(filters=filters, kernel_size=filter_size, activation='relu')(x)
            l_pool = MaxPooling1D()(l_conv)
            l_pool = Flatten()(l_pool)
            convs.append(l_pool)

        merge = concatenate(convs, axis=1)

        out = Dropout(0.2)(merge)

        output = Dense(32, activation='relu')(out)

        output = Dense(units=2, activation='softmax')(output)

        #输出层
        model = Model([input], output)

        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

        #可视化
        plot_model(model, to_file='yelp-cnn-model-textcnn.png',show_shapes=True)

        model.summary()

        return model
    #在 scikit-learn 中使用 Keras 的模型,我们必须使用 KerasClassifier 进行包装。这个类起到创建并返回我们的神经网络模型的作用。
    # 它需要传入调用 fit()所需要的参数,比如迭代次数和批处理大小。
    # 最新接口指定训练的次数为epochs
    clf = KerasClassifier(build_fn=baseline_model, epochs=10, batch_size=50, verbose=1)

    #使用5折交叉验证
    scores = cross_val_score(clf, x, encoded_y, cv=5, scoring='f1_micro')
    # print scores
    print("f1_micro: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Пример #30
0
def load_model():
    # load the pre-trained word2vec model (here we are using a model
    # pre-trained)
    global model
    filename = 'glove.6B.100d.txt.word2vec'
    model = KeyedVectors.load_word2vec_format(filename, binary=False)
Пример #31
0
import os
import json
from gensim.models import KeyedVectors



print ' *', 'loading wv model'

modelFile = os.environ['HOME'] + "/models/" + "glove.6B.300d_word2vec.txt"

model = KeyedVectors.load_word2vec_format(modelFile, binary=False)
print ' *', 'model ready'

w1 = 'nostalgia'
w2 = 'memory'
print ' *', w1, w2, 'similarity:', model.similarity(w1, w2)

for w in ['nostalgia', 'blurred', 'figurative_art', 'erotic', 'voyeurism']:
    if w in model:
        print ' *', w, model.most_similar(positive=[w])


words = set(["contemporary conceptualism", "appropriation", "contemporary participation", "colombian", "color photography", "american", "figurative art", "language", "abstract versus figurative art", "consumerism", "art that plays with scale", "architecture in art", "korean", "assemblage", "calarts", "collage", "1980s", "biomorphic", "collective history", "found objects", "grotesque", "cut/ripped", "decay", "united states", "flatness", "group of objects", "china", "chinese", "graffiti", "street art", "graffiti/street art", "color theory", "abstract sculpture", "art in art", "film/video", "singaporean", "cinematic", "brazil", "abstract", "brazilian", "'85 new wave", "city scenes", "drawing", "cultural commentary", "endurance art", "feminism", "bedrooms and bathrooms", "canadian", "columns and totems", "architecture's effects", "close-up", "1918 - 1939", "documentary photography", "black-and-white photography", "italian", "monochromatic", "gender", "globalization", "outdoor art", "mixed-media", "mexican", "mexico", "1990s", "ceramic", "animals", "artists' books", "1970s", "contemporary fact versus fiction", "art and technology", "installation art", "erased and obscured", "erotic", "contemporary grotesque", "etching/engraving", "abstract painting", "photoconceptualism", "bright/vivid", "abstract photography", "dark", "focus on materials", "contemporary traces of memory", "miniature and small-scale paintings", "conceptual", "photography", "japanese", "japan", "dutch", "contemporary vintage photography", "comic",
             "calligraphic", "belgium", "belgian", "contemporary surrealistic", "animation", "1960s", "collecting and modes of display", "cityscapes", "chance", "spain", "spanish", "black and white", "americana", "indian", "contemporary graphic realism", "conflict", "malaysian", "caricature / parody", "cross-cultural dialogue", "neo-conceptualism", "advertising and brands", "vietnamese", "australia and new zealand", "figurative painting", "central america", "el salvador", "food", "german-american", "germany", "puerto rican", "allover composition", "southern cone", "isolation", "sexual identity", "argentinean", "antiquity as subject", "contemporary archaeological", "human figure", "nude", "contemporary pop", "british", "indonesian", "anthropomorphism", "celebrity", "pakistani", "digital culture", "political", "violence", "social action", "contemporary diy", "narrative", "design", "architecture", "hard-edged", "minimalism", "flora", "chicano art", "crime", "color gradient", "contemporary color fields", "childhood", "suburbia", "blurred", "mexican american", "artist as ethnographer", "venezuelan", "humor", "figurative sculpture", "allegory", "focus on the social margins", "neo-concretism", "cuban", "myth/religion", "immersive", "modern", "pakistani-american", "angular", "costa rican", "abstract landscape", "body art", "performance art", "abject art", "light and space movement", "line, form and color", "classical mythology", "sculpture", "work on paper", "argentinian", "peruvian", "individual portrait", "automatism", "cuba", "engagement with mass media", "cubism", "emerging art"])

results = {}
unmatched = []

for w in words:
    x = w.replace(' ', '-')
    if x in model:
Пример #32
0
 def setUp(self):
     self.model = KeyedVectors.load_word2vec_format(
         rocanr.app.config['VECTOR_FILE'], binary=False)
     rocanr.app.testing = True
     self.app = rocanr.app.test_client()
Пример #33
0
weibo_neg = pd.read_table(path.join(path.dirname(__file__), '..', 'data',
                                    'weibo_neg.txt'),
                          header=None,
                          sep='\n',
                          encoding='utf8')
weibo_neg['label'] = 0
all_ = all_.append(weibo_neg, ignore_index=True)
wb_len = len(all_) - jd_len
print('len(all_) = ' + str(len(all_)))

stop_words = load_stop_words()

all_['words'] = all_[0].apply(lambda s: extract_cn_jd(s).split(' '))  #调用结巴分词
print(all_['words'])
w2v_model = KeyedVectors.load_word2vec_format(path.join(
    path.dirname(__file__), '..', 'data', 'w2v_onlycn_100_c_2.bin'),
                                              binary=True,
                                              unicode_errors='ignore')
word2vec_dim = 100

maxlen = 100  #截断词数
min_count = 5  #出现次数少于该值的词扔掉。这是最简单的降维方法

content = []
for i in all_['words']:
    content.extend(i)  #'收到', '少', '一本', '钱', '算啦', '这本', '宝宝', ……这样,形成了一条

# 建字典索引,这是用status_big_seg.txt做的,是完全的所有的单词,{单词: 索引数字}
dict_index = pd.Series(content).value_counts()  #index是词,value是数值

dict_index = dict_index[dict_index >= min_count]  #这个是去掉了出现次数少于5的词
dict_index[:] = range(1, len(dict_index) + 1)  #对value重排了,按照1到13212排
from gensim.models import KeyedVectors

def cos_sim(word,define_words,model_novice,model_expert):
    c_n = {}
    c_e = {}
    for i in define_words:
        c_n[i] = model_novice.wv.similarity(word, i)
        c_e[i] = model_expert.wv.similarity(word, i)
        #print(word + "(" + model.wv.similarity(word, i)+ ")"+ i)
    return c_e,c_n
    
model_novice = KeyedVectors.load_word2vec_format("very_novice_epoch4.bin", binary=True) 
model_expert = KeyedVectors.load_word2vec_format("expert_epoch4.bin", binary=True)

word = "badminton"
word1 = "dance"
word2 = "shooter"
word3 = "psychotherapy"

define_words = ["baddy",
"ace",
"alley",
"backcourt",
"baseline",
"carry",
"court",
"deception",
"doubles",
"dribble",
"drive",
"drop",
print('   word freq index ...')
num_freq_words = 0
model = None
file_path = ""
f_words = []
with open("generated/" + args.language + "/word_wiki_freq.txt") as f:
    for line in f:
        parts = unquote(line.strip()).split('\t')
        w = parts[0]
        if not is_stop_word_or_number(w) and parts[1].isdigit():
            freq_words[w] = int(parts[1])
            num_freq_words += 1

if args.word_vecs == "w2v":
    model = KeyedVectors.load_word2vec_format('data/basic_data/wordEmbeddings/w2v/GoogleGoogleNews-vectors-negative300.bin', binary=True)
elif args.word_vecs == "fasttext":
    model = KeyedVectors.load_word2vec_format("data/basic_data/wordEmbeddings/fasttext/cc."+args.language+".300.vec", binary=False)
elif args.word_vecs == "muse":
    file_path = 'data/basic_data/wordEmbeddings/muse/wiki.multi.' + args.language + '.vec'
    if not path.exists(file_path):
        url = "https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi." + args.language + ".vec"
        urllib.request.urlretrieve(url, file_path)
    model = KeyedVectors.load_word2vec_format("data/basic_data/wordEmbeddings/muse/wiki.multi."+args.language+".vec", binary=False)

common_w2v_freq_words = [ word for word in model.vocab if word in freq_words ]
print("common_w2v_freq_words : ", len(common_w2v_freq_words))

we_word2id = {}
we_id2word = {}
if path.exists("generated/" + args.language + "/we_word_id.p"):
cv_file = inDir + "/CVSchema/Prav_CVindices_5folds.csv"
CV_Schema = pd.read_csv(cv_file)

train_df = pd.merge(train_df, CV_Schema, how='left', on=['id', 'qid1', 'qid2'])

act = 'relu'
re_weight = True  # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'dn51_question_pairs_weights.h5'

########################################
## index word vectors
########################################
print('Indexing word vectors')

word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
        binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

########################################
## process texts in datasets
########################################
print('Processing text dataset')


# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.

    # Convert words to lower case and split them
    text = text.lower().split()
Пример #37
0
def calcfeatures(stancesFile, bodiesFile):
    path = os.path.abspath("")
    #gensim.models.KeyedVectors.load_word2vec_format
    #wmd_model = Word2Vec.load_word2vec_format('/data/w2v_googlenews/GoogleNews-vectors-negative300.bin.gz', binary=True)
    wmd_model = KeyedVectors.load_word2vec_format(path+'/data/GoogleNews-vectors-negative300.bin', binary=True)
    wmd_model.init_sims(replace=True)
    tknzr = TweetTokenizer()

    count = 0
    features = []
    classes = []

    #N = getDocCount(path+'/data/training/train_bodies.csv')

    keys = {'agree': 0, 'disagree': 1, 'discuss': 2, 'unrelated': 3}

    bodies = loadBodies(bodiesFile)

    bigram_vectorizer = CountVectorizer(tokenizer=tknzr.tokenize, ngram_range=(1, 2), binary=False, lowercase=True, 
        stop_words='english', min_df=1)
    
    vectorizer = TfidfVectorizer(tokenizer=tknzr.tokenize, ngram_range=(1, 1), binary=False, lowercase=True, 
        stop_words='english', min_df=1)

    tfidfMat = vectorizer.fit_transform(list(bodies.values()))
    tfidfMat = vectorizer.transform(list(bodies.values()))
    tfidfMat = tfidfMat.toarray()
    vocab = vectorizer.get_feature_names()
    k = list(bodies.keys())

    bodiesTokens = loadBodiesTokens(bodiesFile)

    with open(stancesFile, 'r', encoding='UTF-8') as csvDataFile1: 
		 
        csvReader1 = csv.reader(csvDataFile1)
        first = 1
        for row in csvReader1:
            f = []
            if first == 1: 
                first = 0
            else:
                print(count)
                count = count + 1

                #class
                classes.append(keys[row[2]])	

                #canberra distance
                f.append(feat.canberraDist(row[0],bodies[row[1]], bigram_vectorizer))
                         
                #polarity scores
                neg, neu, pos = feat.polarityScores(row[0], bodies[row[1]])
                f.append(neg)
                f.append(neu)
                f.append(pos)

                tokens1 = tknzr.tokenize(row[0])
                tokens1=[token.lower() for token in tokens1 if (token.isalpha() and token not in stop_words)]
                tokens2 = bodiesTokens[row[1]]

                #word movers distance
                f.append(feat.wmd(tokens1, tokens2,wmd_model))

                #common words
                common = (set(tokens1) & set(tokens2))              
                f.append(feat.overlap(common))      
                        
                #tfidf
                f.append(feat.tfidf(tfidfMat, common,vocab,k.index(row[1])))
                               
                #negations
                f.append(feat.negWords(tokens1,tokens2))

                #add all features
                features.append(f)
								
    return np.array(features), np.array(classes)
Пример #38
0
for language in ['fr', 'en']:

    print "loading resources..."
    start = time.time()

    URIs = config['URI_' + language]
    stopwords = utils.load_stopwords(
        path_to_resources + URIs['stopwords']
    )

    filler_words = utils.load_filler_words(
        path_to_resources + URIs['filler_words']
    )

    word_vectors = KeyedVectors.load_word2vec_format(
        path_to_resources + URIs['word_vectors'],
        binary=True
    )

    language_model = LanguageModel(
        path_to_resources + URIs['language_model']
    )

    pos_tagger = StanfordPOSTagger(
        model_filename=path_to_resources + URIs['pos_tagger_model'],
        path_to_jar=path_to_resources + URIs['pos_tagger_jar']
    )

    print "time_cost = %.2fs" % (time.time() - start)

    resources[language] = {
        'stopwords': stopwords,
Пример #39
0
    return


def main(args):
    '''
	Pipeline for representational learning for all nodes in a graph.
	'''
    nx_G = read_graph(args)
    G = node2vec.Graph(nx_G, args.directed, args.p, args.q)
    G.preprocess_transition_probs()
    walks = G.simulate_walks(args.num_walks, args.walk_length)
    learn_embeddings(walks)


if __name__ == "__main__":
    args = parse_args()
    args.input = 'user_edges'
    args.output = 'user_vec'
    args.walk_length = 5
    args.num_walks = 10
    # args.weighted = True
    # args.directed = True
    args.dimensions = 64
    args.window_size = 2
    args.p = 2
    args.q = 2
    main(args)
    model = KeyedVectors.load_word2vec_format('user_vec')
    print(model.wv.most_similar('4'))
Пример #40
0
def loadVectors(location, model='gensim', binary=True):
    if model == 'gensim':
        return Word2Vec.load(location)
    elif model == 'w2v':
        return KeyedVectors.load_word2vec_format(location, binary=binary)
#!/usr/bin/python
# -*- coding: utf-8 -*-

from  gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('/home/tj/big_data/data/talk/2j3s.vec', binary=False) 
model.save_word2vec_format('/home/tj/big_data/data/talk/2j3s.vec.bin', binary=True)

Пример #42
0
#The WordSimilarity-353 Test Collectionの評価データをダウンロードし,単語ベクトルにより計算される類似度のランキングと,人間の類似度判定のランキングの間のスピアマン相関係数を計算せよ.
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from tqdm import tqdm


def cosSim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


def culcCosSim(row):
    global model
    w1v = model[row['Word 1']]
    w2v = model[row['Word 2']]
    return cosSim(w1v, w2v)


tqdm.pandas()
model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
df = pd.read_csv('./wordsim353/combined.csv')
df['cosSim'] = df.progress_apply(culcCosSim, axis=1)

print(df[['Human (mean)', 'cosSim']].corr(method='spearman'))
Пример #43
0
For more information on this file, see
https://docs.djangoproject.com/en/1.10/topics/settings/

For the full list of settings and their values, see
https://docs.djangoproject.com/en/1.10/ref/settings/
"""

import os
from gensim.models import KeyedVectors
# Build paths inside the project like this: os.path.join(BASE_DIR, ...)
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

settings_dir = os.path.dirname(__file__)
PROJECT_ROOT = os.path.abspath(os.path.dirname(settings_dir))
MODEL_PATH = os.path.join(PROJECT_ROOT, 'apollo/w2v/GoogleNews-vectors-negative300.bin.gz')
MODEL = KeyedVectors.load_word2vec_format(MODEL_PATH, unicode_errors = 'replace', binary = 'True', limit=10000)

# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/

# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = ')p1#0dnupk$xc59wdfl^%!7)4myi--la+xd4=$krk&a55$%0rz'

# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True

ALLOWED_HOSTS = []


# Application definition
Пример #44
0
 def setUp(self):
     self.vectors = EuclideanKeyedVectors.load_word2vec_format(
         datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)
Пример #45
0
def load_google_vec():
    
    from gensim.models import KeyedVectors
    
    return KeyedVectors.load_word2vec_format('~/nlp/w2v/GoogleNews-vectors-negative300.bin.gz', binary=True)
Пример #46
0
def categorizer():
    
    #driver function,
    #returns model output mapped on the input corpora as a dict object
    
    stats = open('stats.txt', 'w', encoding='utf-8')

    st = time.time()
    
    wordmodelfile = "C:/Users/anush/Desktop/Venter_CMS-master/Venter/ML_model/Civis/MAX.bin"
    wordmodel = KeyedVectors.load_word2vec_format(wordmodelfile, binary = True, limit=200000)

    keywords = {
    'test_data': ['bedbugs', 'cctv', 'pipeline', 'Open spaces', 'gutter', 'garbage',
                    'rats', 'mice', 'robbery', 'theft', 'passage', 'galli', 'lane',
                    'light', 'bathrooms not clean', 'toilets not clean', 'playarea', 'mosquito', 'fogging','water'],
    }
    #wordmodelfile = os.path.join(BASE_DIR, 'Venter/ML_model/Civis/MAX.bin')
    wordmodel = KeyedVectors.load_word2vec_format(wordmodelfile, binary=True, limit=200000)
    et = time.time()
    s = 'Word embedding loaded in %f secs.' % (et-st)
    print(s)
    stats.write(s + '\n')

    #filepaths
    #responsePath = os.path('./comments/')
    responsePath=('./comments/')
    responseDomains = os.listdir('./comments/')
    #responseDomains.sort()
    
    #dictionary for populating the json output
    results = {}
    for responseDomain in zip(responseDomains):
        #instantiating the key for the domain
        responseDomain=str(responseDomain)
        domain=responseDomain[2:-7]
        responseDomain=responseDomain[2:-3]
        #domain = responseDomain[:-4]
        print("ResponseDomain is: ",responseDomain)
        print("Domain is: ",domain)
        results[domain] = {}

        print('Categorizing %s domain...' % domain)

        temp = open(os.path.join(responsePath, responseDomain), 'r', encoding='utf-8-sig')
        responses = temp.readlines()
        rows=0
        for response in responses:
            response = list(filter(None, response.lower().split('.'))) 
            num=0
            if '\n' in response:
                num+=1
            rows+=(len(response)-num)

        categories=keywords[domain]
        columns = len(categories)

        #categories = category
        #saving the scores in a similarity matrix
        #initializing the matrix with -1 to catch dump/false entries
        st = time.time()
        similarity_matrix = [[-1 for c in range(columns)] for r in range(rows)]
        et = time.time()
        s = 'Similarity matrix initialized in %f secs.' % (et-st)
        print(s)
        stats.write(s + '\n')

        row = 0
        st = time.time()
        for response in responses:
            response = list(filter(None, response.lower().split('.'))) 
            print("Row: ",row)
            for single_response in response:
                print("Current sentence is: ",single_response)
                if len(single_response) == 1:
                    continue
                #print(single_response)
                if single_response=='\n':
                    continue
                else:
                    column = 0
                    for category in categories:
                        print("Current category is: ",category)
                        similarity_matrix[row][column] = wmd_similarity(single_response, category, wordmodel)
                        column += 1
            row += 1
        et = time.time()
        s = 'Similarity matrix populated in %f secs. ' % (et-st)
        print(s)
        stats.write(s + '\n')

        print('Initializing json output...')
        for catName in categories:
            results[domain][catName] = []

        print('Populating category files...')
        for score_row, response in zip(similarity_matrix, responses):
            #max_sim_index = len(categories)-1
            response = list(filter(None, response.lower().split('.'))) 
            for single_response in response:
                if single_response!='\n':
                    print("Current score row: \n",np.array(score_row))
                    min_sim_index=len(categories)-1
                #if np.array(score_row).sum() > 0:
                    min_sim_index = np.array(score_row).argmin()
                    temp = {}
                    temp['response'] = single_response
                    temp['score'] = float((np.array(score_row).min()))
            # else:
                    #temp = response
                    results[domain][categories[min_sim_index]].append(temp)
        print('Completed.\n')

        #sorting domain wise categorised responses based on scores
        for domain in results:
            for category in results[domain]:                                                                                                                                      
                temp = results[domain][category]
                if len(temp)==0 or category=='Novel':
                    continue
                #print(temp)
                results[domain][category] = sorted(temp, key=lambda k: k['score'], reverse=True)
        #newlist = sorted(list_to_be_sorted, key=lambda k: k['name']) --> to sort list of dictionaries

        print('***********************************************************') 

        with open('out_new_2.json', 'w') as temp:
            json.dump(results, temp)
    
    return results
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import numpy as np
from sklearn.decomposition import PCA
from matplotlib import pyplot
import pandas as pd
#%%

glove_path = 'F:/year 3/zsl/class_embedding/GloVe/glove.6B.300d.txt'
w2v_path =   'F:/year 3/zsl/class_embedding/GloVe/glove.6B.300d.txt.word2vec'
glove2word2vec(glove_path, w2v_path)

#%%


model = KeyedVectors.load_word2vec_format(w2v_path, binary = False)

results = model.most_similar(positive = ['woman', 'king'], negative = ['man'], topn = 3)
print (results)


words = list (model.vocab)
print (len(words))



#%%

X = model[model.vocab]  # 400000 * 50
print (X.shape)
Пример #48
0
def load_word2vec_dataset():

    words = []
    words.append("airplane")
    words.append("alarm clock")
    words.append("ant")
    words.append("ape")
    words.append("apple")
    words.append("metal")  #armour
    words.append("axe")
    words.append("banana")
    words.append("bat")
    words.append("bear")
    words.append("bee")
    words.append("beetle")
    words.append("bell")
    words.append("bench")
    words.append("bicycle")
    words.append("blimp")
    words.append("bread")
    words.append("butterfly")
    words.append("cabin")
    words.append("camel")
    words.append("candle")
    words.append("cannon")
    words.append("car")
    words.append("castle")
    words.append("cat")
    words.append("chair")
    words.append("chicken")
    words.append("church")
    words.append("couch")
    words.append("cow")
    words.append("crab")
    words.append("crocodile")
    words.append("cup")
    words.append("deer")
    words.append("dog")
    words.append("dolphin")
    words.append("door")
    words.append("duck")
    words.append("elephant")
    words.append("eyeglasses")
    words.append("fan")
    words.append("fish")
    words.append("flower")
    words.append("frog")
    words.append("geyser")
    words.append("giraffe")
    words.append("guitar")
    words.append("hamburger")
    words.append("hammer")
    words.append("harp")
    words.append("hat")
    words.append("hedgehog")
    words.append("helicopter")
    words.append("hermit crab")
    words.append("horse")
    words.append("hot air balloon")
    words.append("hot dog")
    words.append("hour glass")
    words.append("jack o lantern")
    words.append("jelly fish")
    words.append("kangaroo")
    words.append("knife")
    words.append("lion")
    words.append("lizard")
    words.append("lobster")
    words.append("motorcycle")
    words.append("mouse")
    words.append("mushroom")
    words.append("owl")
    words.append("parrot")
    words.append("pear")
    words.append("penguin")
    words.append("piano")
    words.append("pickup truck")
    words.append("pig")
    words.append("pineapple")
    words.append("pistol")
    words.append("pizza")
    words.append("pretzel")
    words.append("Rabbit")
    words.append("raccoon")
    words.append("racket")
    words.append("ray")
    words.append("rhinoceros")
    words.append("rifle")
    words.append("rocket")
    words.append("sail boat")
    words.append("saw")
    words.append("saxophone")
    words.append("scissors")
    words.append("scorpion")
    words.append("seagull")
    words.append("seal")
    words.append("sea turtle")
    words.append("shark")
    words.append("sheep")
    words.append("shoe")
    words.append("skyscraper")
    words.append("snail")
    words.append("snake")
    words.append("songbird")
    words.append("spider")
    words.append("spoon")
    words.append("squirrel")
    words.append("starfish")
    words.append("strawberry")
    words.append("swan")
    words.append("sword")
    words.append("table")
    words.append("tank")
    words.append("teapot")
    words.append("teddy bear")
    words.append("tiger")
    words.append("tree")
    words.append("trumpet")
    words.append("turtle")
    words.append("umbrella")
    words.append("violin")
    words.append("volcano")
    words.append("wading bird")
    words.append("wheel chair")
    words.append("windmill")
    words.append("window")
    words.append("wine bottle")
    words.append("zebra")

    model = KeyedVectors.load_word2vec_format(
        'dataset/GoogleNews-vectors-negative300.bin', binary=True)
    wv_embeddings = np.zeros((125, 300))
    #print(model['cars'])
    #print type(model['cars'])

    for i in range(125):
        if i == 1:
            wv_embeddings[i, :] = (model['alarm'] + model['clock']) / 2
        elif i == 6:
            wv_embeddings[i, :] = model['metal']
        elif i == 53:
            wv_embeddings[i, :] = (model['hermit'] + model['crab']) / 2
        elif i == 55:
            wv_embeddings[i, :] = (model['hot'] + model['air'] +
                                   model['balloon']) / 3
        elif i == 56:
            wv_embeddings[i, :] = (model['hot'] + model['dog']) / 2
        elif i == 57:
            wv_embeddings[i, :] = (model['hour'] + model['glass']) / 2
        elif i == 58:
            wv_embeddings[i, :] = (model['jack'] + model['lantern']) / 2
        elif i == 59:
            wv_embeddings[i, :] = (model['jelly'] + model['fish']) / 2
        elif i == 73:
            wv_embeddings[i, :] = (model['pickup'] + model['truck']) / 2
        elif i == 86:
            wv_embeddings[i, :] = (model['sail'] + model['boat']) / 2
        elif i == 93:
            wv_embeddings[i, :] = (model['sea'] + model['turtle']) / 2
        elif i == 111:
            wv_embeddings[i, :] = (model['teddy'] + model['bear']) / 2
        elif i == 119:
            wv_embeddings[i, :] = (model['wading'] + model['bird']) / 2
        elif i == 120:
            wv_embeddings[i, :] = (model['wheel'] + model['chair']) / 2
        elif i == 123:
            wv_embeddings[i, :] = (model['wine'] + model['bottle']) / 2
        else:
            print(i)
            wv_embeddings[i, :] = model[words[i]]
        scipy.io.savemat('dataset/wv_embeddings.mat',
                         {'features': wv_embeddings})  #saving

    return words
Пример #49
0
        vectors_to_compare.append(vector)
        """
    if len(target_words) > 1:
        if "_".join(target_words) in model:
            target_vector = model["_".join(target_words)]
        else:
            target_vectors = [model[word] for word in target_words if word in model] # and word not in stops)]
            if target_vectors:
                target_vector = np.mean(target_vectors, axis=0)
            else:
                return 0.0
    else:
        try:
            target_vector = model[target_words[0]]
        except:
            return 0.0
        """
    return 1 - cosine(vectors_to_compare[0], vectors_to_compare[1])


if __name__ == "__main__":
    embeddings_path = 'GoogleNews-vectors-negative300.bin'
    #embeddings_path = "numberbatch-en-17.06.txt"
    model = KeyedVectors.load_word2vec_format(embeddings_path, binary=False)

    with open('stopwords.txt', 'r') as f:
        stops = set(line.strip() for line in f.readlines())
    stops = stops.union(string.punctuation)

    #print(cos_similarity(["make", "you", "sneeze"], ["separate"], model, stops))
Пример #50
0
def main():
    parser = ArgumentParser("scoring",
                            formatter_class=ArgumentDefaultsHelpFormatter,
                            conflict_handler='resolve')
    parser.add_argument("--emb", required=True, help='Embeddings file')

    parser.add_argument("--format", default='mat', help='mat or edgelist')

    parser.add_argument(
        "--network",
        required=True,
        help=
        'A .mat file containing the adjacency matrix and node labels of the input network.'
    )

    parser.add_argument(
        "--adj-matrix-name",
        default='network',
        help='Variable name of the adjacency matrix inside the .mat file.')

    parser.add_argument(
        "--label-matrix-name",
        default='group',
        help='Variable name of the labels matrix inside the .mat file.')

    parser.add_argument("--num-shuffles",
                        default=2,
                        type=int,
                        help='Number of shuffles.')
    parser.add_argument(
        "--all",
        default=False,
        action='store_true',
        help=
        'The embeddings are evaluated on all training percents from 10 to 90 when this flag is set to true. '
        'By default, only training percents of 10, 50 and 90 are used.')

    args = parser.parse_args()
    # 0. Files
    embeddings_file = args.emb
    matfile = args.network

    # 1. Load Embeddings
    model = KeyedVectors.load_word2vec_format(embeddings_file, binary=False)

    # 2. Load labels
    mat = loadmat(matfile)
    A = mat[args.adj_matrix_name]
    graph = sparse2graph(A)
    labels_matrix = mat[args.label_matrix_name]
    labels_count = labels_matrix.shape[1]
    mlb = MultiLabelBinarizer(range(labels_count))

    # Map nodes to their features (note:  assumes nodes are labeled as integers 1:N)
    features_matrix = numpy.asarray(
        [model[str(node)] for node in range(len(graph))])

    # 2. Shuffle, to create train/test groups
    shuffles = []
    for x in range(args.num_shuffles):
        shuffles.append(skshuffle(features_matrix, labels_matrix))

    # 3. to score each train/test group
    all_results = defaultdict(list)

    if args.all:
        training_percents = numpy.asarray(range(1, 10)) * .1
    else:
        training_percents = [0.1, 0.5, 0.9]
    for train_percent in training_percents:
        for shuf in shuffles:

            X, y = shuf

            training_size = int(train_percent * X.shape[0])

            X_train = X[:training_size, :]
            y_train_ = y[:training_size]

            y_train = [[] for x in range(y_train_.shape[0])]

            cy = y_train_.tocoo()
            for i, j in zip(cy.row, cy.col):
                y_train[i].append(j)

            assert sum(len(l) for l in y_train) == y_train_.nnz

            X_test = X[training_size:, :]
            y_test_ = y[training_size:]

            y_test = [[] for _ in range(y_test_.shape[0])]

            cy = y_test_.tocoo()
            for i, j in zip(cy.row, cy.col):
                y_test[i].append(j)

            clf = TopKRanker(LogisticRegression())
            clf.fit(X_train, y_train_)

            # find out how many labels should be predicted
            top_k_list = [len(l) for l in y_test]
            preds = clf.predict(X_test, top_k_list)

            results = {}
            averages = ["micro", "macro"]
            for average in averages:
                results[average] = f1_score(mlb.fit_transform(y_test),
                                            mlb.fit_transform(preds),
                                            average=average)

            all_results[train_percent].append(results)

    print('Results, using embeddings of dimensionality', X.shape[1])
    print('-------------------')
    for train_percent in sorted(all_results.keys()):
        print('Train percent:', train_percent)
        for index, result in enumerate(all_results[train_percent]):
            print('Shuffle #%d:   ' % (index + 1), result)
        avg_score = defaultdict(float)
        for score_dict in all_results[train_percent]:
            for metric, score in iteritems(score_dict):
                avg_score[metric] += score
        for metric in avg_score:
            avg_score[metric] /= len(all_results[train_percent])
        print('Average score:', dict(avg_score))
        print('-------------------')
Пример #51
0
def pega_dados(vecfile, target, ant, syn):
    
    import csv
    from gensim.models import KeyedVectors

    cosine_ant = []
    cosine_syn = []
    subcos_ant = []
    subcos_syn = []
    
    mod = KeyedVectors.load_word2vec_format("/home/bthalenberg/ic/novos novos/"+vecfile, binary=False)
    
    i = 0
    while i != len(target):
        
        #getting cosine similary between target and antonym
        try:
            cos = mod.similarity(target[i], ant[i])
        except KeyError:
            cos = None
        cosine_ant.append(cos)
        
        #getting cosine similary between target and synonym
        try:
            cos_s = mod.similarity(target[i], syn[i])
        except KeyError:
             cos_s = None
        cosine_syn.append(cos_s)


        #subtracting the antonym cosine similarity from the synonym similarity for syn input
        try:
            subcos_syn.append(cos_s - cos)
        except TypeError:
            subcos_syn.append(None)
        
        #negating subtracted values for ant input
        try:
            subcos_ant.append(-(cos_s - cos))
        except TypeError:
            subcos_ant.append(None)
        
        i += 1
        
    dirname = vecfile[:-4]
    
    with open(dirname+"/db_ant.csv", "w", encoding="utf-8") as f:
        writer = csv.writer(f)
        i = 0
        while i != len(target):
            writer.writerow([target[i], ant[i], cosine_ant[i]])
            i += 1
            
    with open(dirname+"/db_syn.csv", "w", encoding="utf-8") as f:
        writer = csv.writer(f)
        i = 0
        while i != len(target):
            writer.writerow([target[i], syn[i], cosine_syn[i]])
            i += 1
                        
    with open(dirname+"/db_sub_ant.csv", "w", encoding="utf-8") as f:
        writer = csv.writer(f)
        i = 0
        while i != len(target):
            writer.writerow([target[i], ant[i], subcos_ant[i]])
            i += 1
            
    with open(dirname+"/db_sub_syn.csv", "w", encoding="utf-8") as f:
        writer = csv.writer(f)
        i = 0
        while i != len(target):
            writer.writerow([target[i], syn[i], subcos_syn[i]])
            i += 1
Пример #52
0
            word2int_vietnamese.get(word, word2int_vietnamese["<unk>"])
            for word in words
        ]

        # Add int_seq to seq_list
        seq_list.append(int_seq)

    return seq_list


# Define the max length of english and vietnamese
english_max_len = 50
vietnamese_max_len = 50

# load model
word_embed_english_w2v = KeyedVectors.load_word2vec_format(
    args["word_emb_src"], binary=True, unicode_errors='ignore')
# Sort the int2word
int2word_sorted = sorted(int2word_english.items())

# Get the list of word embedding corresponding to int value in ascending order
word_emb_list = list()
embedding_size = len(word_embed_english_w2v['the'])
for int_val, word in int2word_sorted:
    # Add Glove embedding if it exists
    if (word in word_embed_english_w2v):
        word_emb_list.append(word_embed_english_w2v[word])

    # Otherwise, the value of word embedding is 0
    else:
        word_emb_list.append(np.zeros([embedding_size], dtype=np.float32))
Пример #53
0
def main(args):

    # Build the correct ensembly information

    print("Building Embeddings")

    if args.ensembler == "Infuser":
        if not (args.skipensembler):
            ensemble_infuser_method_StrucDiff2vec()
        output = args.output + "Infuser" + str(args.dimensions) + ".emb"
        ## Decide ensemble method to choose between multiple vocabs or infuser
        embeddings_model = KeyedVectors.load_word2vec_format(output)

    elif args.ensembler == "strucdiff2vec":
        if not (args.skipensembler):
            ensemble_method_StrucDiff2vec()
        args.dimensions = int(args.dimensions / 2)
        output1 = args.output + "struc2vec" + str(args.dimensions) + ".emb"
        ## Decide ensemble method to choose between multiple vocabs or infuser
        embeddings_model_struc = KeyedVectors.load_word2vec_format(output1)

        output2 = args.output + "diff2vec" + str(args.dimensions) + ".emb"
        ## Decide ensemble method to choose between multiple vocabs or infuser
        embeddings_model_diff = KeyedVectors.load_word2vec_format(output2)

        embeddings_model = [embeddings_model_struc, embeddings_model_diff]
        args.dimensions = int(args.dimensions * 2)

    elif args.ensembler == "Skip ensembler":
        output = "Embeddings/file.emb"
        ## Decide ensemble method to choose between multiple vocabs or infuser
        embeddings_model = KeyedVectors.load_word2vec_format(output)

    else:
        if args.ensembler == "node2vec":
            if not (args.skipensembler):
                no_ensemble_method_node2vec()
            output = args.output + "node2vec" + str(args.dimensions) + ".emb"
            ## Decide ensemble method to choose between multiple vocabs or infuser
            embeddings_model = KeyedVectors.load_word2vec_format(output)

        elif args.ensembler == "struc2vec":
            if not (args.skipensembler):
                no_ensemble_method_struc2vec()
            output = args.output + "struc2vec" + str(args.dimensions) + ".emb"
            ## Decide ensemble method to choose between multiple vocabs or infuser
            embeddings_model = KeyedVectors.load_word2vec_format(output)

        elif args.ensembler == "diff2vec":
            if not (args.skipensembler):
                no_ensemble_method_diff2vec()
            output = args.output + "diff2vec" + str(args.dimensions) + ".emb"
            ## Decide ensemble method to choose between multiple vocabs or infuser
            embeddings_model = KeyedVectors.load_word2vec_format(output)
        else:
            print("No ensembly method chosen try again")
            return

    print("Loaded Embeddings")
    ## Transform the dataset to be trained in the model.
    dataset_pos, dataset_neg = dataset_transformer(embeddings_model,
                                                   args.datasetinput,
                                                   args.dimensions,
                                                   args.usenegativesample,
                                                   args.buildnegativesample)
    ## Transform the testset to be trained in the model.
    testset_pos, testset_neg = dataset_transformer(embeddings_model,
                                                   args.testsetinput,
                                                   args.dimensions,
                                                   args.usenegativesample,
                                                   args.buildnegativesample)
    ## Transform the validset to be trained in the model.
    validset_pos, validset_neg = dataset_transformer(embeddings_model,
                                                     args.validsetinput,
                                                     args.dimensions,
                                                     args.usenegativesample,
                                                     args.buildnegativesample)

    ## Preprocess data
    dataset, targets = dataset_preprocess_hadamard(dataset_pos, dataset_neg)
    testset, testtargets = dataset_preprocess_hadamard(testset_pos,
                                                       testset_neg)
    validset, validtargets = dataset_preprocess_hadamard(
        validset_pos, validset_neg)

    print("Building model")
    ## build mode with tensorflow

    print("Training Model")
    ## Run the training on the dataset.

    model_accuracy = train([dataset, targets], [testset, testtargets],
                           [validset, validtargets], args)
    return model_accuracy
  # Making an empty column in our test data for predicted labels.
  test['Predicted Label'] = ''

  print("Unique words in Training Data: {}".format(train_unique_words))
  print("Unique words in Test Data: {}".format(test_unique_words))

preprocess()

trained.head()

test.head()

!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

word2vec = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

"""# **Repeating Part 1 with Word2Vec**"""

def extract_features(sentence):
  words = [word for word in sentence.split() if word in word2vec.vocab]
  if words == []:
    return []
  else:
    return np.mean(word2vec[words],axis=0)

train_embeddings = []

for sentence in trained['Tweet']:
  words = extract_features(sentence)
  if words == []:
Пример #55
0
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

########################################
## index word vectors
########################################
print('Indexing word vectors')

word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
        binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

########################################
## process texts in datasets
########################################
print('Processing text dataset')

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    text = text.lower().split()
    # Optionally, remove stop words
    if remove_stopwords:
Пример #56
0
        "United States",
        "United States Minor Outlying Islands",
        "Uruguay",
        "Uzbekistan",
        "Vanuatu",
        "Venezuela",
        "Viet Nam",
        "Virgin Islands, British",
        "Virgin Islands, U.s.",
        "Wallis And Futuna",
        "Western Sahara",
        "Yemen",
        "Zambia",
        "Zimbabwe"]

model = KeyedVectors.load_word2vec_format('section10/matrix_word2vec.txt', binary=True)
country_to_id = defaultdict(int)
matrix = np.empty([0,300], dtype=np.float)
cnt = 0

for c in country:
    try:
        matrix = np.vstack([matrix, model[c]])
        country_to_id[c] = cnt
        cnt += 1
    except:
        pass

io.savemat("section10/country_matrix", {"matrix":matrix})
with open("section10/country_to_id", "wb") as f:
    pickle.dump(country_to_id, f)
Пример #57
0
            continue

        X = np.array(X)/np.linalg.norm(X)
        Y = np.array(Y)/np.linalg.norm(Y)
        o = np.dot(X, Y.T)/np.linalg.norm(X)/np.linalg.norm(Y)

        scores.append(o)

    scores = np.asarray(scores)
    return np.mean(scores), 1.96*np.std(scores)/float(len(scores)), np.std(scores)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('ground_truth', help="ground truth text file, one example per line")
    parser.add_argument('predicted', help="predicted text file, one example per line")
    parser.add_argument('embeddings', help="embeddings bin file")
    args = parser.parse_args()

    print("loading embeddings file...")
    w2v = KeyedVectors.load_word2vec_format(args.embeddings, binary=True)

    r = average(args.ground_truth, args.predicted,  w2v)
    print("Embedding Average Score: %f +/- %f ( %f )" %(r[0], r[1], r[2]))

    r = greedy_match(args.ground_truth, args.predicted, w2v)
    print("Greedy Matching Score: %f +/- %f ( %f )" %(r[0], r[1], r[2]))

    r = extrema_score(args.ground_truth, args.predicted, w2v)
    print("Extrema Score: %f +/- %f ( %f )" %(r[0], r[1], r[2]))
    raise FileExistsError()
if not os.path.exists(PATH_ENTITY_VECTOR):
    raise FileExistsError()

## initialize tokenizer funtion ##
tokenizer_obj = MecabWrapper(dictType='neologd')
get_token = partial(__func_japanese_tokenizer,
                    tokenizer_obj=tokenizer_obj,
                    pos_condition=POS_CONDITION,
                    is_surface=False)

## load word embedding ##
try:
    embedding_model = KeyedVectors.load_word2vec_format(
        PATH_ENTITY_VECTOR, **{
            'binary': True,
            'unicode_errors': 'ignore'
        })
except:
    embedding_model = Word2Vec.load_word2vec_format(
        PATH_ENTITY_VECTOR, **{
            'binary': True,
            'unicode_errors': 'ignore'
        })

## make training data ##
with open(PATH_TRAINING_TEXT, 'r') as f:
    seq_wikipedia_training_text = json.loads(f.read())

seq_training_input_text_obj = []
for i, wikipedia_article_obj in enumerate(seq_wikipedia_training_text):
from future.utils import iteritems
from builtins import range
# Note: you may need to update your version of future
# sudo pip install -U future


from gensim.models import KeyedVectors


# warning: takes quite awhile
# https://code.google.com/archive/p/word2vec/
# direct link: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
# 3 million words and phrases
# D = 300
word_vectors = KeyedVectors.load_word2vec_format(
  '../large_files/GoogleNews-vectors-negative300.bin',
  binary=True
)


# convenience
# result looks like:
# [('athens', 0.6001024842262268),
#  ('albert', 0.5729557275772095),
#  ('holmes', 0.569324254989624),
#  ('donnie', 0.5690680742263794),
#  ('italy', 0.5673537254333496),
#  ('toni', 0.5666348338127136),
#  ('spain', 0.5661854147911072),
#  ('jh', 0.5661597847938538),
#  ('pablo', 0.5631559491157532),
#  ('malta', 0.5620371103286743)]
Пример #60
0
def load_w2v_model(file_name: str) -> None:
    print("loading w2v_model...")
    return KeyedVectors.load_word2vec_format(file_name,
                                             binary=True,
                                             encoding='utf-8')