Exemplo n.º 1
0
def loadWord2VecAndVectorizeInputs(X_train, X_test, word2vecURI):

    #load Word2Vec model
    w2v_model = Word2VecKeyedVectors.load_word2vec_format(word2vecURI,
                                                          binary=False)
    print("vocab_size = %s", len(w2v_model.vocab))

    #determine number of features for each word in the model
    WORD2VEC_NO_OF_FEATURES = w2v_model['dog'].shape[0]

    print("num_features = ", WORD2VEC_NO_OF_FEATURES)
    print("len(X_train) = ", len(X_train))
    print("len(X_test) = ", len(X_test))

    #define the missing word vector
    empty_word = np.zeros(WORD2VEC_NO_OF_FEATURES, dtype=float)

    #create the list to get the all words which we are missing in the Word2Vec model
    missedWords = []
    word2index = {}

    #vectorize each input
    X_train_vectorized = vectorizeInput(X_train, w2v_model, empty_word,
                                        missedWords, word2index)
    X_test_vectorized = vectorizeInput(X_test, w2v_model, empty_word,
                                       missedWords, word2index)

    print("Number of used words = ", len(set(word2index)))
    print("Number of words missing = ", len(set(missedWords)))

    return X_train_vectorized, X_test_vectorized, w2v_model, word2index
Exemplo n.º 2
0
def tokenizeLemmatizeDataSet(X_train, X_test, word2vecURI):
    lemmatizer, tokenizer, stop_words = initTokenizers()

    #load Word2Vec model
    w2v_model = Word2VecKeyedVectors.load_word2vec_format(word2vecURI,
                                                          binary=False)
    print("vocab_size = %s", len(w2v_model.vocab))

    #determine number of features for each word in the model
    WORD2VEC_NO_OF_FEATURES = w2v_model['dog'].shape[0]

    print("num_features = ", WORD2VEC_NO_OF_FEATURES)
    print("len(X_train) = ", len(X_train))
    print("len(X_test) = ", len(X_test))

    #create the list to get the all words which we are missing in the Word2Vec model
    missedWords = []
    word2index = {}

    X_train = prepareDataSet(lemmatizer, tokenizer, stop_words, X_train,
                             missedWords, word2index, w2v_model)
    X_test = prepareDataSet(lemmatizer, tokenizer, stop_words, X_test,
                            missedWords, word2index, w2v_model)

    print("Number of used words = ", len(set(word2index)))
    print("Number of words missing = ", len(set(missedWords)))

    return X_train, X_test, w2v_model, word2index
Exemplo n.º 3
0
def load_wv(url):
    """ load KeyedVectors wv

    Args:
        url: url to wv file

    Returns:
        Word2VecKeyedVectors: wv

    """
    return Word2VecKeyedVectors.load_word2vec_format(url, binary=False)
Exemplo n.º 4
0
    def readorg(self):
        dic = {}
        dishnames = DataLoader().load_dish_name()
        basicwords = self.readcorpus()
        regionnames = self.readregion()
        embedding_text = Word2VecKeyedVectors.load_word2vec_format(self.txtfilepath, binary=False)
        model = embedding_text

        for word in basicwords:
            if word in model.wv.vocab.keys():
                dic[word] = model[word]
                a = 0
                temp =[]
                #print(dic[word])
                #print(dic[word][a])
                while a < 200:
                    temp.append(float(dic[word][a]))
                    a +=1
                dic[word] = temp

        for dishname in dishnames:
            for name in dishname:
                if name in model.wv.vocab.keys():
                    dic[name] = model[name]
                    a = 0
                    temp = []
                    while a < 200:
                        temp.append(float(dic[name][a]))
                        a +=1
                    dic[name] = temp
        for regionname in regionnames:
            for name in regionname:
                if name in model.wv.vocab.keys():
                    dic[name] = model[name]
                    a = 0
                    temp = []
                    while a < 200:
                        temp.append(float(dic[name][a]))
                        a +=1
                    dic[name] = temp
        f = open('./data/ChineseFoodEmbedding.txt', 'w', encoding='utf-8')
        index=0
        while index < 10:
            print(dic[basicwords[index]])
            index += 1
        for key in dic.keys():
            pattern = re.compile(r'[\[\]\n\r\t]')
            f.write(key+" "+re.sub(pattern, "", str(dic[key]))+'\n')
        f.close()
Exemplo n.º 5
0
    def from_pretrained(
            cls,  # type: ignore
            embed_type: str,
            embed_path: Path,
            word_vocab: Vocabulary,
            embed_dim: int,
            freeze=True,
            sparse=False) -> 'Embedding':
        r"""
        Creates an :class:`Embedding` instance from external pretrained embeddings.

        :param embed_type: Type of the embedding, can be ``word2vec`` or ``fasttext``.
        :param embed_path: Path to the embedding file.
        :param word_vocab: A vocabulary mapping words to indices.
        :param embed_dim: Dimension of embeddings.
        :param freeze: If ``True``, embeddings are fixed during training.
        :param sparse: If ``True``, sparse embeddings are used. See PyTorch documentation for details.
        """
        embed_path_str = str(embed_path.resolve())
        if embed_type == 'word2vec':
            from gensim.models.word2vec import Word2VecKeyedVectors
            model = Word2VecKeyedVectors.load(embed_path_str)
        elif embed_type == 'fasttext':
            from gensim.models.fasttext import FastTextKeyedVectors
            model = FastTextKeyedVectors.load(embed_path_str)
        elif embed_type == 'glove':
            raise NotImplementedError
        else:
            raise ValueError(f"Embedding type {embed_type} not supported.")

        assert model.vector_size == embed_dim
        embeddings = np.zeros((len(word_vocab), embed_dim))
        for word, idx in word_vocab.items():
            embeddings[idx] = model.get_vector(word)

        embedding = super().from_pretrained(embeddings,
                                            freeze=freeze,
                                            sparse=sparse)
        # no point in doing the following: `cls` in classmethod points to the subclass
        # embedding.forward = types.MethodType(cls.forward, embedding)
        return embedding
Exemplo n.º 6
0
    def __init__(self, weights : str = None, size : int=100, window : int=5,
                 min_count : int=1, normalize : str=None, dictionary=None,
                 batch_size : int=10, **kwargs):

        self.dictionary = dictionary
        self.window = window

        if weights:
            self.obj = Word2VecKeyedVectors.load_word2vec_format(weights, binary=True)
        else:
            super(Word2VecWrapper, self).__init__(
                size=size,
                window=self.window,
                min_count=min_count,
                **kwargs)

            if self.dictionary:
                self.obj.build_vocab([[v for v in self.dictionary.values()]])

        self.normalize = normalize
        self.batch_size = batch_size
Exemplo n.º 7
0
from gensim.models.word2vec import Word2VecKeyedVectors
wv = Word2VecKeyedVectors.load_word2vec_format(
    "/hd/tecent_ew/Tencent_AILab_ChineseEmbedding.txt", binary=False)

kw = "电话号码"

print kw, "/".join([word for word, sim in wv.most_similar(kw, topn=10)])

#https://www.cnblogs.com/bymo/p/8440722.html
'''
>>> kw = u"电话号码"
>>> print kw, "/".join([word for word, sim in wv.most_similar(kw, topn=10)])
/usr/local/lib/python2.7/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.
  if np.issubdtype(vec.dtype, np.int):
电话号码 手机号码/手机号/电话号/座机号码/你的电话号码/联系号码/几个电话号码/座机号/电话/新号码
>>> print kw, "/".join([word for word, sim in wv.most_similar(kw, topn=100)])
电话号码 手机号码/手机号/电话号/座机号码/你的电话号码/联系号码/几个电话号码/座机号/电话/新号码/电话拨打/我的电话/私人号码/联系方式/家里的电话号码/电话信息/个人电话/手机通讯录/通话记录/手机电话/联系人电话/你的手机号/qq号码/家庭地址/办公电话/银行账号/电话记录/你的电话/银行卡号/联络方式/固定电话号码/电话本/短信内容/电话簿/电子邮箱地址/家庭电话/移动电话号码/通讯号码/号码显示/私人手机/父母的电话/姓名地址/卡号/办公室电话/短信/住址/私人电话/银行卡号码/电话薄/拨通/电子邮件地址/陌生号码/号码/qq号/其他联系方式/一串数字/住宅电话/显示号码/两个电话/回拨电话/短信信息/单位电话/手机联系人/家庭号码/拨通电话/联系电话号码/通信录/微信号码/通讯录/家庭住址/联系人姓名/联系人的姓名/常用号码/电话和短信/银行卡账号/手机拨打/短信息/拨打/座机电话号码/详细住址/银行卡密码/银行卡卡号/手机电话号码/办公室号码/信用卡号/座机电话/身份证信息/通讯记录/公用电话/通讯簿/留电话/打电话/收到的短信/空号/网络联系方式/骚扰电话/查号码/联络电话/电话通讯录/email地址
>>> kw = u"短信"
>>> print kw, "/".join([word for word, sim in wv.most_similar(kw, topn=100)])
短信 短息/短信息/一条短信/短信内容/电话和短信/手机短信/微信信息/微信消息/陌生号码/短信里/收到短信/短信回复/诈骗短信/陌生短信/通知短信/短信发送/群发短信/qq信息/打开短信/短信提醒/验证码短信/匿名短信/短信信息/qq消息/短消息/回复短信/垃圾短信/一条信息/发信息/骚扰短信/我的短信/手机号码/广告短信/信息回复/电话/语音留言/发送短信/收到的短信/回复信息/发短息/10086/问候短信/扣费短信/电话信息/群发/群发信息/qq留言/祝福短信/短信轰炸/语音电话/微信提醒/祝福信息/邮件/手机信息/微信短信/条微信/发短信/彩信/语音消息/短信提示/电话号码/那条短信/骗子短信/语音信息/信息提醒/转账信息/手机号/推送消息/看短信/骚扰电话/所有短信/短讯/电子邮件/以及短信/电话短信/我的电话/打电话/短信或电话/发消息/手机短信息/通讯录好友/发送失败/短信通知/短信电话/手机里/一则短信/陌生来电/微信发/陌生电话/道歉短信/接到的电话/短信消息/电话或短信/诈骗电话/回短信/未知号码/短信显示/未读信息/诈骗信息/新号码
'''
Exemplo n.º 8
0
# -*- coding:UTF-8 -*-
"""
@File    : test_wv.py
@Time    : 2019/4/17 22:21
@Author  : Blue Keroro
"""

from gensim.models.word2vec import Word2VecKeyedVectors

if __name__ == '__main__':
    from time import time

    start = time()
    print('加载词向量')
    wv_from_text = Word2VecKeyedVectors.load_word2vec_format(
        'C:/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt',
        binary=False)
    print('加载词向量    完毕')
    with open('models/nerDict.txt', 'r', encoding='utf-8') as f1, \
            open('models/nerDict_tencent_error.txt', 'w', encoding='utf-8') as f2:
        for line in f1:
            line = line.strip()
            try:
                if line not in wv_from_text:
                    f2.write(line + '\n')
            except Exception as e:
                print('出现问题', 'line:', line, 'error:', e)

    print('end', time() - start)
Exemplo n.º 9
0
 def __init__(self):
     self.wv = Word2VecKeyedVectors.load(
         '/Users/linjliang/Learning/PROJECT/workspace/functional/Tencent_AILab_smallEmbedding/1M.bin',
         mmap='r')
Exemplo n.º 10
0
from gensim.models.word2vec import Word2VecKeyedVectors

file = r"F:\NLP_learnings\词向量\腾讯中文词向量\Tencent_AILab_ChineseEmbedding.tar.gz"
wv_from_text = Word2VecKeyedVectors.load_word2vec_format(file,
                                                         binary=False,
                                                         encoding='gbk')
print(wv_from_text)
Exemplo n.º 11
0
 def __init__(self, path):
     self.model = Word2VecKeyedVectors.load_word2vec_format(path, binary=False)
Exemplo n.º 12
0
def load_model(filepath):
    return Word2VecKeyedVectors.load(filepath)
Exemplo n.º 13
0
 def read(self):
     embedding_text = Word2VecKeyedVectors.load_word2vec_format(self.foodfilepath, binary=False)
     return embedding_text