def wmdistance(self, document1, document2):
    len_pre_oov1 = len(document1)
    len_pre_oov2 = len(document2)
    document1 = [token for token in document1 if token in self]
    document2 = [token for token in document2 if token in self]
    diff1 = len_pre_oov1 - len(document1)
    diff2 = len_pre_oov2 - len(document2)
    
    dictionary = Dictionary(documents=[document1, document2])
    vocab_len = len(dictionary)

    # Sets for faster look-up.
    docset1 = set(document1)
    docset2 = set(document2)

    # Compute distance matrix.
    distance_matrix = zeros((vocab_len, vocab_len), dtype=double)
    for i, t1 in dictionary.items():
        for j, t2 in dictionary.items():
            if not t1 in docset1 or not t2 in docset2:
                continue
            # Compute cosine distance between word vectors.
            distance_matrix[i, j] = dot(self[t1],self[t2])/(norm(self[t1])*norm(self[t2]))           
            
    def nbow(document):
        d = zeros(vocab_len, dtype=double)
        nbow = dictionary.doc2bow(document)  # Word frequencies.
        doc_len = len(document)
        for idx, freq in nbow:
            d[idx] = freq / float(doc_len)  # Normalized word frequencies.
        return d
    
    d1 = nbow(document1)
    d2 = nbow(document2)
    return emd(d1, d2, distance_matrix)
예제 #2
0
def wmd(document1, document2, model):
    # Remove out-of-vocabulary words.
    document1 = [token for token in document1 if token in model]
    document2 = [token for token in document2 if token in model]
    if len(document1) == 0 or len(document2) == 0:
        return 1.
    dictionary = Dictionary(documents=[document1, document2])
    vocab_len = len(dictionary)
    # Compute distance matrix.
    distance_matrix = zeros((vocab_len, vocab_len), dtype=double)
    for i, t1 in list(dictionary.items()):
        for j, t2 in list(dictionary.items()):
            distance_matrix[i, j] = scipy.spatial.distance.cosine(
                model[t1], model[t2])
    if np_sum(distance_matrix) == 0.0:
        # `emd` gets stuck if the distance matrix contains only zeros.
        return 0.

    def nbow(document):
        d = zeros(vocab_len, dtype=double)
        nbow = dictionary.doc2bow(document)  # Word frequencies.
        doc_len = len(document)
        for idx, freq in nbow:
            d[idx] = freq / float(doc_len)  # Normalized word frequencies.
        return d

    # Compute nBOW representation of documents.
    d1 = nbow(document1)
    d2 = nbow(document2)
    # Compute WMD.
    res = emd(d1, d2, distance_matrix)
    return res if res >= 0 else 1
예제 #3
0
    def wmdistance(self, document1, document2, tf_idf1, tf_idf2, weight):

        document1 = [
            token for token in document1 if self.model.__contains__(token)
        ]
        document2 = [
            token for token in document2 if self.model.__contains__(token)
        ]
        dictionary = Dictionary(documents=[document1, document2])
        vocab_len = len(dictionary)

        if vocab_len == 1:
            # Both documents are composed by a single unique token
            return 0.0

        # Sets for faster look-up.
        docset1 = set(document1)
        docset2 = set(document2)

        # Compute distance matrix.

        distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.double)
        for i, t1 in dictionary.items():
            for j, t2 in dictionary.items():
                if t1 not in docset1 or t2 not in docset2:
                    continue
                # Compute Euclidean distance between word vectors.
                distance_matrix[i, j] = np.sqrt(
                    np.sum((self.model.get_vector(t1) -
                            self.model.get_vector(t2))**2))

        if np.sum(distance_matrix) == 0.0:
            logger.info(
                'The distance matrix is all zeros. Aborting (returning inf).')
            return float('inf')

        def _tf_idf(word_tfidf):
            d = np.zeros(vocab_len, dtype=np.double)
            for id_, term in dictionary.items():
                d[id_] = word_tfidf[term]
            return d

        def nbow(document):
            d = np.zeros(vocab_len, dtype=np.double)
            nbow = dictionary.doc2bow(document)  # Word frequencies.
            doc_len = len(document)
            for idx, freq in nbow:
                d[idx] = freq / float(doc_len)  # Normalized word frequencies.
            return d

        # Compute nBOW representation of documents.
        if weight == 'TFIDF':
            d1 = _tf_idf(tf_idf1)
            d2 = _tf_idf(tf_idf2)
        elif weight == 'Norm':
            d1 = nbow(document1)
            d2 = nbow(document2)

        # Compute WMD.
        return emd(d1, d2, distance_matrix)
예제 #4
0
def create_dictionaries(model = None, sentences = None):
	if (sentences is not None) and (model is not None):
		gensim_dict = Dictionary()
		gensim_dict.doc2bow(model.wv.vocab.keys(),
							allow_update=True)
		w2indx = {v: k+1 for k, v in gensim_dict.items()}
		w2vec = {word: model[word] for word in w2indx.keys()}
		idx2w = {k+1: v for k, v in gensim_dict.items()}
		def parse_dataset(sentences):
			data=[]
			for sentence in sentences:
				new_txt = []
				for word in sentence:
					try:
						new_txt.append(w2indx[word])
					except:
						new_txt.append(0)
				data.append(new_txt)
			return data
			
		sentences=parse_dataset(sentences)
		sentences= sequence.pad_sequences(sentences, maxlen=maxlen)
		return w2indx, w2vec, idx2w, sentences
	else:
		logging.info ('No data provided...')
예제 #5
0
def create_dictionaries(p_model):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(p_model.wv.vocab.keys(), allow_update=True)
    print gensim_dict.items().__sizeof__()
    w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引,从1开始编号
    w2vec = {word: model[word] for word in w2indx.keys()}  # 词语的词向量
    print 'test'
    return w2indx, w2vec
예제 #6
0
def get_emd_score(example, pivot_words=[]):
    #normal emd score calculation
    #------------Code required to get emd score when memory present-------
    doc1 = deepcopy(example.embedding1[1][0])
    doc2 = deepcopy(example.embedding2[1][0])
    embed1 = deepcopy(example.embedding1[0][0])
    embed2 = deepcopy(example.embedding2[0][0])

    map1 = {}
    for x in range(len(doc1)):
        map1[doc1[x]] = embed1[x]

    map2 = {}
    for x in range(len(doc2)):
        map2[doc2[x]] = embed2[x]
    for word in list(doc1):
        if word in blacklist or word in pivot_words or word[0] == '#' or len(
                word) == 1 or word in stopwords.words(
                    'english') or word.isdigit() or word.isdecimal():
            doc1.remove(word)

    for word in list(doc2):
        if word in blacklist or word in pivot_words or word[0] == '#' or len(
                word) == 1 or word in stopwords.words(
                    'english') or word.isdigit() or word.isdecimal():
            doc2.remove(word)
    dic = Dictionary(documents=[doc1, doc2])

    docset1 = set(doc1)
    docset2 = set(doc2)
    vocab_len = len(dic)

    d_matrix = np.zeros((vocab_len, vocab_len), dtype=np.float64)
    if len(doc1) == 0 or len(doc2) == 0:
        return 1.0

    for i, t1 in dic.items():
        for j, t2 in dic.items():
            if t1 not in docset1 or t2 not in docset2:
                continue
            v1, v2 = map1[t1], map2[t2]
            d_matrix[i, j] = np.sqrt(np.sum((v1 - v2)**2))

    def nbow(document):
        d = np.zeros(vocab_len, dtype=np.float64)
        nbow = dic.doc2bow(document)  # Word frequencies.
        doc_len = len(document)
        for idx, freq in nbow:
            d[idx] = freq / float(doc_len)  # Normalized word frequencies.
        return d

    d1 = nbow(doc1)
    d2 = nbow(doc2)

    output_emd = pyemd.emd(d1, d2, d_matrix)
    return emd_to_score(output_emd)
예제 #7
0
파일: wmd.py 프로젝트: ahare63/embeddings
def wmdistance(document1, document2, embedder):
    """
    Compute the Word Mover's Distance between two documents. When using this
    code, please consider citing the following papers:

    .. Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching".
    .. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances".
    .. Matt Kusner et al. "From Word Embeddings To Document Distances".
    
    `document1` and `document2` should be lists of words. `embedder` should be an embeddings object from this library, e.g. `FastTextEmbedding()`.
    You should verify that each word in `document1` and `document2` have valid embeddings in the `embedder` object. Otherwise this may throw a `NoneTypeError`
    from line 39.
"""

    dictionary = Dictionary(documents=[document1, document2])
    vocab_len = len(dictionary)

    # Sets for faster look-up.
    docset1 = set(document1)
    docset2 = set(document2)

    # Compute distance matrix.
    distance_matrix = np.zeros((vocab_len, vocab_len), dtype=np.double)
    for i, t1 in dictionary.items():
        for j, t2 in dictionary.items():
            if not t1 in docset1 or not t2 in docset2:
                continue
            # Compute Euclidean distance between word vectors.
            distance_matrix[i, j] = sqrt(
                np.sum((np.asarray(embedder.emb(t1)) -
                        np.asarray(embedder.emb(t2)))**2))

    if np.sum(distance_matrix) == 0.0:
        # `emd` gets stuck if the distance matrix contains only zeros.
        logger.info(
            'The distance matrix is all zeros. Aborting (returning inf).')
        return float('inf')

    def nbow(document):
        d = np.zeros(vocab_len, dtype=np.double)
        nbow = dictionary.doc2bow(document)  # Word frequencies.
        doc_len = len(document)
        for idx, freq in nbow:
            d[idx] = freq / float(doc_len)  # Normalized word frequencies.
        return d

    # Compute nBOW representation of documents.
    d1 = nbow(document1)
    d2 = nbow(document2)

    # Compute WMD.
    return emd(d1, d2, distance_matrix)
예제 #8
0
def create_dictionaries(model):

    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
    w2indx = {v: k + 1 for k, v in gensim_dict.items()}
    w2vec = {word: model[word] for word in w2indx.keys()}
    return w2indx, w2vec
예제 #9
0
def create_dictionaries(model=None, combined=None):
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)

        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()}  # 词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()}  # 词语的词向量, (word->model(word))

        def parse_dataset(combined):
            """
            :intro: Words become integers
            :param combined:
            :return:
            """
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freq < 10->0
                data.append(new_txt)
            return data

        n_combined = parse_dataset(combined)
        n_combined = sequence.pad_sequences(
            n_combined,
            maxlen=input_length)  # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, n_combined
    else:
        print('No data provided...')
예제 #10
0
def create_dictionaries(maxlen, model=None, combined=None):
    """
    :param model: 训练好的Word2vec_model
    :param combined: 文本
    :return:  model词语索引字典,model词语向量字典,填充后的文本索引列表
    """
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        # the index of a word which have word vector is not 0
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        # integrate all the corresponding word vectors into the word vector matrix
        w2vec = {word: model[word] for word in w2indx.keys()}

        # 对文本按字典生成索引值
        def parse_dataset(combined):
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        # 生成索引值列表
        combined_index = parse_dataset(combined)
        # 对列表进行填充,不足maxlen的从前面填充0,超过maxlen的从后往前截断,并且要求至少是二维
        combined_pad = sequence.pad_sequences(combined_index, maxlen=maxlen)
        return w2indx, w2vec, combined_pad
    else:
        print('No data provided...')
예제 #11
0
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引
        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec,combined
    else:
        print 'No data provided...'
def create_dictionaries(combined=None):
    # 加载word2vec模型:
    model = gensim.models.KeyedVectors.load_word2vec_format(
        'Word2vec_model2.vector', binary=False)  # 加载单词向量
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        # 生成词的索引词典
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        # 生成词向量矩阵词典
        w2vec = {word: model[word] for word in w2indx.keys()}

        # 对于经过分词后的每个句子,将句子中的每个特征词使用word2vec模型转化为对应的词典中的索引,若某个词没有编码向量,则索引设为0
        # 通过这种方式将每个句子/样本转化为向量,向量的维度就是该句子中特征词的个数。
        x_idx = []
        for sentence in combined:
            new_txt = []
            sentence = sentence.strip('\r\n')  # 去除句子首尾的换行符号
            wordList = sentence.split(' ')  # 将句子切割成词语
            for word in wordList:
                try:
                    new_txt.append(w2indx[word])
                except:
                    new_txt.append(0)
            x_idx.append(new_txt)

        # 句子长度归一化
        x_idx = sequence.pad_sequences(x_idx, maxlen=max_sentence_len)
        return w2indx, w2vec, x_idx
    else:
        print('Errors when transfer the x_words to x_idx')
예제 #13
0
def corpus_dict(model=None,corpus=None):
    ''' 整理训练结果,重整词典
    '''
    
    if (corpus is not None) and (model is not None):
        gensim_dict=Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),allow_update=True)
        #计算在文档中,每个关键词出现的频率并用稀疏矩阵的方式返回结果。允许增加新的文档来更新这个稀疏矩阵
        w2indx = {v: k+1 for k, v in gensim_dict.items()}
        
        w2vec = {word: model[word] for word in w2indx.keys()}

        def rebuild_corpus(corpus):
           
            data=[]
            for sentence in corpus:
                new_txt=[]
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data
        corpus=rebuild_corpus(corpus)
        #可用参数控制截断和填充从头开始还是从尾进行
        corpus=sequence.pad_sequences(corpus, maxlen=maxlen)
        return w2indx, w2vec,corpus
    else:
        print("没输入,你让我咋输出...")
예제 #14
0
def create_emb_weights(model=None):
    print("create_emb_weights...")
    if model is not None:
        gensim_dict = Dictionary(
        )  #gensim_dict:  Dictionary(647 unique tokens: ['app', '一套', '久', '买', '代']...)
        gensim_dict.doc2bow(
            model.wv.vocab.keys(),
            allow_update=True)  #打印看看, vocab.keys(): [单词1, 单词2, 单词3]
        #gensim_dict:  [(19, 1), (29, 1), (32, 1), (45, 1), (102, 1), (108, 1), (130, 1)]
        #频数小于10的词的索引为0, 所以k+1
        #dct_w2indx = gensim_dict.token2id                        #字典{词语:词频}
        dct_w2indx = {
            v: k + 1
            for k, v in gensim_dict.items()
        }  #所有频数超过10的词语的索引,(k->v)=>(v->k), {'hellip': 1, '上菜': 2, '不错': 3, ...}
        dct_w2vec = {word: model[word]
                     for word in dct_w2indx.keys()
                     }  #所有频数超过10的词语的词向量构成一个向量字典, (word->model(word))
        n_symbols = len(dct_w2indx) + 1  # 所有单词的索引数,频数小于10的词语索引为0,所以加1
        embedding_weights = np.zeros(
            (n_symbols, VOCAB_DIM))  #初始化 索引为0的词语,词向量全为0
        for word, index in dct_w2indx.items():  # 从索引为1的词语开始,对每个词语对应其词向量??
            embedding_weights[index, :] = dct_w2vec[word]
        print("embedding_weights: ", np.shape(embedding_weights))
        return embedding_weights
    else:
        return None
예제 #15
0
def create_dictionaries(train=None, test=None, model=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    '''
    if (train is not None) and (model is not None) and (test is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(data):
            #import pdb;pdb.set_trace()
            txt = data.lower().replace('\n', '').split()
            new_txt = []
            for word in txt:
                try:
                    new_txt.append(w2indx[word])
                except:
                    new_txt.append(0)
            return new_txt
            #return data

        # read the movie review data
        testData['text'] = test['text'].apply(parse_dataset)
        trainData['text'] = train['text'].apply(parse_dataset)

        return w2indx, w2vec, trainData, testData
    else:
        print('No data provided...')
예제 #16
0
def create_dictionaries(model=None, combined=None):
    """
    :param model: 传进去的模型(词向量模型)
    :param combined: 数据集语料的对应向量
    :return: 1-创建的每个词对应的索引字典  2-创建一个词向量的字典  3-转换训练与测试数据集的字典
    """
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()  # 创建gensim字典
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)

        w2_index = {v: k + 1 for k, v in gensim_dict.items()}  # 所有频数超过10的词语的索引
        w2_vec = {word: model[word]
                  for word in w2_index.keys()}  # 所有频数超过10的词语的词向量

        def parse_dataset(combined):
            """词语变成了整数数字"""

            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2_index[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语索引为0
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  # 设置每个句子的最大长度为maxlen,低于的补0,多出的截断
        return w2_index, w2_vec, combined
    else:
        print('NO data provided....没有数据提供了,会发生错误')
예제 #17
0
def create_dictionaries(model=None, combined=None):
    """
    返回索引,单词向量矩阵和具有统一长度和索引的句子

    函数做的是工作的数量:
        1- 创建一个单词来索引映射
        2- 创建一个字到向量映射
        3- 转换培训和测试字典
    """
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        # 有单词向量的单词的索引不为0
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        # 将所有对应的向量整合到向量矩阵中
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(combined):
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        # 用keras的pad_sequences函数统一句子的长度
        combined = sequence.pad_sequences(combined, maxlen=max_len)
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
예제 #18
0
def create_dictionaries(model=None, combined=None):
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)

        # 所有频数超过20的词索引
        word2index = {v: k + 1 for k, v in gensim_dict.items()}
        word2vec = {word: model[word] for word in word2index.keys()}

        # 实现单词转换成索引
        def parse_dataset(combined):
            data = []
            for sentence in combined:
                new_text = []
                for word in sentence:
                    try:
                        new_text.append(word2index[word])
                    except:
                        new_text.append(0)
                data.append(new_text)
            return data

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(combined, maxlen=maxlen)
        # (21088, 100)
        # print combined.shape

        return word2index, word2vec, combined
    else:
        print 'No data provided...'
예제 #19
0
def create_dictionaries(model=None, combined=None):

    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)

    word2id = {v: k + 1 for k, v in gensim_dict.items()}
    word2vec = {word: model[word] for word in word2id.keys()}

    def parse_dataset(combined):
        ''' Words become integers
        '''
        data = []
        for sentence in combined:
            new_txt = []
            for word in sentence:
                try:
                    new_txt.append(word2id[word])
                except:
                    new_txt.append(0)
            data.append(new_txt)
        return data

    combined = parse_dataset(combined)
    combined = sequence.pad_sequences(combined, maxlen=maxlen)

    return word2id, word2vec, combined
예제 #20
0
def create_dictionaries(model=None, combined=None):
    ''' Function does are number of Jobs:
        1-创建索引映射的单词
        2-创建一个单词到矢量映射
        3-转换训练和测试词典

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用
            ''' 单词变集合
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        #返回每个词语的索引(从1开始编号)、词向量、每个句子所对应的词语索引
        '''
        f_3 = open('./data/create_dictionaries_w2indx.txt', 'w', encoding='utf-8')
        for word,index in w2indx.items():   
            f_3.write(str(word))
            f_3.write(' : ')
            f_3.write(str(index))
            f_3.write('\n')
        f_3.close()
        f_4 = open('./data/create_dictionaries_w2vec.txt', 'w', encoding='utf-8')
        for word,index in w2vec.items():   
            f_4.write(str(word))
            f_4.write(' : ')
            f_4.write(str(index))
            f_4.write('\n')
        f_4.close()
        f_5 = open('./data/create_dictionaries_combined.txt', 'w', encoding='utf-8')
        for line in combined:   
            f_5.write(str(line))
            f_5.write('\n')
        f_5.close()
        '''
        return w2indx, w2vec, combined
    else:
        print('没有提供数据...')
예제 #21
0
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries
    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引
        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec,combined
    else:
        print('No data provided...')
def create_dictionaries(model=None, combined=None):

    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined):  # 闭包-->临时使用
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)  # freqxiao10->0
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=maxlen)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
    def create_dictionaries(self, model, trainX):

        gensim_dict = Dictionary()

        # print model.vocab.keys()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 所有频数超过10的词语的索引
        w2vec = {word: model[word]
                 for word in w2indx.keys()}  # 所有频数超过10的词语的词向量

        data = []
        for sentence in trainX:
            new_txt = []
            for word in sentence:
                try:
                    new_txt.append(w2indx[word])
                except:
                    new_txt.append(0)
            data.append(new_txt)
        # return data

        # combined = parse_dataset(combined)
        trainX = data
        trainX = sequence.pad_sequences(
            trainX,
            maxlen=self.vocabmaxlen)  # 每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, trainX
def create_dictionaries(model=None, combined=None):

    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1
                  for k, v in gensim_dict.items()
                  }  #the index mapping for word frequency > 10
        w2vec = {word: model[word]
                 for word in w2indx.keys()
                 }  #the vector mapping for word frequency > 10

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(combined, maxlen=maxlen)
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
예제 #25
0
def create_dictionaries(model=None, combined=None):

    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(combined):  # 闭包-->临时使用
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data  # word=>index

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(combined, maxlen=maxlen)
        return w2indx, w2vec, combined
    else:
        print('No data provided...')
예제 #26
0
def create_dictionaries(p_model):
    gensim_dict = Dictionary()
    # p_model.build_vocab('./lstm_vocab')
    gensim_dict.doc2bow(p_model.wv.vocab.keys(), allow_update=True)
    w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引,从1开始编号
    w2vec = {word: p_model[word] for word in w2indx.keys()}  # 词语的词向量
    return w2indx, w2vec
예제 #27
0
def get_corpus(path, train=True):
    severity_index = {
        'blocker': 0,
        'critical': 1,
        'major': 2,
        'minor': 3,
        'trivial': 4
    }
    split_corpus = []
    tag = []
    with open(path, encoding='utf-8') as fin:
        for line in fin.readlines():
            temp = json.loads(line.strip())
            split_corpus.append(temp["summary"])
            tag.append(severity_index[temp["severity"]])
    frequency = defaultdict(int)
    for line in split_corpus:
        for token in line:
            frequency[token] += 1
    split_corpus = [[word for word in x if frequency[word] >= 5]
                    for x in split_corpus]
    word_dict = Dictionary(split_corpus)
    word_index = {word: index for index, word in word_dict.items()}
    print(len(word_index))
    feature_matrix = np.zeros((len(split_corpus), len(word_index)),
                              dtype=int)  #特征向量,最后一行是分类标签
    for i in range(len(split_corpus)):
        for word in split_corpus[i]:
            feature_matrix[i, word_index[word]] = 1
    return feature_matrix, tag
def create_dictionaries(train=None,
                        test=None,
                        model=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (train is not None) and (model is not None) and (test is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(data):
            ''' Words become integers
            '''
            for key in data.keys():
                txt = data[key].lower().replace('\n', '').split()
                new_txt = []
                for word in txt:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data[key] = new_txt
            return data
        train = parse_dataset(train)
        test = parse_dataset(test)
        return w2indx, w2vec, train, test
    else:
        print('No data provided...')
def create_dictionaries(train=None, test=None, model=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (train is not None) and (model is not None) and (test is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(data):
            ''' Words become integers
            '''
            for key in data.keys():
                txt = data[key].lower().replace('\n', '').split()
                new_txt = []
                for word in txt:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data[key] = new_txt
            return data

        train = parse_dataset(train)
        test = parse_dataset(test)
        return w2indx, w2vec, train, test
    else:
        print('No data provided...')
예제 #30
0
def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1-创建索引映射的单词
        2-创建一个单词到矢量映射
        3-转换训练和测试词典

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined): # 闭包-->临时使用
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0) # freqxiao10->0
                data.append(new_txt)
            return data # word=>index
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec,combined
    else:
        print ('没有提供数据...')
def create_dictionaries(train = None,
                        test = None,
			predict = None,
                        model = None):
    if (train is not None) and (model is not None) and (test is not None) and (predict is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        w2indx = {v: k+1 for k, v in gensim_dict.items()}
        w2vec = {word: model[word] for word in w2indx.keys()}

        def parse_dataset(data ):
            ''' Words become integers
            '''
            for key in data.keys():
                txt = data[key].lower().replace('\n', '').split()
                new_txt = []
                for word in txt:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data[key] = new_txt
            return data

        train = parse_dataset(train )
        test = parse_dataset(test )
        predict = parse_dataset(predict )
        return w2indx, w2vec, train, test , predict
    else:
        print('No data provided...')
예제 #32
0
파일: LSTM_QG.py 프로젝트: yanHaowen/comp
def create_dictionaries(model=None, combined=None):

    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
        w2indx = {v: k + 1 for k, v in gensim_dict.items()}  #所有频数超过10的词语的索引
        w2vec = {word: model[word] for word in w2indx.keys()}  #所有频数超过10的词语的词向量

        def parse_dataset(combined):
            ''' Words become integers
            '''
            data = []
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0)
                data.append(new_txt)
            return data

        combined = parse_dataset(combined)
        combined = sequence.pad_sequences(
            combined, maxlen=140)  #每个句子所含词语对应的索引,所以句子中含有频数小于10的词语,索引为0
        return w2indx, w2vec, combined
    else:
        print 'No data provided...'
예제 #33
0
def create_mapping_dicts(wrd_embedding, filter_corpus=False, bodies=None,
                         headlines=None): 
    """Generate word:index, word:vector, index:word dictionaries. 

    Args: 
    ----
        wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
        filter_corpus (optional): boolean  
            Filter the corpus to only those words seen in the bodies/headlines. 
        bodies (optional): list of lists 
            Must be passed in if `filter_corpus` is True. 
        headlines (optional): list of lists  
            Must be passed in if `filter_corpus` is True. 

    Return: 
    ------
        word_idx_dct: dict
        idx_word_dct: dict
        word_vector_dct: dict
    """

    if filter_corpus:
        if (not bodies or not headlines): 
            excep_str = "Must pass in bodies and headlines with filter_corpus True!"
            raise Exception(excep_str)
        else: 
            wrd_embedding = _filter_corpus(bodies, headlines, wrd_embedding)

    gensim_dct = Dictionary()
    gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)

    # Leave index 0 for the newline character
    word_idx_dct = {wrd: (idx + 1) for idx, wrd in gensim_dct.items()}
    idx_word_dct = {(idx + 1): wrd for idx, wrd in gensim_dct.items()}
    word_idx_dct['\n'] = 0
    idx_word_dct[0] = '\n'

    word_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}
    vec_dim = next(len(value) for value in word_vector_dct.values())
    word_vector_dct['\n'] = np.zeros((vec_dim))

    return word_idx_dct, idx_word_dct, word_vector_dct 
예제 #34
0
def create_mapping_dicts(wrd_embedding, reviews=None, vocab_size=None):
    """Generate word:index, word:vector, index:word dictionaries. 

    Args: 
    ----
        wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
        reviews (optional): np.array (or array-like) of lists of strings
            Used to filter the vocabulary, either to only those words in `reviews`
            or the most common `vocab_size` words in `reviews` that are also in 
            the `wrd_embedding`.
        vocab_size (optional): int
            Keep only `vocab_size` most common words from the reviews. 

    Return: 
    ------
        word_idx_dct: dict
        idx_word_dct: dict
        word_vector_dct: dict
    """

    if reviews is not None: 
        wrd_embedding = _filter_corpus(wrd_embedding, reviews, vocab_size)

    gensim_dct = Dictionary()
    gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)

    # Leave index 0 for masking the padding, 1 for the end of sequence
    # character (EOS), and 2 for unkown words (denoted 'UNK')
    wrd_idx_dct = {wrd: (idx + 3) for idx, wrd in gensim_dct.items()}
    idx_wrd_dct = {(idx + 3): wrd for idx, wrd in gensim_dct.items()}
    wrd_idx_dct['EOS'] = 1
    idx_wrd_dct[1] = 'EOS'
    wrd_idx_dct['UNK'] = 2
    idx_wrd_dct[2] = 'UNK'

    wrd_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}
    embedding_dim = wrd_embedding.vector_size
    wrd_vector_dct['EOS'] = np.zeros((embedding_dim))
    wrd_vector_dct['UNK'] = np.zeros((embedding_dim))

    return wrd_idx_dct, idx_wrd_dct, wrd_vector_dct 
예제 #35
0
def create_mapping_dicts(wrd_embedding, filter_corpus=False, bodies=None,
                         headlines=None): 
    """Generate word:index, word:vector, index:word dictionaries. 

    Args: 
    ----
        wrd_embedding: gensim.models.word2vec.Word2Vec fitted model
        filter_corpus (optional): boolean  
            Filter the corpus to only those words seen in the articles. Use
            to speed up iteration during intial building/training phases. 
        bodies (optional): list of lists 
            Must be passed in if `filter_corpus` is True. 
        headlines (optional): list of lists  
            Must be passed in if `filter_corpus` is True. 

    Return: 
    ------
        word_idx_dct: dict
        idx_word_dct: dict
        word_vector_dct: dict
    """

    if filter_corpus:
        if (not bodies or not headlines): 
            raise Exception('Must pass in bodies and headlines with filter_corpus as True!')
        else: 
            wrd_embedding = _filter_corpus(bodies, headlines, wrd_embedding)

    gensim_dct = Dictionary()
    gensim_dct.doc2bow(wrd_embedding.vocab.keys(), allow_update=True)

    word_idx_dct = {wrd: idx for idx, wrd in gensim_dct.items()}
    idx_word_dct = {idx: wrd for idx, wrd in gensim_dct.items()}
    word_vector_dct = {wrd: wrd_embedding[wrd] for idx, wrd in gensim_dct.items()}

    return word_idx_dct, idx_word_dct, word_vector_dct 
예제 #36
0
    def wmdistance(self, document1, document2):
        """
        Compute the Word Mover's Distance between two documents. When using this
        code, please consider citing the following papers:

        .. Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching".
        .. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances".
        .. Matt Kusner et al. "From Word Embeddings To Document Distances".

        Note that if one of the documents have no words that exist in the
        Word2Vec vocab, `float('inf')` (i.e. infinity) will be returned.

        This method only works if `pyemd` is installed (can be installed via pip, but requires a C compiler).

        Example:
            >>> # Train word2vec model.
            >>> model = Word2Vec(sentences)

            >>> # Some sentences to test.
            >>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
            >>> sentence_president = 'The president greets the press in Chicago'.lower().split()

            >>> # Remove their stopwords.
            >>> from nltk.corpus import stopwords
            >>> stopwords = nltk.corpus.stopwords.words('english')
            >>> sentence_obama = [w for w in sentence_obama if w not in stopwords]
            >>> sentence_president = [w for w in sentence_president if w not in stopwords]

            >>> # Compute WMD.
            >>> distance = model.wmdistance(sentence_obama, sentence_president)
        """

        if not PYEMD_EXT:
            raise ImportError("Please install pyemd Python package to compute WMD.")

        # Remove out-of-vocabulary words.
        len_pre_oov1 = len(document1)
        len_pre_oov2 = len(document2)
        document1 = [token for token in document1 if token in self]
        document2 = [token for token in document2 if token in self]
        diff1 = len_pre_oov1 - len(document1)
        diff2 = len_pre_oov2 - len(document2)
        if diff1 > 0 or diff2 > 0:
            logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', diff1, diff2)

        if len(document1) == 0 or len(document2) == 0:
            logger.info(
                "At least one of the documents had no words that werein the vocabulary. "
                "Aborting (returning inf)."
            )
            return float('inf')

        dictionary = Dictionary(documents=[document1, document2])
        vocab_len = len(dictionary)

        if vocab_len == 1:
            # Both documents are composed by a single unique token
            return 0.0

        # Sets for faster look-up.
        docset1 = set(document1)
        docset2 = set(document2)

        # Compute distance matrix.
        distance_matrix = zeros((vocab_len, vocab_len), dtype=double)
        for i, t1 in dictionary.items():
            for j, t2 in dictionary.items():
                if t1 not in docset1 or t2 not in docset2:
                    continue
                # Compute Euclidean distance between word vectors.
                distance_matrix[i, j] = sqrt(np_sum((self[t1] - self[t2])**2))

        if np_sum(distance_matrix) == 0.0:
            # `emd` gets stuck if the distance matrix contains only zeros.
            logger.info('The distance matrix is all zeros. Aborting (returning inf).')
            return float('inf')

        def nbow(document):
            d = zeros(vocab_len, dtype=double)
            nbow = dictionary.doc2bow(document)  # Word frequencies.
            doc_len = len(document)
            for idx, freq in nbow:
                d[idx] = freq / float(doc_len)  # Normalized word frequencies.
            return d

        # Compute nBOW representation of documents.
        d1 = nbow(document1)
        d2 = nbow(document2)

        # Compute WMD.
        return emd(d1, d2, distance_matrix)