Exemplo n.º 1
0
    def get_sif(self, s1, s2):
        s1 = list(s1.lower())
        s2 = list(s2.lower())

        if len(s1) == 0 or len(s2) == 0:
            return pd.np.NaN

        ## English data
        en_x, en_m = data_io.sentences2idx(s1, self.en_words)
        en_w = data_io.seq2weight(en_x, en_m, self.en_weight4ind)
        en_embedding = SIF_embedding.SIF_embedding(self.en_We, en_x, en_w,
                                                   self.parameters)
        en_embedding = en_embedding[0]

        ## German data
        de_x, de_m = data_io.sentences2idx(s2, self.de_words)
        de_w = data_io.seq2weight(de_x, de_m, self.de_weight4ind)
        de_embedding = SIF_embedding.SIF_embedding(self.de_We, de_x, de_w,
                                                   self.parameters)
        de_embedding = de_embedding[0]

        if np.count_nonzero(en_embedding) == 0 or np.count_nonzero(
                de_embedding) == 0:
            return -1

        score = self.cosine_similarity(en_embedding, de_embedding)
        return score
Exemplo n.º 2
0
def get_sentences_embedding(sentences):
    """
    return: embedding: ndarray, shape (n_samples, vector_space_dim)
    """

    sequence_matrix, mask_matrix = data_io.sentences2idx(
        sentences, words2index)
    weight_matrix = data_io.seq2weight(sequence_matrix, mask_matrix,
                                       weight4ind)
    params = sparams.params()
    params.rmpc = rm_pc

    embedding = SIF_embedding.SIF_embedding(words_embedding, sequence_matrix,
                                            weight_matrix, params)
    return embedding
Exemplo n.º 3
0
def sif_embedding(sents):
    """
    func: 对列表sents赋值句向量
    param: sents - 切词后的句子列表
    return: 词向量列表
    """
    weightpara = 1e-3  # the parameter in the SIF weighting scheme, usually in the range [3e-5, 3e-3]
    rmpc = 1  # number of principal components to remove in SIF weighting scheme(是否去掉最大主成分项)
    (words, We) = data_io.getWordmap(wordfile)
    # load word weights
    word2weight = data_io.getWordWeight(
        weightfile,
        weightpara)  # word2weight['str'] is the weight for the word 'str'
    weight4ind = data_io.getWeight(
        words, word2weight)  # weight4ind[i] is the weight for the i-th word
    # load sentences
    x, m = data_io.sentences2idx(sents, words)
    w = data_io.seq2weight(x, m, weight4ind)  # get word weights
    param = params.params()
    param.rmpc = rmpc
    embedding = SIF_embedding.SIF_embedding(
        We, x, w, param)  # embedding[i,:] is the embedding for sentence i
    return embedding