def extract_instances(self, train_instances):
        sentences = []
        for idx, train_instance in enumerate(train_instances):
            sa, sb = train_instance.get_word(type='lemma', lower=True)
            sentences.append(TaggedDocument(words=sa, tags=['sa_%d' % idx]))
            sentences.append(TaggedDocument(words=sb, tags=['sb_%d' % idx]))

        model = Doc2Vec(sentences,
                        size=25,
                        window=3,
                        min_count=0,
                        workers=10,
                        iter=1000)

        features = []
        infos = []
        for idx in range(len(train_instances)):
            vec_a = model.docvecs['sa_%d' % idx]
            vec_b = model.docvecs['sb_%d' % idx]
            feature, info = vk.get_all_kernel(vec_a, vec_b)
            features.append(feature)
            infos.append([])
            # infos.append([vec_a, vec_b])

        return features, infos
def vector_similarity(vec1, vec2, normlize=True):
    """
    Next is a example:
    Args:   
        vec1 = [0, 1]
        vec2 = [1, 0]
    Returns:
        ['1.414', '1.0', ...], ['euclidean', 'cosine', ...]
    which means:
        euclidean 1.41421356237
        cosine 1.0
        manhattan 2
        chebyshev_distance 1
        spearmanr -1.0
        kendalltau -1.0
        pearsonr -1.0
        polynomial 1.0
        rbf 0.493068691395
        laplacian 0.367879441171
        sigmoid 0.761594155956
    """
    if normlize:
        vec1 = vk.normalize(vec1)
        vec2 = vk.normalize(vec2)
    return vk.get_all_kernel(vec1, vec2)
예제 #3
0
    def extract(self, train_instance):
        from stst.features.features_embedding import minavgmaxpooling

        pos_sa, pos_sb = train_instance.get_pos_tag(stopwords=False)
        sa = [w for w, tag in pos_sa if tag == 'n']
        sb = [w for w, tag in pos_sb if tag == 'n']

        pooling_vec_sa = minavgmaxpooling(sa,
                                          self.vocab,
                                          self.embeddings,
                                          self.dim,
                                          convey='idf',
                                          idf_weight=self.idf_weight)
        pooling_vec_sb = minavgmaxpooling(sb,
                                          self.vocab,
                                          self.embeddings,
                                          self.dim,
                                          convey='idf',
                                          idf_weight=self.idf_weight)

        all_feats, all_names = vk.get_all_kernel(pooling_vec_sa,
                                                 pooling_vec_sb)
        features = all_feats
        infos = [self.emb_name]
        return features, infos
예제 #4
0
    def extract(self, train_instance):
        lower = self.lower
        emb_type = self.emb_type
        dim = self.dim

        word_sa, word_sb = train_instance.get_word(type='word',
                                                   stopwords=True,
                                                   lower=lower)

        pooling_vec_sa = minavgmaxpooling(word_sa, emb_type, dim)
        pooling_vec_sb = minavgmaxpooling(word_sb, emb_type, dim)
        all_feats, all_names = vk.get_all_kernel(pooling_vec_sa,
                                                 pooling_vec_sb)
        features = all_feats

        infos = [emb_type, lower]
        return features, infos
    def extract_instances(self, train_instances):
        model = dict_utils.DictLoader().load_doc2vec()
        file_name = self.train_file.split('/')[-1]
        features = []
        infos = []
        for idx in range(len(train_instances)):
            vec_a = model.docvecs['%s_%d_sa' % (file_name, idx)]
            vec_b = model.docvecs['%s_%d_sb' % (file_name, idx)]
            # train_instance = train_instances[idx]
            # sa, sb = train_instance.get_word(type='lemma', stopwords=True, lower=True)
            # vec_a = model.infer_vector(sa)
            # vec_b = model.infer_vector(sb)

            feature, info = vk.get_all_kernel(vec_a, vec_b)
            features.append(feature)
            infos.append(info)

        return features, infos
    def extract_instances(self, train_instances):
        features = []
        infos = []
        input_file = self.feature_file.split('/')[-2] + '.txt'
        f_in = utils.create_read_file(config.NN_FEATURE_PATH + '/' +
                                      self.nntype + '/' + input_file)
        for line in f_in:
            line = line.strip()
            obj = json.loads(line)
            emb1 = obj[1]
            emb2 = obj[2]
            emb1 = vk.normalize(emb1)
            emb2 = vk.normalize(emb2)
            feats, info = vk.get_all_kernel(emb1, emb2)
            features.append(feats)
            infos.append(info)

        print(len(features), features[0], infos[0])

        return features, infos
    def extract(self, train_instance):

        word_sa, word_sb = train_instance.get_word(type=self.word_type,
                                                   stopwords=self.stopwords,
                                                   lower=self.lower)
        pooling_vec_sa = minavgmaxpooling(word_sa,
                                          self.vocab,
                                          self.embeddings,
                                          self.dim,
                                          convey='idf',
                                          idf_weight=self.idf_weight)
        pooling_vec_sb = minavgmaxpooling(word_sb,
                                          self.vocab,
                                          self.embeddings,
                                          self.dim,
                                          convey='idf',
                                          idf_weight=self.idf_weight)
        all_feats, all_names = vk.get_all_kernel(pooling_vec_sa,
                                                 pooling_vec_sb)
        features = all_feats

        infos = [self.emb_name, self.word_type, self.stopwords, self.lower]
        return features, infos