def extract_instances(self, train_instances): sentences = [] for idx, train_instance in enumerate(train_instances): sa, sb = train_instance.get_word(type='lemma', lower=True) sentences.append(TaggedDocument(words=sa, tags=['sa_%d' % idx])) sentences.append(TaggedDocument(words=sb, tags=['sb_%d' % idx])) model = Doc2Vec(sentences, size=25, window=3, min_count=0, workers=10, iter=1000) features = [] infos = [] for idx in range(len(train_instances)): vec_a = model.docvecs['sa_%d' % idx] vec_b = model.docvecs['sb_%d' % idx] feature, info = vk.get_all_kernel(vec_a, vec_b) features.append(feature) infos.append([]) # infos.append([vec_a, vec_b]) return features, infos
def vector_similarity(vec1, vec2, normlize=True): """ Next is a example: Args: vec1 = [0, 1] vec2 = [1, 0] Returns: ['1.414', '1.0', ...], ['euclidean', 'cosine', ...] which means: euclidean 1.41421356237 cosine 1.0 manhattan 2 chebyshev_distance 1 spearmanr -1.0 kendalltau -1.0 pearsonr -1.0 polynomial 1.0 rbf 0.493068691395 laplacian 0.367879441171 sigmoid 0.761594155956 """ if normlize: vec1 = vk.normalize(vec1) vec2 = vk.normalize(vec2) return vk.get_all_kernel(vec1, vec2)
def extract(self, train_instance): from stst.features.features_embedding import minavgmaxpooling pos_sa, pos_sb = train_instance.get_pos_tag(stopwords=False) sa = [w for w, tag in pos_sa if tag == 'n'] sb = [w for w, tag in pos_sb if tag == 'n'] pooling_vec_sa = minavgmaxpooling(sa, self.vocab, self.embeddings, self.dim, convey='idf', idf_weight=self.idf_weight) pooling_vec_sb = minavgmaxpooling(sb, self.vocab, self.embeddings, self.dim, convey='idf', idf_weight=self.idf_weight) all_feats, all_names = vk.get_all_kernel(pooling_vec_sa, pooling_vec_sb) features = all_feats infos = [self.emb_name] return features, infos
def extract(self, train_instance): lower = self.lower emb_type = self.emb_type dim = self.dim word_sa, word_sb = train_instance.get_word(type='word', stopwords=True, lower=lower) pooling_vec_sa = minavgmaxpooling(word_sa, emb_type, dim) pooling_vec_sb = minavgmaxpooling(word_sb, emb_type, dim) all_feats, all_names = vk.get_all_kernel(pooling_vec_sa, pooling_vec_sb) features = all_feats infos = [emb_type, lower] return features, infos
def extract_instances(self, train_instances): model = dict_utils.DictLoader().load_doc2vec() file_name = self.train_file.split('/')[-1] features = [] infos = [] for idx in range(len(train_instances)): vec_a = model.docvecs['%s_%d_sa' % (file_name, idx)] vec_b = model.docvecs['%s_%d_sb' % (file_name, idx)] # train_instance = train_instances[idx] # sa, sb = train_instance.get_word(type='lemma', stopwords=True, lower=True) # vec_a = model.infer_vector(sa) # vec_b = model.infer_vector(sb) feature, info = vk.get_all_kernel(vec_a, vec_b) features.append(feature) infos.append(info) return features, infos
def extract_instances(self, train_instances): features = [] infos = [] input_file = self.feature_file.split('/')[-2] + '.txt' f_in = utils.create_read_file(config.NN_FEATURE_PATH + '/' + self.nntype + '/' + input_file) for line in f_in: line = line.strip() obj = json.loads(line) emb1 = obj[1] emb2 = obj[2] emb1 = vk.normalize(emb1) emb2 = vk.normalize(emb2) feats, info = vk.get_all_kernel(emb1, emb2) features.append(feats) infos.append(info) print(len(features), features[0], infos[0]) return features, infos
def extract(self, train_instance): word_sa, word_sb = train_instance.get_word(type=self.word_type, stopwords=self.stopwords, lower=self.lower) pooling_vec_sa = minavgmaxpooling(word_sa, self.vocab, self.embeddings, self.dim, convey='idf', idf_weight=self.idf_weight) pooling_vec_sb = minavgmaxpooling(word_sb, self.vocab, self.embeddings, self.dim, convey='idf', idf_weight=self.idf_weight) all_feats, all_names = vk.get_all_kernel(pooling_vec_sa, pooling_vec_sb) features = all_feats infos = [self.emb_name, self.word_type, self.stopwords, self.lower] return features, infos