def word2vec_test(): # 读入数据 pos_file_path = globe.file_pos neg_file_path = globe.file_neg tmp = data_processing.read_data(pos_file_path, neg_file_path) res = data_processing.data_split(tmp[0], tmp[1]) x_train = res[0] x_train = data_processing.text_clean(x_train) for i in x_train: for j in i: print j, n_dim = 200 min_count = 2 # model = gensim.models.Word2Vec(x_train, min_count=0, size=200, workers=4) model = word2vec_model(x_train, n_dim, min_count) # res = w2c_model.most_similar(positive=['纤维', '批次'], negative=['成分'], topn=1) # # w2c_model.doesnt_match("我 爱 中国".split()) # # var = w2c_model.similarity('纤维', '批次') # print var # res = w2c_model.most_similar("纤维") # for i in res: # print i[0], dd = model.most_similar("批次") for i in dd: print i[0],
def _data_read(pos_file_path, neg_file_path, w2c_model_path): """read data and word2vec model from file path, Args: pos_file_path: Positive file path. neg_file_path: Negative file path. w2c_model_path: word2vec model path Returns: A list contains train and test data with labels. Raises: IOError: An error occurred accessing the bigtable.Table object. """ tmp = data_processing.read_data(pos_file_path, neg_file_path) res = data_processing.data_split(tmp[0], tmp[1]) (train_data, test_data, train_labels, test_labels) = (res[0], res[1], res[2], res[3]) # print train_labels[0] train_data = data_processing.text_clean(train_data) test_data = data_processing.text_clean(test_data) # 词向量的维度 n_dim = globe.n_dim doc_vecs = [] try: # load word2vec model from model path word2vec_model = Word2Vec.load(w2c_model_path) doc_vecs = word2vec_gensim_train.text_vecs(train_data, test_data, n_dim, word2vec_model) except IOError: pass # 生成文本向量 train_data_vecs = doc_vecs[0] # print train_data_vecs.shape test_data_vecs = doc_vecs[1] # print test_data_vecs.shape return train_data_vecs, train_labels, test_data_vecs, test_labels
# res = w2c_model.most_similar(positive=['纤维', '批次'], negative=['成分'], topn=1) # # w2c_model.doesnt_match("我 爱 中国".split()) # # var = w2c_model.similarity('纤维', '批次') # print var # res = w2c_model.most_similar("纤维") # for i in res: # print i[0], dd = model.most_similar("批次") for i in dd: print i[0], if __name__ == "__main__": word2vec_test() pos_file_path = globe.file_pos neg_file_path = globe.file_neg tmp = data_processing.read_data(pos_file_path, neg_file_path) res = data_processing.data_split(tmp[0], tmp[1]) x_train = res[0] x_train = data_processing.text_clean(x_train) n_dim = 200 min_count = 2 model_path = globe.model_path mymodel = word2vec_model(x_train, n_dim, min_count) mymodel.save(model_path)