def word2vec_test(): # 读入数据 pos_file_path = globe.file_pos neg_file_path = globe.file_neg tmp = data_processing.read_data(pos_file_path, neg_file_path) res = data_processing.data_split(tmp[0], tmp[1]) x_train = res[0] x_train = data_processing.text_clean(x_train) for i in x_train: for j in i: print j, n_dim = 200 min_count = 2 # model = gensim.models.Word2Vec(x_train, min_count=0, size=200, workers=4) model = word2vec_model(x_train, n_dim, min_count) # res = w2c_model.most_similar(positive=['纤维', '批次'], negative=['成分'], topn=1) # # w2c_model.doesnt_match("我 爱 中国".split()) # # var = w2c_model.similarity('纤维', '批次') # print var # res = w2c_model.most_similar("纤维") # for i in res: # print i[0], dd = model.most_similar("批次") for i in dd: print i[0],
def run_li(): # 读入数据 # pos_file_path = '/Users/li/Kunyan/MyRepository/DeepNaturalLanguageProcessing/DeepNLP/data/test3.txt' # neg_file_path = '/Users/li/Kunyan/MyRepository/DeepNaturalLanguageProcessing/DeepNLP/data/test2.txt' pos_file_path = globe.file_pos neg_file_path = globe.file_neg tmp = data_processing.read_data(pos_file_path, neg_file_path) res = data_processing.data_split(tmp[0], tmp[1]) train_vecs = res[0] test_vecs = res[1] label_train = res[2] label_test = res[3] # 分类训练 lr = SGDClassifier(loss='log', penalty='l1') lr.fit(train_vecs, label_train) print('Test Accuracy: %.2f' % lr.score(test_vecs, label_test)) pred_probas = lr.predict_proba(test_vecs)[:, 1] fpr, tpr, _ = roc_curve(label_test, pred_probas) roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label='area = %.2f' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.legend(loc='lower right') plt.show()
def _data_read(pos_file_path, neg_file_path, w2c_model_path): """read data and word2vec model from file path, Args: pos_file_path: Positive file path. neg_file_path: Negative file path. w2c_model_path: word2vec model path Returns: A list contains train and test data with labels. Raises: IOError: An error occurred accessing the bigtable.Table object. """ tmp = data_processing.read_data(pos_file_path, neg_file_path) res = data_processing.data_split(tmp[0], tmp[1]) (train_data, test_data, train_labels, test_labels) = (res[0], res[1], res[2], res[3]) # print train_labels[0] train_data = data_processing.text_clean(train_data) test_data = data_processing.text_clean(test_data) # 词向量的维度 n_dim = globe.n_dim doc_vecs = [] try: # load word2vec model from model path word2vec_model = Word2Vec.load(w2c_model_path) doc_vecs = word2vec_gensim_train.text_vecs(train_data, test_data, n_dim, word2vec_model) except IOError: pass # 生成文本向量 train_data_vecs = doc_vecs[0] # print train_data_vecs.shape test_data_vecs = doc_vecs[1] # print test_data_vecs.shape return train_data_vecs, train_labels, test_data_vecs, test_labels
# res = w2c_model.most_similar(positive=['纤维', '批次'], negative=['成分'], topn=1) # # w2c_model.doesnt_match("我 爱 中国".split()) # # var = w2c_model.similarity('纤维', '批次') # print var # res = w2c_model.most_similar("纤维") # for i in res: # print i[0], dd = model.most_similar("批次") for i in dd: print i[0], if __name__ == "__main__": word2vec_test() pos_file_path = globe.file_pos neg_file_path = globe.file_neg tmp = data_processing.read_data(pos_file_path, neg_file_path) res = data_processing.data_split(tmp[0], tmp[1]) x_train = res[0] x_train = data_processing.text_clean(x_train) n_dim = 200 min_count = 2 model_path = globe.model_path mymodel = word2vec_model(x_train, n_dim, min_count) mymodel.save(model_path)
def get_datas(): # 调用函数读取训练数据 datas = data_read() train_data = data_split(datas) return train_data