def word2vec_test():
    # 读入数据
    pos_file_path = globe.file_pos
    neg_file_path = globe.file_neg

    tmp = data_processing.read_data(pos_file_path, neg_file_path)
    res = data_processing.data_split(tmp[0], tmp[1])
    x_train = res[0]
    x_train = data_processing.text_clean(x_train)

    for i in x_train:
        for j in i:
            print j,
    n_dim = 200
    min_count = 2

    # model = gensim.models.Word2Vec(x_train, min_count=0, size=200, workers=4)

    model = word2vec_model(x_train, n_dim, min_count)

    # res = w2c_model.most_similar(positive=['纤维', '批次'], negative=['成分'], topn=1)
    #
    # w2c_model.doesnt_match("我 爱 中国".split())
    #
    # var = w2c_model.similarity('纤维', '批次')
    # print var
    # res = w2c_model.most_similar("纤维")
    # for i in res:
    #     print i[0],

    dd = model.most_similar("批次")
    for i in dd:
        print i[0],
def run_li():
    # 读入数据
    # pos_file_path = '/Users/li/Kunyan/MyRepository/DeepNaturalLanguageProcessing/DeepNLP/data/test3.txt'
    # neg_file_path = '/Users/li/Kunyan/MyRepository/DeepNaturalLanguageProcessing/DeepNLP/data/test2.txt'

    pos_file_path = globe.file_pos
    neg_file_path = globe.file_neg

    tmp = data_processing.read_data(pos_file_path, neg_file_path)
    res = data_processing.data_split(tmp[0], tmp[1])
    train_vecs = res[0]
    test_vecs = res[1]
    label_train = res[2]
    label_test = res[3]

    # 分类训练
    lr = SGDClassifier(loss='log', penalty='l1')
    lr.fit(train_vecs, label_train)

    print('Test Accuracy: %.2f' % lr.score(test_vecs, label_test))

    pred_probas = lr.predict_proba(test_vecs)[:, 1]

    fpr, tpr, _ = roc_curve(label_test, pred_probas)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='area = %.2f' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.legend(loc='lower right')

    plt.show()
def _data_read(pos_file_path, neg_file_path, w2c_model_path):
    """read data and word2vec model from file path,
    Args:
        pos_file_path: Positive file path.
        neg_file_path: Negative file path.
        w2c_model_path: word2vec model path
    Returns:
        A list contains train and test data with labels.
    Raises:
        IOError: An error occurred accessing the bigtable.Table object.
    """

    tmp = data_processing.read_data(pos_file_path, neg_file_path)
    res = data_processing.data_split(tmp[0], tmp[1])
    (train_data, test_data, train_labels, test_labels) = (res[0], res[1],
                                                          res[2], res[3])

    # print train_labels[0]
    train_data = data_processing.text_clean(train_data)
    test_data = data_processing.text_clean(test_data)

    # 词向量的维度
    n_dim = globe.n_dim
    doc_vecs = []
    try:
        # load word2vec model from model path
        word2vec_model = Word2Vec.load(w2c_model_path)

        doc_vecs = word2vec_gensim_train.text_vecs(train_data, test_data,
                                                   n_dim, word2vec_model)
    except IOError:
        pass

    # 生成文本向量
    train_data_vecs = doc_vecs[0]
    # print train_data_vecs.shape
    test_data_vecs = doc_vecs[1]
    # print test_data_vecs.shape

    return train_data_vecs, train_labels, test_data_vecs, test_labels
    # res = w2c_model.most_similar(positive=['纤维', '批次'], negative=['成分'], topn=1)
    #
    # w2c_model.doesnt_match("我 爱 中国".split())
    #
    # var = w2c_model.similarity('纤维', '批次')
    # print var
    # res = w2c_model.most_similar("纤维")
    # for i in res:
    #     print i[0],

    dd = model.most_similar("批次")
    for i in dd:
        print i[0],


if __name__ == "__main__":
    word2vec_test()
    pos_file_path = globe.file_pos
    neg_file_path = globe.file_neg
    tmp = data_processing.read_data(pos_file_path, neg_file_path)
    res = data_processing.data_split(tmp[0], tmp[1])
    x_train = res[0]
    x_train = data_processing.text_clean(x_train)

    n_dim = 200
    min_count = 2
    model_path = globe.model_path
    mymodel = word2vec_model(x_train, n_dim, min_count)
    mymodel.save(model_path)
예제 #5
0
def get_datas():  # 调用函数读取训练数据
    datas = data_read()
    train_data = data_split(datas)
    return train_data