Python preprocess示例，reader.preprocess Python示例

示例#1

0

显示文件

def predict(input_path, output_path):
    df_i = pd.read_excel(input_path)
    corpus_i = df_i.iloc[:, [1]]
    corpus_i = np.array(corpus_i).tolist()

    corpus = reader.preprocess(reader.read_excel(input_path, text_column=1),
                               seq_lenth=FLAGS.seq_lenth,
                               seq_num=1,
                               overlap_lenth=0,
                               input_label=False,
                               output_index=True)
    # vocab, word2id = reader.read_glossary()

    test_inputs = []
    test_lenths = []
    test_num = 0
    for item in corpus:
        test_inputs.append(item[0])
        test_lenths.append(item[1])
        test_num += 1

    with tf.Graph().as_default(), tf.Session() as sess:
        model = em_sent(seq_size=FLAGS.seq_lenth,
                        glossary_size=FLAGS.glossary_size,
                        embedding_size=FLAGS.embedding_size,
                        hidden_size=FLAGS.hidden_size,
                        attn_lenth=FLAGS.attn_lenth,
                        is_training=False)
        model.buildTrainGraph()

        saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=10)

        if reader.restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir):
            total_expection = []
            print(test_num)
            for piece_inputs, piece_lenths in get_test_batch(
                    test_inputs, test_lenths, None, test_num,
                    input_label=False):
                test_feed_dict = {
                    model.inputs:
                    piece_inputs,
                    model.lenths:
                    piece_lenths,
                    model.lenths_weight:
                    padded_ones_list_like(piece_lenths, FLAGS.seq_lenth),
                }
                expection = sess.run(model.expection, feed_dict=test_feed_dict)
                total_expection.extend(expection)

            zipped = []
            for index in range(test_num):
                zipped.append([
                    corpus_i[corpus[index][2]],
                    'T' if total_expection[index][0] == 0 else 'F'
                ])
            df_o = pd.DataFrame(zipped)
            writer = pd.ExcelWriter(output_path)
            df_o.to_excel(writer, 'Sheet1')
            writer.save()

示例#2

0

显示文件

def test_onesent(text):
    corpus = reader.preprocess([[text]],
                               seq_lenth=FLAGS.seq_lenth,
                               seq_num=1,
                               overlap_lenth=0,
                               input_label=False,
                               output_index=False)
    vocab, word2id = reader.read_glossary()

    print(corpus)
    test_inputs = []
    test_lenths = []
    test_num = 0
    for item in corpus:
        test_inputs.append(item[0])
        test_lenths.append(item[1])
        test_num += 1

    with tf.Graph().as_default(), tf.Session() as sess:
        model = em_sent(seq_size=FLAGS.seq_lenth,
                        glossary_size=FLAGS.glossary_size,
                        embedding_size=FLAGS.embedding_size,
                        hidden_size=FLAGS.hidden_size,
                        attn_lenth=FLAGS.attn_lenth,
                        is_training=False)
        model.buildTrainGraph()

        saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=10)

        if reader.restore_from_checkpoint(sess, saver, FLAGS.ckpt_dir):
            test_feed_dict = {
                model.inputs:
                test_inputs,
                model.lenths:
                test_lenths,
                model.lenths_weight:
                padded_ones_list_like(test_lenths, FLAGS.seq_lenth),
            }
            expection, alpha, logits = sess.run(
                [model.expection, model.alpha, model.logits],
                feed_dict=test_feed_dict)

            print([vocab[i] for i in test_inputs[0]])

            for i in range(len(test_inputs[0])):
                print(vocab[test_inputs[0][i]], alpha[0][i], logits[0][i])

            # print([vocab[word] for word in test_inputs])
            if (expection[0][0] == 1):
                print('负面')
            else:
                print('正面')

            return expection[0]

示例#3

0

显示文件

def test_for_lime(text):
    corpus = reader.preprocess([[i] for i in text],
                               seq_lenth=FLAGS.seq_lenth,
                               seq_num=1,
                               overlap_lenth=0,
                               input_label=False,
                               output_index=False,
                               split_func=lambda x: x.split(' '),
                               de_duplicated=False)
    # vocab, word2id = reader.read_glossary()

    test_inputs = []
    test_lenths = []
    test_num = 0
    for item in corpus:
        test_inputs.append(item[0])
        test_lenths.append(item[1])
        test_num += 1

    saver = tf.train.Saver(tf.trainable_variables(), max_to_keep=10)

    if reader.restore_from_checkpoint(SESS, saver, FLAGS.ckpt_dir):
        total_expection = []
        for piece_inputs, piece_lenths in get_test_batch(test_inputs,
                                                         test_lenths,
                                                         None,
                                                         test_num,
                                                         input_label=False):
            test_feed_dict = {
                model.inputs:
                piece_inputs,
                model.lenths:
                piece_lenths,
                model.lenths_weight:
                padded_ones_list_like(piece_lenths, FLAGS.seq_lenth),
            }
            expection = SESS.run(model.raw_expection, feed_dict=test_feed_dict)
            total_expection.extend([[round(i[0], 4),
                                     round(1 - i[0], 4)] for i in expection])
        return np.array(total_expection)

示例#4

0

显示文件

                    print('负面')
                    temp.append('负面')
                elif (expection[index][0] == 0):
                    print('非负面')
                    temp.append('非负面')
                sp.append(temp)

            return sp


if __name__ == '__main__':
    text = ['康宝莱】河南郑州', '花季少女死于康宝莱']
    corpus = reader.preprocess(reader.read_excel(
        '../data/corpus/check/yxcd.xlsx', text_column=1, label_column=0),
                               seq_lenth=FLAGS.seq_lenth,
                               seq_num=1,
                               overlap_lenth=0,
                               input_label=True,
                               output_index=False,
                               de_duplicated=True)
    # with open('../data/corpus/latest/test.pickle', 'rb') as fp:
    #     corpus = pickle.load(fp)
    test(corpus)
    # predict(input_path='../data/corpus/check/klb.xlsx', output_path='out/x.xlsx')
    # test_onesent('【康宝莱】河南郑州花季少女死于康宝莱传销相关部门难逃其咎： 我叫张云成，男，汉族，现年47岁，家住河南省淮阳县冯塘乡蔡李庄村。2017年6月5日,我的儿子张旭(17岁)从郑州回到家里,向家里要钱,...文字版>> http://t.cn/RonRbWK （新浪长微博>> http://t.cn/zOXAaic）')
    # test_onesent('走进康宝莱之前，我听到了台上的分享嘉宾说了这句话：“也许做康宝莱这件事不是你的梦想，但做好康宝莱能实现你所有的梦想！” 我信了。 果然这是真的。 同时，在这个过程中，帮助更多人获得好身材、健康、财富、和精彩人生，也一并成为了我的梦想！和我其他的梦想一起实现了！——肖珂宇 网页链接')

    # vocab, word2id = reader.read_glossary()
    # sp = get_word_sentiment_polarity(vocab)
    #
    # import pandas as pd
    #