Пример #1
0
def data_preprocess(all_data):
    # Load data
    print("Loading data...")
    if not os.path.exists(os.path.join(out_dir, "data_x.npy")):
        x, y = data_helper_new.load_data_and_labels(FLAGS.clone_data_file)
        # Get embedding vector
        all_sentences, all_max_document_length = data_helper_new.padding_sentences(
            all_data,
            '<PADDING>',
            padding_sentence_length=FLAGS.sequence_length)
        sentences, max_document_length = data_helper_new.padding_sentences(
            x, '<PADDING>', padding_sentence_length=FLAGS.sequence_length)
        print(len(sentences[0]))
        if not os.path.exists(os.path.join(out_dir, "trained_word2vec.model")):
            word2vec_helpers.word2vec_model(all_sentences,
                                            embedding_size=FLAGS.embedding_dim,
                                            file_to_save=os.path.join(
                                                out_dir,
                                                'trained_word2vec.model'))
            x = np.array(
                word2vec_helpers.embedding_sentences(
                    sentences,
                    file_to_load=os.path.join(out_dir,
                                              'trained_word2vec.model')))
        else:
            print('w2v model found...')
            # word2vec_helpers.word2vec_model(all_sentences, embedding_size = FLAGS.embedding_dim, file_to_save = os.path.join(out_dir, 'trained_word2vec.model'),file_to_load=os.path.join(out_dir, 'trained_word2vec.model'))
            x = np.array(
                word2vec_helpers.embedding_sentences(
                    sentences,
                    file_to_load=os.path.join(out_dir,
                                              'trained_word2vec.model')))
            # x = np.array(word2vec_helpers.embedding_sentences(sentences, embedding_size = FLAGS.embedding_dim, file_to_save = os.path.join(out_dir, 'trained_word2vec.model'),file_to_load=os.path.join(out_dir, 'trained_word2vec.model')))
        y = np.array(y)
        del sentences
        del all_sentences
    else:
        print('data found...')
        x = np.load(os.path.join(out_dir, "data_x.npy"))
        y = np.load(os.path.join(out_dir, "data_y.npy"))
    print("x.shape = {}".format(x.shape))
    print("y.shape = {}".format(y.shape))

    # Save params
    if not os.path.exists(os.path.join(out_dir, "training_params.pickle")):
        training_params_file = os.path.join(out_dir, 'training_params.pickle')
        params = {
            'num_labels': FLAGS.num_labels,
            'max_document_length': max_document_length
        }
        data_helper_new.saveDict(params, training_params_file)  #用于保存数据字典
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2,
        random_state=42)  # split into training and testing set 80/20 ratio
    del x, y
    return x_train, x_test, y_train, y_test
Пример #2
0
def preprocess():
    """
    数据准备阶段

    :return:
    """
    # 1. 加载数据文件
    x_text, y = load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)

    #  文本进行向量化
    sentences, max_document_length = padding_sentences(x_text, '<PAD>')
    x = np.array(embedding_sentences(sentences, FLAGS.word2vec_fname))
    y = np.array(list(y))
    print("x.shape = {}".format(x.shape))
    print("y.shape = {}".format(y.shape))

    # shuffle 数据
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # 分拆数据,训练和测试
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

    del x, y, x_shuffled, y_shuffled

    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    return x_train, y_train, x_dev, y_dev
def validData2vec(sentences):
    print 'Word embedding...'
    all_vectors = word2vec_helpers.embedding_sentences(
        sentences,
        embedding_size=FLAGS.embedding_dim,
        file_to_load=
        '/home/WXX/WebClassify/cnn_website_text_classify/runs/1503023156/trained_word2vec.model'
    )
    x_valid = np.array(all_vectors)
    return x_valid
Пример #4
0
def pre_type(string,max_document_length):
    x_raw = [data_helpers.clean_str(data_helpers.jieba_line(string))]
    # Get Embedding vector x_test
    sentences, max_document_length = data_helpers.padding_sentences(x_raw, '<PADDING>',
                                                                    padding_sentence_length=max_document_length)
    x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load=trained_word2vec_model_file))
    # print("x_test.shape = {}".format(x_test.shape))
    pred = sess.run(predictions, {input_x: x_test, dropout_keep_prob: 1.0})[0]
    type=''
    if pred == 0:
        type = type + 'neg'
    else:
        type = type + 'pos'
    return type
Пример #5
0
    def predict(self,url):
        params = data_helper.loadDict(self.training_params_file)
        num_labels = int(params['num_labels'])
        max_document_length = int(params['max_document_length'])
        x_raw = [url]
        sentences, max_document_length = data_helper.padding_sentences(x_raw, '<PADDING>', padding_sentence_length = max_document_length)
        x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load = self.trained_word2vec_model_file))
        # print(x_test)
        # with self.graph.as_default():
        # with self.sess.as_default():

        result = self.sess.run(self.predictions, {self.input_x: x_test,self.dropout_keep_prob: 1.0})
        result = 'good' if result else 'bad'
        print("Request examples: {}, inference result: {}".format(url,result))
        return result
def save_data_vector(contents_dir, labels_dir, out_dir):
    x_text, y = load_files_labels(contents_dir, labels_dir, one_hot=True)
    # Get embedding vector, 句子padding到最大长度190,pandding内容为: '<PADDING>'
    sentences, max_document_length = data_helpers.padding_sentences(
        x_text, '<PADDING>', padding_sentence_length=190)
    embedding_dim = 128
    x = np.array(
        word2vec_helpers.embedding_sentences(sentences,
                                             embedding_size=embedding_dim,
                                             file_to_save=os.path.join(
                                                 out_dir,
                                                 'trained_word2vec.model')))
    print("x.shape = {}".format(x.shape)
          )  # shape=(10000, 190, 128)->(样本个数10000,每个样本的字词个数190,每个字词的向量长度128)
    print("y.shape = {}".format(
        y.shape))  # y.shape = (10000, 2)->样本的labels,以one-hot编码
    np.save(os.path.join(out_dir, 'data_vector.npy'), x)
    np.save(os.path.join(out_dir, 'labels.npy'), y)
Пример #7
0
    def predict(self, message):
        # 支持不论在python2还是python3下训练的模型都可以在2或者3的环境下运行
        data = message
        # data=list(content)
        sentences, max_document_length = padding_sentences(data, '<PADDING>')
        x = np.array(
            word2vec_helpers.embedding_sentences(
                sentences,
                embedding_size=args.embedding_size,
                file_to_load='./best_model/1568855551/trained_word2vec.model'))
        # print(x.shape)

        feed_dict = {self.model.input_x: x, self.model.dropout_keep_prob: 1.0}
        #最后一层输出
        y_pred_cls = self.session.run(self.model.predictions,
                                      feed_dict=feed_dict)
        # y_pred_cls = self.session.run(tf.nn.softmax(self.model.scores), feed_dict=feed_dict)
        y_prob = y_pred_cls.tolist()
        print(y_prob)
        return self.categories[y_pred_cls[0]]
Пример #8
0
def data_preprocess():
    # Data preprocess
    # =======================================================
    # Load data
    print("Loading data...")
    if not os.path.exists(os.path.join(out_dir, "data_x.npy")):
        x, y = data_helper.load_data_and_labels(FLAGS.data_file)
        # Get embedding vector
        x = x[:1000]
        y = y[:1000]
        sentences, max_document_length = data_helper.padding_sentences(
            x, '<PADDING>', padding_sentence_length=FLAGS.sequence_length)
        print(len(sentences[0]))
        if not os.path.exists(os.path.join(out_dir, "trained_word2vec.model")):
            x = np.array(
                word2vec_helpers.embedding_sentences(
                    sentences,
                    embedding_size=FLAGS.embedding_dim,
                    file_to_save=os.path.join(out_dir,
                                              'trained_word2vec.model')))
        else:
            print('w2v model found...')
            x = np.array(
                word2vec_helpers.embedding_sentences(
                    sentences,
                    embedding_size=FLAGS.embedding_dim,
                    file_to_save=os.path.join(out_dir,
                                              'trained_word2vec.model'),
                    file_to_load=os.path.join(out_dir,
                                              'trained_word2vec.model')))
        y = np.array(y)
        # np.save(os.path.join(out_dir,"data_x.npy"),x)
        # np.save(os.path.join(out_dir,"data_y.npy"),y)
        del sentences
    else:
        print('data found...')
        x = np.load(os.path.join(out_dir, "data_x.npy"))
        y = np.load(os.path.join(out_dir, "data_y.npy"))
    print("x.shape = {}".format(x.shape))
    print("y.shape = {}".format(y.shape))

    # Save params
    if not os.path.exists(os.path.join(out_dir, "training_params.pickle")):
        training_params_file = os.path.join(out_dir, 'training_params.pickle')
        params = {
            'num_labels': FLAGS.num_labels,
            'max_document_length': max_document_length
        }
        data_helper.saveDict(params, training_params_file)

    # Shuffle data randomly
    # np.random.seed(10)
    # shuffle_indices = np.random.permutation(np.arange(len(y)))
    # x_shuffled = x[shuffle_indices]
    # y_shuffled = y[shuffle_indices]
    # del x,y

    # x_train, x_test, y_train, y_test = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=42)  # split into training and testing set 80/20 ratio
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2,
        random_state=42)  # split into training and testing set 80/20 ratio
    del x, y
    return x_train, x_test, y_train, y_test
Пример #9
0
# Load params
params = data_helpers.loadDict(training_params_file)
num_labels = int(params['num_labels'])
max_document_length = int(params['max_document_length'])

# Load data
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_positive_negative_data_files(FLAGS)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Get Embedding vector x_test
print max_document_length
x_test, max_document_length = data_helpers.padding_sentences(x_raw, '<PADDING>', padding_sentence_length = max_document_length)
_, w2vModel = word2vec_helpers.embedding_sentences(file_to_load = trained_word2vec_model_file)
x_test = np.array(x_test)

print("x_test.shape = {}".format(x_test.shape))


# Evaluation
# ==================================================
print("\nEvaluating...\n")
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
Пример #10
0
def sample(args):
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    print('Loading data')
    #x_text, y = data_helpers.load_positive_negative_data_files1()
    # Get embedding vector
    #sentences, max_document_length = data_helpers.padding_sentences(x_text, '<PADDING>')
    #x = np.array(word2vec_helpers.embedding_sentences(sentences, embedding_size=FLAGS.embedding_dim,
    #file_to_save=os.path.join(out_dir, 'trained_word2vec.model')))

    checkpoint_file = tf.train.latest_checkpoint(args.checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        sess = tf.Session()
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]
            #体育 娱乐 彩票 房产
            textlist = [
                '谁 足 球 踢 得 好 ?', u'彩 票 中 奖 几 乎 是 不 可 能 的',
                u'上 海 的 房 价 始 终 居 高 不 下',
                u'关 晓 彤 主 演 新 版 倚 天 屠 龙 记 让 人 笑 掉 大 牙 ',
                u'杜 兰 特 是 勇 士 的 篮 球 运 动 员 ', u'娱 乐 圈 吸 毒 是 常 有 的 事',
                u'上 海 一 彩 民 中了 二 等 奖', u' 万 达 集 团 再 次 中 标 关键 地 段 的 房 产 开 发 权 ',
                u'很 多 观 众 每 晚 准 时 看 体 育 新 闻 ', u'草 莓 音 乐 节 即 将 开 始',
                u'中 国 福 利 彩 票 是 否 有 黑 幕 不 得 而 知 ', u'房 地 产 行 业 永 远 不 会 倒'
            ]
            for i in textlist:
                textlist = []
                textlist.append(i)
                print(textlist)
                sentences_padded1, max_document_length = data_helpers.padding_sentences(
                    textlist, '<PADDING>')
                raw_x1 = np.array(
                    word2vec_helpers.embedding_sentences(
                        sentences_padded1,
                        embedding_size=FLAGS.embedding_dim,
                        file_to_load=
                        'C:/Users/I343039/PycharmProjects/nlp-multiclass-text-tf/runs/1508811868/trained_word2vec.model'
                    ))
                predicted_result = sess.run(predictions, {
                    input_x: raw_x1,
                    dropout_keep_prob: 1.0
                })
                if (predicted_result[0] == 0):
                    print(i + ": 体育")
                elif (predicted_result[0] == 1):
                    print(i + ": 娱乐")
                elif (predicted_result[0] == 2):
                    print(i + ": 彩票")
                elif (predicted_result[0] == 3):
                    print(i + ": 房产")
Пример #11
0
    with open(dict_file, 'rb') as f:
        output_dict = pickle.load(f)
    return output_dict


if __name__=='__main__':
    base_path = os.path.abspath(os.path.dirname(__file__))
    positive_file = os.path.join(base_path, 'data/ham_100.utf8')
    negative_file = os.path.join(base_path, 'data/spam_100.utf8')

    x_test,y = load_positive_negative_data_files(positive_file,negative_file)
    sentences, max_document_length = padding_sentences(x_test, '<PADDING>')
    # w2vModel = Word2Vec(sentences, size = 128, window = 5, min_count = 5, workers = multiprocessing.cpu_count())
    # file_to_save = os.path.join(base_path,'model/trained_word2vec.model')
    # w2vModel.save(file_to_save)
    x = np.array(word2vec_helpers.embedding_sentences(sentences, embedding_size=128,file_to_save=os.path.join(base_path, 'model/trained_word2vec.model')))
    training_params_file = os.path.join(base_path, 'model/training_params.pickle')
    params = {'num_labels': 2, 'max_document_length': max_document_length}
    saveDict(params, training_params_file)
    # Shuffle data randomly
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]
    dev_sample_index = -1 * int(0.1 * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
    embedded_chars = x_train
    print x_train.shape
    embedded_chars_expended = tf.expand_dims(embedded_chars, -1)
    print embedded_chars_expended.shape
Пример #12
0
# Load params
params = data_helpers.loadDict(training_params_file)
num_labels = int(params['num_labels'])
max_document_length = int(params['max_document_length'])

# Load data
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.input_text_file, FLAGS.input_label_file, num_labels)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Get Embedding vector x_test
sentences, max_document_length = data_helpers.padding_sentences(x_raw, '<PADDING>', padding_sentence_length = max_document_length)
x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load = trained_word2vec_model_file))
print("x_test.shape = {}".format(x_test.shape))


# Evaluation
# ==================================================
print("\nEvaluating...\n")
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Load the saved meta graph and restore variables
Пример #13
0
import data_helpers
import word2vec_helpers
import os

tf.flags.DEFINE_string("test_file", "./data/data_test.csv",
                       "Data source for the mid data.")
FLAGS = tf.flags.FLAGS

print("Loading data...")
x_text, y = data_helpers.load_test_files(FLAGS.test_file)
sentences, max_document_length = data_helpers.padding_sentences(
    x_text, '<PADDING>')
x = np.array(
    word2vec_helpers.embedding_sentences(sentences,
                                         embedding_size=128,
                                         file_to_save=os.path.join(
                                             'data/',
                                             'trained_word2vec.model')))
print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))
# data=pd.read_csv('data/data_test.csv',encoding='utf-8')
# test=pd.DataFrame(data)
# W = tf.Variable(np.arange(6).reshape((2, 3)), dtype=tf.float32, name="weights")
# b = tf.Variable(np.arange(3).reshape((1, 3)), dtype=tf.float32, name="biases")
# saver = tf.train.Saver()

with tf.Session() as sess:
    saver = tf.train.import_meta_graph(
        '/home/liqian/liqian/NLP/runs/1525330118/checkpoints/model-1100.meta')
    saver.restore(
        sess,
Пример #14
0
def validate_method(x_raw, y_test, max_document_length):

    # Get Embedding vector x_test
    sentences, max_document_length = data_helpers.padding_sentences(
        x_raw, '<PADDING>', padding_sentence_length=max_document_length)
    x_test = np.array(
        word2vec_helpers.embedding_sentences(
            sentences, file_to_load=trained_word2vec_model_file))
    print("x_test.shape = {}".format(x_test.shape))

    # Evaluation
    # ==================================================
    print("\nEvaluating...\n")
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            # Generate batches for one epoch
            batches = data_helpers.batch_iter(list(x_test),
                                              FLAGS.batch_size,
                                              1,
                                              shuffle=False)

            # Collect the predictions here
            all_predictions = []

            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {
                    input_x: x_test_batch,
                    dropout_keep_prob: 1.0
                })
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])

    # Print accuracy if y_test is defined
    if y_test is not None:
        correct_predictions = float(sum(all_predictions == y_test))
        print("Total number of test examples: {}".format(len(y_test)))
        print("Accuracy: {:g}".format(correct_predictions /
                                      float(len(y_test))))

    # Save the evaluation to a csv
    predictions_human_readable = np.column_stack(
        (np.array([text.encode('utf-8') for text in x_raw]), all_predictions))
    out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv")
    print("Saving evaluation to {0}".format(out_path))
    with open(out_path, 'a+') as f:
        csv.writer(f).writerows(predictions_human_readable)
Пример #15
0
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
print("Writing to {}\n".format(out_dir))
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# Data preprocess
# =======================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_positive_negative_data_files(FLAGS.positive_data_file, FLAGS.negative_data_file)

# Get embedding vector
sentences, max_document_length = data_helpers.padding_sentences(x_text, '<PADDING>')
x = np.array(word2vec_helpers.embedding_sentences(sentences, embedding_size = FLAGS.embedding_dim, file_to_save = os.path.join(out_dir, 'trained_word2vec.model')))
print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))

# Save params
training_params_file = os.path.join(out_dir, 'training_params.pickle')
params = {'num_labels' : FLAGS.num_labels, 'max_document_length' : max_document_length}
data_helpers.saveDict(params, training_params_file)

# Shuffle data randomly
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# Data preprocess
# =======================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_positive_negative_data_files(
    FLAGS.positive_data_file, FLAGS.negative_data_file)

# Get embedding vector
sentences, max_document_length = data_helpers.padding_sentences(
    x_text, '<PADDING>')
x = np.array(
    word2vec_helpers.embedding_sentences(sentences,
                                         embedding_size=FLAGS.embedding_dim,
                                         file_to_save=os.path.join(
                                             base_path,
                                             'model/trained_word2vec.model')))
print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))

# Save params
training_params_file = os.path.join(base_path, 'training_params.pickle')
params = {
    'num_labels': FLAGS.num_labels,
    'max_document_length': max_document_length
}
data_helpers.saveDict(params, training_params_file)

# Shuffle data randomly
np.random.seed(10)
Пример #17
0
# Get Embedding vector x_test
sentences, max_document_length = new_data_helper.padding_sentences(
    x_raw, '.', padding_sentence_len=max_document_length)
# print(sentences)
# 如果测试的文件过大,则容易出现,x_test.shape出错的问题,因此将测试集分割测试
# Collect the predictions here
all_predictions = []
print(len(sentences))
for i in range(len(sentences) / 100):
    print(i)
    print(len(sentences) / 100)
    print(sentences[i * 100:(i + 1) * 100])
    x_test = np.array(
        word2vec_helpers.embedding_sentences(
            sentences[i * 100:(i + 1) * 100],
            file_to_load=trained_word2vec_model_file))
    #print(x_test)
    print("x_test.shape = {}".format(x_test.shape))

    # Evaluation
    # ==================================================
    print("\nEvaluating...\n")
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
Пример #18
0
# Data preprocess
# =======================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_positive_negative_data_files(
    FLAGS.positive_data_file, FLAGS.negative_data_file)

# Get embedding vector
sentences, max_document_length = data_helpers.padding_sentences(
    x_text, '<PADDING>')
x = np.array(
    word2vec_helpers.embedding_sentences(
        sentences,
        embedding_size=FLAGS.embedding_dim,
        file_to_load="/Users/jiangqy/Code/model/wiki.zh.text.model",
        file_to_save=os.path.join(out_dir, 'trained_word2vec.model')))
print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))

# Save params
training_params_file = os.path.join(out_dir, 'training_params.pickle')
params = {
    'num_labels': FLAGS.num_labels,
    'max_document_length': max_document_length
}
data_helpers.saveDict(params, training_params_file)

# Shuffle data randomly
np.random.seed(10)
Пример #19
0
# Load data
print("Loading data...")
positive_data_file = os.path.join('.', args.positive_data_file)
negative_data_file = os.path.join('.', args.negative_data_file)
# print(positive_data_file)

x_text, y = data_deal.load_positive_negative_data_files(
    positive_data_file, negative_data_file)
print(x_text)

sentences, max_document_length = data_deal.padding_sentences(
    x_text, '<PADDING>')
x = np.array(
    word2vec_helpers.embedding_sentences(sentences,
                                         embedding_size=args.embedding_size,
                                         file_to_save=os.path.join(
                                             out_dir,
                                             'trained_word2vec.model')))

print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))

# # Save params
training_params_file = os.path.join(out_dir, 'training_params.pickle')
params = {
    'num_classes': args.num_classes,
    'max_document_length': max_document_length
}
data_deal.saveDict(params, training_params_file)

# Shuffle data randomly
Пример #20
0
    y_test = None
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

label2str = data_helper.label2str('input_data/')

# Get Embedding vector x_test
print 'Padding sentence...'
sentences, max_document_length = data_helper.padding_sentences(
    x_raw, '<PADDING>', padding_sentence_length=max_document_length)
print 'sentences length : %d , max_document_length : %d' % (
    len(sentences), max_document_length)
sentences, new_x_raw = eval_helper.check_padding_sentences(sentences, x_raw)

all_vectors = word2vec_helpers.embedding_sentences(
    sentences, embedding_size=128, file_to_load=trained_word2vec_model_file)
print 'all_vectors length: %d' % len(all_vectors[0])
x_test = np.array(all_vectors)
print("x_test.shape = {}".format(x_test.shape))
print 'x_test_shape: ', x_test.shape, " ", len(x_test), " ", len(
    x_test[0]), " ", len(x_test[0][0])
print 'list x_test ', len(list(x_test))

# Evaluation
# ==================================================
print("\nEvaluating...\n")
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
Пример #21
0
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Map data into vocabulary
"""
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))
"""
sentences, max_document_length = data_helpers.padding_sentences(
    x_raw, '<PADDING>')
x_test = np.array(
    word2vec_helpers.embedding_sentences(sentences,
                                         embedding_size=FLAGS.embedding_dim,
                                         file_to_save=os.path.join(
                                             FLAGS.checkpoint_dir,
                                             'trained_word2vec.model')))
data_sigma = data_scale(eps)
pos_noise = np.random.normal(0, data_sigma,
                             [pos_len, x_test.shape[1], x_test.shape[2]])
neg_noise = np.random.normal(0, data_sigma,
                             [neg_len, x_test.shape[1], x_test.shape[2]])
noise = np.concatenate([pos_noise, neg_noise], 0)
x_test = x_test + noise
print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
Пример #22
0
# print("")

# 2. 加载数据、词典及模型
# 2.1 加载数据

if FLAGS.eval_train:
    x_raw, y_test = load_data_and_labels(FLAGS.positive_data_file,
                                         FLAGS.negative_data_file)
    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [0, 1]

#  文本进行向量化
sentences, max_document_length = padding_sentences(x_raw, '<PAD>', 112)
x_test = np.array(embedding_sentences(sentences, word2vec_path))
y_test = np.array(list(y_test))

print("x.shape = {}".format(x_test.shape))
print("y.shape = {}".format(y_test.shape))

# 3 加载模型及预测
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
print("latest checkpoint: %s" % checkpoint_file)

graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
Пример #23
0
# Data preprocess
# =======================================================

# Load data
print("Loading data...")
x_text, y = data_helpers.load_positive_negative_data_files(FLAGS)

# Get embedding vector
sentences, max_document_length = data_helpers.padding_sentences(
    x_text,
    '<PADDING>',
    word_segment=FLAGS.word_segment,
    padding_sentence_length=FLAGS.max_document_len)
if not os.path.exists(_w2v_path):
    _, w2vModel = word2vec_helpers.embedding_sentences(
        sentences=sentences,
        embedding_size=FLAGS.embedding_dim,
        file_to_save=_w2v_path)
else:
    _, w2vModel = word2vec_helpers.embedding_sentences(
        sentences=None,
        embedding_size=FLAGS.embedding_dim,
        file_to_load=_w2v_path)
FLAGS.embedding_dim = w2vModel.vector_size
print('wordembedding.dim = {}'.format(FLAGS.embedding_dim))
print('wordembedding.lenth = {}'.format(len(w2vModel.wv.vocab)))

x = np.array(sentences)
print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))

# Save params
Пример #24
0
#x_text为二维列表,第一维以每一句话为元素构成的列表,第二维以该句话的每个词组成的列表构成
#['全国', '少年儿童', '游泳', '锦标赛', '开幕', '新华社', '广州', '月', '日电', '记者', '何惠飞', '年', '喜乐', '杯', '全国', '少年儿童', '游泳', '锦标赛', '昨天', '在', '游泳', '之', '乡', '广东省', '东莞市', '开幕', '参加', '这次', '比赛', '的', '有', '个', '省', '自治区', '直辖市', '的', '名', '男女', '选手', '比赛', '分为', '岁', '组和岁', '以下', '组', '参赛者', '都', '是', '近几年', '涌现', '的', '优秀', '小', '选手', '不少', '是', '本', '年龄组', '的', '全国纪录', '创造者', '这次', '比赛', '是', '对', '我国', '参加', '下', '两届', '奥运会', '游泳赛', '后备力量', '的', '一次', '检阅', '国家体委', '将', '通过', '这次', '比赛', '选拔', '优秀', '选手', '组队参加', '今年', '月', '在', '印度尼西亚', '举行', '的', '亚太区', '年龄组', '游泳', '比赛', '比赛', '将', '于', '日', '结束', '完']
x_text, y = data_helpers.load_data_files(
    FLAGS.sports_file, FLAGS.amusement_file, FLAGS.home_file,
    FLAGS.estate_file, FLAGS.education_file, FLAGS.fashion_file,
    FLAGS.politics_file, FLAGS.game_file, FLAGS.technology_file,
    FLAGS.finance_file)

# Get embedding vector
sentences, max_document_length = data_helpers.padding_sentences(
    x_text, 'PADDING', FLAGS.max_seq_length)
#此时的sentences为每一句话中的词构成的列表为元素而构成的二维列表,其中每一句话的长度都相同,因为都用
#'PADDING'将其补充到最大长度
#将返回的列表转化为数组
x_embedding = word2vec_helpers.embedding_sentences(
    sentences,
    embedding_size=FLAGS.embedding_dim,
    ext_emb_path=FLAGS.word_embedding_file)
x = np.array(x_embedding)
#x的三维分别表示句子总数,每个句子中的单词数(以最长的句子计),词向量的维数
print("x.shape =", x.shape)
print("y.shape =", y.shape)
#Save params
training_params_file = 'train/training_params.pickle'
params = {
    'num_labels': FLAGS.num_labels,
    'max_document_length': max_document_length
}
data_helpers.saveDict(params, training_params_file)

# 数据混杂
np.random.seed(10)
new_x_text = []
new_y = []
for i in range(3000):
    rand_idx = random.randint(0, len(x_text))
    #rand_y = random.randint(0, len(x_text))
    new_x_text.append(x_text[rand_idx])
    new_y.append(y[rand_idx])
print "new_x_text length: %d" % len(new_x_text)
print "new_y length: %d" % len(new_y)

# embedding vector
print("Padding sentences...")
sentences, max_document_length = data_helper.padding_sentences(new_x_text, '<PADDING>')    #max_document_length = 

print("embedding_sentences...")
all_vectors = word2vec_helpers.embedding_sentences(sentences, embedding_size = FLAGS.embedding_dim, file_to_save = os.path.join(out_dir, 'trained_word2vec.model'))
print "all_vectors length %d  *  %d  *  %d : " %  (len(all_vectors) , len(all_vectors[0]) , len(all_vectors[0][0]))
#x = np.array(all_vectors)   ## this operation  could lead to memory error!!!

#TODO: transform large vectors into sparse matrix
x = np.asarray(all_vectors)
y = np.asarray(new_y)
print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))

# Save params
training_params_file = os.path.join(out_dir, 'training_params.pickle')
params = {'num_labels' : FLAGS.num_labels, 'max_document_length' : max_document_length}
data_helper.saveDict(params, training_params_file)

# Shuffle data randomly
Пример #26
0
import word2vec_helpers
import data_helper
from text_lstm import TextLSTM

# Load data
print("Loading data...")
x_text, y = data_helper.load_positive_negative_data_files(
    'bingyin.txt', 'zhenduan.txt', 'zhiliao.txt', 'zhengzhuang.txt')

# Get embedding vector
embedding_dim = 300
sentences, max_document_length = data_helper.padding_sentences(
    x_text, '<PADDING>')
x = np.array(
    word2vec_helpers.embedding_sentences(
        sentences,
        embedding_size=embedding_dim,
        file_to_save='trained_word2vec.model'))
print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))

# Shuffle data randomly
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
dev_sample_index = -1 * int(0.1 * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
Пример #27
0
    # Load data
    if FLAGS.eval_train:
        x_raw, y_test = data_helpers.load_data_and_labels(
            FLAGS.input_text_file, FLAGS.input_label_file, num_labels)
    else:
        x_raw = [
            "a masterpiece four years in the making", "everything is off."
        ]
        y_test = [1, 0]

    # Get Embedding vector x_test
    sentences, max_document_length = data_helpers.padding_sentences(
        x_raw, '<PADDING>', padding_sentence_length=max_document_length)
    x_test = np.array(
        word2vec_helpers.embedding_sentences(
            sentences, file_to_load=trained_word2vec_model_file))
    print("x_test.shape = {}".format(x_test.shape))

    # Evaluation
    # ==================================================
    print("\nEvaluating...\n")
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
Пример #28
0
print("Writing to {}\n".format(out_dir))
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# Data preprocess
# =======================================================

# Load data
print("Loading data...")
#x_text, y = data_helpers.load_positive_negative_data_files(FLAGS.positive_data_file, FLAGS.negative_data_file)
x_text, y = data_helpers.load_positive_negative_data_files1()
# Get embedding vector
sentences, max_document_length = data_helpers.padding_sentences(
    x_text, '<PADDING>')
x = np.asanyarray((word2vec_helpers.embedding_sentences(
    sentences,
    embedding_size=FLAGS.embedding_dim,
    file_to_save=os.path.join(out_dir, 'trained_word2vec.model'))))
#print(x)
print("x.shape = {}".format(x.shape))
print("y.shape = {}".format(y.shape))

# Save params
training_params_file = os.path.join(out_dir, 'training_params.pickle')
params = {
    'num_labels': FLAGS.num_labels,
    'max_document_length': max_document_length
}
data_helpers.saveDict(params, training_params_file)

# Shuffle data randomly
#np.random.seed(10)
Пример #29
0
def predict(filename, input_file, max_document_length):
    # Load data
    if FLAGS.eval_train:
        x_raw, y_test = data_process.load_data_and_labels(
            input_file, FLAGS.input_label_file, num_labels)
    else:
        x_raw = [
            "a masterpiece four years in the making", "everything is off."
        ]
        y_test = [1, 0]

    # Get Embedding vector x_test
    if len(x_raw) == 0:
        return
    sentences, max_document_length = data_process.padding_sentences(
        x_raw, '补', padding_sentence_length=max_document_length)
    #此处有问题,data_process.padding_sentences返回后一些句子长度会增加1,这些句子都是过长进行裁剪的,原因不清楚,暂时通过二次裁剪修正
    sentences = [sentence[:max_document_length] for sentence in sentences]
    x_test = np.array(
        word2vec_helpers.embedding_sentences(
            sentences, file_to_load=trained_word2vec_model_file))
    print(len(x_test))

    print("x_test.shape = {}".format(x_test.shape))

    # Evaluation
    # ==================================================
    print("\nEvaluating...\n")
    checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            # input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]

            # Tensors we want to evaluate
            predictions = graph.get_operation_by_name(
                "output/predictions").outputs[0]

            # Generate batches for one epoch
            batches = data_process.batch_iter(list(x_test),
                                              FLAGS.batch_size,
                                              1,
                                              shuffle=False)

            # Collect the predictions here
            all_predictions = []

            for x_test_batch in batches:
                batch_predictions = sess.run(predictions, {
                    input_x: x_test_batch,
                    dropout_keep_prob: 1.0
                })
                all_predictions = np.concatenate(
                    [all_predictions, batch_predictions])

    # Print accuracy if y_test is defined
    if y_test is not None:
        correct_predictions = float(sum(all_predictions == y_test))
        print("Total number of test examples: {}".format(len(y_test)))
        print("Accuracy: {:g}".format(correct_predictions /
                                      float(len(y_test))))

    # Save the evaluation to a csv
    # predictions_human_readable = np.column_stack((np.array([text.encode('utf-8') for text in x_raw]), all_predictions))
    predictions_human_readable = np.column_stack(
        (np.array(x_raw), all_predictions))
    out_path = os.path.join(FLAGS.checkpoint_dir, "..", filename)
    print("Saving evaluation to {0}".format(out_path))
    with open(out_path, 'w') as f:
        csv.writer(f).writerows(predictions_human_readable)
def preprocess():
    # Data Preparation
    # ==================================================
    global out_dir
    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    print("Writing to {}\n".format(out_dir))
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    # Load data
    print("Loading data...")
    x_text, y, pos_len, neg_len = data_helpers.noisy_load_data_and_labels(
        FLAGS.positive_data_file, FLAGS.negative_data_file)
    data_size = len(x_text)
    """
    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in x_text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_text)))
    """
    # Get embedding vector
    sentences, max_document_length = data_helpers.padding_sentences(
        x_text, '<PADDING>')
    x = np.array(
        word2vec_helpers.embedding_sentences(
            sentences,
            embedding_size=FLAGS.embedding_dim,
            file_to_save=os.path.join(out_dir, 'trained_word2vec.model')))
    #x=tf.cast(x, tf.float32)
    #vectors =word2vec_helpers.embedding_sentences([['first', 'sentence'], ['second', 'sentence']], embedding_size = 4, min_count = 1)
    print(x[0].shape)
    #y =np.reshape(y,(-1,1))
    print("x.shape = {}".format(x.shape))
    print("y.shape = {}".format(y.shape))
    #adding noise according to different classes

    data_sigma = data_scale()
    global gradient_sigma
    gradient_sigma = gradient_scale(data_size)
    pos_noise = np.random.normal(0, data_sigma,
                                 [pos_len, x.shape[1], x.shape[2]])
    neg_noise = np.random.normal(0, data_sigma,
                                 [neg_len, x.shape[1], x.shape[2]])
    noise = np.concatenate([pos_noise, neg_noise], 0)
    x = x + noise
    # Save params
    """
    training_params_file = os.path.join(out_dir, 'training_params.pickle')
    params = {'num_labels': FLAGS.num_labels,'max_document_length' : max_document_length}
    data_helpers.saveDict(params, training_params_file)"""
    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    print(shuffle_indices)
    #x_shuffle_indices=[[index] for index in shuffle_indices]
    print("the shape of x:{}".format(x.shape[0]))
    print("indices shape:{}".format(shuffle_indices))
    """
    x_shuffled=tf.gather_nd(
    x,
    x_shuffle_indices,
    name=None
)"""
    x_shuffled = x[shuffle_indices]
    #x_shuffled = x[x_shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    # TODO: This is very crude, should use cross-validation

    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]
    print("shape of x:{}".format(x_train.shape))
    print("shape of y:{}".format(y_train.shape))
    del x, y, x_shuffled, y_shuffled
    """
    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    """
    #return x_train, y_train, vocab_processor, x_dev, y_dev
    return x_train, y_train, x_dev, y_dev
Пример #31
0
x_text, y = chinese_data.load_positive_negative_data_files(
    FLAGS.positive_data_file, FLAGS.negative_data_file)

# Prepare output directory for models and summaries
timestamp = str(int(time.time()))
out_dir = os.path.abspath(os.path.join(os.path.curdir, 'runs', timestamp))
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

# Get embedding vector
sentences, max_document_length = chinese_data.padding_sentences(
    x_text, '<PADDING>')
x = np.array(
    word2vec_helpers.embedding_sentences(
        sentences,
        embedding_size=FLAGS.embedding_dim,
        file_to_load=FLAGS.chinese_word2vec_model,
        file_to_save=os.path.join(out_dir, 'trained_word2vec.model')))

# Shuffle data randomly
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
#
# # Split train/test set
# # TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
print(x_train.shape)
Пример #32
0
# Load params
params = data_helpers.loadDict(training_params_file)
window_size = int(params['window_size'])
print("params = {}".format(params))

# Load data
generated_text = []
# seed_text = FLAGS.seed_text.strip()
seed_text = u'白 玉 京'
x_text_current = [] if len(seed_text) == 0 else seed_text.split(' ')
generated_text.extend(x_text_current)
x_text_current = [data_helpers.sentence_start_padding(x_text_current, window_size)]

# Get Embedding vector x_test
x_current = np.array(word2vec_helpers.embedding_sentences(x_text_current, file_to_load = trained_word2vec_model_file))
print("x_current.shape = {}".format(x_current.shape))
print("x_current = {}".format(x_current))

# Generation
# ==================================================
print("\nGenerating...\n")
checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
graph = tf.Graph()
with graph.as_default():
    session_conf = tf.ConfigProto(
      allow_soft_placement=FLAGS.allow_soft_placement,
      log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        # Load the saved meta graph and restore variables