def data_preprocess(all_data): # Load data print("Loading data...") if not os.path.exists(os.path.join(out_dir, "data_x.npy")): x, y = data_helper_new.load_data_and_labels(FLAGS.clone_data_file) # Get embedding vector all_sentences, all_max_document_length = data_helper_new.padding_sentences( all_data, '<PADDING>', padding_sentence_length=FLAGS.sequence_length) sentences, max_document_length = data_helper_new.padding_sentences( x, '<PADDING>', padding_sentence_length=FLAGS.sequence_length) print(len(sentences[0])) if not os.path.exists(os.path.join(out_dir, "trained_word2vec.model")): word2vec_helpers.word2vec_model(all_sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join( out_dir, 'trained_word2vec.model')) x = np.array( word2vec_helpers.embedding_sentences( sentences, file_to_load=os.path.join(out_dir, 'trained_word2vec.model'))) else: print('w2v model found...') # word2vec_helpers.word2vec_model(all_sentences, embedding_size = FLAGS.embedding_dim, file_to_save = os.path.join(out_dir, 'trained_word2vec.model'),file_to_load=os.path.join(out_dir, 'trained_word2vec.model')) x = np.array( word2vec_helpers.embedding_sentences( sentences, file_to_load=os.path.join(out_dir, 'trained_word2vec.model'))) # x = np.array(word2vec_helpers.embedding_sentences(sentences, embedding_size = FLAGS.embedding_dim, file_to_save = os.path.join(out_dir, 'trained_word2vec.model'),file_to_load=os.path.join(out_dir, 'trained_word2vec.model'))) y = np.array(y) del sentences del all_sentences else: print('data found...') x = np.load(os.path.join(out_dir, "data_x.npy")) y = np.load(os.path.join(out_dir, "data_y.npy")) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Save params if not os.path.exists(os.path.join(out_dir, "training_params.pickle")): training_params_file = os.path.join(out_dir, 'training_params.pickle') params = { 'num_labels': FLAGS.num_labels, 'max_document_length': max_document_length } data_helper_new.saveDict(params, training_params_file) #用于保存数据字典 x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42) # split into training and testing set 80/20 ratio del x, y return x_train, x_test, y_train, y_test
def preprocess(): """ 数据准备阶段 :return: """ # 1. 加载数据文件 x_text, y = load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) # 文本进行向量化 sentences, max_document_length = padding_sentences(x_text, '<PAD>') x = np.array(embedding_sentences(sentences, FLAGS.word2vec_fname)) y = np.array(list(y)) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # shuffle 数据 np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # 分拆数据,训练和测试 dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] del x, y, x_shuffled, y_shuffled print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) return x_train, y_train, x_dev, y_dev
def validData2vec(sentences): print 'Word embedding...' all_vectors = word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, file_to_load= '/home/WXX/WebClassify/cnn_website_text_classify/runs/1503023156/trained_word2vec.model' ) x_valid = np.array(all_vectors) return x_valid
def pre_type(string,max_document_length): x_raw = [data_helpers.clean_str(data_helpers.jieba_line(string))] # Get Embedding vector x_test sentences, max_document_length = data_helpers.padding_sentences(x_raw, '<PADDING>', padding_sentence_length=max_document_length) x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load=trained_word2vec_model_file)) # print("x_test.shape = {}".format(x_test.shape)) pred = sess.run(predictions, {input_x: x_test, dropout_keep_prob: 1.0})[0] type='' if pred == 0: type = type + 'neg' else: type = type + 'pos' return type
def predict(self,url): params = data_helper.loadDict(self.training_params_file) num_labels = int(params['num_labels']) max_document_length = int(params['max_document_length']) x_raw = [url] sentences, max_document_length = data_helper.padding_sentences(x_raw, '<PADDING>', padding_sentence_length = max_document_length) x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load = self.trained_word2vec_model_file)) # print(x_test) # with self.graph.as_default(): # with self.sess.as_default(): result = self.sess.run(self.predictions, {self.input_x: x_test,self.dropout_keep_prob: 1.0}) result = 'good' if result else 'bad' print("Request examples: {}, inference result: {}".format(url,result)) return result
def save_data_vector(contents_dir, labels_dir, out_dir): x_text, y = load_files_labels(contents_dir, labels_dir, one_hot=True) # Get embedding vector, 句子padding到最大长度190,pandding内容为: '<PADDING>' sentences, max_document_length = data_helpers.padding_sentences( x_text, '<PADDING>', padding_sentence_length=190) embedding_dim = 128 x = np.array( word2vec_helpers.embedding_sentences(sentences, embedding_size=embedding_dim, file_to_save=os.path.join( out_dir, 'trained_word2vec.model'))) print("x.shape = {}".format(x.shape) ) # shape=(10000, 190, 128)->(样本个数10000,每个样本的字词个数190,每个字词的向量长度128) print("y.shape = {}".format( y.shape)) # y.shape = (10000, 2)->样本的labels,以one-hot编码 np.save(os.path.join(out_dir, 'data_vector.npy'), x) np.save(os.path.join(out_dir, 'labels.npy'), y)
def predict(self, message): # 支持不论在python2还是python3下训练的模型都可以在2或者3的环境下运行 data = message # data=list(content) sentences, max_document_length = padding_sentences(data, '<PADDING>') x = np.array( word2vec_helpers.embedding_sentences( sentences, embedding_size=args.embedding_size, file_to_load='./best_model/1568855551/trained_word2vec.model')) # print(x.shape) feed_dict = {self.model.input_x: x, self.model.dropout_keep_prob: 1.0} #最后一层输出 y_pred_cls = self.session.run(self.model.predictions, feed_dict=feed_dict) # y_pred_cls = self.session.run(tf.nn.softmax(self.model.scores), feed_dict=feed_dict) y_prob = y_pred_cls.tolist() print(y_prob) return self.categories[y_pred_cls[0]]
def data_preprocess(): # Data preprocess # ======================================================= # Load data print("Loading data...") if not os.path.exists(os.path.join(out_dir, "data_x.npy")): x, y = data_helper.load_data_and_labels(FLAGS.data_file) # Get embedding vector x = x[:1000] y = y[:1000] sentences, max_document_length = data_helper.padding_sentences( x, '<PADDING>', padding_sentence_length=FLAGS.sequence_length) print(len(sentences[0])) if not os.path.exists(os.path.join(out_dir, "trained_word2vec.model")): x = np.array( word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join(out_dir, 'trained_word2vec.model'))) else: print('w2v model found...') x = np.array( word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join(out_dir, 'trained_word2vec.model'), file_to_load=os.path.join(out_dir, 'trained_word2vec.model'))) y = np.array(y) # np.save(os.path.join(out_dir,"data_x.npy"),x) # np.save(os.path.join(out_dir,"data_y.npy"),y) del sentences else: print('data found...') x = np.load(os.path.join(out_dir, "data_x.npy")) y = np.load(os.path.join(out_dir, "data_y.npy")) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Save params if not os.path.exists(os.path.join(out_dir, "training_params.pickle")): training_params_file = os.path.join(out_dir, 'training_params.pickle') params = { 'num_labels': FLAGS.num_labels, 'max_document_length': max_document_length } data_helper.saveDict(params, training_params_file) # Shuffle data randomly # np.random.seed(10) # shuffle_indices = np.random.permutation(np.arange(len(y))) # x_shuffled = x[shuffle_indices] # y_shuffled = y[shuffle_indices] # del x,y # x_train, x_test, y_train, y_test = train_test_split(x_shuffled, y_shuffled, test_size=0.2, random_state=42) # split into training and testing set 80/20 ratio x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=42) # split into training and testing set 80/20 ratio del x, y return x_train, x_test, y_train, y_test
# Load params params = data_helpers.loadDict(training_params_file) num_labels = int(params['num_labels']) max_document_length = int(params['max_document_length']) # Load data if FLAGS.eval_train: x_raw, y_test = data_helpers.load_positive_negative_data_files(FLAGS) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Get Embedding vector x_test print max_document_length x_test, max_document_length = data_helpers.padding_sentences(x_raw, '<PADDING>', padding_sentence_length = max_document_length) _, w2vModel = word2vec_helpers.embedding_sentences(file_to_load = trained_word2vec_model_file) x_test = np.array(x_test) print("x_test.shape = {}".format(x_test.shape)) # Evaluation # ================================================== print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf)
def sample(args): timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) if not os.path.exists(out_dir): os.makedirs(out_dir) print('Loading data') #x_text, y = data_helpers.load_positive_negative_data_files1() # Get embedding vector #sentences, max_document_length = data_helpers.padding_sentences(x_text, '<PADDING>') #x = np.array(word2vec_helpers.embedding_sentences(sentences, embedding_size=FLAGS.embedding_dim, #file_to_save=os.path.join(out_dir, 'trained_word2vec.model'))) checkpoint_file = tf.train.latest_checkpoint(args.checkpoint_dir) graph = tf.Graph() with graph.as_default(): sess = tf.Session() with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/predictions").outputs[0] #体育 娱乐 彩票 房产 textlist = [ '谁 足 球 踢 得 好 ?', u'彩 票 中 奖 几 乎 是 不 可 能 的', u'上 海 的 房 价 始 终 居 高 不 下', u'关 晓 彤 主 演 新 版 倚 天 屠 龙 记 让 人 笑 掉 大 牙 ', u'杜 兰 特 是 勇 士 的 篮 球 运 动 员 ', u'娱 乐 圈 吸 毒 是 常 有 的 事', u'上 海 一 彩 民 中了 二 等 奖', u' 万 达 集 团 再 次 中 标 关键 地 段 的 房 产 开 发 权 ', u'很 多 观 众 每 晚 准 时 看 体 育 新 闻 ', u'草 莓 音 乐 节 即 将 开 始', u'中 国 福 利 彩 票 是 否 有 黑 幕 不 得 而 知 ', u'房 地 产 行 业 永 远 不 会 倒' ] for i in textlist: textlist = [] textlist.append(i) print(textlist) sentences_padded1, max_document_length = data_helpers.padding_sentences( textlist, '<PADDING>') raw_x1 = np.array( word2vec_helpers.embedding_sentences( sentences_padded1, embedding_size=FLAGS.embedding_dim, file_to_load= 'C:/Users/I343039/PycharmProjects/nlp-multiclass-text-tf/runs/1508811868/trained_word2vec.model' )) predicted_result = sess.run(predictions, { input_x: raw_x1, dropout_keep_prob: 1.0 }) if (predicted_result[0] == 0): print(i + ": 体育") elif (predicted_result[0] == 1): print(i + ": 娱乐") elif (predicted_result[0] == 2): print(i + ": 彩票") elif (predicted_result[0] == 3): print(i + ": 房产")
with open(dict_file, 'rb') as f: output_dict = pickle.load(f) return output_dict if __name__=='__main__': base_path = os.path.abspath(os.path.dirname(__file__)) positive_file = os.path.join(base_path, 'data/ham_100.utf8') negative_file = os.path.join(base_path, 'data/spam_100.utf8') x_test,y = load_positive_negative_data_files(positive_file,negative_file) sentences, max_document_length = padding_sentences(x_test, '<PADDING>') # w2vModel = Word2Vec(sentences, size = 128, window = 5, min_count = 5, workers = multiprocessing.cpu_count()) # file_to_save = os.path.join(base_path,'model/trained_word2vec.model') # w2vModel.save(file_to_save) x = np.array(word2vec_helpers.embedding_sentences(sentences, embedding_size=128,file_to_save=os.path.join(base_path, 'model/trained_word2vec.model'))) training_params_file = os.path.join(base_path, 'model/training_params.pickle') params = {'num_labels': 2, 'max_document_length': max_document_length} saveDict(params, training_params_file) # Shuffle data randomly np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] dev_sample_index = -1 * int(0.1 * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] embedded_chars = x_train print x_train.shape embedded_chars_expended = tf.expand_dims(embedded_chars, -1) print embedded_chars_expended.shape
# Load params params = data_helpers.loadDict(training_params_file) num_labels = int(params['num_labels']) max_document_length = int(params['max_document_length']) # Load data if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.input_text_file, FLAGS.input_label_file, num_labels) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Get Embedding vector x_test sentences, max_document_length = data_helpers.padding_sentences(x_raw, '<PADDING>', padding_sentence_length = max_document_length) x_test = np.array(word2vec_helpers.embedding_sentences(sentences, file_to_load = trained_word2vec_model_file)) print("x_test.shape = {}".format(x_test.shape)) # Evaluation # ================================================== print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables
import data_helpers import word2vec_helpers import os tf.flags.DEFINE_string("test_file", "./data/data_test.csv", "Data source for the mid data.") FLAGS = tf.flags.FLAGS print("Loading data...") x_text, y = data_helpers.load_test_files(FLAGS.test_file) sentences, max_document_length = data_helpers.padding_sentences( x_text, '<PADDING>') x = np.array( word2vec_helpers.embedding_sentences(sentences, embedding_size=128, file_to_save=os.path.join( 'data/', 'trained_word2vec.model'))) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # data=pd.read_csv('data/data_test.csv',encoding='utf-8') # test=pd.DataFrame(data) # W = tf.Variable(np.arange(6).reshape((2, 3)), dtype=tf.float32, name="weights") # b = tf.Variable(np.arange(3).reshape((1, 3)), dtype=tf.float32, name="biases") # saver = tf.train.Saver() with tf.Session() as sess: saver = tf.train.import_meta_graph( '/home/liqian/liqian/NLP/runs/1525330118/checkpoints/model-1100.meta') saver.restore( sess,
def validate_method(x_raw, y_test, max_document_length): # Get Embedding vector x_test sentences, max_document_length = data_helpers.padding_sentences( x_raw, '<PADDING>', padding_sentence_length=max_document_length) x_test = np.array( word2vec_helpers.embedding_sentences( sentences, file_to_load=trained_word2vec_model_file)) print("x_test.shape = {}".format(x_test.shape)) # Evaluation # ================================================== print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/predictions").outputs[0] # Generate batches for one epoch batches = data_helpers.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here all_predictions = [] for x_test_batch in batches: batch_predictions = sess.run(predictions, { input_x: x_test_batch, dropout_keep_prob: 1.0 }) all_predictions = np.concatenate( [all_predictions, batch_predictions]) # Print accuracy if y_test is defined if y_test is not None: correct_predictions = float(sum(all_predictions == y_test)) print("Total number of test examples: {}".format(len(y_test))) print("Accuracy: {:g}".format(correct_predictions / float(len(y_test)))) # Save the evaluation to a csv predictions_human_readable = np.column_stack( (np.array([text.encode('utf-8') for text in x_raw]), all_predictions)) out_path = os.path.join(FLAGS.checkpoint_dir, "..", "prediction.csv") print("Saving evaluation to {0}".format(out_path)) with open(out_path, 'a+') as f: csv.writer(f).writerows(predictions_human_readable)
timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) if not os.path.exists(out_dir): os.makedirs(out_dir) # Data preprocess # ======================================================= # Load data print("Loading data...") x_text, y = data_helpers.load_positive_negative_data_files(FLAGS.positive_data_file, FLAGS.negative_data_file) # Get embedding vector sentences, max_document_length = data_helpers.padding_sentences(x_text, '<PADDING>') x = np.array(word2vec_helpers.embedding_sentences(sentences, embedding_size = FLAGS.embedding_dim, file_to_save = os.path.join(out_dir, 'trained_word2vec.model'))) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Save params training_params_file = os.path.join(out_dir, 'training_params.pickle') params = {'num_labels' : FLAGS.num_labels, 'max_document_length' : max_document_length} data_helpers.saveDict(params, training_params_file) # Shuffle data randomly np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set
# Data preprocess # ======================================================= # Load data print("Loading data...") x_text, y = data_helpers.load_positive_negative_data_files( FLAGS.positive_data_file, FLAGS.negative_data_file) # Get embedding vector sentences, max_document_length = data_helpers.padding_sentences( x_text, '<PADDING>') x = np.array( word2vec_helpers.embedding_sentences(sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join( base_path, 'model/trained_word2vec.model'))) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Save params training_params_file = os.path.join(base_path, 'training_params.pickle') params = { 'num_labels': FLAGS.num_labels, 'max_document_length': max_document_length } data_helpers.saveDict(params, training_params_file) # Shuffle data randomly np.random.seed(10)
# Get Embedding vector x_test sentences, max_document_length = new_data_helper.padding_sentences( x_raw, '.', padding_sentence_len=max_document_length) # print(sentences) # 如果测试的文件过大,则容易出现,x_test.shape出错的问题,因此将测试集分割测试 # Collect the predictions here all_predictions = [] print(len(sentences)) for i in range(len(sentences) / 100): print(i) print(len(sentences) / 100) print(sentences[i * 100:(i + 1) * 100]) x_test = np.array( word2vec_helpers.embedding_sentences( sentences[i * 100:(i + 1) * 100], file_to_load=trained_word2vec_model_file)) #print(x_test) print("x_test.shape = {}".format(x_test.shape)) # Evaluation # ================================================== print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default():
# Data preprocess # ======================================================= # Load data print("Loading data...") x_text, y = data_helpers.load_positive_negative_data_files( FLAGS.positive_data_file, FLAGS.negative_data_file) # Get embedding vector sentences, max_document_length = data_helpers.padding_sentences( x_text, '<PADDING>') x = np.array( word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, file_to_load="/Users/jiangqy/Code/model/wiki.zh.text.model", file_to_save=os.path.join(out_dir, 'trained_word2vec.model'))) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Save params training_params_file = os.path.join(out_dir, 'training_params.pickle') params = { 'num_labels': FLAGS.num_labels, 'max_document_length': max_document_length } data_helpers.saveDict(params, training_params_file) # Shuffle data randomly np.random.seed(10)
# Load data print("Loading data...") positive_data_file = os.path.join('.', args.positive_data_file) negative_data_file = os.path.join('.', args.negative_data_file) # print(positive_data_file) x_text, y = data_deal.load_positive_negative_data_files( positive_data_file, negative_data_file) print(x_text) sentences, max_document_length = data_deal.padding_sentences( x_text, '<PADDING>') x = np.array( word2vec_helpers.embedding_sentences(sentences, embedding_size=args.embedding_size, file_to_save=os.path.join( out_dir, 'trained_word2vec.model'))) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # # Save params training_params_file = os.path.join(out_dir, 'training_params.pickle') params = { 'num_classes': args.num_classes, 'max_document_length': max_document_length } data_deal.saveDict(params, training_params_file) # Shuffle data randomly
y_test = None else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] label2str = data_helper.label2str('input_data/') # Get Embedding vector x_test print 'Padding sentence...' sentences, max_document_length = data_helper.padding_sentences( x_raw, '<PADDING>', padding_sentence_length=max_document_length) print 'sentences length : %d , max_document_length : %d' % ( len(sentences), max_document_length) sentences, new_x_raw = eval_helper.check_padding_sentences(sentences, x_raw) all_vectors = word2vec_helpers.embedding_sentences( sentences, embedding_size=128, file_to_load=trained_word2vec_model_file) print 'all_vectors length: %d' % len(all_vectors[0]) x_test = np.array(all_vectors) print("x_test.shape = {}".format(x_test.shape)) print 'x_test_shape: ', x_test.shape, " ", len(x_test), " ", len( x_test[0]), " ", len(x_test[0][0]) print 'list x_test ', len(list(x_test)) # Evaluation # ================================================== print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement,
else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [1, 0] # Map data into vocabulary """ vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab") vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path) x_test = np.array(list(vocab_processor.transform(x_raw))) """ sentences, max_document_length = data_helpers.padding_sentences( x_raw, '<PADDING>') x_test = np.array( word2vec_helpers.embedding_sentences(sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join( FLAGS.checkpoint_dir, 'trained_word2vec.model'))) data_sigma = data_scale(eps) pos_noise = np.random.normal(0, data_sigma, [pos_len, x_test.shape[1], x_test.shape[2]]) neg_noise = np.random.normal(0, data_sigma, [neg_len, x_test.shape[1], x_test.shape[2]]) noise = np.concatenate([pos_noise, neg_noise], 0) x_test = x_test + noise print("\nEvaluating...\n") # Evaluation # ================================================== checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph()
# print("") # 2. 加载数据、词典及模型 # 2.1 加载数据 if FLAGS.eval_train: x_raw, y_test = load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file) y_test = np.argmax(y_test, axis=1) else: x_raw = ["a masterpiece four years in the making", "everything is off."] y_test = [0, 1] # 文本进行向量化 sentences, max_document_length = padding_sentences(x_raw, '<PAD>', 112) x_test = np.array(embedding_sentences(sentences, word2vec_path)) y_test = np.array(list(y_test)) print("x.shape = {}".format(x_test.shape)) print("y.shape = {}".format(y_test.shape)) # 3 加载模型及预测 checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) print("latest checkpoint: %s" % checkpoint_file) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement)
# Data preprocess # ======================================================= # Load data print("Loading data...") x_text, y = data_helpers.load_positive_negative_data_files(FLAGS) # Get embedding vector sentences, max_document_length = data_helpers.padding_sentences( x_text, '<PADDING>', word_segment=FLAGS.word_segment, padding_sentence_length=FLAGS.max_document_len) if not os.path.exists(_w2v_path): _, w2vModel = word2vec_helpers.embedding_sentences( sentences=sentences, embedding_size=FLAGS.embedding_dim, file_to_save=_w2v_path) else: _, w2vModel = word2vec_helpers.embedding_sentences( sentences=None, embedding_size=FLAGS.embedding_dim, file_to_load=_w2v_path) FLAGS.embedding_dim = w2vModel.vector_size print('wordembedding.dim = {}'.format(FLAGS.embedding_dim)) print('wordembedding.lenth = {}'.format(len(w2vModel.wv.vocab))) x = np.array(sentences) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Save params
#x_text为二维列表,第一维以每一句话为元素构成的列表,第二维以该句话的每个词组成的列表构成 #['全国', '少年儿童', '游泳', '锦标赛', '开幕', '新华社', '广州', '月', '日电', '记者', '何惠飞', '年', '喜乐', '杯', '全国', '少年儿童', '游泳', '锦标赛', '昨天', '在', '游泳', '之', '乡', '广东省', '东莞市', '开幕', '参加', '这次', '比赛', '的', '有', '个', '省', '自治区', '直辖市', '的', '名', '男女', '选手', '比赛', '分为', '岁', '组和岁', '以下', '组', '参赛者', '都', '是', '近几年', '涌现', '的', '优秀', '小', '选手', '不少', '是', '本', '年龄组', '的', '全国纪录', '创造者', '这次', '比赛', '是', '对', '我国', '参加', '下', '两届', '奥运会', '游泳赛', '后备力量', '的', '一次', '检阅', '国家体委', '将', '通过', '这次', '比赛', '选拔', '优秀', '选手', '组队参加', '今年', '月', '在', '印度尼西亚', '举行', '的', '亚太区', '年龄组', '游泳', '比赛', '比赛', '将', '于', '日', '结束', '完'] x_text, y = data_helpers.load_data_files( FLAGS.sports_file, FLAGS.amusement_file, FLAGS.home_file, FLAGS.estate_file, FLAGS.education_file, FLAGS.fashion_file, FLAGS.politics_file, FLAGS.game_file, FLAGS.technology_file, FLAGS.finance_file) # Get embedding vector sentences, max_document_length = data_helpers.padding_sentences( x_text, 'PADDING', FLAGS.max_seq_length) #此时的sentences为每一句话中的词构成的列表为元素而构成的二维列表,其中每一句话的长度都相同,因为都用 #'PADDING'将其补充到最大长度 #将返回的列表转化为数组 x_embedding = word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, ext_emb_path=FLAGS.word_embedding_file) x = np.array(x_embedding) #x的三维分别表示句子总数,每个句子中的单词数(以最长的句子计),词向量的维数 print("x.shape =", x.shape) print("y.shape =", y.shape) #Save params training_params_file = 'train/training_params.pickle' params = { 'num_labels': FLAGS.num_labels, 'max_document_length': max_document_length } data_helpers.saveDict(params, training_params_file) # 数据混杂 np.random.seed(10)
new_x_text = [] new_y = [] for i in range(3000): rand_idx = random.randint(0, len(x_text)) #rand_y = random.randint(0, len(x_text)) new_x_text.append(x_text[rand_idx]) new_y.append(y[rand_idx]) print "new_x_text length: %d" % len(new_x_text) print "new_y length: %d" % len(new_y) # embedding vector print("Padding sentences...") sentences, max_document_length = data_helper.padding_sentences(new_x_text, '<PADDING>') #max_document_length = print("embedding_sentences...") all_vectors = word2vec_helpers.embedding_sentences(sentences, embedding_size = FLAGS.embedding_dim, file_to_save = os.path.join(out_dir, 'trained_word2vec.model')) print "all_vectors length %d * %d * %d : " % (len(all_vectors) , len(all_vectors[0]) , len(all_vectors[0][0])) #x = np.array(all_vectors) ## this operation could lead to memory error!!! #TODO: transform large vectors into sparse matrix x = np.asarray(all_vectors) y = np.asarray(new_y) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Save params training_params_file = os.path.join(out_dir, 'training_params.pickle') params = {'num_labels' : FLAGS.num_labels, 'max_document_length' : max_document_length} data_helper.saveDict(params, training_params_file) # Shuffle data randomly
import word2vec_helpers import data_helper from text_lstm import TextLSTM # Load data print("Loading data...") x_text, y = data_helper.load_positive_negative_data_files( 'bingyin.txt', 'zhenduan.txt', 'zhiliao.txt', 'zhengzhuang.txt') # Get embedding vector embedding_dim = 300 sentences, max_document_length = data_helper.padding_sentences( x_text, '<PADDING>') x = np.array( word2vec_helpers.embedding_sentences( sentences, embedding_size=embedding_dim, file_to_save='trained_word2vec.model')) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Shuffle data randomly np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set dev_sample_index = -1 * int(0.1 * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
# Load data if FLAGS.eval_train: x_raw, y_test = data_helpers.load_data_and_labels( FLAGS.input_text_file, FLAGS.input_label_file, num_labels) else: x_raw = [ "a masterpiece four years in the making", "everything is off." ] y_test = [1, 0] # Get Embedding vector x_test sentences, max_document_length = data_helpers.padding_sentences( x_raw, '<PADDING>', padding_sentence_length=max_document_length) x_test = np.array( word2vec_helpers.embedding_sentences( sentences, file_to_load=trained_word2vec_model_file)) print("x_test.shape = {}".format(x_test.shape)) # Evaluation # ================================================== print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph(
print("Writing to {}\n".format(out_dir)) if not os.path.exists(out_dir): os.makedirs(out_dir) # Data preprocess # ======================================================= # Load data print("Loading data...") #x_text, y = data_helpers.load_positive_negative_data_files(FLAGS.positive_data_file, FLAGS.negative_data_file) x_text, y = data_helpers.load_positive_negative_data_files1() # Get embedding vector sentences, max_document_length = data_helpers.padding_sentences( x_text, '<PADDING>') x = np.asanyarray((word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join(out_dir, 'trained_word2vec.model')))) #print(x) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) # Save params training_params_file = os.path.join(out_dir, 'training_params.pickle') params = { 'num_labels': FLAGS.num_labels, 'max_document_length': max_document_length } data_helpers.saveDict(params, training_params_file) # Shuffle data randomly #np.random.seed(10)
def predict(filename, input_file, max_document_length): # Load data if FLAGS.eval_train: x_raw, y_test = data_process.load_data_and_labels( input_file, FLAGS.input_label_file, num_labels) else: x_raw = [ "a masterpiece four years in the making", "everything is off." ] y_test = [1, 0] # Get Embedding vector x_test if len(x_raw) == 0: return sentences, max_document_length = data_process.padding_sentences( x_raw, '补', padding_sentence_length=max_document_length) #此处有问题,data_process.padding_sentences返回后一些句子长度会增加1,这些句子都是过长进行裁剪的,原因不清楚,暂时通过二次裁剪修正 sentences = [sentence[:max_document_length] for sentence in sentences] x_test = np.array( word2vec_helpers.embedding_sentences( sentences, file_to_load=trained_word2vec_model_file)) print(len(x_test)) print("x_test.shape = {}".format(x_test.shape)) # Evaluation # ================================================== print("\nEvaluating...\n") checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) # Get the placeholders from the graph by name input_x = graph.get_operation_by_name("input_x").outputs[0] # input_y = graph.get_operation_by_name("input_y").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] # Tensors we want to evaluate predictions = graph.get_operation_by_name( "output/predictions").outputs[0] # Generate batches for one epoch batches = data_process.batch_iter(list(x_test), FLAGS.batch_size, 1, shuffle=False) # Collect the predictions here all_predictions = [] for x_test_batch in batches: batch_predictions = sess.run(predictions, { input_x: x_test_batch, dropout_keep_prob: 1.0 }) all_predictions = np.concatenate( [all_predictions, batch_predictions]) # Print accuracy if y_test is defined if y_test is not None: correct_predictions = float(sum(all_predictions == y_test)) print("Total number of test examples: {}".format(len(y_test))) print("Accuracy: {:g}".format(correct_predictions / float(len(y_test)))) # Save the evaluation to a csv # predictions_human_readable = np.column_stack((np.array([text.encode('utf-8') for text in x_raw]), all_predictions)) predictions_human_readable = np.column_stack( (np.array(x_raw), all_predictions)) out_path = os.path.join(FLAGS.checkpoint_dir, "..", filename) print("Saving evaluation to {0}".format(out_path)) with open(out_path, 'w') as f: csv.writer(f).writerows(predictions_human_readable)
def preprocess(): # Data Preparation # ================================================== global out_dir timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) if not os.path.exists(out_dir): os.makedirs(out_dir) # Load data print("Loading data...") x_text, y, pos_len, neg_len = data_helpers.noisy_load_data_and_labels( FLAGS.positive_data_file, FLAGS.negative_data_file) data_size = len(x_text) """ # Build vocabulary max_document_length = max([len(x.split(" ")) for x in x_text]) vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length) x = np.array(list(vocab_processor.fit_transform(x_text))) """ # Get embedding vector sentences, max_document_length = data_helpers.padding_sentences( x_text, '<PADDING>') x = np.array( word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, file_to_save=os.path.join(out_dir, 'trained_word2vec.model'))) #x=tf.cast(x, tf.float32) #vectors =word2vec_helpers.embedding_sentences([['first', 'sentence'], ['second', 'sentence']], embedding_size = 4, min_count = 1) print(x[0].shape) #y =np.reshape(y,(-1,1)) print("x.shape = {}".format(x.shape)) print("y.shape = {}".format(y.shape)) #adding noise according to different classes data_sigma = data_scale() global gradient_sigma gradient_sigma = gradient_scale(data_size) pos_noise = np.random.normal(0, data_sigma, [pos_len, x.shape[1], x.shape[2]]) neg_noise = np.random.normal(0, data_sigma, [neg_len, x.shape[1], x.shape[2]]) noise = np.concatenate([pos_noise, neg_noise], 0) x = x + noise # Save params """ training_params_file = os.path.join(out_dir, 'training_params.pickle') params = {'num_labels': FLAGS.num_labels,'max_document_length' : max_document_length} data_helpers.saveDict(params, training_params_file)""" # Randomly shuffle data np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) print(shuffle_indices) #x_shuffle_indices=[[index] for index in shuffle_indices] print("the shape of x:{}".format(x.shape[0])) print("indices shape:{}".format(shuffle_indices)) """ x_shuffled=tf.gather_nd( x, x_shuffle_indices, name=None )""" x_shuffled = x[shuffle_indices] #x_shuffled = x[x_shuffle_indices] y_shuffled = y[shuffle_indices] # Split train/test set # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[ dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[ dev_sample_index:] print("shape of x:{}".format(x_train.shape)) print("shape of y:{}".format(y_train.shape)) del x, y, x_shuffled, y_shuffled """ print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_))) print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev))) """ #return x_train, y_train, vocab_processor, x_dev, y_dev return x_train, y_train, x_dev, y_dev
x_text, y = chinese_data.load_positive_negative_data_files( FLAGS.positive_data_file, FLAGS.negative_data_file) # Prepare output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, 'runs', timestamp)) if not os.path.exists(out_dir): os.mkdir(out_dir) # Get embedding vector sentences, max_document_length = chinese_data.padding_sentences( x_text, '<PADDING>') x = np.array( word2vec_helpers.embedding_sentences( sentences, embedding_size=FLAGS.embedding_dim, file_to_load=FLAGS.chinese_word2vec_model, file_to_save=os.path.join(out_dir, 'trained_word2vec.model'))) # Shuffle data randomly np.random.seed(10) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] # # # Split train/test set # # TODO: This is very crude, should use cross-validation dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y))) x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:] y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:] print(x_train.shape)
# Load params params = data_helpers.loadDict(training_params_file) window_size = int(params['window_size']) print("params = {}".format(params)) # Load data generated_text = [] # seed_text = FLAGS.seed_text.strip() seed_text = u'白 玉 京' x_text_current = [] if len(seed_text) == 0 else seed_text.split(' ') generated_text.extend(x_text_current) x_text_current = [data_helpers.sentence_start_padding(x_text_current, window_size)] # Get Embedding vector x_test x_current = np.array(word2vec_helpers.embedding_sentences(x_text_current, file_to_load = trained_word2vec_model_file)) print("x_current.shape = {}".format(x_current.shape)) print("x_current = {}".format(x_current)) # Generation # ================================================== print("\nGenerating...\n") checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables