def predict(): predict_base_path = "../data/review_relation/predict/content_relation.csv" food_reviews = [] liren_reviews = [] jiudian_reviews = [] yule_reviews = [] gouwu_reviews = [] with tf.gfile.GFile(predict_base_path, 'r') as reader: for line in reader: fields = line.strip().split('\t') if fields[2] == "美食": food_reviews.append(line) elif fields[2] == "丽人": liren_reviews.append(line) elif fields[2] == "酒店": jiudian_reviews.append(line) elif fields[2] == "休娱": yule_reviews.append(line) else: gouwu_reviews.append(line) words_path = "../data/review_relation/bert_words.csv" word_index = load_vocab_ids(words_path, sep='\t') model_base_path = "../data/review_relation/version2/" predict_result_base_path = "../data/review_relation/predict_result/" model_predict( os.path.join(model_base_path, 'food2.h5'), food_reviews, os.path.join(predict_result_base_path, 'food_predict_result2.csv'), word_index) model_predict( os.path.join(model_base_path, 'jiudian.h5'), jiudian_reviews, os.path.join(predict_result_base_path, 'jiudian_predict_result2.csv'), word_index) model_predict( os.path.join(model_base_path, 'liren.h5'), liren_reviews, os.path.join(predict_result_base_path, 'liren_predict_result2.csv'), word_index) model_predict( os.path.join(model_base_path, 'yule.h5'), yule_reviews, os.path.join(predict_result_base_path, 'yule_predict_result2.csv'), word_index) model_predict( os.path.join(model_base_path, 'gouwu.h5'), gouwu_reviews, os.path.join(predict_result_base_path, 'gouwu_predict_result2.csv'), word_index)
def predict(): word_index = load_vocab_ids(FLAGS.word_path) data = pd.read_csv(FLAGS.predict_path, header=None) # shopid = data.values[:, 0] # textid = data.values[:, 1] texts = data.values[:, 0] text_id = [text_to_sequence(text.decode('utf-8'), word_index) for text in texts] text_id = tf.keras.preprocessing.sequence.pad_sequences(text_id, value=word_index['pad'], padding='post', maxlen=15) print text_id[:10] label = np.zeros((len(texts), 2)).astype(np.int64) # a = serialize_example(text_id[0], label[0]) # print tf.train.Example.FromString(a) # iterator = dataset.make_one_shot_iterator() # next = iterator.get_next() # with tf.Session() as sess: # while True: # try: # x, y = sess.run(next) # print # except tf.errors.OutOfRangeError: # break # config = BaseConfig.from_json_file(FLAGS.model_config_path).to_dict() model = BilstmModel(config) model = model.create_model() model.compile(optimizer=tf.train.AdamOptimizer(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy']) model = tf.keras.estimator.model_to_estimator(keras_model=model) # tf.estimator.Estimator.predict() inference = model.predict( input_fn=lambda: predict_input_fn(text_id, label), checkpoint_path=FLAGS.model_path) result = [0 if ele['dense'][0] > 0.5 else 1 for ele in inference] with tf.gfile.GFile("/tmp/1.csv", 'w') as writer: for i in xrange(len(result)): writer.write("%s\t%d\n" % (texts[i], result[i]))
def predict(text): words_path = '../data/review_relation/bert_words.csv' words = load_vocab_ids(words_path, sep='\t') wordIds = text_to_ids(text, words) signature_key = "xiaoxiang" sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True with tf.Session(graph=tf.Graph(), config=sess_config) as sess: meta_graph_def = tf.saved_model.loader.load( sess, [tf.saved_model.tag_constants.SERVING], "/tmp/saved_model/1") signature = meta_graph_def.signature_def text = signature[signature_key].inputs["text"].name outputs = signature[signature_key].outputs["label"].name text = sess.graph.get_tensor_by_name(text) outputs = sess.graph.get_tensor_by_name(outputs) print(sess.run([outputs], feed_dict={text: [wordIds]}))
def predict2(): predict_base_path = "../data/review_relation/predict/content_relation.csv" reviews = [] contentids = [] reviewlist = [] with tf.gfile.GFile(predict_base_path, 'r') as reader: for line in reader: fields = line.strip().split("\t") if len(fields) != 2: continue contentids.append(fields[0]) reviewlist.append(fields[1]) reviews.append(line.strip()) words_path = "../data/review_relation/bert_words.csv" word_index = load_vocab_ids(words_path, sep='\t') model_base_path = "../data/review_relation/version2/" predict_result_base_path = "../data/review_relation/content_pic_relation/content_relation_res.csv" food_label = model_predict2(os.path.join(model_base_path, 'food2.h5'), reviews, word_index) jiudian_label = model_predict2(os.path.join(model_base_path, 'jiudian.h5'), reviews, word_index) liren_label = model_predict2(os.path.join(model_base_path, 'liren.h5'), reviews, word_index) yule_label = model_predict2(os.path.join(model_base_path, 'yule.h5'), reviews, word_index) gouwu_label = model_predict2(os.path.join(model_base_path, 'gouwu.h5'), reviews, word_index) with tf.gfile.GFile(predict_result_base_path, 'w') as writer: writer.write( "contentid\tfood\tjiudian\tliren\tyule\tgouwu\tcontentbody\n") for i in xrange(len(contentids)): writer.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (contentids[i], str(food_label[i]), str(jiudian_label[i]), str(liren_label[i]), str(yule_label[i]), str(gouwu_label[i]), reviews[i]))
def input_fn(path, vocab, config, batch_size): dataset = tf.data.TextLineDataset(path) dataset = dataset.map(lambda line: parse_line(line, vocab, config), num_parallel_calls=4) return dataset.batch(batch_size=batch_size) if __name__ == "__main__": config = BaseConfig.from_json_file( "../data/dish/dish_similarity/config.json").to_dict() path = "/tmp/valid.csv" word_path = "../data/dish/dish_similarity/words.csv" word_index = load_vocab_ids(word_path) signature_key = 'predict_label' sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True with tf.Session(graph=tf.Graph(), config=sess_config) as sess: dataset = input_fn(path, word_index, config, 10) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() meta_graph_def = tf.saved_model.loader.load( sess, [tf.saved_model.tag_constants.SERVING], "../data/dish/dish_similarity/saved_model/4") signature = meta_graph_def.signature_def text_a = signature[signature_key].inputs["text_a"].name text_b = signature[signature_key].inputs["text_b"].name
padding='pre', maxlen=max_len) tag_list = tf.keras.preprocessing.sequence.pad_sequences( tag_list, value=word_index.get('[PAD]'.encode('utf-8')), padding='pre', maxlen=max_len) return feature_list, tag_list if __name__ == '__main__': # #处理训练数据 words_path = "/Users/lionel/Desktop/data/review_relation/bert_words.csv" word_index = load_vocab_ids(words_path, sep='\t') path = '/Users/lionel/Downloads/review_dish.csv' config = { 'batch_size': 100, 'max_length': 100, 'vocab_size': 21128, 'embedding_size': 100, 'units': 100, 'num_tags': 3 } feature_list, tag_list = process_data(path, word_index, config['max_length']) # index = feature_list.shape[0] // config['batch_size'] * config['batch_size'] #
def train(): train_path = '../train.csv' valid_path = '../valid.csv' words_path = '../data/review_relation/bert_words.csv' words = load_vocab_ids(words_path, sep='\t') train_text = [] train_label = [] with tf.gfile.GFile(train_path, 'r') as reader: for line in reader: fields = line.strip().split('\t') if len(fields) != 2: continue train_text.append(fields[0]) train_label.append(int(fields[1])) valid_text = [] valid_label = [] with tf.gfile.GFile(valid_path, 'r') as reader: for line in reader: fields = line.strip().split('\t') if len(fields) != 2: continue valid_text.append(fields[0]) valid_label.append(int(fields[1])) train_data = [text_to_ids(ele, words) for ele in train_text] train_label = tf.keras.utils.to_categorical(train_label, 2) valid_data = [text_to_ids(ele, words) for ele in valid_text] valid_label = tf.keras.utils.to_categorical(valid_label, 2) import numpy as np train_data = np.array(train_data) train_label = np.array(train_label) valid_data = np.array(valid_data) valid_label = np.array(valid_label) print train_data.shape config = { 'max_sequence_length': 50, 'vocab_size': 25000, 'embedding_size': 200, 'hidden_size': 100, 'drop_out': 0.2, 'num_classes': 2, 'epoch': 10, 'batch_size': 100, 'model_path': '/tmp/100' } model = LstmModel(config).create_model() model.summary() model.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer=tf.keras.optimizers.Adam(lr=0.01), metrics=['accuracy']) model.fit(x=train_data, y=train_label, epochs=config['epoch'], batch_size=config['batch_size'], validation_data=(valid_data, valid_label), verbose=1) model.save(config['model_path'])
def train(config): """ 训练模型 :param config: 超参数配置文件 :return: 预测结果 """ word_index = load_vocab_ids(os.path.join(FLAGS.data_dir, 'words.csv')) if not tf.gfile.Exists(os.path.join(FLAGS.data_dir, 'model')): tf.gfile.MkDir(tf.gfile.Exists(os.path.join(FLAGS.data_dir, 'model'))) model_path = os.path.join(FLAGS.data_dir, 'model/%s' % FLAGS.model_name) train_examples = None valid_examples = None test_examples = None if FLAGS.do_train and FLAGS.do_valid: if FLAGS.input_format == 1: train_examples = OneInputDataProcessor().get_train_examples( FLAGS.data_dir) valid_examples = OneInputDataProcessor().get_valid_examples( FLAGS.data_dir) if FLAGS.input_format == 2: train_examples = TwoInputDataProcessor().get_train_examples( FLAGS.data_dir) valid_examples = TwoInputDataProcessor().get_valid_examples( FLAGS.data_dir) train_text_a, train_text_b, train_label_ids = features_labels_digitalize( train_examples, word_index, config['max_sequence_length']) valid_text_a, valid_text_b, valid_label_ids = features_labels_digitalize( valid_examples, word_index, config['max_sequence_length']) model = None if FLAGS.input_format == 2: model = text_similarity_model.BilstmModel(config, merge_mode='multiply') if FLAGS.input_format == 1: model = text_classification_model.BilstmModel(config) model = model.create_model() model.compile( optimizer=tf.keras.optimizers.Adam(lr=config['learning_rate']), loss='binary_crossentropy', metrics=['accuracy']) start = time.time() if train_text_b is not None: model.fit(x=[train_text_a, train_text_b], y=train_label_ids, epochs=config['epoch'], batch_size=config['batch_size'], validation_data=([valid_text_a, valid_text_b], valid_label_ids), callbacks=[keras.callbacks.EarlyStopping(patience=2)]) else: model.fit(x=train_text_a, y=train_label_ids, epochs=config['epoch'], batch_size=config['batch_size'], validation_data=(valid_text_a, valid_label_ids), callbacks=[keras.callbacks.EarlyStopping(patience=2)]) end = time.time() tf.logging.info("Train time is %ds", end - start) model.save(model_path, overwrite=True) if FLAGS.do_test: if FLAGS.input_format == 1: test_examples = OneInputDataProcessor().get_test_examples( FLAGS.data_dir) if FLAGS.input_format == 2: test_examples = TwoInputDataProcessor().get_test_examples( FLAGS.data_dir) test_text_a, test_text_b, test_label_ids = features_labels_digitalize( test_examples, word_index, config['max_sequence_length']) model = keras.models.load_model(model_path) if FLAGS.do_statistic: if test_text_b is not None: result = model.predict([test_text_a, test_text_b]) else: result = model.predict(test_text_a) print metrics.classification_report( np.argmax(test_label_ids, axis=1), np.argmax(result, axis=1)) else: count = len(test_text_a) n = count // config['batch_size'] + 1 with tf.gfile.GFile( os.path.join(FLAGS.data_dir, 'predict.csv'), 'w') as f: for i in xrange(n): if (i + 1) * config['batch_size'] >= count: x_a = test_text_a[i * config['batch_size']:] x_b = test_text_b[i * config['batch_size']:] else: x_a = test_text_a[i * config['batch_size']:(i + 1) * config['batch_size']] x_b = test_text_b[i * config['batch_size']:(i + 1) * config['batch_size']] predictions = model.predict_on_batch([x_a, x_b]) result = np.argmax(predictions, axis=1) for j in xrange(len(result)): index = i * config['batch_size'] + j f.write('%s\t%s\t%s\n' % (test_examples[index].text_a, test_examples[index].text_b, result[j])) if FLAGS.do_export: model = keras.models.load_model(model_path) features = dict() x = model.input y = model.output args = FLAGS.params if isinstance(x, list): for i in range(len(x)): features[args[i]] = x[i] else: features[args[0]] = x labels = dict() if isinstance(y, list): for i in range(len(y)): labels[args[len(features)]] = y[i] else: labels[args[len(features)]] = y sess = tf.keras.backend.get_session() prediction_signature = tf.saved_model.signature_def_utils.predict_signature_def( inputs=features, outputs=labels) valid_prediction_signature = tf.saved_model.signature_def_utils.is_valid_signature( prediction_signature) if not valid_prediction_signature: raise ValueError("Error: Prediction signature not valid!") saved_model_path = os.path.join( FLAGS.data_dir, 'saved_model/%d' % (FLAGS.saved_model_version)) if tf.gfile.Exists(saved_model_path): tf.gfile.DeleteRecursively(saved_model_path) builder = tf.saved_model.builder.SavedModelBuilder( saved_model_path) legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={FLAGS.signature_name: prediction_signature}, legacy_init_op=legacy_init_op) builder.save()