def main(): print('loading data...') tokenizer = FullTokenizer(config.bert_vocab, do_lower_case=config.to_lower) pos_2_id, id_2_pos = read_dict(config.pos_dict) tag_2_id, id_2_tag = read_dict(config.tag_dict) config.num_pos = len(pos_2_id) config.num_tag = len(tag_2_id) data_reader = DataReader(config, tokenizer, pos_2_id, tag_2_id) input_file = args.input print('input file: {}'.format(input_file)) input_data = data_reader.load_data_from_file(input_file) print('building model...') model = get_model(config, is_training=False) saver = tf.train.Saver(max_to_keep=1) with tf.Session(config=sess_config) as sess: if tf.train.latest_checkpoint(config.result_dir): saver.restore(sess, tf.train.latest_checkpoint(config.result_dir)) print('loading model from {}'.format(tf.train.latest_checkpoint(config.result_dir))) batch_iter = make_batch_iter(list(zip(*input_data)), config.batch_size, shuffle=False) outputs = inference(sess, model, batch_iter, verbose=True) print('========== Saving Result ==========') output_file = args.output save_result(outputs, output_file, tokenizer, id_2_tag) else: print('model not found.') print('done')
def test(): print('loading data...') tokenizer = FullTokenizer(config.bert_vocab, do_lower_case=config.to_lower) pos_2_id, id_2_pos = read_dict(config.pos_dict) tag_2_id, id_2_tag = read_dict(config.tag_dict) config.num_pos = len(pos_2_id) config.num_tag = len(tag_2_id) data_reader = DataReader(config, tokenizer, pos_2_id, tag_2_id) test_data = data_reader.read_test_data() print('building model...') model = get_model(config, is_training=False) saver = tf.train.Saver(max_to_keep=1) with tf.Session(config=sess_config) as sess: if tf.train.latest_checkpoint(config.result_dir): saver.restore(sess, tf.train.latest_checkpoint(config.result_dir)) print('loading model from {}'.format(tf.train.latest_checkpoint(config.result_dir))) print('========== Test ==========') test_batch_iter = make_batch_iter(list(zip(*test_data)), config.batch_size, shuffle=False) outputs, test_loss, test_accu = evaluate(sess, model, test_batch_iter, verbose=True) print('The average test loss is {:>.4f}, average test accuracy is {:>.4f}'.format(test_loss, test_accu)) print('========== Saving Result ==========') save_result(outputs, config.test_result, tokenizer, id_2_tag) else: print('model not found.') print('done')
def main(): config = Config('.', 'temp') pos_2_id, id_2_pos = read_dict(config.pos_dict) tag_2_id, id_2_tag = read_dict(config.tag_dict) tokenizer = Tokenizer(config.bert_vocab, do_lower_case=config.to_lower) data_reader = DataReader(config, tokenizer, pos_2_id, tag_2_id) valid_data = data_reader.read_valid_data() check_data(valid_data, tokenizer, id_2_pos, id_2_tag) print('done')
def test(): print('load data...') word_2_id, id_2_word = read_dict(config.word_dict) accu_2_id, id_2_accu = read_dict(config.accu_dict) art_2_id, id_2_art = read_dict(config.art_dict) if os.path.exists(config.word2vec_model): embedding_matrix = load_embedding(config.word2vec_model, word_2_id.keys()) else: embedding_matrix = np.random.uniform( -0.5, 0.5, [len(word_2_id), config.embedding_size]) data_reader = DataReader(config) test_data = data_reader.read_test_data(word_2_id, accu_2_id, art_2_id) art_data = data_reader.read_article(art_2_id.keys(), word_2_id) print('build model...') with tf.variable_scope('model'): model = get_model(config, embedding_matrix, is_training=False) saver = tf.train.Saver(max_to_keep=1) with tf.Session(config=config_proto) as sess: print('load model from: ' + config.model_file) saver.restore(sess, config.model_file) print('========== Test ==========') test_batch_iter = make_batch_iter(list(zip(*test_data)), config.batch_size, shuffle=False) outputs = inference(sess, model, test_batch_iter, art_data, verbose=True) save_result(outputs, config.test_result, id_2_accu, id_2_art) result = judger.get_result(config.test_data, config.test_result) accu_micro_f1, accu_macro_f1 = judger.calc_f1(result[0]) article_micro_f1, article_macro_f1 = judger.calc_f1(result[1]) score = [(accu_micro_f1 + accu_macro_f1) / 2, (article_micro_f1 + article_macro_f1) / 2] print('Micro-F1 of accusation: %.4f' % accu_micro_f1) print('Macro-F1 of accusation: %.4f' % accu_macro_f1) print('Micro-F1 of relevant articles: %.4f' % article_micro_f1) print('Macro-F1 of relevant articles: %.4f' % article_macro_f1) print('Score: ', score)
print(data.columns) print(data.iloc[2399:2402]) def get_leaf_file_test(data_reader): print(data_reader.data_path) all_paths = data_reader.get_all_file_path() print(all_paths) print(len(all_paths)) def create_dataset_test(dataset): dataset.date_dataset_generator(output_directory='../output/', num_workers=8, testset_size=.4) if __name__ == '__main__': dr = DataReader( data_path='../data/rawdata' ) # read_file_by_stock_code_and_date_test(data_reader) ds = Dataset( data_path='../data/rawdata/') # create_dataset_test(ds) ds.parse_data_by_date()
print(rtn.iloc[-602:-598]['avg_delta_ bid_size_15']) return rtn def volume_init_test(trx): rtn = trx.get_volume_init_by_buy_and_sell() print(rtn.head(30)) print(rtn.shape) print(rtn.columns) print(rtn.describe()) return rtn def get_lag_return_test(trx): rtn = trx.get_lag_return() print(rtn.iloc[298:302]) print(rtn.shape) print(rtn.describe()) if __name__ == '__main__': dr = DataReader(data_path='../data/rawdata') df = dr.get_stock_info(stock_code='000006', date='20180820') tr = Transaction(df) data = tr.parse() print(np.isnan(data).any()) print(data.shape)
def main(): if not os.path.exists(config.result_dir): os.makedirs(config.result_dir) if not os.path.exists(config.train_log_dir): os.makedirs(config.train_log_dir) if not os.path.exists(config.valid_log_dir): os.makedirs(config.valid_log_dir) print('preparing data...') config.word_2_id, config.id_2_word = read_dict(config.word_dict) config.attr_2_id, config.id_2_attr = read_dict(config.attr_dict) config.vocab_size = min(config.vocab_size, len(config.word_2_id)) config.oov_vocab_size = len(config.word_2_id) - config.vocab_size config.attr_size = len(config.attr_2_id) embedding_matrix = None if args.do_train: if os.path.exists(config.glove_file): print('loading embedding matrix from file: {}'.format(config.glove_file)) embedding_matrix, config.word_em_size = load_glove_embedding(config.glove_file, list(config.word_2_id.keys())) print('shape of embedding matrix: {}'.format(embedding_matrix.shape)) else: if os.path.exists(config.glove_file): with open(config.glove_file, 'r', encoding='utf-8') as fin: line = fin.readline() config.word_em_size = len(line.strip().split()) - 1 data_reader = DataReader(config) evaluator = Evaluator('description') print('building model...') model = get_model(config, embedding_matrix) saver = tf.train.Saver(max_to_keep=10) if args.do_train: print('loading data...') train_data = data_reader.read_train_data() valid_data = data_reader.read_valid_data_small() print_title('Trainable Variables') for v in tf.trainable_variables(): print(v) print_title('Gradients') for g in model.gradients: print(g) with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint(config.result_dir) if model_file is not None: print('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) else: print('initializing from scratch...') tf.global_variables_initializer().run() train_writer = tf.summary.FileWriter(config.train_log_dir, sess.graph) valid_writer = tf.summary.FileWriter(config.valid_log_dir, sess.graph) run_train(sess, model, train_data, valid_data, saver, evaluator, train_writer, valid_writer, verbose=True) if args.do_eval: print('loading data...') valid_data = data_reader.read_valid_data() with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint(config.result_dir) if model_file is not None: print('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) predicted_ids, alignment_history, valid_loss, valid_accu = run_evaluate(sess, model, valid_data, verbose=True) print('average valid loss: {:>.4f}, average valid accuracy: {:>.4f}'.format(valid_loss, valid_accu)) print_title('Saving Result') save_result(predicted_ids, alignment_history, config.id_2_word, config.valid_data, config.valid_result) evaluator.evaluate(config.valid_data, config.valid_result, config.to_lower) else: print('model not found!') if args.do_test: print('loading data...') test_data = data_reader.read_test_data() with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint(config.result_dir) if model_file is not None: print('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) predicted_ids, alignment_history = run_test(sess, model, test_data, verbose=True) print_title('Saving Result') save_result(predicted_ids, alignment_history, config.id_2_word, config.test_data, config.test_result) evaluator.evaluate(config.test_data, config.test_result, config.to_lower) else: print('model not found!')
def train(): if not os.path.exists(config.result_dir): os.makedirs(config.result_dir) print('load data...') word_2_id, id_2_word = read_dict(config.word_dict) accu_2_id, id_2_accu = read_dict(config.accu_dict) art_2_id, id_2_art = read_dict(config.art_dict) if os.path.exists(config.word2vec_model): embedding_matrix = load_embedding(config.word2vec_model, word_2_id.keys()) else: embedding_matrix = np.random.uniform( -0.5, 0.5, [len(word_2_id), config.embedding_size]) data_reader = DataReader(config) train_data = data_reader.read_train_data(word_2_id, accu_2_id, art_2_id) valid_data = data_reader.read_valid_data(word_2_id, accu_2_id, art_2_id) art_data = data_reader.read_article(art_2_id.keys(), word_2_id) print('build model...') with tf.variable_scope('model'): model = get_model(config, embedding_matrix, is_training=True) print('========== Trainable Variables ==========') for v in tf.trainable_variables(): print(v) saver = tf.train.Saver(max_to_keep=1) with tf.Session(config=config_proto) as sess: tf.global_variables_initializer().run() saver.save(sess, config.model_file) for i in range(config.num_epoch): print('========== Epoch %2d Train ==========' % (i + 1)) train_batch_iter = make_batch_iter(list(zip(*train_data)), config.batch_size, shuffle=True) train_loss, _ = run_epoch(sess, model, train_batch_iter, art_data, verbose=True) print('The average train loss of epoch %2d is %.4f' % ((i + 1), train_loss)) print('========== Epoch %2d Valid ==========' % (i + 1)) valid_batch_iter = make_batch_iter(list(zip(*valid_data)), config.batch_size, shuffle=False) outputs = inference(sess, model, valid_batch_iter, art_data, verbose=True) print('========== Saving model ==========') saver.save(sess, config.model_file) save_result(outputs, config.valid_result, id_2_accu, id_2_art) result = judger.get_result(config.valid_data, config.valid_result) accu_micro_f1, accu_macro_f1 = judger.calc_f1(result[0]) article_micro_f1, article_macro_f1 = judger.calc_f1(result[1]) score = [(accu_micro_f1 + accu_macro_f1) / 2, (article_micro_f1 + article_macro_f1) / 2] print('Micro-F1 of accusation: %.4f' % accu_micro_f1) print('Macro-F1 of accusation: %.4f' % accu_macro_f1) print('Micro-F1 of relevant articles: %.4f' % article_micro_f1) print('Macro-F1 of relevant articles: %.4f' % article_macro_f1) print('Score: ', score)
def train(): if not os.path.exists(config.result_dir): os.makedirs(config.result_dir) if not os.path.exists(config.train_log_dir): os.mkdir(config.train_log_dir) if not os.path.exists(config.valid_log_dir): os.mkdir(config.valid_log_dir) print('loading data...') tokenizer = FullTokenizer(config.bert_vocab, do_lower_case=config.to_lower) pos_2_id, id_2_pos = read_dict(config.pos_dict) tag_2_id, id_2_tag = read_dict(config.tag_dict) config.num_pos = len(pos_2_id) config.num_tag = len(tag_2_id) data_reader = DataReader(config, tokenizer, pos_2_id, tag_2_id) train_data = data_reader.read_train_data() valid_data = data_reader.read_valid_data() print('building model...') model = get_model(config, is_training=True) tvars = tf.trainable_variables() assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(tvars, config.bert_ckpt) tf.train.init_from_checkpoint(config.bert_ckpt, assignment_map) print('========== Trainable Variables ==========') for v in tvars: init_string = '' if v.name in initialized_variable_names: init_string = '<INIT_FROM_CKPT>' print(v.name, v.shape, init_string) print('========== Gradients ==========') for g in model.gradients: print(g) best_score = 0.0 saver = tf.train.Saver(max_to_keep=1) with tf.Session(config=sess_config) as sess: if tf.train.latest_checkpoint(config.result_dir): saver.restore(sess, tf.train.latest_checkpoint(config.result_dir)) print('loading model from {}'.format(tf.train.latest_checkpoint(config.result_dir))) else: tf.global_variables_initializer().run() print('initializing from scratch.') train_writer = tf.summary.FileWriter(config.train_log_dir, sess.graph) for i in range(config.num_epoch): print('========== Epoch {} Train =========='.format(i + 1)) train_batch_iter = make_batch_iter(list(zip(*train_data)), config.batch_size, shuffle=True) train_loss, train_accu = run_epoch(sess, model, train_batch_iter, train_writer, verbose=True) print('The average train loss is {:>.4f}, average train accuracy is {:>.4f}'.format(train_loss, train_accu)) print('========== Epoch {} Valid =========='.format(i + 1)) valid_batch_iter = make_batch_iter(list(zip(*valid_data)), config.batch_size, shuffle=False) outputs, valid_loss, valid_accu = evaluate(sess, model, valid_batch_iter, verbose=True) print('The average valid loss is {:>.4f}, average valid accuracy is {:>.4f}'.format(valid_loss, valid_accu)) print('========== Saving Result ==========') save_result(outputs, config.valid_result, tokenizer, id_2_tag) if valid_accu > best_score: best_score = valid_accu saver.save(sess, config.model_file)
def main(): os.makedirs(config.temp_dir, exist_ok=True) os.makedirs(config.result_dir, exist_ok=True) os.makedirs(config.train_log_dir, exist_ok=True) logger.setLevel(logging.INFO) init_logger(logging.INFO, 'temp.log.txt', 'w') logger.info('preparing data...') config.word_2_id, config.id_2_word = read_json_dict(config.vocab_dict) config.vocab_size = min(config.vocab_size, len(config.word_2_id)) config.oov_vocab_size = min(config.oov_vocab_size, len(config.word_2_id) - config.vocab_size) embedding_matrix = None if args.do_train: if os.path.exists(config.glove_file): logger.info('loading embedding matrix from file: {}'.format( config.glove_file)) embedding_matrix, config.word_em_size = load_glove_embedding( config.glove_file, list(config.word_2_id.keys())) logger.info('shape of embedding matrix: {}'.format( embedding_matrix.shape)) else: if os.path.exists(config.glove_file): with open(config.glove_file, 'r', encoding='utf-8') as fin: line = fin.readline() config.word_em_size = len(line.strip().split()) - 1 data_reader = DataReader(config) evaluator = Evaluator('tgt') logger.info('building model...') model = get_model(config, embedding_matrix) saver = tf.train.Saver(max_to_keep=10) if args.do_train: logger.info('loading data...') train_data = data_reader.read_train_data() valid_data = data_reader.read_valid_data() logger.info(log_title('Trainable Variables')) for v in tf.trainable_variables(): logger.info(v) logger.info(log_title('Gradients')) for g in model.gradients: logger.info(g) with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint( os.path.join(config.result_dir, config.current_model)) if model_file is not None: logger.info('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) else: logger.info('initializing from scratch...') tf.global_variables_initializer().run() train_writer = tf.summary.FileWriter(config.train_log_dir, sess.graph) valid_log_history = run_train(sess, model, train_data, valid_data, saver, evaluator, train_writer) save_json( valid_log_history, os.path.join(config.result_dir, config.current_model, 'valid_log_history.json')) if args.do_eval: logger.info('loading data...') valid_data = data_reader.read_valid_data() with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint( os.path.join(config.result_dir, config.current_model)) if model_file is not None: logger.info('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) predicted_ids, valid_loss, valid_accu = run_evaluate( sess, model, valid_data) logger.info( 'average valid loss: {:>.4f}, average valid accuracy: {:>.4f}' .format(valid_loss, valid_accu)) logger.info(log_title('Saving Result')) save_outputs(predicted_ids, config.id_2_word, config.valid_data, config.valid_outputs) results = evaluator.evaluate(config.valid_data, config.valid_outputs, config.to_lower) save_json(results, config.valid_results) else: logger.info('model not found!') if args.do_test: logger.info('loading data...') test_data = data_reader.read_test_data() with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint( os.path.join(config.result_dir, config.current_model)) if model_file is not None: logger.info('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) predicted_ids = run_test(sess, model, test_data) logger.info(log_title('Saving Result')) save_outputs(predicted_ids, config.id_2_word, config.test_data, config.test_outputs) results = evaluator.evaluate(config.test_data, config.test_outputs, config.to_lower) save_json(results, config.test_results) else: logger.info('model not found!')
def main(): os.makedirs(config.temp_dir, exist_ok=True) os.makedirs(config.result_dir, exist_ok=True) os.makedirs(config.train_log_dir, exist_ok=True) logger.setLevel(logging.INFO) init_logger(logging.INFO) logger.info('loading dict...') config.src_2_id, config.id_2_src = read_json_dict(config.src_vocab_dict) config.src_vocab_size = min(config.src_vocab_size, len(config.src_2_id)) config.tgt_2_id, config.id_2_tgt = read_json_dict(config.tgt_vocab_dict) config.tgt_vocab_size = min(config.tgt_vocab_size, len(config.tgt_2_id)) data_reader = DataReader(config) evaluator = Evaluator('tgt') logger.info('building model...') model = get_model(config) saver = tf.train.Saver(max_to_keep=10) if args.do_train: logger.info('loading data...') train_data = data_reader.load_train_data() valid_data = data_reader.load_valid_data() logger.info(log_title('Trainable Variables')) for v in tf.trainable_variables(): logger.info(v) logger.info(log_title('Gradients')) for g in model.gradients: logger.info(g) with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint( os.path.join(config.result_dir, config.current_model)) if model_file is not None: logger.info('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) else: logger.info('initializing from scratch...') tf.global_variables_initializer().run() train_writer = tf.summary.FileWriter(config.train_log_dir, sess.graph) valid_log_history = run_train(sess, model, train_data, valid_data, saver, evaluator, train_writer) save_json( valid_log_history, os.path.join(config.result_dir, config.current_model, 'valid_log_history.json')) if args.do_eval: logger.info('loading data...') valid_data = data_reader.load_valid_data() with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint( os.path.join(config.result_dir, config.current_model)) if model_file is not None: logger.info('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) predicted_ids, valid_loss, valid_accu = run_evaluate( sess, model, valid_data) logger.info( 'average valid loss: {:>.4f}, average valid accuracy: {:>.4f}' .format(valid_loss, valid_accu)) logger.info(log_title('Saving Result')) save_outputs(predicted_ids, config.id_2_tgt, config.valid_data, config.valid_outputs) results = evaluator.evaluate(config.valid_data, config.valid_outputs, config.to_lower) save_json(results, config.valid_results) else: logger.info('model not found!') if args.do_test: logger.info('loading data...') test_data = data_reader.load_test_data() with tf.Session(config=sess_config) as sess: model_file = args.model_file if model_file is None: model_file = tf.train.latest_checkpoint( os.path.join(config.result_dir, config.current_model)) if model_file is not None: logger.info('loading model from {}...'.format(model_file)) saver.restore(sess, model_file) predicted_ids = run_test(sess, model, test_data) logger.info(log_title('Saving Result')) save_outputs(predicted_ids, config.id_2_tgt, config.test_data, config.test_outputs) results = evaluator.evaluate(config.test_data, config.test_outputs, config.to_lower) save_json(results, config.test_results) else: logger.info('model not found!')
all_file_path = self.data_reader.get_all_file_path(flatten=True)[:100] batch_size = int(ceil(len(all_file_path) / num_workers)) pool = mp.Pool(processes=num_workers) small_batches = [ all_file_path[idx:idx + batch_size] for idx in range(0, len(all_file_path), batch_size) ] res = pool.map(self.get_data_pairs, small_batches) pickle.dump(res, 'result.pkl') return res def get_data_pairs(self, file_path): transaction = Transaction( self.data_reader.get_stock_info(stock_file_path=file_path)) turnover = transaction.get_return(self.time_frame_size_y) avg_delta_in_bid_and_ask = transaction.get_average_delta_bid_ask_size( 'all', self.time_frame_size_x) volume_init_by_buy_and_sell = transaction.get_volume_init_by_buy_and_sell( 'all', self.time_frame_size_x) result = [turnover ] + avg_delta_in_bid_and_ask + volume_init_by_buy_and_sell return pd.concat(result, axis=1).iloc[:min(map(len, result))] if __name__ == '__main__': dr = DataReader() analyzer = Analyzer(dr) res = analyzer.batch_get_data_pairs() print(res) print(type(res))