def home(): obj = DataReader() obj.parse_country() obj.parse_city() obj.parse_features() obj.parse_prediction() return jsonify(obj.final_data)
def pretrain_model(self, src1_path, src2_path, tgt_path, epochs): datareader = DataReader() data = datareader.read_parallel_data(self.model, src1_path, src2_path, tgt_path) self.seq2seq_trainer.train( train_data=data, val_data=[], epochs=epochs, pretrain=True, )
def init_from_config(self, config): # self.model = Model(config) self.model = Transformer(config, config.test.devices) self.model.build_test_model() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config, graph=self.model.graph) # Restore model. self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.train.logdir)) self.data_reader = DataReader(config)
def init_from_config(self, config): self.model = eval(config.model)(config, config.test.num_gpus) self.model.build_test_model() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config, graph=self.model.graph) if is_debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) # Restore model. self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.model_dir)) self.data_reader = DataReader(config)
def init_from_config(self, config): logger = logging.getLogger('') self.model = eval(config.model)(config, config.test.num_gpus) self.model.build_test_model() # Print the number of total parameters print_num_of_total_parameters() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config, graph=self.model.graph) # Restore model. self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.model_dir)) self.data_reader = DataReader(config)
def init_from_config(self, config): self.model = eval(config.model)(config, config.test.num_gpus) self.model.build_test_model() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config) # Restore model. try: tf.train.Saver().restore( self.sess, tf.train.latest_checkpoint(config.model_dir)) except tf.errors.NotFoundError: roll_back_to_previous_version(config) tf.train.Saver().restore( self.sess, tf.train.latest_checkpoint(config.model_dir)) self.data_reader = DataReader(config)
def __init__(self, options): """Gonna need a db, and some creds.""" log.info("Starting AG Chatter Bot.") self.options = options # Build Constructors self.idx2word = Database( host=options.redis_host, pass_=options.redis_pass, db=0 ) self.word2idx = Database( host=options.redis_host, pass_=options.redis_pass, db=1 ) self.dataReader = DataReader( self.options, self.idx2word, self.word2idx ) self.model = Model( self.options ) log.debug(options) log.info("Init complete.")
class Seq2SeqTester: def __init__(self, model, output_name): self.model = model self.datareader = DataReader() self.metrics = ErrorMetrics() self.output_name = output_name def test(self, src1, src2, tgt): if tgt: data = self.datareader.read_parallel_data(self.model, src1, src2, tgt) output_name = "{}_{}".format(self.output_name, src1.split("/")[-1]) cer, wer = self.metrics.get_average_cer( self.model, data, output_file=open("{}.output".format(output_name), "w", encoding="utf-8"), write_pgens=False, ) with open("{}.metrics".format(output_name), "w") as output_file: output_file.write("TEST CER: %0.4f\n" % (cer)) output_file.write("TEST WER: %0.4f\n" % (wer)) else: output_file = open( "{}_{}.output".format(self.output_name, src1.split("/")[-1]), "w", encoding="utf8", ) data = self.datareader.read_test_data(self.model, src1, src2) for src1, src2 in data: if len(src1) == 0 or len(src2) == 0: output_file.write("\n") continue dy.renew_cg() output, _ = self.model.generate_beam(src1, src2) output_file.write(str(output) + "\n") output_file.close()
class Chatter(object): """Chatter App.""" def __init__(self, options): """Gonna need a db, and some creds.""" log.info("Starting AG Chatter Bot.") self.options = options # Build Constructors self.idx2word = Database( host=options.redis_host, pass_=options.redis_pass, db=0 ) self.word2idx = Database( host=options.redis_host, pass_=options.redis_pass, db=1 ) self.dataReader = DataReader( self.options, self.idx2word, self.word2idx ) self.model = Model( self.options ) log.debug(options) log.info("Init complete.") def sanity(self): """This kind of thing should be standardized.""" log.info("Starting Stanity Check") key = "stuff" value = "morestuff" self.idx2word.write_data(key, value) new_value = self.idx2word.read_data(key) assert value == new_value log.debug("Passed Stanity Check") return True def main(self): """This kind of thing should be standardized.""" if self.sanity(): # Add the path to files in the config.yaml dataset = self.dataReader.make_buckets() print(dataset) return True return False
logfile = sys.argv[6] ckpt_file = "ckpt" x, y_true, y, gap_w, conv3_pool, train_step, accuracy, saver = \ inference(batch_size) sess = tf.InteractiveSession() # Setup summary merged = tf.summary.merge_all() writer = tf.summary.FileWriter(logfile, sess.graph) # Get data reader data_reader = DataReader(train_dataset_dir, batch_size=batch_size, file_names=False, resize_to=(224, 224)) tf.add_to_collection('x', x) tf.add_to_collection('y', y) tf.add_to_collection('gap_w', gap_w) tf.add_to_collection('conv3', conv3_pool) ckpt = tf.train.latest_checkpoint(model_path) if ckpt: saver.restore(sess, ckpt) print("Model loaded from file: %s" % ckpt) # Initialize variables sess.run(tf.global_variables_initializer())
def train(config, num_epoch, last_pretrain_model_dir, pretrain_model_dir, model_dir, block_idx_enc, block_idx_dec): logger = logging.getLogger('') config.num_blocks_enc = block_idx_enc config.num_blocks_dec = block_idx_dec # if block_idx >= 2: # config.train.var_filter = 'encoder/block_' + str(block_idx - 1) + '|' + 'decoder/block_' + str( # block_idx - 1) + '|' + 'encoder/src_embedding' + '|' + 'decoder/dst_embedding' # if block_idx >= 2: # config.train.var_filter = 'encoder/block_' + str(block_idx - 1) + '|' + 'decoder/block_' + str( # block_idx - 1) logger.info("config.num_blocks_enc=" + str(config.num_blocks_enc) + ",config.num_blocks_dec=" + str(config.num_blocks_dec) + ',config.train.var_filter=' + str(config.train.var_filter)) """Train a model with a config file.""" data_reader = DataReader(config=config) model = eval(config.model)(config=config, num_gpus=config.train.num_gpus) model.build_train_model(test=config.train.eval_on_dev) sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True summary_writer = tf.summary.FileWriter(pretrain_model_dir, graph=model.graph) with tf.Session(config=sess_config, graph=model.graph) as sess: # Initialize all variables. sess.run(tf.global_variables_initializer()) # Reload variables in disk. if tf.train.latest_checkpoint(last_pretrain_model_dir): available_vars = available_variables_without_global_step( last_pretrain_model_dir) # available_vars = available_variables(last_pretrain_model_dir) if available_vars: saver = tf.train.Saver(var_list=available_vars) saver.restore( sess, tf.train.latest_checkpoint(last_pretrain_model_dir)) for v in available_vars: logger.info('Reload {} from disk.'.format(v.name)) else: logger.info('Nothing to be reload from disk.') else: logger.info('Nothing to be reload from disk.') evaluator = Evaluator() evaluator.init_from_existed(model, sess, data_reader) global dev_bleu, toleration dev_bleu = evaluator.evaluate( **config.dev) if config.train.eval_on_dev else 0 toleration = config.train.toleration def train_one_step(batch): feat_batch, target_batch = batch feed_dict = expand_feed_dict({ model.src_pls: feat_batch, model.dst_pls: target_batch }) step, lr, loss, _ = sess.run([ model.global_step, model.learning_rate, model.loss, model.train_op ], feed_dict=feed_dict) if step % config.train.summary_freq == 0: logger.info('pretrain summary_writer...') summary = sess.run(model.summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary, global_step=step) summary_writer.flush() return step, lr, loss def maybe_save_model(model_dir, is_save_global_step=True): global dev_bleu, toleration new_dev_bleu = evaluator.evaluate( **config.dev) if config.train.eval_on_dev else dev_bleu + 1 if new_dev_bleu >= dev_bleu: mp = model_dir + '/pretrain_model_step_{}'.format(step) # model.saver.save(sess, mp) if is_save_global_step: model.saver.save(sess, mp) else: variables_without_global_step = global_variables_without_global_step( ) saver = tf.train.Saver( var_list=variables_without_global_step, max_to_keep=10) saver.save(sess, mp) logger.info('Save model in %s.' % mp) toleration = config.train.toleration dev_bleu = new_dev_bleu else: toleration -= 1 step = 0 for epoch in range(1, num_epoch + 1): for batch in data_reader.get_training_batches_with_buckets(): # Train normal instances. start_time = time.time() step, lr, loss = train_one_step(batch) logger.info( 'epoch: {0}\tstep: {1}\tlr: {2:.6f}\tloss: {3:.4f}\ttime: {4:.4f}' .format(epoch, step, lr, loss, time.time() - start_time)) if config.train.num_steps and step >= config.train.num_steps: break # Early stop if toleration <= 0: break maybe_save_model(pretrain_model_dir) if model_dir: maybe_save_model(model_dir, False) logger.info("Finish pretrain block_idx_enc=" + str(block_idx_enc) + ',block_idx_dec=' + str(block_idx_dec))
def init_from_frozen_graphdef(self, config): frozen_graph_path = os.path.join(config.model_dir, 'freeze_graph_test.py') # If the file doesn't existed, create it. if not os.path.exists(frozen_graph_path): logging.warning( 'The frozen graph does not existed, use \'init_from_config\' instead' 'and create a frozen graph for next use.') self.init_from_config(config) saver = tf.train.Saver() save_dir = '/tmp/graph-{}'.format(os.getpid()) os.mkdir(save_dir) save_path = '{}/ckpt'.format(save_dir) saver.save(sess=self.sess, save_path=save_path) with tf.Session(graph=tf.Graph()) as sess: clear_devices = True output_node_names = ['loss_sum', 'predictions'] # We import the meta graph in the current default Graph saver = tf.train.import_meta_graph(save_path + '.meta', clear_devices=clear_devices) # We restore the weights saver.restore(sess, save_path) # We use a built-in TF helper to export variables to constants output_graph_def = tf.graph_util.convert_variables_to_constants( sess, # The session is used to retrieve the weights tf.get_default_graph().as_graph_def( ), # The graph_def is used to retrieve the nodes output_node_names # The output node names are used to select the useful nodes ) # Finally we serialize and dump the output graph to the filesystem with tf.gfile.GFile(frozen_graph_path, "wb") as f: f.write(output_graph_def.SerializeToString()) logging.info("%d ops in the final graph." % len(output_graph_def.node)) # Remove temp files. os.system('rm -rf ' + save_dir) else: sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config) self.data_reader = DataReader(config) # We load the protobuf file from the disk and parse it to retrieve the # unserialized graph_def with tf.gfile.GFile(frozen_graph_path, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) # Import the graph_def into current the default graph. tf.import_graph_def(graph_def) graph = tf.get_default_graph() self.model = AttrDict() def collect_placeholders(prefix): ret = [] idx = 0 while True: try: ret.append( graph.get_tensor_by_name('import/{}_{}:0'.format( prefix, idx))) idx += 1 except KeyError: return tuple(ret) self.model['src_pls'] = collect_placeholders('src_pl') self.model['dst_pls'] = collect_placeholders('dst_pl') self.model['predictions'] = graph.get_tensor_by_name( 'import/predictions:0')
"pert_id": ['BRD-U41416256', 'BRD-U60236422'], "pert_type": ["trt_cp"], "cell_id": ['A375', 'HA1E', 'HELA', 'HT29', 'MCF7', 'PC3', 'YAPC'], "pert_idose": ["0.04 um", "0.12 um", "0.37 um", "1.11 um", "3.33 um", "10.0 um"] } # check cuda if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") print("Use GPU: %s" % torch.cuda.is_available()) data = DataReader(drug_file, gene_file, gene_expression_file_train, gene_expression_file_dev, gene_expression_file_test, filter, device) print('#Train: %d' % len(data.train_feature['drug'])) print('#Dev: %d' % len(data.dev_feature['drug'])) print('#Test: %d' % len(data.test_feature['drug'])) # model creation model = DeepCE(drug_input_dim=drug_input_dim, drug_emb_dim=drug_embed_dim, conv_size=conv_size, degree=degree, gene_input_dim=np.shape(data.gene)[1], gene_emb_dim=gene_embed_dim, num_gene=np.shape(data.gene)[0], hid_dim=hid_dim, dropout=dropout,
target_col = len(col_name) - 1 # ============================================ # # Data location wd = os.path.dirname(os.path.abspath(__file__)) + '/' data_path = wd + 'data/' data_path += 'prototype/' output_path = wd + 'output/' # ============================================ # # Read data data_files = os.listdir(data_path) for i in range(len(data_files)): data_files[i] = data_path + data_files[i] dr = DataReader(data_files, col_idx) ds = DataScaler() dp = DataParser() print('======== Supplying data ============') for file_id in range(len(data_files)): dr_tmp = DataReader([data_files[file_id]], col_idx) dr_tmp.read(delimiter='\t') data = dr_tmp.getData() data = parse_data(dp, data, col_name, target_col) dr.append(data) del data del dr_tmp print(file_id + 1, ' - ', data_files[file_id], ': ',
model_path = sys.argv[5] logfile = sys.argv[6] ckpt_file = "ckpt" x, y_true, y, gap_w, conv3_pool, train_step, accuracy, saver = \ inference(batch_size) sess = tf.InteractiveSession() # Setup summary merged = tf.summary.merge_all() writer = tf.summary.FileWriter(logfile, sess.graph) # Get data reader data_reader = DataReader(train_dataset_dir, batch_size=batch_size, file_names=False) tf.add_to_collection('x', x) tf.add_to_collection('y', y) tf.add_to_collection('gap_w', gap_w) tf.add_to_collection('conv3', conv3_pool) ckpt = tf.train.latest_checkpoint(model_path) if ckpt: saver.restore(sess, ckpt) print("Model loaded from file: %s" % ckpt) # Initialize variables sess.run(tf.global_variables_initializer())
class Evaluator(object): """ Evaluate the model. """ def __init__(self): pass def init_from_config(self, config): self.model = eval(config.model)(config, config.test.num_gpus) self.model.build_test_model() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config, graph=self.model.graph) if is_debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) # Restore model. self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.model_dir)) self.data_reader = DataReader(config) def init_from_existed(self, model, sess, data_reader): assert model.graph == sess.graph self.sess = sess self.model = model self.data_reader = data_reader def beam_search(self, X): return self.sess.run(self.model.prediction, feed_dict=expand_feed_dict( {self.model.src_pls: X})) def loss(self, X, Y): return self.sess.run(self.model.loss_sum, feed_dict=expand_feed_dict({ self.model.src_pls: X, self.model.dst_pls: Y })) def translate(self, src_path, output_path, batch_size): logging.info('Translate %s.' % src_path) tmp = output_path + '.tmp' fd = codecs.open(tmp, 'w', 'utf8') count = 0 token_count = 0 start = time.time() for X, uttids in self.data_reader.get_test_batches( src_path, batch_size): Y = self.beam_search(X) sents = self.data_reader.indices_to_words(Y) assert len(X) == len(sents) for sent, uttid in zip(sents, uttids): print(uttid + '\t' + sent, file=fd) count += len(X) token_count += np.sum(np.not_equal(Y, 3)) # 3: </s> time_span = time.time() - start logging.info( '{0} sentences ({1} tokens) processed in {2:.2f} minutes (speed: {3:.4f} sec/token).' .format(count, token_count, time_span / 60, time_span / token_count)) fd.close() # Remove BPE flag, if have. os.system("sed -r 's/(@@ )|(@@ ?$)//g' %s > %s" % (tmp, output_path)) os.remove(tmp) logging.info('The result file was saved in %s.' % output_path) def ppl(self, src_path, dst_path, batch_size): logging.info('Calculate PPL for %s and %s.' % (src_path, dst_path)) token_count = 0 loss_sum = 0 for batch in self.data_reader.get_test_batches_with_target( src_path, dst_path, batch_size): X, Y = batch loss_sum += self.loss(X, Y) token_count += np.sum(np.greater(Y, 0)) # Compute PPL ppl = np.exp(loss_sum / token_count) logging.info('PPL: %.4f' % ppl) return ppl def evaluate(self, batch_size, **kargs): """Evaluate the model on dev set.""" src_path = kargs['src_path'] output_path = kargs['output_path'] cmd = kargs['cmd'] if 'cmd' in kargs else\ "perl multi-bleu.perl {ref} < {output} 2>/dev/null | awk '{{print($3)}}' | awk -F, '{{print $1}}'" self.translate(src_path, output_path, batch_size) # if 'ref_path' in kargs: # ref_path = kargs['ref_path'] # bleu = commands.getoutput(cmd.format(**{'ref': ref_path, 'output': output_path})) # logging.info('BLEU: {}'.format(bleu)) # return float(bleu) # if 'dst_path' in kargs: # self.ppl(src_path, kargs['dst_path'], batch_size) return None
def train(args): vocab = Vocab.load(args.vocab, max_size=args.vocab_size) data_reader = DataReader(data_dir=args.data_dir, shuffle=True) preprocessor = Preprocessor( predict_prev=args.predict_prev, predict_cur=args.predict_cur, predict_next=args.predict_next, vocab=vocab, max_length=args.max_length, gpu=args.gpu) model = SkipThought( rnn_type=args.rnn_type, num_words=len(vocab), word_dim=args.word_dim, hidden_dim=args.hidden_dim, bidirectional=args.bidirectional, predict_prev=args.predict_prev, predict_cur=args.predict_cur, predict_next=args.predict_next) print(model) if args.pretrained is not None: print(f'Loading pretrained model from {args.pretrained}') model.load_state_dict( torch.load(args.pretrained, map_location=lambda storage, loc: storage)) if args.gpu > -1: model.cuda(args.gpu) optimizer = optim.Adam(model.parameters()) summary_writer = SummaryWriter(os.path.join(args.save_dir, 'log')) def add_scalar_summary(name, value, step): summary_writer.add_scalar(tag=name, scalar_value=value, global_step=step) def add_text_summary(name, value, step): summary_writer.add_text(tag=name, text_string=value, global_step=step) def variable(tensor, volatile=False): return Variable(tensor, volatile=volatile) def run_train_iter(batch): if not model.training: model.train() src, tgt = preprocessor(batch) src = (variable(src[0]), src[1]) for k in tgt: tgt[k] = (variable(tgt[k][0]), tgt[k][1]) logits = model.forward(src=src, tgt=tgt) loss = 0 for k in tgt: logits_k = logits[k] tgt_k = tgt[k] loss = loss + basic.sequence_cross_entropy( logits=logits_k[:-1], targets=tgt_k[0][1:], length=tgt_k[1] - 1) optimizer.zero_grad() loss.backward() clip_grad_norm(model.parameters(), max_norm=10) optimizer.step() return loss.data[0] def ids_to_words(ids): words = [] eos_id = vocab.stoi(vocab.eos) for id_ in ids: words.append(vocab.itos(id_)) if id_ == eos_id: break return words def generate_using_decoder(name, src, max_length): _, encoder_state = model.encoder(words=src[0], length=src[1]) if isinstance(encoder_state, tuple): # LSTM encoder_state = encoder_state[0] context = (encoder_state.transpose(0, 1).contiguous() .view(-1, args.hidden_dim)) batch_size = src[1].size(0) bos_id = vocab.stoi(vocab.bos) bos = Variable(src[1].new(1, batch_size).fill_(bos_id)) decoder = model.get_decoder(name) prev_pred = bos done = torch.zeros(batch_size).byte() hyps = [] prev_state = context.unsqueeze(0) for t in range(max_length): if done.all(): break decoder_input = prev_pred logit, prev_state = decoder(words=decoder_input, prev_state=prev_state) pred = logit.max(2)[1] prev_pred = pred hyps.append(pred.data) hyps = torch.cat(hyps, dim=0).transpose(0, 1).tolist() return hyps def generate(batch): # Greedy search src, tgt = preprocessor(batch) src = (variable(src[0]), src[1]) for k in tgt: tgt[k] = (variable(tgt[k][0], volatile=True), tgt[k][1]) batch_size = src[0].size(1) max_length = src[0].size(0) * 2 generated = {} for k in tgt: generated[k] = generate_using_decoder( name=k, src=src, max_length=max_length) results = [] for i in range(batch_size): res = {'src': ' '.join(ids_to_words(src[0][:src[1][i], i].data)), 'tgt': {}, 'out': {}} for k in tgt: res['tgt'][k] = ' '.join(ids_to_words(tgt[k][0][1:, i].data)) res['out'][k] = ' '.join(ids_to_words(generated[k][i])) results.append(res) return results def generate_synthetic_batch(real_batch): def sort_by_length(tgt_of_key): sorted_length, sort_inds = tgt_of_key[1].sort( dim=0, descending=True) return tgt_of_key[0][:, sort_inds], sorted_length # Forward: given prev, generate cur' _, tgt = preprocessor(real_batch) tgt_prev, tgt_prev_length = sort_by_length(tgt['prev']) syn_src_fw = generate_using_decoder( name='next', src=(variable(tgt_prev[1:], volatile=True), tgt_prev_length - 1), max_length=args.max_length) # Backward: given next, generate cur'' tgt_next, tgt_next_length = sort_by_length(tgt['next']) syn_src_bw = generate_using_decoder( name='prev', src=(variable(tgt_next[1:], volatile=True), tgt_next_length - 1), max_length=args.max_length) syn_batch_fw = [] syn_batch_bw = [] for i in range(len(real_batch)): syn_src_fw_str = ' '.join(ids_to_words(syn_src_fw[i])) syn_src_bw_str = ' '.join(ids_to_words(syn_src_bw[i])) syn_batch_fw.append( (real_batch[i][0], syn_src_fw_str, real_batch[i][2])) syn_batch_bw.append( (real_batch[i][0], syn_src_bw_str, real_batch[i][2])) return syn_batch_fw, syn_batch_bw global_step = 0 def print_samples(): model.eval() num_samples = 2 samples = data_reader.next_batch(size=num_samples, peek=True) syn_samples_fw, syn_samples_bw = generate_synthetic_batch(samples) gen_results = generate(samples) syn_gen_results_fw = generate(syn_samples_fw) syn_gen_results_bw = generate(syn_samples_bw) text_val = '' for i, res in enumerate(gen_results): text_val += f'* sample (real) #{i}\n' text_val += f'\t* src: {res["src"]}\n' for k in res['tgt']: tgt_k = res['tgt'][k] out_k = res['out'][k] text_val += f'\t* {k} (tgt): {tgt_k}\n' text_val += f'\t* {k} (out): {out_k}\n' for i, res in enumerate(syn_gen_results_fw): text_val += f'* sample (syn_fw) #{i}\n' text_val += f'\t* src: {res["src"]}\n' for k in res['tgt']: tgt_k = res['tgt'][k] out_k = res['out'][k] text_val += f'\t* {k} (tgt): {tgt_k}\n' text_val += f'\t* {k} (out): {out_k}\n' for i, res in enumerate(syn_gen_results_bw): text_val += f'* sample (syn_bw) #{i}\n' text_val += f'\t* src: {res["src"]}\n' for k in res['tgt']: tgt_k = res['tgt'][k] out_k = res['out'][k] text_val += f'\t* {k} (tgt): {tgt_k}\n' text_val += f'\t* {k} (out): {out_k}\n' add_text_summary('Sample', value=text_val, step=global_step) for epoch in range(args.max_epoch): data_reader.start_epoch() for batch in tqdm(data_reader.iterator(args.batch_size), desc=f'Epoch {epoch}'): # Train on real batch real_loss = run_train_iter(batch) # Train on synthetic batches syn_batch_fw, syn_batch_bw = generate_synthetic_batch(batch) syn_loss_fw = run_train_iter(syn_batch_fw) syn_loss_bw = run_train_iter(syn_batch_bw) global_step += 1 add_scalar_summary(name='real_loss', value=real_loss, step=global_step) add_scalar_summary(name='syn_loss_fw', value=syn_loss_fw, step=global_step) add_scalar_summary(name='syn_loss_bw', value=syn_loss_bw, step=global_step) if global_step % args.print_every == 0: print_samples() if global_step % args.save_every == 0: model_filename = f'model-{global_step}.pt' model_path = os.path.join(args.save_dir, model_filename) torch.save(model.state_dict(), model_path) print(f'\nIter #{global_step}: ' f'Saved checkpoint to {model_path}')
results = {} for color in point_clouds: results[color] = [] for x, y in point_clouds[color]: if x > 300 and y > 300: results[color].append([x, y]) return results if __name__ == "__main__": target_case = os.listdir(target_folder) failure_count = 0 cannot_fix = 0 with tqdm(target_case) as t: for case_name in t: point_clouds = DataReader.parse_annotation( os.path.join(target_folder, case_name, file_name)) stats = {c: len(point_clouds[c]) for c in point_clouds} if not landmark_num_checker(point_clouds): print("case '{}' is not legal\n{}".format( case_name, json.dumps(point_clouds))) new_point_clouds = location_filter(point_clouds) if not landmark_num_checker(new_point_clouds): print( "location filtered case '{}' is not legal\n{}".format( case_name, json.dumps(new_point_clouds))) cannot_fix += 1 failure_count += 1 t.set_postfix(stats) print("Summary") print("In folder {}, {} failures found, {} cannot be auto-fixed".format( target_folder, failure_count, cannot_fix))
def main(unused_argv): # prints a message if you've entered flags incorrectly if len(unused_argv) != 1: raise Exception("Problem with flags: %s" % unused_argv) # Get hyperparameters. We only get a subset of all the hyperparameters, others would be feed to Model directly. #logging.basicConfig(level=logging.INFO) print('Starting Basic model') log_root = FLAGS.log_root exp_name = FLAGS.exp_name data_file_path = FLAGS.data_file_path pinyin_dict_path = FLAGS.pinyin_dict_path id_data_dir = FLAGS.id_data_dir n_epoch = FLAGS.n_epoch batch_size = FLAGS.batch_size seed_num = FLAGS.seed_num max_timesteps = FLAGS.max_timesteps vocab_size = FLAGS.vocab_size train_size = FLAGS.train_size load_data_and_dr = FLAGS.load_data_and_dr use_local = FLAGS.use_local # make the directory for logs log_root = os.path.join(log_root, exp_name) if not os.path.exists(log_root): os.makedirs(log_root) if use_local == 1: #load or save the DR class from local dir DR_path = os.path.join(log_root, 'DataReader.pkl') #load or save the id data from local dir id_data_path = os.path.join(log_root, 'id_data.pkl') else: #load or save the DR class from global dir DR_path = os.path.join(id_data_dir, 'DataReader.pkl') #load or save the id data from global dir id_data_path = os.path.join(id_data_dir, 'id_data.pkl') if load_data_and_dr == 1: with open(DR_path, 'rb') as f: DR = pickle.load(f) with open(id_data_path, 'rb') as f1: input_pinyin_data = pickle.load(f1) input_word_data = pickle.load(f1) target_data = pickle.load(f1) else: # load and make the data for training DR = DataReader(vocab_size=vocab_size, pinyin_dict_path=pinyin_dict_path) #input_data,target_data = DR.make_data_from_scratch(file_path = data_file_path,build_dictionary=True) input_pinyin_data, input_word_data, target_data = DR.make_data_from_dataframe( file_path=data_file_path, build_dictionary=True, max_rows=train_size) #save the DR class to local dir with open(DR_path, 'wb') as f: pickle.dump(DR, f) #save the ids data to local dir with open(id_data_path, 'wb') as f1: pickle.dump(input_pinyin_data, f1) pickle.dump(input_word_data, f1) pickle.dump(target_data, f1) # make the batch train_data_full = batch_generator_triple_with_length( input_pinyin_data, input_word_data, target_data, batch_size, max_timesteps, DR.word2id, DR.pinyin2id) # create the model model = SpellChecker(hps=FLAGS) sess = tf.Session() sess.run(tf.global_variables_initializer()) n_iter_per_epoch = len(input_pinyin_data) // (batch_size * 2) epoch = 0.0 print('number of iterations per epoch: {}'.format(n_iter_per_epoch)) print('start training...') for _ in range(n_epoch * 2): epoch += 0.5 avg_loss = 0.0 print("----- Epoch {}/{} -----".format(epoch, n_epoch)) for t in tqdm(range(1, n_iter_per_epoch + 1)): batch_full = next(train_data_full) src_pinyin_list, src_word_list, src_length_list, tgt_list, tgt_length_list = batch_full src_pinyin_list = np.asarray(src_pinyin_list, dtype=np.int32) src_word_list = np.asarray(src_word_list, dtype=np.int32) src_length_list = np.asarray(src_length_list, dtype=np.int32) tgt_list = np.asarray(tgt_list, dtype=np.int32) keep_ratio = FLAGS.keep_ratio #tgt_length_list = np.asarray(tgt_length_list,dtype = np.int32) loss = model.train_one_step(src_pinyin_list, src_word_list, src_length_list, tgt_list, keep_ratio, sess) avg_loss += loss avg_loss /= n_iter_per_epoch print('the avg_loss is {}'.format(avg_loss)) if epoch == 1.5: print('Build model for serving...') model.build_model_for_serving(sess) print('Build model serving done!')
col_idx = (1, 2, 3, 4, 5, 6) target_col = len(col_name) - 1 # ============================================ # # Data location wd = os.path.dirname(os.path.abspath(__file__)) + '/' data_path = wd + 'data/' output_path = wd + 'output/' # ============================================ # # Read data data_files = os.listdir(data_path) for i in range(len(data_files)): data_files[i] = data_path + data_files[i] dr = DataReader(data_files, col_idx) ds = DataScaler() dp = DataParser() print('======== Supplying data ============') dr.read() print('======== Extracting data ============') # ============================================ # # Split data X = dr.data[:, :target_col] y = dr.data[:, target_col] alias = list(np.unique(y)) y = dp.convertTextTarget(y, alias) #dump_result(output_path + 'accidents.csv', np.array(alias), ['accident']) print('Accident types: ', alias)
def train(config): """Train a model with a config file.""" logger = logging.getLogger('') data_reader = DataReader(config=config) model = eval(config.model)(config=config, num_gpus=config.train.num_gpus) model.build_train_model(test=config.train.eval_on_dev) train_op, loss_op = model.get_train_op(name=None) global_saver = tf.train.Saver() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True summary_writer = tf.summary.FileWriter(config.model_dir) with tf.Session(config=sess_config) as sess: # Initialize all variables. sess.run(tf.global_variables_initializer()) # Reload variables from disk. if tf.train.latest_checkpoint(config.model_dir): available_vars = available_variables(config.model_dir) if available_vars: saver = tf.train.Saver(var_list=available_vars) saver.restore(sess, tf.train.latest_checkpoint(config.model_dir)) for v in available_vars: logger.info('Reload {} from disk.'.format(v.name)) else: logger.info('Nothing to be reload from disk.') else: logger.info('Nothing to be reload from disk.') evaluator = Evaluator() evaluator.init_from_existed(model, sess, data_reader) global dev_bleu, toleration dev_bleu = evaluator.evaluate(**config.dev) if config.train.eval_on_dev else 0 toleration = config.train.toleration def train_one_step(batch, loss_op, train_op): feed_dict = expand_feed_dict({model.src_pls: batch[0], model.dst_pls: batch[1]}) step, lr, loss, _ = sess.run( [model.global_step, model.learning_rate, loss_op, train_op], feed_dict=feed_dict) if step % config.train.summary_freq == 0: summary = sess.run(model.summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary, global_step=step) return step, lr, loss def maybe_save_model(): global dev_bleu, toleration def save(): mp = config.model_dir + '/model_step_{}'.format(step) global_saver.save(sess, mp) logger.info('Save model in %s.' % mp) if config.train.eval_on_dev: new_dev_bleu = evaluator.evaluate(**config.dev) summary = tf.Summary(value=[tf.Summary.Value(tag="dev_bleu", simple_value=new_dev_bleu)]) summary_writer.add_summary(summary, step) if config.train.toleration is None: save() else: if new_dev_bleu >= dev_bleu: save() toleration = config.train.toleration dev_bleu = new_dev_bleu else: toleration -= 1 else: save() try: step = 0 for epoch in range(1, config.train.num_epochs+1): for batch in data_reader.get_training_batches(epoches=1): # Train normal instances. start_time = time.time() step, lr, loss = train_one_step(batch, loss_op, train_op) logger.info( 'epoch: {0}\tstep: {1}\tlr: {2:.6f}\tloss: {3:.4f}\ttime: {4:.4f}'. format(epoch, step, lr, loss, time.time() - start_time)) # Save model if config.train.save_freq > 0 \ and step > 0 \ and step % config.train.save_freq == 0: maybe_save_model() if config.train.num_steps is not None and step >= config.train.num_steps: raise BreakLoopException("BreakLoop") if toleration is not None and toleration <= 0: raise BreakLoopException("BreakLoop") # Save model per epoch if config.train.save_freq is less or equal than zero if config.train.save_freq <= 0: maybe_save_model() except BreakLoopException as e: logger.info(e) logger.info("Finish training.")
class Evaluator(object): """ Evaluate the model. """ def __init__(self): pass def init_from_config(self, config): logger = logging.getLogger('') self.model = eval(config.model)(config, config.test.num_gpus) self.model.build_test_model() # Print the number of total parameters print_num_of_total_parameters() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config, graph=self.model.graph) # Restore model. self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.model_dir)) self.data_reader = DataReader(config) def init_from_existed(self, model, sess, data_reader): assert model.graph == sess.graph self.sess = sess self.model = model self.data_reader = data_reader def beam_search(self, X): return self.sess.run(self.model.prediction, feed_dict=expand_feed_dict({self.model.src_pls: X})) def beam_search_label(self, X, Y, Z, X_lens): return self.sess.run([self.model.prediction, self.model.prediction_label], feed_dict=expand_feed_dict({self.model.src_pls: X, self.model.dst_pls: Y, self.model.label_pls: Z, self.model.src_len_pls: X_lens})) def loss(self, X, Y): return self.sess.run(self.model.loss_sum, feed_dict=expand_feed_dict({self.model.src_pls: X, self.model.dst_pls: Y})) def loss_label(self, X, Y, Z): return self.sess.run(self.model.loss_sum, feed_dict=expand_feed_dict({self.model.src_pls: X, self.model.dst_pls: Y, self.model.label_pls: Z})) def translate(self, src_path, dst_path, lbl_path, output_path, output_label_path, batch_size): logging.info('Translate %s.' % src_path) _, tmp = mkstemp() fd = codecs.open(tmp, 'w', 'utf8') _, tmp_label = mkstemp() fd_label = codecs.open(tmp_label, 'w', 'utf8') count = 0 token_count = 0 start = time.time() for X, ref, label, src_lens in self.data_reader.get_test_batches_with_target_with_label(src_path, dst_path, lbl_path, batch_size): Y, Z = self.beam_search_label(X, ref, label, src_lens) sents = self.data_reader.indices_to_words(Y, src_lens) assert len(X) == len(sents) for sent in sents: print(sent, file=fd) count += len(X) token_count += np.sum(np.not_equal(Y, 3)) # 3: </s> time_span = time.time() - start logging.info('{0} sentences ({1} tokens) processed in {2:.2f} minutes (speed: {3:.4f} sec/token).'. format(count, token_count, time_span / 60, time_span / token_count)) # Save the prediction of label sents_label = self.data_reader.indices_to_words(Z, src_lens, o='lbl') assert len(X) == len(sents_label) for sent in sents_label: print(sent, file=fd_label) fd.close() # Remove BPE flag, if have. os.system("sed -r 's/(@@ )|(@@ ?$)//g' %s > %s" % (tmp, output_path)) os.remove(tmp) logging.info('The result file was saved in %s.' % output_path) fd_label.close() os.system("sed -r 's/(@@ )|(@@ ?$)//g' %s > %s" % (tmp_label, output_label_path)) os.remove(tmp_label) logging.info('The label file was saved in %s.' % output_label_path) def ppl(self, src_path, dst_path, batch_size): logging.info('Calculate PPL for %s and %s.' % (src_path, dst_path)) token_count = 0 loss_sum = 0 for batch in self.data_reader.get_test_batches_with_target(src_path, dst_path, batch_size): X, Y = batch loss_sum += self.loss(X, Y) token_count += np.sum(np.greater(Y, 0)) # Compute PPL ppl = np.exp(loss_sum / token_count) logging.info('PPL: %.4f' % ppl) return ppl def fscore(self, lbl_path, output_label_path): logging.info('Calculate P/R/F for %s and %s.' % (lbl_path, output_label_path)) ref_file = codecs.open(lbl_path, 'r', 'utf8') pred_file = codecs.open(output_label_path, 'r', 'utf8') tp, fp, fn = 1, 1, 1 err = 0 # assert len(target) == len(prediction) line = 0 for ref, pred in zip(ref_file, pred_file): line += 1 if len(ref) != len(pred): # print(line) err += 1 continue for x, y in zip(ref, pred): if x == y and x == 'E': tp += 1 elif y == 'E': fp += 1 elif x == 'E': fn += 1 else: pass print('tp:{}, fp:{}, fn:{}, err:{}'.format(tp, fp, fn, err)) precision = tp / (tp + fp) recall = tp / (tp + fn) fscore = (2 * precision * recall / (precision + recall)) ref_file.close() pred_file.close() logging.info('precision: %.4f' % precision) logging.info('recall: %.4f' % recall) logging.info('fscore: %.4f' % fscore) return precision, recall, fscore def evaluate(self, batch_size, **kargs): """Evaluate the model on dev set.""" src_path = kargs['src_path'] dst_path = kargs['ref_path'] lbl_path = kargs['label_path'] output_path = kargs['output_path'] output_label_path = kargs['output_label_path'] cmd = kargs['cmd'] if 'cmd' in kargs else\ "perl multi-bleu.perl {ref} < {output} 2>/dev/null | awk '{{print($3)}}' | awk -F, '{{print $1}}'" self.translate(src_path, dst_path, lbl_path, output_path, output_label_path, batch_size) if 'dst_path' in kargs: self.ppl(src_path, kargs['dst_path'], batch_size) # calculate the fscore of label result if 'label_path' in kargs: precision, recall, f_score = self.fscore(lbl_path, output_label_path) return float(f_score) return None
def main(unused_argv): # prints a message if you've entered flags incorrectly if len(unused_argv) != 1: raise Exception("Problem with flags: %s" % unused_argv) # Get hyperparameters. We only get a subset of all the hyperparameters, others would be feed to Model directly. #logging.basicConfig(level=logging.INFO) print('Starting Basic model') log_root = FLAGS.log_root exp_name = FLAGS.exp_name data_file_path = FLAGS.data_file_path pinyin_dict_path = FLAGS.pinyin_dict_path id_data_dir = FLAGS.id_data_dir n_epoch = FLAGS.n_epoch batch_size = FLAGS.batch_size seed_num = FLAGS.seed_num max_timesteps= FLAGS.max_timesteps vocab_size = FLAGS.vocab_size train_size = FLAGS.train_size load_data_and_dr = FLAGS.load_data_and_dr use_local = FLAGS.use_local # make the directory for logs log_root = os.path.join(log_root, exp_name) if not os.path.exists(log_root): os.makedirs(log_root) if use_local == 1: #load or save the DR class from local dir DR_path = os.path.join(log_root, 'DataReader.pkl') #load or save the id data from local dir id_data_path = os.path.join(log_root, 'id_data.pkl') else: #load or save the DR class from global dir DR_path = os.path.join(id_data_dir, 'DataReader.pkl') #load or save the id data from global dir id_data_path = os.path.join(id_data_dir, 'id_data.pkl') if load_data_and_dr == 1: with open(DR_path,'rb') as f: DR = pickle.load(f) with open(id_data_path,'rb') as f1: input_pinyin_data = pickle.load(f1) input_word_data = pickle.load(f1) target_data = pickle.load(f1) else: # load and make the data for training DR = DataReader(vocab_size = vocab_size, pinyin_dict_path = pinyin_dict_path) #input_data,target_data = DR.make_data_from_scratch(file_path = data_file_path,build_dictionary=True) input_pinyin_data,input_word_data,target_data = DR.make_data_from_dataframe(file_path = data_file_path,build_dictionary=True,max_rows = train_size) #save the DR class to local dir with open(DR_path,'wb') as f: pickle.dump(DR,f) #save the ids data to local dir with open(id_data_path,'wb') as f1: pickle.dump(input_pinyin_data,f1) pickle.dump(input_word_data,f1) pickle.dump(target_data,f1) # make the batch train_data_full= batch_generator_triple_with_length(input_pinyin_data,input_word_data,target_data,batch_size,max_timesteps,DR.word2id,DR.pinyin2id) # create the model model = SpellChecker(hps = FLAGS) # create the supervisor with model.graph.as_default(): # print the variables of tensorflow print("Number of sets of parameters: {}".format(len(tf.trainable_variables()))) print("Number of parameters: {}".format( np.sum([np.prod(v.shape.as_list()) for v in tf.trainable_variables()]))) for v in tf.trainable_variables(): print(v) sv = tf.train.Supervisor(logdir=log_root, saver = model.saver, summary_op=None, save_model_secs=60, global_step = model.global_step, init_op=model.init_op) # Do not run the summary service # train the model with sv.managed_session() as sess: n_iter_per_epoch = len(input_pinyin_data) // (batch_size * 2) epoch = 0.0 print('number of iterations per epoch: {}'.format(n_iter_per_epoch)) print('start training...') for _ in range(n_epoch * 2): epoch += 0.5 avg_loss = 0.0 print("----- Epoch {}/{} -----".format(epoch, n_epoch)) for t in tqdm(range(1, n_iter_per_epoch + 1)): batch_full = next(train_data_full) src_pinyin_list,src_word_list,src_length_list,tgt_list,tgt_length_list = batch_full #if epoch == 0.5: #print(src_list[1]) #print(len(src_list[1])) #print(src_length_list[1]) #print(tgt_list[1]) #print(len(tgt_list[1])) #print(tgt_length_list[1]) src_pinyin_list = np.asarray(src_pinyin_list,dtype = np.int32) src_word_list = np.asarray(src_word_list,dtype = np.int32) src_length_list = np.asarray(src_length_list,dtype = np.int32) tgt_list = np.asarray(tgt_list,dtype = np.int32) #tgt_length_list = np.asarray(tgt_length_list,dtype = np.int32) loss = model.train_one_step(src_pinyin_list, src_word_list,src_length_list, tgt_list,sess) avg_loss +=loss avg_loss /= n_iter_per_epoch print('the avg_loss is {}'.format(avg_loss))
class Seq2SeqTrainer: def __init__(self, model, output_name=None): self.model = model self.datareader = DataReader() self.metrics = ErrorMetrics() self.output_name = output_name def train( self, train_data, val_data, epochs=EPOCHS, patience=PATIENCE, pretrain=False, minibatch_size=1, ): trainer = dy.SimpleSGDTrainer(self.model.model) logging.info("Training data length: %d" % len(train_data)) logging.info("Validation data length: %d" % len(val_data)) for e in range(epochs): start_time = time.time() logging.info("Epoch: %d" % e) epoch_loss = 0.0 random.shuffle(train_data) for i in range(0, len(train_data), minibatch_size): cur_size = min(minibatch_size, len(train_data) - i) losses = [] dy.renew_cg() for (src1, src2, tgt) in train_data[i:i + cur_size]: losses.append(self.model.get_loss(src1, src2, tgt)) batch_loss = dy.esum(losses) batch_loss.backward() trainer.update() epoch_loss += batch_loss.scalar_value() logging.info("Epoch loss: %0.4f" % (epoch_loss / len(train_data))) if not pretrain: cur_cer, cur_wer = self.metrics.get_average_cer( self.model, val_data, output_file=None, write_pgens=False) if cur_cer < self.model.best_val_cer: self.model.save() self.model.best_val_cer = cur_cer best_val_epoch = e logging.info( "Model saved at epoch: {}".format(best_val_epoch)) logging.info("VAL CER: %0.4f" % (cur_cer)) logging.info("VAL WER: %0.4f" % (cur_wer)) if cur_cer == 0: logging.info("Validation CER is zero. End training.") break logging.info("--- %s seconds ---" % (time.time() - start_time)) logging.info("\n") if not pretrain: if e - best_val_epoch > patience: logging.info("Patience reached. End training.") break def train_model(self, train_src1, train_src2, train_tgt, val_src1, val_src2, val_tgt): train_data = self.datareader.read_parallel_data( self.model, train_src1, train_src2, train_tgt) val_data = self.datareader.read_parallel_data(self.model, val_src1, val_src2, val_tgt) self.train(train_data, val_data)
def train(config): logger = logging.getLogger('') """Train a model with a config file.""" data_reader = DataReader(config=config) model = eval(config.model)(config=config, num_gpus=config.train.num_gpus) model.build_train_model(test=config.train.eval_on_dev) sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True summary_writer = tf.summary.FileWriter(config.model_dir, graph=model.graph) with tf.Session(config=sess_config, graph=model.graph) as sess: # Initialize all variables. sess.run(tf.global_variables_initializer()) # Reload variables in disk. if tf.train.latest_checkpoint(config.model_dir): available_vars = available_variables(config.model_dir) if available_vars: saver = tf.train.Saver(var_list=available_vars) saver.restore(sess, tf.train.latest_checkpoint(config.model_dir)) for v in available_vars: logger.info('Reload {} from disk.'.format(v.name)) else: logger.info('Nothing to be reload from disk.') else: logger.info('Nothing to be reload from disk.') evaluator = Evaluator() evaluator.init_from_existed(model, sess, data_reader) global dev_bleu, toleration dev_bleu = evaluator.evaluate( **config.dev) if config.train.eval_on_dev else 0 toleration = config.train.toleration def train_one_step(batch): feat_batch, target_batch, batch_size = batch feed_dict = expand_feed_dict({ model.src_pls: feat_batch, model.dst_pls: target_batch }) step, lr, loss, _ = sess.run([ model.global_step, model.learning_rate, model.loss, model.train_op ], feed_dict=feed_dict) if step % config.train.summary_freq == 0: summary = sess.run(model.summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary, global_step=step) return step, lr, loss def maybe_save_model(): global dev_bleu, toleration new_dev_bleu = evaluator.evaluate( **config.dev) if config.train.eval_on_dev else dev_bleu + 1 if new_dev_bleu >= dev_bleu: mp = config.model_dir + '/model_step_{}'.format(step) model.saver.save(sess, mp) logger.info('Save model in %s.' % mp) toleration = config.train.toleration dev_bleu = new_dev_bleu else: toleration -= 1 step = 0 for epoch in range(1, config.train.num_epochs + 1): for batch in data_reader.get_training_batches_with_buckets(): # Train normal instances. start_time = time.time() step, lr, loss = train_one_step(batch) logger.info( 'epoch: {0}\tstep: {1}\tlr: {2:.6f}\tloss: {3:.4f}\ttime: {4:.4f}\tbatch_size: {5}' .format(epoch, step, lr, loss, time.time() - start_time, batch[2])) # Save model if config.train.save_freq > 0 and step % config.train.save_freq == 0: maybe_save_model() if config.train.num_steps and step >= config.train.num_steps: break # Save model per epoch if config.train.save_freq is less or equal than zero if config.train.save_freq <= 0: maybe_save_model() # Early stop if toleration <= 0: break logger.info("Finish training.")
def build_data_loader(args, char_dict, intent_dict): """[decorate samples for dataloader] Arguments: args {[type]} -- [description] char_dict {[type]} -- [description] intent_dict {[type]} -- [description] Returns: [type] -- [description] """ loader_res = {} if args.do_train: train_processor = DataReader(char_dict, intent_dict, args.max_seq_len) train_data_generator = train_processor.prepare_data( data_path=args.data_dir + "train.txt", batch_size=args.batch_size, mode='train') loader_res["train_data_generator"] = train_data_generator num_train_examples = train_processor._get_num_examples() logger.info("Num train examples: %d" % num_train_examples) logger.info("Num train steps: %d" % (math.ceil(num_train_examples * 1.0 / args.batch_size) * \ args.epoch // DEV_COUNT)) if math.ceil( num_train_examples * 1.0 / args.batch_size) // DEV_COUNT <= 0: logger.error( "Num of train steps is less than 0 or equals to 0, exit") exit(1) if args.do_eval: eval_processor = DataReader(char_dict, intent_dict, args.max_seq_len) eval_data_generator = eval_processor.prepare_data( data_path=args.data_dir + "eval.txt", batch_size=args.batch_size, mode='eval') loader_res["eval_data_generator"] = eval_data_generator num_eval_examples = eval_processor._get_num_examples() logger.info("Num eval examples: %d" % num_eval_examples) if args.do_test: test_processor = DataReader(char_dict, intent_dict, args.max_seq_len) test_data_generator = test_processor.prepare_data( data_path=args.data_dir + "test.txt", batch_size=args.batch_size, mode='test') loader_res["test_data_generator"] = test_data_generator return loader_res
def __init__(self, model, output_name): self.model = model self.datareader = DataReader() self.metrics = ErrorMetrics() self.output_name = output_name
M, C_pc, C_rpc, p, q, loss_fun='square_loss', alpha=0.2) H = - np.sum(dP * np.log(dP + 1e-5)) # print(k, H) P = P + dP # print(k, np.argmax(dP, axis=1)) re_ranking[k] = np.argmax(P, axis=1).tolist() return re_ranking if __name__ == '__main__': golden_annotations = defaultdict(list) for case in golden_cases: file = os.path.join(golden_train_dir, case, "annotation.txt") annotation = DataReader.parse_annotation(file) for k, v in annotation.items(): golden_annotations[k].append(np.asarray(v)) t = tqdm(train_cases) for case in t: raw_annotation = os.path.join(train_dir, case, "annotation.txt") annotation = {k: np.asarray(v) for k, v in DataReader.parse_annotation(raw_annotation).items()} re_ranking = align_annotation( golden_annotations, annotation, align_method='FGW') if re_ranking: re_annotation = {k: np.asarray([v[i] for i in re_ranking[k]]) for k, v in annotation.items()} save_refined_annotation(sorted_train_dir, case, re_annotation) else: fail_list.write(case + "\n")
class Evaluator(object): """ Evaluate the model. """ def __init__(self): pass def init_from_config(self, config): self.model = eval(config.model)(config, config.test.num_gpus) self.model.build_test_model() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config) # Restore model. try: tf.train.Saver().restore( self.sess, tf.train.latest_checkpoint(config.model_dir)) except tf.errors.NotFoundError: roll_back_to_previous_version(config) tf.train.Saver().restore( self.sess, tf.train.latest_checkpoint(config.model_dir)) self.data_reader = DataReader(config) def init_from_frozen_graphdef(self, config): frozen_graph_path = os.path.join(config.model_dir, 'freeze_graph_test.py') # If the file doesn't existed, create it. if not os.path.exists(frozen_graph_path): logging.warning( 'The frozen graph does not existed, use \'init_from_config\' instead' 'and create a frozen graph for next use.') self.init_from_config(config) saver = tf.train.Saver() save_dir = '/tmp/graph-{}'.format(os.getpid()) os.mkdir(save_dir) save_path = '{}/ckpt'.format(save_dir) saver.save(sess=self.sess, save_path=save_path) with tf.Session(graph=tf.Graph()) as sess: clear_devices = True output_node_names = ['loss_sum', 'predictions'] # We import the meta graph in the current default Graph saver = tf.train.import_meta_graph(save_path + '.meta', clear_devices=clear_devices) # We restore the weights saver.restore(sess, save_path) # We use a built-in TF helper to export variables to constants output_graph_def = tf.graph_util.convert_variables_to_constants( sess, # The session is used to retrieve the weights tf.get_default_graph().as_graph_def( ), # The graph_def is used to retrieve the nodes output_node_names # The output node names are used to select the useful nodes ) # Finally we serialize and dump the output graph to the filesystem with tf.gfile.GFile(frozen_graph_path, "wb") as f: f.write(output_graph_def.SerializeToString()) logging.info("%d ops in the final graph." % len(output_graph_def.node)) # Remove temp files. os.system('rm -rf ' + save_dir) else: sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config) self.data_reader = DataReader(config) # We load the protobuf file from the disk and parse it to retrieve the # unserialized graph_def with tf.gfile.GFile(frozen_graph_path, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) # Import the graph_def into current the default graph. tf.import_graph_def(graph_def) graph = tf.get_default_graph() self.model = AttrDict() def collect_placeholders(prefix): ret = [] idx = 0 while True: try: ret.append( graph.get_tensor_by_name('import/{}_{}:0'.format( prefix, idx))) idx += 1 except KeyError: return tuple(ret) self.model['src_pls'] = collect_placeholders('src_pl') self.model['dst_pls'] = collect_placeholders('dst_pl') self.model['predictions'] = graph.get_tensor_by_name( 'import/predictions:0') def init_from_existed(self, model, sess, data_reader): self.sess = sess self.model = model self.data_reader = data_reader def beam_search(self, X): return self.sess.run(self.model.predictions, feed_dict=expand_feed_dict( {self.model.src_pls: X})) def loss(self, X, Y): return self.sess.run(self.model.loss_sum, feed_dict=expand_feed_dict({ self.model.src_pls: X, self.model.dst_pls: Y })) def translate(self, src_path, output_path, batch_size): logging.info('Translate %s.' % src_path) _, tmp = mkstemp() fd = codecs.open(tmp, 'w', 'utf8') count = 0 token_count = 0 epsilon = 1e-6 start = time.time() for X in self.data_reader.get_test_batches(src_path, batch_size): Y = self.beam_search(X) Y = Y[:len(X)] sents = self.data_reader.indices_to_words(Y) assert len(X) == len(sents) for sent in sents: print(sent, file=fd) count += len(X) token_count += np.sum(np.not_equal(Y, 3)) # 3: </s> time_span = time.time() - start logging.info( '{0} sentences ({1} tokens) processed in {2:.2f} minutes (speed: {3:.4f} sec/token).' .format(count, token_count, time_span / 60, time_span / (token_count + epsilon))) fd.close() # Remove BPE flag, if have. os.system("sed -r 's/(@@ )|(@@ ?$)//g' %s > %s" % (tmp, output_path)) os.remove(tmp) logging.info('The result file was saved in %s.' % output_path) def ppl(self, src_path, dst_path, batch_size): logging.info('Calculate PPL for %s and %s.' % (src_path, dst_path)) token_count = 0 loss_sum = 0 for batch in self.data_reader.get_test_batches_with_target( src_path, dst_path, batch_size): X, Y = batch loss_sum += self.loss(X, Y) token_count += np.sum(np.greater(Y, 0)) # Compute PPL ppl = np.exp(loss_sum / token_count) logging.info('PPL: %.4f' % ppl) return ppl def evaluate(self, batch_size, **kargs): """Evaluate the model on dev set.""" src_path = kargs['src_path'] output_path = kargs['output_path'] cmd = kargs['cmd'] if 'cmd' in kargs else\ "perl multi-bleu.perl {ref} < {output} 2>/dev/null | awk '{{print($3)}}' | awk -F, '{{print $1}}'" cmd = cmd.strip() logging.info('Evaluation command: ' + cmd) self.translate(src_path, output_path, batch_size) bleu = None if 'ref_path' in kargs: ref_path = kargs['ref_path'] try: bleu = commands.getoutput( cmd.format(**{ 'ref': ref_path, 'output': output_path })) bleu = float(bleu) except ValueError, e: logging.warning( 'An error raised when calculate BLEU: {}'.format(e)) bleu = 0 logging.info('BLEU: {}'.format(bleu)) if 'dst_path' in kargs: self.ppl(src_path, kargs['dst_path'], batch_size) return bleu
def train(self, model, save=False, make_chart=False): """ Trains an input model. Makes Calculations, Charts, and Saves the model if necessary. Parameters ---------- model: SKLearn Model The regression model to use save: Boolean Whether or not the model should be saved make_chart Boolean Whether or not to make/save a chart Returns ------- float, float, float: The Average CV Mean Squared Error, Mean Absolute Error, and Test MSE """ #get/split data reader = DataReader() df = reader.create_input_data() df = self.preprocess(df) self.X_train, self.X_test, self.y_train, self.y_test = self.split_data( df) parameters = { 'n_estimators': [1, 5, 10, 20, 30], 'max_depth': [1, 5, 10] } rf = RandomForestRegressor() self.model = GridSearchCV(rf, parameters, cv=10) #train model self.model.fit(self.X_train, self.y_train) #Feature importance importances = self.model.best_estimator_.feature_importances_ cols = self.X_train.columns for i in range(len(importances)): print(cols[i], importances[i]) if save: joblib.dump(self.model.best_estimator_, "../models/" + self.name + "_2017.joblib") print("------------------------") MSEs = cross_val_score(estimator=self.model, X=self.X_train, y=self.y_train, scoring='neg_mean_squared_error', cv=8) predicted = self.model.predict(self.X_test) print("Average CV Mean Squared Error: ", abs(np.mean(MSEs))) print( "Testing Mean Absolute Error: ", mean_absolute_error(self.y_test, self.model.predict(self.X_test))) print("Testing MSE: ", mean_squared_error(self.y_test, predicted)) #print(self.model.feature_importances_) if make_chart: print("Generating Chart...") plt.style.use('dark_background') fig, ax = plt.subplots(nrows=1, ncols=1) ax.set_ylabel('HDI') ax.set_xlabel("Municipality Codmun ID") ax.set_title(self.name + 'Real vs Predicted') green, = ax.plot(np.arange(20), self.y_test[0:100:5], 'g', label='True') red, = ax.plot(np.arange(20), predicted[0:100:5], 'r', label='Predicted') ax.set_xticks(np.arange(20)) x_labels = self.X_test.iloc[0:100:5]['codmun'].tolist() ax.set_xticklabels([str(int(y)) for y in x_labels], rotation='vertical') plt.legend(handles=[green, red], labels=["True", "Predicted"]) plt.tight_layout() fig.savefig(self.name + "_real_v_predicted") for x in range(0, 100, 5): print(predicted[x], x_labels[int(x / 5)]) print(x_labels, predicted[0:100:5]) return np.mean(MSEs), mean_absolute_error( self.y_test, self.model.predict(self.X_test)), mean_squared_error( self.y_test, predicted)