def home(): obj = DataReader() obj.parse_country() obj.parse_city() obj.parse_features() obj.parse_prediction() return jsonify(obj.final_data)
def build_data_loader(args, char_dict, intent_dict): """[decorate samples for dataloader] Arguments: args {[type]} -- [description] char_dict {[type]} -- [description] intent_dict {[type]} -- [description] Returns: [type] -- [description] """ loader_res = {} if args.do_train: train_processor = DataReader(char_dict, intent_dict, args.max_seq_len) train_data_generator = train_processor.prepare_data( data_path=args.data_dir + "train.txt", batch_size=args.batch_size, mode='train') loader_res["train_data_generator"] = train_data_generator num_train_examples = train_processor._get_num_examples() logger.info("Num train examples: %d" % num_train_examples) logger.info("Num train steps: %d" % (math.ceil(num_train_examples * 1.0 / args.batch_size) * \ args.epoch // DEV_COUNT)) if math.ceil( num_train_examples * 1.0 / args.batch_size) // DEV_COUNT <= 0: logger.error( "Num of train steps is less than 0 or equals to 0, exit") exit(1) if args.do_eval: eval_processor = DataReader(char_dict, intent_dict, args.max_seq_len) eval_data_generator = eval_processor.prepare_data( data_path=args.data_dir + "eval.txt", batch_size=args.batch_size, mode='eval') loader_res["eval_data_generator"] = eval_data_generator num_eval_examples = eval_processor._get_num_examples() logger.info("Num eval examples: %d" % num_eval_examples) if args.do_test: test_processor = DataReader(char_dict, intent_dict, args.max_seq_len) test_data_generator = test_processor.prepare_data( data_path=args.data_dir + "test.txt", batch_size=args.batch_size, mode='test') loader_res["test_data_generator"] = test_data_generator return loader_res
def pretrain_model(self, src1_path, src2_path, tgt_path, epochs): datareader = DataReader() data = datareader.read_parallel_data(self.model, src1_path, src2_path, tgt_path) self.seq2seq_trainer.train( train_data=data, val_data=[], epochs=epochs, pretrain=True, )
def init_from_config(self, config): # self.model = Model(config) self.model = Transformer(config, config.test.devices) self.model.build_test_model() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config, graph=self.model.graph) # Restore model. self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.train.logdir)) self.data_reader = DataReader(config)
def init_from_config(self, config): self.model = eval(config.model)(config, config.test.num_gpus) self.model.build_test_model() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config, graph=self.model.graph) if is_debug: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) # Restore model. self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.model_dir)) self.data_reader = DataReader(config)
def init_from_config(self, config): logger = logging.getLogger('') self.model = eval(config.model)(config, config.test.num_gpus) self.model.build_test_model() # Print the number of total parameters print_num_of_total_parameters() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config, graph=self.model.graph) # Restore model. self.model.saver.restore(self.sess, tf.train.latest_checkpoint(config.model_dir)) self.data_reader = DataReader(config)
def init_from_config(self, config): self.model = eval(config.model)(config, config.test.num_gpus) self.model.build_test_model() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config) # Restore model. try: tf.train.Saver().restore( self.sess, tf.train.latest_checkpoint(config.model_dir)) except tf.errors.NotFoundError: roll_back_to_previous_version(config) tf.train.Saver().restore( self.sess, tf.train.latest_checkpoint(config.model_dir)) self.data_reader = DataReader(config)
def __init__(self, options): """Gonna need a db, and some creds.""" log.info("Starting AG Chatter Bot.") self.options = options # Build Constructors self.idx2word = Database( host=options.redis_host, pass_=options.redis_pass, db=0 ) self.word2idx = Database( host=options.redis_host, pass_=options.redis_pass, db=1 ) self.dataReader = DataReader( self.options, self.idx2word, self.word2idx ) self.model = Model( self.options ) log.debug(options) log.info("Init complete.")
def train(config): logger = logging.getLogger('') """Train a model with a config file.""" data_reader = DataReader(config=config) model = eval(config.model)(config=config, num_gpus=config.train.num_gpus) model.build_train_model(test=config.train.eval_on_dev) sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True summary_writer = tf.summary.FileWriter(config.model_dir, graph=model.graph) with tf.Session(config=sess_config, graph=model.graph) as sess: # Initialize all variables. sess.run(tf.global_variables_initializer()) # Reload variables in disk. if tf.train.latest_checkpoint(config.model_dir): available_vars = available_variables(config.model_dir) if available_vars: saver = tf.train.Saver(var_list=available_vars) saver.restore(sess, tf.train.latest_checkpoint(config.model_dir)) for v in available_vars: logger.info('Reload {} from disk.'.format(v.name)) else: logger.info('Nothing to be reload from disk.') else: logger.info('Nothing to be reload from disk.') evaluator = Evaluator() evaluator.init_from_existed(model, sess, data_reader) global dev_bleu, toleration dev_bleu = evaluator.evaluate( **config.dev) if config.train.eval_on_dev else 0 toleration = config.train.toleration def train_one_step(batch): feat_batch, target_batch, batch_size = batch feed_dict = expand_feed_dict({ model.src_pls: feat_batch, model.dst_pls: target_batch }) step, lr, loss, _ = sess.run([ model.global_step, model.learning_rate, model.loss, model.train_op ], feed_dict=feed_dict) if step % config.train.summary_freq == 0: summary = sess.run(model.summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary, global_step=step) return step, lr, loss def maybe_save_model(): global dev_bleu, toleration new_dev_bleu = evaluator.evaluate( **config.dev) if config.train.eval_on_dev else dev_bleu + 1 if new_dev_bleu >= dev_bleu: mp = config.model_dir + '/model_step_{}'.format(step) model.saver.save(sess, mp) logger.info('Save model in %s.' % mp) toleration = config.train.toleration dev_bleu = new_dev_bleu else: toleration -= 1 step = 0 for epoch in range(1, config.train.num_epochs + 1): for batch in data_reader.get_training_batches_with_buckets(): # Train normal instances. start_time = time.time() step, lr, loss = train_one_step(batch) logger.info( 'epoch: {0}\tstep: {1}\tlr: {2:.6f}\tloss: {3:.4f}\ttime: {4:.4f}\tbatch_size: {5}' .format(epoch, step, lr, loss, time.time() - start_time, batch[2])) # Save model if config.train.save_freq > 0 and step % config.train.save_freq == 0: maybe_save_model() if config.train.num_steps and step >= config.train.num_steps: break # Save model per epoch if config.train.save_freq is less or equal than zero if config.train.save_freq <= 0: maybe_save_model() # Early stop if toleration <= 0: break logger.info("Finish training.")
def main(unused_argv): # prints a message if you've entered flags incorrectly if len(unused_argv) != 1: raise Exception("Problem with flags: %s" % unused_argv) # Get hyperparameters. We only get a subset of all the hyperparameters, others would be feed to Model directly. #logging.basicConfig(level=logging.INFO) print('Starting Basic model') log_root = FLAGS.log_root exp_name = FLAGS.exp_name data_file_path = FLAGS.data_file_path pinyin_dict_path = FLAGS.pinyin_dict_path id_data_dir = FLAGS.id_data_dir n_epoch = FLAGS.n_epoch batch_size = FLAGS.batch_size seed_num = FLAGS.seed_num max_timesteps= FLAGS.max_timesteps vocab_size = FLAGS.vocab_size train_size = FLAGS.train_size load_data_and_dr = FLAGS.load_data_and_dr use_local = FLAGS.use_local # make the directory for logs log_root = os.path.join(log_root, exp_name) if not os.path.exists(log_root): os.makedirs(log_root) if use_local == 1: #load or save the DR class from local dir DR_path = os.path.join(log_root, 'DataReader.pkl') #load or save the id data from local dir id_data_path = os.path.join(log_root, 'id_data.pkl') else: #load or save the DR class from global dir DR_path = os.path.join(id_data_dir, 'DataReader.pkl') #load or save the id data from global dir id_data_path = os.path.join(id_data_dir, 'id_data.pkl') if load_data_and_dr == 1: with open(DR_path,'rb') as f: DR = pickle.load(f) with open(id_data_path,'rb') as f1: input_pinyin_data = pickle.load(f1) input_word_data = pickle.load(f1) target_data = pickle.load(f1) else: # load and make the data for training DR = DataReader(vocab_size = vocab_size, pinyin_dict_path = pinyin_dict_path) #input_data,target_data = DR.make_data_from_scratch(file_path = data_file_path,build_dictionary=True) input_pinyin_data,input_word_data,target_data = DR.make_data_from_dataframe(file_path = data_file_path,build_dictionary=True,max_rows = train_size) #save the DR class to local dir with open(DR_path,'wb') as f: pickle.dump(DR,f) #save the ids data to local dir with open(id_data_path,'wb') as f1: pickle.dump(input_pinyin_data,f1) pickle.dump(input_word_data,f1) pickle.dump(target_data,f1) # make the batch train_data_full= batch_generator_triple_with_length(input_pinyin_data,input_word_data,target_data,batch_size,max_timesteps,DR.word2id,DR.pinyin2id) # create the model model = SpellChecker(hps = FLAGS) # create the supervisor with model.graph.as_default(): # print the variables of tensorflow print("Number of sets of parameters: {}".format(len(tf.trainable_variables()))) print("Number of parameters: {}".format( np.sum([np.prod(v.shape.as_list()) for v in tf.trainable_variables()]))) for v in tf.trainable_variables(): print(v) sv = tf.train.Supervisor(logdir=log_root, saver = model.saver, summary_op=None, save_model_secs=60, global_step = model.global_step, init_op=model.init_op) # Do not run the summary service # train the model with sv.managed_session() as sess: n_iter_per_epoch = len(input_pinyin_data) // (batch_size * 2) epoch = 0.0 print('number of iterations per epoch: {}'.format(n_iter_per_epoch)) print('start training...') for _ in range(n_epoch * 2): epoch += 0.5 avg_loss = 0.0 print("----- Epoch {}/{} -----".format(epoch, n_epoch)) for t in tqdm(range(1, n_iter_per_epoch + 1)): batch_full = next(train_data_full) src_pinyin_list,src_word_list,src_length_list,tgt_list,tgt_length_list = batch_full #if epoch == 0.5: #print(src_list[1]) #print(len(src_list[1])) #print(src_length_list[1]) #print(tgt_list[1]) #print(len(tgt_list[1])) #print(tgt_length_list[1]) src_pinyin_list = np.asarray(src_pinyin_list,dtype = np.int32) src_word_list = np.asarray(src_word_list,dtype = np.int32) src_length_list = np.asarray(src_length_list,dtype = np.int32) tgt_list = np.asarray(tgt_list,dtype = np.int32) #tgt_length_list = np.asarray(tgt_length_list,dtype = np.int32) loss = model.train_one_step(src_pinyin_list, src_word_list,src_length_list, tgt_list,sess) avg_loss +=loss avg_loss /= n_iter_per_epoch print('the avg_loss is {}'.format(avg_loss))
model_path = sys.argv[5] logfile = sys.argv[6] ckpt_file = "ckpt" x, y_true, y, gap_w, conv3_pool, train_step, accuracy, saver = \ inference(batch_size) sess = tf.InteractiveSession() # Setup summary merged = tf.summary.merge_all() writer = tf.summary.FileWriter(logfile, sess.graph) # Get data reader data_reader = DataReader(train_dataset_dir, batch_size=batch_size, file_names=False) tf.add_to_collection('x', x) tf.add_to_collection('y', y) tf.add_to_collection('gap_w', gap_w) tf.add_to_collection('conv3', conv3_pool) ckpt = tf.train.latest_checkpoint(model_path) if ckpt: saver.restore(sess, ckpt) print("Model loaded from file: %s" % ckpt) # Initialize variables sess.run(tf.global_variables_initializer())
def train(config): """Train a model with a config file.""" logger = logging.getLogger('') data_reader = DataReader(config=config) model = eval(config.model)(config=config, num_gpus=config.train.num_gpus) model.build_train_model(test=config.train.eval_on_dev) train_op, loss_op = model.get_train_op(name=None) global_saver = tf.train.Saver() sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True summary_writer = tf.summary.FileWriter(config.model_dir) with tf.Session(config=sess_config) as sess: # Initialize all variables. sess.run(tf.global_variables_initializer()) # Reload variables from disk. if tf.train.latest_checkpoint(config.model_dir): available_vars = available_variables(config.model_dir) if available_vars: saver = tf.train.Saver(var_list=available_vars) saver.restore(sess, tf.train.latest_checkpoint(config.model_dir)) for v in available_vars: logger.info('Reload {} from disk.'.format(v.name)) else: logger.info('Nothing to be reload from disk.') else: logger.info('Nothing to be reload from disk.') evaluator = Evaluator() evaluator.init_from_existed(model, sess, data_reader) global dev_bleu, toleration dev_bleu = evaluator.evaluate(**config.dev) if config.train.eval_on_dev else 0 toleration = config.train.toleration def train_one_step(batch, loss_op, train_op): feed_dict = expand_feed_dict({model.src_pls: batch[0], model.dst_pls: batch[1]}) step, lr, loss, _ = sess.run( [model.global_step, model.learning_rate, loss_op, train_op], feed_dict=feed_dict) if step % config.train.summary_freq == 0: summary = sess.run(model.summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary, global_step=step) return step, lr, loss def maybe_save_model(): global dev_bleu, toleration def save(): mp = config.model_dir + '/model_step_{}'.format(step) global_saver.save(sess, mp) logger.info('Save model in %s.' % mp) if config.train.eval_on_dev: new_dev_bleu = evaluator.evaluate(**config.dev) summary = tf.Summary(value=[tf.Summary.Value(tag="dev_bleu", simple_value=new_dev_bleu)]) summary_writer.add_summary(summary, step) if config.train.toleration is None: save() else: if new_dev_bleu >= dev_bleu: save() toleration = config.train.toleration dev_bleu = new_dev_bleu else: toleration -= 1 else: save() try: step = 0 for epoch in range(1, config.train.num_epochs+1): for batch in data_reader.get_training_batches(epoches=1): # Train normal instances. start_time = time.time() step, lr, loss = train_one_step(batch, loss_op, train_op) logger.info( 'epoch: {0}\tstep: {1}\tlr: {2:.6f}\tloss: {3:.4f}\ttime: {4:.4f}'. format(epoch, step, lr, loss, time.time() - start_time)) # Save model if config.train.save_freq > 0 \ and step > 0 \ and step % config.train.save_freq == 0: maybe_save_model() if config.train.num_steps is not None and step >= config.train.num_steps: raise BreakLoopException("BreakLoop") if toleration is not None and toleration <= 0: raise BreakLoopException("BreakLoop") # Save model per epoch if config.train.save_freq is less or equal than zero if config.train.save_freq <= 0: maybe_save_model() except BreakLoopException as e: logger.info(e) logger.info("Finish training.")
col_idx = (1, 2, 3, 4, 5, 6) target_col = len(col_name) - 1 # ============================================ # # Data location wd = os.path.dirname(os.path.abspath(__file__)) + '/' data_path = wd + 'data/' output_path = wd + 'output/' # ============================================ # # Read data data_files = os.listdir(data_path) for i in range(len(data_files)): data_files[i] = data_path + data_files[i] dr = DataReader(data_files, col_idx) ds = DataScaler() dp = DataParser() print('======== Supplying data ============') dr.read() print('======== Extracting data ============') # ============================================ # # Split data X = dr.data[:, :target_col] y = dr.data[:, target_col] alias = list(np.unique(y)) y = dp.convertTextTarget(y, alias) #dump_result(output_path + 'accidents.csv', np.array(alias), ['accident']) print('Accident types: ', alias)
logfile = sys.argv[6] ckpt_file = "ckpt" x, y_true, y, gap_w, conv3_pool, train_step, accuracy, saver = \ inference(batch_size) sess = tf.InteractiveSession() # Setup summary merged = tf.summary.merge_all() writer = tf.summary.FileWriter(logfile, sess.graph) # Get data reader data_reader = DataReader(train_dataset_dir, batch_size=batch_size, file_names=False, resize_to=(224, 224)) tf.add_to_collection('x', x) tf.add_to_collection('y', y) tf.add_to_collection('gap_w', gap_w) tf.add_to_collection('conv3', conv3_pool) ckpt = tf.train.latest_checkpoint(model_path) if ckpt: saver.restore(sess, ckpt) print("Model loaded from file: %s" % ckpt) # Initialize variables sess.run(tf.global_variables_initializer())
def main(unused_argv): # prints a message if you've entered flags incorrectly if len(unused_argv) != 1: raise Exception("Problem with flags: %s" % unused_argv) # Get hyperparameters. We only get a subset of all the hyperparameters, others would be feed to Model directly. #logging.basicConfig(level=logging.INFO) print('Starting Basic model') log_root = FLAGS.log_root exp_name = FLAGS.exp_name data_file_path = FLAGS.data_file_path pinyin_dict_path = FLAGS.pinyin_dict_path id_data_dir = FLAGS.id_data_dir n_epoch = FLAGS.n_epoch batch_size = FLAGS.batch_size seed_num = FLAGS.seed_num max_timesteps = FLAGS.max_timesteps vocab_size = FLAGS.vocab_size train_size = FLAGS.train_size load_data_and_dr = FLAGS.load_data_and_dr use_local = FLAGS.use_local # make the directory for logs log_root = os.path.join(log_root, exp_name) if not os.path.exists(log_root): os.makedirs(log_root) if use_local == 1: #load or save the DR class from local dir DR_path = os.path.join(log_root, 'DataReader.pkl') #load or save the id data from local dir id_data_path = os.path.join(log_root, 'id_data.pkl') else: #load or save the DR class from global dir DR_path = os.path.join(id_data_dir, 'DataReader.pkl') #load or save the id data from global dir id_data_path = os.path.join(id_data_dir, 'id_data.pkl') if load_data_and_dr == 1: with open(DR_path, 'rb') as f: DR = pickle.load(f) with open(id_data_path, 'rb') as f1: input_pinyin_data = pickle.load(f1) input_word_data = pickle.load(f1) target_data = pickle.load(f1) else: # load and make the data for training DR = DataReader(vocab_size=vocab_size, pinyin_dict_path=pinyin_dict_path) #input_data,target_data = DR.make_data_from_scratch(file_path = data_file_path,build_dictionary=True) input_pinyin_data, input_word_data, target_data = DR.make_data_from_dataframe( file_path=data_file_path, build_dictionary=True, max_rows=train_size) #save the DR class to local dir with open(DR_path, 'wb') as f: pickle.dump(DR, f) #save the ids data to local dir with open(id_data_path, 'wb') as f1: pickle.dump(input_pinyin_data, f1) pickle.dump(input_word_data, f1) pickle.dump(target_data, f1) # make the batch train_data_full = batch_generator_triple_with_length( input_pinyin_data, input_word_data, target_data, batch_size, max_timesteps, DR.word2id, DR.pinyin2id) # create the model model = SpellChecker(hps=FLAGS) sess = tf.Session() sess.run(tf.global_variables_initializer()) n_iter_per_epoch = len(input_pinyin_data) // (batch_size * 2) epoch = 0.0 print('number of iterations per epoch: {}'.format(n_iter_per_epoch)) print('start training...') for _ in range(n_epoch * 2): epoch += 0.5 avg_loss = 0.0 print("----- Epoch {}/{} -----".format(epoch, n_epoch)) for t in tqdm(range(1, n_iter_per_epoch + 1)): batch_full = next(train_data_full) src_pinyin_list, src_word_list, src_length_list, tgt_list, tgt_length_list = batch_full src_pinyin_list = np.asarray(src_pinyin_list, dtype=np.int32) src_word_list = np.asarray(src_word_list, dtype=np.int32) src_length_list = np.asarray(src_length_list, dtype=np.int32) tgt_list = np.asarray(tgt_list, dtype=np.int32) keep_ratio = FLAGS.keep_ratio #tgt_length_list = np.asarray(tgt_length_list,dtype = np.int32) loss = model.train_one_step(src_pinyin_list, src_word_list, src_length_list, tgt_list, keep_ratio, sess) avg_loss += loss avg_loss /= n_iter_per_epoch print('the avg_loss is {}'.format(avg_loss)) if epoch == 1.5: print('Build model for serving...') model.build_model_for_serving(sess) print('Build model serving done!')
def init_from_frozen_graphdef(self, config): frozen_graph_path = os.path.join(config.model_dir, 'freeze_graph_test.py') # If the file doesn't existed, create it. if not os.path.exists(frozen_graph_path): logging.warning( 'The frozen graph does not existed, use \'init_from_config\' instead' 'and create a frozen graph for next use.') self.init_from_config(config) saver = tf.train.Saver() save_dir = '/tmp/graph-{}'.format(os.getpid()) os.mkdir(save_dir) save_path = '{}/ckpt'.format(save_dir) saver.save(sess=self.sess, save_path=save_path) with tf.Session(graph=tf.Graph()) as sess: clear_devices = True output_node_names = ['loss_sum', 'predictions'] # We import the meta graph in the current default Graph saver = tf.train.import_meta_graph(save_path + '.meta', clear_devices=clear_devices) # We restore the weights saver.restore(sess, save_path) # We use a built-in TF helper to export variables to constants output_graph_def = tf.graph_util.convert_variables_to_constants( sess, # The session is used to retrieve the weights tf.get_default_graph().as_graph_def( ), # The graph_def is used to retrieve the nodes output_node_names # The output node names are used to select the useful nodes ) # Finally we serialize and dump the output graph to the filesystem with tf.gfile.GFile(frozen_graph_path, "wb") as f: f.write(output_graph_def.SerializeToString()) logging.info("%d ops in the final graph." % len(output_graph_def.node)) # Remove temp files. os.system('rm -rf ' + save_dir) else: sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True self.sess = tf.Session(config=sess_config) self.data_reader = DataReader(config) # We load the protobuf file from the disk and parse it to retrieve the # unserialized graph_def with tf.gfile.GFile(frozen_graph_path, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) # Import the graph_def into current the default graph. tf.import_graph_def(graph_def) graph = tf.get_default_graph() self.model = AttrDict() def collect_placeholders(prefix): ret = [] idx = 0 while True: try: ret.append( graph.get_tensor_by_name('import/{}_{}:0'.format( prefix, idx))) idx += 1 except KeyError: return tuple(ret) self.model['src_pls'] = collect_placeholders('src_pl') self.model['dst_pls'] = collect_placeholders('dst_pl') self.model['predictions'] = graph.get_tensor_by_name( 'import/predictions:0')
except: print("Please add --train or --test after py Regressor.py") options = None if options == "--train": r = Regressor("Random Forest", load_model=False) mod = Regressor("Random Forest", load_model=False) cv, ma, mse = r.train(mod, save=False, make_chart=False) print(cv, ma, mse) elif options == "--test": model_name = sys.argv[2] + " " + sys.argv[ 3] #Random Forest_2017 or Random Forest_2016 year = int(model_name.split("_")[-1]) r = Regressor(model_name, load_model=True) reader = DataReader() df = reader.create_input_data() predictions = r.predict(df, year) print("Actual || Predicted") for i in range(len(predictions)): print(df.iloc[i]['hdi'], "||", predictions[i]) ######Training Code######### #cv_error = [] #testing_ma_error = [] #testing_mse = [] #mod = RandomForestRegressor(bootstrap=True, criterion='mae', n_estimators=100) #mod = RandomForestRegressor() #r = Regressor("Random Forest_2017", load_model=True) #importances = r.model.feature_importances_
def train(self, model, save=False, make_chart=False): """ Trains an input model. Makes Calculations, Charts, and Saves the model if necessary. Parameters ---------- model: SKLearn Model The regression model to use save: Boolean Whether or not the model should be saved make_chart Boolean Whether or not to make/save a chart Returns ------- float, float, float: The Average CV Mean Squared Error, Mean Absolute Error, and Test MSE """ #get/split data reader = DataReader() df = reader.create_input_data() df = self.preprocess(df) self.X_train, self.X_test, self.y_train, self.y_test = self.split_data( df) parameters = { 'n_estimators': [1, 5, 10, 20, 30], 'max_depth': [1, 5, 10] } rf = RandomForestRegressor() self.model = GridSearchCV(rf, parameters, cv=10) #train model self.model.fit(self.X_train, self.y_train) #Feature importance importances = self.model.best_estimator_.feature_importances_ cols = self.X_train.columns for i in range(len(importances)): print(cols[i], importances[i]) if save: joblib.dump(self.model.best_estimator_, "../models/" + self.name + "_2017.joblib") print("------------------------") MSEs = cross_val_score(estimator=self.model, X=self.X_train, y=self.y_train, scoring='neg_mean_squared_error', cv=8) predicted = self.model.predict(self.X_test) print("Average CV Mean Squared Error: ", abs(np.mean(MSEs))) print( "Testing Mean Absolute Error: ", mean_absolute_error(self.y_test, self.model.predict(self.X_test))) print("Testing MSE: ", mean_squared_error(self.y_test, predicted)) #print(self.model.feature_importances_) if make_chart: print("Generating Chart...") plt.style.use('dark_background') fig, ax = plt.subplots(nrows=1, ncols=1) ax.set_ylabel('HDI') ax.set_xlabel("Municipality Codmun ID") ax.set_title(self.name + 'Real vs Predicted') green, = ax.plot(np.arange(20), self.y_test[0:100:5], 'g', label='True') red, = ax.plot(np.arange(20), predicted[0:100:5], 'r', label='Predicted') ax.set_xticks(np.arange(20)) x_labels = self.X_test.iloc[0:100:5]['codmun'].tolist() ax.set_xticklabels([str(int(y)) for y in x_labels], rotation='vertical') plt.legend(handles=[green, red], labels=["True", "Predicted"]) plt.tight_layout() fig.savefig(self.name + "_real_v_predicted") for x in range(0, 100, 5): print(predicted[x], x_labels[int(x / 5)]) print(x_labels, predicted[0:100:5]) return np.mean(MSEs), mean_absolute_error( self.y_test, self.model.predict(self.X_test)), mean_squared_error( self.y_test, predicted)
"pert_id": ['BRD-U41416256', 'BRD-U60236422'], "pert_type": ["trt_cp"], "cell_id": ['A375', 'HA1E', 'HELA', 'HT29', 'MCF7', 'PC3', 'YAPC'], "pert_idose": ["0.04 um", "0.12 um", "0.37 um", "1.11 um", "3.33 um", "10.0 um"] } # check cuda if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") print("Use GPU: %s" % torch.cuda.is_available()) data = DataReader(drug_file, gene_file, gene_expression_file_train, gene_expression_file_dev, gene_expression_file_test, filter, device) print('#Train: %d' % len(data.train_feature['drug'])) print('#Dev: %d' % len(data.dev_feature['drug'])) print('#Test: %d' % len(data.test_feature['drug'])) # model creation model = DeepCE(drug_input_dim=drug_input_dim, drug_emb_dim=drug_embed_dim, conv_size=conv_size, degree=degree, gene_input_dim=np.shape(data.gene)[1], gene_emb_dim=gene_embed_dim, num_gene=np.shape(data.gene)[0], hid_dim=hid_dim, dropout=dropout,
def train(args): vocab = Vocab.load(args.vocab, max_size=args.vocab_size) data_reader = DataReader(data_dir=args.data_dir, shuffle=True) preprocessor = Preprocessor( predict_prev=args.predict_prev, predict_cur=args.predict_cur, predict_next=args.predict_next, vocab=vocab, max_length=args.max_length, gpu=args.gpu) model = SkipThought( rnn_type=args.rnn_type, num_words=len(vocab), word_dim=args.word_dim, hidden_dim=args.hidden_dim, bidirectional=args.bidirectional, predict_prev=args.predict_prev, predict_cur=args.predict_cur, predict_next=args.predict_next) print(model) if args.pretrained is not None: print(f'Loading pretrained model from {args.pretrained}') model.load_state_dict( torch.load(args.pretrained, map_location=lambda storage, loc: storage)) if args.gpu > -1: model.cuda(args.gpu) optimizer = optim.Adam(model.parameters()) summary_writer = SummaryWriter(os.path.join(args.save_dir, 'log')) def add_scalar_summary(name, value, step): summary_writer.add_scalar(tag=name, scalar_value=value, global_step=step) def add_text_summary(name, value, step): summary_writer.add_text(tag=name, text_string=value, global_step=step) def variable(tensor, volatile=False): return Variable(tensor, volatile=volatile) def run_train_iter(batch): if not model.training: model.train() src, tgt = preprocessor(batch) src = (variable(src[0]), src[1]) for k in tgt: tgt[k] = (variable(tgt[k][0]), tgt[k][1]) logits = model.forward(src=src, tgt=tgt) loss = 0 for k in tgt: logits_k = logits[k] tgt_k = tgt[k] loss = loss + basic.sequence_cross_entropy( logits=logits_k[:-1], targets=tgt_k[0][1:], length=tgt_k[1] - 1) optimizer.zero_grad() loss.backward() clip_grad_norm(model.parameters(), max_norm=10) optimizer.step() return loss.data[0] def ids_to_words(ids): words = [] eos_id = vocab.stoi(vocab.eos) for id_ in ids: words.append(vocab.itos(id_)) if id_ == eos_id: break return words def generate_using_decoder(name, src, max_length): _, encoder_state = model.encoder(words=src[0], length=src[1]) if isinstance(encoder_state, tuple): # LSTM encoder_state = encoder_state[0] context = (encoder_state.transpose(0, 1).contiguous() .view(-1, args.hidden_dim)) batch_size = src[1].size(0) bos_id = vocab.stoi(vocab.bos) bos = Variable(src[1].new(1, batch_size).fill_(bos_id)) decoder = model.get_decoder(name) prev_pred = bos done = torch.zeros(batch_size).byte() hyps = [] prev_state = context.unsqueeze(0) for t in range(max_length): if done.all(): break decoder_input = prev_pred logit, prev_state = decoder(words=decoder_input, prev_state=prev_state) pred = logit.max(2)[1] prev_pred = pred hyps.append(pred.data) hyps = torch.cat(hyps, dim=0).transpose(0, 1).tolist() return hyps def generate(batch): # Greedy search src, tgt = preprocessor(batch) src = (variable(src[0]), src[1]) for k in tgt: tgt[k] = (variable(tgt[k][0], volatile=True), tgt[k][1]) batch_size = src[0].size(1) max_length = src[0].size(0) * 2 generated = {} for k in tgt: generated[k] = generate_using_decoder( name=k, src=src, max_length=max_length) results = [] for i in range(batch_size): res = {'src': ' '.join(ids_to_words(src[0][:src[1][i], i].data)), 'tgt': {}, 'out': {}} for k in tgt: res['tgt'][k] = ' '.join(ids_to_words(tgt[k][0][1:, i].data)) res['out'][k] = ' '.join(ids_to_words(generated[k][i])) results.append(res) return results def generate_synthetic_batch(real_batch): def sort_by_length(tgt_of_key): sorted_length, sort_inds = tgt_of_key[1].sort( dim=0, descending=True) return tgt_of_key[0][:, sort_inds], sorted_length # Forward: given prev, generate cur' _, tgt = preprocessor(real_batch) tgt_prev, tgt_prev_length = sort_by_length(tgt['prev']) syn_src_fw = generate_using_decoder( name='next', src=(variable(tgt_prev[1:], volatile=True), tgt_prev_length - 1), max_length=args.max_length) # Backward: given next, generate cur'' tgt_next, tgt_next_length = sort_by_length(tgt['next']) syn_src_bw = generate_using_decoder( name='prev', src=(variable(tgt_next[1:], volatile=True), tgt_next_length - 1), max_length=args.max_length) syn_batch_fw = [] syn_batch_bw = [] for i in range(len(real_batch)): syn_src_fw_str = ' '.join(ids_to_words(syn_src_fw[i])) syn_src_bw_str = ' '.join(ids_to_words(syn_src_bw[i])) syn_batch_fw.append( (real_batch[i][0], syn_src_fw_str, real_batch[i][2])) syn_batch_bw.append( (real_batch[i][0], syn_src_bw_str, real_batch[i][2])) return syn_batch_fw, syn_batch_bw global_step = 0 def print_samples(): model.eval() num_samples = 2 samples = data_reader.next_batch(size=num_samples, peek=True) syn_samples_fw, syn_samples_bw = generate_synthetic_batch(samples) gen_results = generate(samples) syn_gen_results_fw = generate(syn_samples_fw) syn_gen_results_bw = generate(syn_samples_bw) text_val = '' for i, res in enumerate(gen_results): text_val += f'* sample (real) #{i}\n' text_val += f'\t* src: {res["src"]}\n' for k in res['tgt']: tgt_k = res['tgt'][k] out_k = res['out'][k] text_val += f'\t* {k} (tgt): {tgt_k}\n' text_val += f'\t* {k} (out): {out_k}\n' for i, res in enumerate(syn_gen_results_fw): text_val += f'* sample (syn_fw) #{i}\n' text_val += f'\t* src: {res["src"]}\n' for k in res['tgt']: tgt_k = res['tgt'][k] out_k = res['out'][k] text_val += f'\t* {k} (tgt): {tgt_k}\n' text_val += f'\t* {k} (out): {out_k}\n' for i, res in enumerate(syn_gen_results_bw): text_val += f'* sample (syn_bw) #{i}\n' text_val += f'\t* src: {res["src"]}\n' for k in res['tgt']: tgt_k = res['tgt'][k] out_k = res['out'][k] text_val += f'\t* {k} (tgt): {tgt_k}\n' text_val += f'\t* {k} (out): {out_k}\n' add_text_summary('Sample', value=text_val, step=global_step) for epoch in range(args.max_epoch): data_reader.start_epoch() for batch in tqdm(data_reader.iterator(args.batch_size), desc=f'Epoch {epoch}'): # Train on real batch real_loss = run_train_iter(batch) # Train on synthetic batches syn_batch_fw, syn_batch_bw = generate_synthetic_batch(batch) syn_loss_fw = run_train_iter(syn_batch_fw) syn_loss_bw = run_train_iter(syn_batch_bw) global_step += 1 add_scalar_summary(name='real_loss', value=real_loss, step=global_step) add_scalar_summary(name='syn_loss_fw', value=syn_loss_fw, step=global_step) add_scalar_summary(name='syn_loss_bw', value=syn_loss_bw, step=global_step) if global_step % args.print_every == 0: print_samples() if global_step % args.save_every == 0: model_filename = f'model-{global_step}.pt' model_path = os.path.join(args.save_dir, model_filename) torch.save(model.state_dict(), model_path) print(f'\nIter #{global_step}: ' f'Saved checkpoint to {model_path}')
def __init__(self, model, output_name): self.model = model self.datareader = DataReader() self.metrics = ErrorMetrics() self.output_name = output_name
def train(config, num_epoch, last_pretrain_model_dir, pretrain_model_dir, model_dir, block_idx_enc, block_idx_dec): logger = logging.getLogger('') config.num_blocks_enc = block_idx_enc config.num_blocks_dec = block_idx_dec # if block_idx >= 2: # config.train.var_filter = 'encoder/block_' + str(block_idx - 1) + '|' + 'decoder/block_' + str( # block_idx - 1) + '|' + 'encoder/src_embedding' + '|' + 'decoder/dst_embedding' # if block_idx >= 2: # config.train.var_filter = 'encoder/block_' + str(block_idx - 1) + '|' + 'decoder/block_' + str( # block_idx - 1) logger.info("config.num_blocks_enc=" + str(config.num_blocks_enc) + ",config.num_blocks_dec=" + str(config.num_blocks_dec) + ',config.train.var_filter=' + str(config.train.var_filter)) """Train a model with a config file.""" data_reader = DataReader(config=config) model = eval(config.model)(config=config, num_gpus=config.train.num_gpus) model.build_train_model(test=config.train.eval_on_dev) sess_config = tf.ConfigProto() sess_config.gpu_options.allow_growth = True sess_config.allow_soft_placement = True summary_writer = tf.summary.FileWriter(pretrain_model_dir, graph=model.graph) with tf.Session(config=sess_config, graph=model.graph) as sess: # Initialize all variables. sess.run(tf.global_variables_initializer()) # Reload variables in disk. if tf.train.latest_checkpoint(last_pretrain_model_dir): available_vars = available_variables_without_global_step( last_pretrain_model_dir) # available_vars = available_variables(last_pretrain_model_dir) if available_vars: saver = tf.train.Saver(var_list=available_vars) saver.restore( sess, tf.train.latest_checkpoint(last_pretrain_model_dir)) for v in available_vars: logger.info('Reload {} from disk.'.format(v.name)) else: logger.info('Nothing to be reload from disk.') else: logger.info('Nothing to be reload from disk.') evaluator = Evaluator() evaluator.init_from_existed(model, sess, data_reader) global dev_bleu, toleration dev_bleu = evaluator.evaluate( **config.dev) if config.train.eval_on_dev else 0 toleration = config.train.toleration def train_one_step(batch): feat_batch, target_batch = batch feed_dict = expand_feed_dict({ model.src_pls: feat_batch, model.dst_pls: target_batch }) step, lr, loss, _ = sess.run([ model.global_step, model.learning_rate, model.loss, model.train_op ], feed_dict=feed_dict) if step % config.train.summary_freq == 0: logger.info('pretrain summary_writer...') summary = sess.run(model.summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary, global_step=step) summary_writer.flush() return step, lr, loss def maybe_save_model(model_dir, is_save_global_step=True): global dev_bleu, toleration new_dev_bleu = evaluator.evaluate( **config.dev) if config.train.eval_on_dev else dev_bleu + 1 if new_dev_bleu >= dev_bleu: mp = model_dir + '/pretrain_model_step_{}'.format(step) # model.saver.save(sess, mp) if is_save_global_step: model.saver.save(sess, mp) else: variables_without_global_step = global_variables_without_global_step( ) saver = tf.train.Saver( var_list=variables_without_global_step, max_to_keep=10) saver.save(sess, mp) logger.info('Save model in %s.' % mp) toleration = config.train.toleration dev_bleu = new_dev_bleu else: toleration -= 1 step = 0 for epoch in range(1, num_epoch + 1): for batch in data_reader.get_training_batches_with_buckets(): # Train normal instances. start_time = time.time() step, lr, loss = train_one_step(batch) logger.info( 'epoch: {0}\tstep: {1}\tlr: {2:.6f}\tloss: {3:.4f}\ttime: {4:.4f}' .format(epoch, step, lr, loss, time.time() - start_time)) if config.train.num_steps and step >= config.train.num_steps: break # Early stop if toleration <= 0: break maybe_save_model(pretrain_model_dir) if model_dir: maybe_save_model(model_dir, False) logger.info("Finish pretrain block_idx_enc=" + str(block_idx_enc) + ',block_idx_dec=' + str(block_idx_dec))
def analyze(data, acc): """Analyze the data in `data' and store quantities in the accumulator `acc'""" ##fir = load_amplitude_reco_weights('pulse_weights.pkl') #fir = load_amplitude_reco_weights('computed_weights.pkl') #print(len(fir), fir) signal_processing = cfg.cfg.get('analysis', 'signal_processing', fallback='') if signal_processing == 'butterworth': butterworth = True print("# Using Butterworth filter for signal processing") else: butterworth = False lfreq_default = cfg.cfg.getfloat('analysis', 'lfreq_default', fallback=3) hfreq_default = cfg.cfg.getfloat('analysis', 'hfreq_default', fallback=300) thr_default = cfg.cfg.getfloat('analysis', 'threshold_default', fallback=0.01) win_default = cfg.cfg.getfloat('analysis', 'peak_search_window', fallback=1.e-3) lfreq = [] thr = [] hfreq = [] win = [] gain = [] noise_threshold = [] for i in range(len(data)): lfreq.append( cfg.cfg.getfloat('analysis', 'filter_lfreq_ch%03d' % (i + 1), fallback=lfreq_default)) hfreq.append( cfg.cfg.getfloat('analysis', 'filter_hfreq_ch%03d' % (i + 1), fallback=hfreq_default)) thr.append( cfg.cfg.getfloat('analysis', 'thr_ch%003d' % (i + 1), fallback=thr_default)) win.append( cfg.cfg.getfloat('analysis', 'peak_search_window_ch%003d' % (i + 1), fallback=win_default)) gain.append( cfg.cfg.getfloat('setup', 'gain_ch%03d' % (i + 1), fallback=1000)) noise_threshold.append( cfg.cfg.getfloat('analysis', 'noise_threshold_ch%03d' % (i + 1), fallback=2e-6)) max_samples = cfg.cfg.getint('data', 'max_samples_per_file', fallback=-1) n_max_chunk = cfg.cfg.getint('data', 'n_max_chunk', fallback=-1) ## analyze independent channels tot_samples_read = 0 for i, f in enumerate(data): print('# Processing file', f, '(%d)' % i) # to avoid a too big file loaded in RAM, split the reading in parts # and accumulate the results in acc #d = read_data(f, max_samples) h = DataReader(f, max_samples, n_max_chunk) n_samples_read = 0 for d in h: cfg.params.sampling_freq = h.sampling_freq duration = len(d) / 3.6e3 / cfg.params.sampling_freq # skipping runs of less than 28.8 seconds if duration < 0.008: print("# skipping file/chunk %d (%d samples - %f hours)" % (i, len(d), duration)) continue ###print("# processing file %d (%d samples - %f hours)" % (i, len(d), duration)) print("# Progress: %.1f%%" % (h.progress() * 100.)) d = volt(d) / gain[i] suff = '_det%03d' % i det = i + 1 acc.set_sampling_freq(det, h.sampling_freq) #compute_pulse_weights(d, 200) ##for j, s in enumerate(d): ## if j > 50000: ## break ## print(i, j, s) ##print('\n') ##continue #import sys #sys.exit(0) # amplitude spectrum if butterworth: #TODO select freq depending on channel type peaks, peaks_max = filt_ana.find_peaks_2( d, [lfreq[i], hfreq[i]], cfg.params.sampling_freq, win[i], thr[i]) else: peaks, peaks_max = find_peaks(d * 1., fir) peaks = list(np.add(peaks, n_samples_read)) #print(peaks[:10], '...', peaks[-10:]) acc.add(det, 'peak', (peaks, peaks_max)) # store peak positions and amplitudes for # correlation analysis #corr_peaks[1] = (peaks, peaks_max) # baseline vs time base, base_min = baseline(d * 1., 10000) base = list(np.add(base, n_samples_read)) acc.add(det, 'baseline', (base, base_min)) ## normalized pulse shape #shapes = pulse_shapes(d * 1., peaks, 1000) #plot_pulse_shapes(shapes, suff, det) ## power spectral density -- all #f, Pxx_den = signal.welch(d, cfg.params.sampling_freq, nperseg = 25000) # power spectral density -- noise only dn = remove_signals(d, noise_threshold[i], 10000) #print(dn[:10], '...', dn[-10:], len(dn), d[:10], '...', d[-10:], len(d)) f, Pxx_den = signal.welch(dn, cfg.params.sampling_freq, nperseg=min(25000, int(len(dn) / 2))) acc.add(det, 'fft', (f, Pxx_den)) ## rate FFT # FIXME: takes quite a long time #p = [0] * (peaks[len(peaks) - 1] + 1) #for el in peaks: # p[el] = 1. ##p = np.abs(np.fft.rfft(p[:10000])) ##p = np.abs(np.fft.rfft(rate)) ##f = np.linspace(0, 1/2, len(p)) ##plot_fft_rate(f, p, suff) #from scipy import signal #f, Pxx_den = signal.periodogram(p[:10000], 1) #plot_fft_rate(f, Pxx_den, suff) acc.add_analyzed_samples(det, h.last_chunk_size) n_samples_read += h.last_chunk_size #print('-->', h.last_chunk_size, n_samples_read) tot_samples_read += n_samples_read return tot_samples_read
target_col = len(col_name) - 1 # ============================================ # # Data location wd = os.path.dirname(os.path.abspath(__file__)) + '/' data_path = wd + 'data/' data_path += 'prototype/' output_path = wd + 'output/' # ============================================ # # Read data data_files = os.listdir(data_path) for i in range(len(data_files)): data_files[i] = data_path + data_files[i] dr = DataReader(data_files, col_idx) ds = DataScaler() dp = DataParser() print('======== Supplying data ============') for file_id in range(len(data_files)): dr_tmp = DataReader([data_files[file_id]], col_idx) dr_tmp.read(delimiter='\t') data = dr_tmp.getData() data = parse_data(dp, data, col_name, target_col) dr.append(data) del data del dr_tmp print(file_id + 1, ' - ', data_files[file_id], ': ',