def hyperband_original(self, hyper_params, epochs): train_feature_input, train_label_input = load_train_dataset(self.hp_dataset) eval_feature_input, eval_label_input = load_eval_dataset(self.hp_dataset) graph = tf.Graph() with graph.as_default(): features = tf.placeholder(tf.float32, [None, self.img_width, self.img_height, self.num_channel]) labels = tf.placeholder(tf.int64, [None, self.num_class]) dt = datetime.now() np.random.seed(dt.microsecond) net_instnace = np.random.randint(sys.maxsize) model_arch = hyper_params[0] model_type = model_arch.split('-')[0] model_layer = int(model_arch.split('-')[1]) batch_size = hyper_params[1] opt = hyper_params[2] learning_rate = hyper_params[3] activation = hyper_params[4] print("\n** model: {} | batch size: {} | opt: {} | model layer: {} | learn rate: {} | act: {} **" .format(model_type, batch_size, opt, model_layer, learning_rate, activation)) dm = ModelImporter(model_type, str(net_instnace), model_layer, self.img_height, self.img_width, self.num_channel, self.num_class, batch_size, opt, learning_rate, activation, batch_padding=False) model_entity = dm.get_model_entity() model_logit = model_entity.build(features, is_training=True) train_op = model_entity.train(model_logit, labels) eval_op = model_entity.evaluate(model_logit, labels) if self.hp_dataset == 'imagenet': image_list = sorted(os.listdir(train_feature_input)) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True with tf.Session(graph=graph, config=config) as sess: sess.run(tf.global_variables_initializer()) num_batch = train_label_input.shape[0] // batch_size for e in range(epochs): for i in range(num_batch): # print('epoch %d / %d, step %d / %d' %(e+1, epochs, i+1, num_batch)) batch_offset = i * batch_size batch_end = (i + 1) * batch_size if self.hp_dataset == 'imagenet': batch_list = image_list[batch_offset:batch_end] train_feature_batch = load_imagenet_raw(self.hp_dataset, batch_list, self.img_height, self.img_width) else: train_feature_batch = train_feature_input[batch_offset:batch_end] train_label_batch = train_label_input[batch_offset:batch_end] sess.run(train_op, feed_dict={features: train_feature_batch, labels: train_label_batch}) if self.hp_dataset == 'imagenet': acc_sum = 0 imagenet_batch_size_eval = 50 num_batch_eval = eval_label_input.shape[0] // imagenet_batch_size_eval test_image_list = sorted(os.listdir(eval_feature_input)) for n in range(num_batch_eval): batch_offset = n * imagenet_batch_size_eval batch_end = (n + 1) * imagenet_batch_size_eval test_batch_list = test_image_list[batch_offset:batch_end] test_feature_batch = load_imagenet_raw(eval_feature_input, test_batch_list, self.img_height, self.img_width) test_label_batch = eval_label_input[batch_offset:batch_end] acc_batch = sess.run(eval_op, feed_dict={features: test_feature_batch, labels: test_label_batch}) acc_sum += acc_batch acc_avg = acc_sum / num_batch_eval else: acc_avg = sess.run(eval_op, feed_dict={features: eval_feature_input, labels: eval_label_input}) print(f'Accuracy: {acc_avg}') return acc_avg
def train_pack(): print('start training pack') rand_seed_pack = cfg_para.multi_rand_seed model_type_list = cfg_para.multi_model_type optimizer_list = cfg_para.multi_opt num_layer_list = cfg_para.multi_num_layer activation_list = cfg_para.multi_activation batch_size_list = cfg_para.multi_batch_size learning_rate_list = cfg_para.multi_learning_rate if len(set(batch_size_list)) == 1: is_batch_padding = False else: is_batch_padding = True num_epoch = cfg_para.multi_num_epoch train_dataset = cfg_para.multi_train_dataset use_tf_timeline = cfg_para.single_use_tb_timeline max_batch_size = max(batch_size_list) ################################################# # load dataset ################################################# img_width, img_height, num_channel, num_class = load_dataset_para( train_dataset) train_feature_input, train_label_input = load_train_dataset(train_dataset) ######################### # build packed model ######################### features = tf.placeholder(tf.float32, [None, img_width, img_height, num_channel]) labels = tf.placeholder(tf.int64, [None, num_class]) model_name_abbr = np.random.choice(rand_seed_pack, len(model_type_list), replace=False).tolist() train_op_pack = list() for midx, mt in enumerate(model_type_list): dm = ModelImporter(mt, str(model_name_abbr.pop()), num_layer_list[midx], img_height, img_width, num_channel, num_class, batch_size_list[midx], optimizer_list[midx], learning_rate_list[midx], activation_list[midx], batch_padding=is_batch_padding) model_entity = dm.get_model_entity() model_logit = model_entity.build(features, is_training=True) train_op = model_entity.train(model_logit, labels) train_op_pack.append(train_op) ######################### # train packed model ######################### config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True step_time = 0 step_count = 0 if train_dataset == 'imagenet': image_list = sorted(os.listdir(train_feature_input)) overall_time_start = timer() with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) num_batch = train_label_input.shape[0] // max_batch_size for e in range(num_epoch): for i in range(num_batch): print('epoch %d / %d, step %d / %d' % (e + 1, num_epoch, i + 1, num_batch)) if i != 0: start_time = timer() batch_offset = i * max_batch_size batch_end = (i + 1) * max_batch_size if train_dataset == 'imagenet': batch_list = image_list[batch_offset:batch_end] train_feature_batch = load_imagenet_raw( train_feature_input, batch_list, img_height, img_width) else: train_feature_batch = train_feature_input[ batch_offset:batch_end] train_label_batch = train_label_input[batch_offset:batch_end] if use_tf_timeline: profile_path = cfg_path.profile_path run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() sess.run(train_op_pack, feed_dict={ features: train_feature_batch, labels: train_label_batch }, options=run_options, run_metadata=run_metadata) trace = timeline.Timeline( step_stats=run_metadata.step_stats) trace_file = open( profile_path + '/' + '-'.join(map(str, set(model_type_list))) + '-' + str(len(model_type_list)) + '-'.join(map(str, set(batch_size_list))) + '-' + str(i) + '.json', 'w') trace_file.write( trace.generate_chrome_trace_format(show_dataflow=True, show_memory=True)) else: sess.run(train_op_pack, feed_dict={ features: train_feature_batch, labels: train_label_batch }) if i != 0: end_time = timer() dur_time = end_time - start_time print("step time:", dur_time) step_time += dur_time step_count += 1 overall_time_end = timer() overall_time = overall_time_end - overall_time_start print( f'overall training time (s):{overall_time}, average step time (ms):{step_time / step_count * 1000}' )
def hyperband_pack_knn(self, confs, epochs): train_feature_input, train_label_input = load_train_dataset(self.hp_dataset) features = tf.placeholder(tf.float32, [None, self.img_width, self.img_height, self.num_channel]) labels = tf.placeholder(tf.int64, [None, self.num_class]) dt = datetime.now() np.random.seed(dt.microsecond) net_instnace = np.random.randint(sys.maxsize, size=len(confs)) desire_epochs = epochs entity_pack = list() train_pack = list() eval_pack = list() batch_size_set = set() for cidx, cf in enumerate(confs): model_arch = cf[0] model_type = model_arch.split('-')[0] model_layer = int(model_arch.split('-')[1]) batch_size = cf[1] batch_size_set.add(batch_size) opt = cf[2] learning_rate = cf[3] activation = cf[4] desire_steps = train_label_input.shape[0] // batch_size dm = ModelImporter(model_type, str(net_instnace[cidx]), model_layer, self.img_height, self.img_width, self.num_channel, self.num_class, batch_size, opt, learning_rate, activation, batch_padding=True) model_entity = dm.get_model_entity() model_entity.set_desire_epochs(desire_epochs) model_entity.set_desire_steps(desire_steps) model_logit = model_entity.build(features, is_training=True) train_op = model_entity.train(model_logit, labels) eval_op = model_entity.evaluate(model_logit, labels) entity_pack.append(model_entity) train_pack.append(train_op) eval_pack.append(eval_op) if self.hp_dataset == 'imagenet': image_list = sorted(os.listdir(train_feature_input)) config = tf.ConfigProto() config.allow_soft_placement = True config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) max_bs = max(batch_size_set) complete_flag = False while len(train_pack) != 0: num_steps = train_label_input.shape[0] // max_bs for i in range(num_steps): print('step %d / %d' % (i + 1, num_steps)) batch_offset = i * max_bs batch_end = (i + 1) * max_bs if self.hp_dataset == 'imagenet': batch_list = image_list[batch_offset:batch_end] train_feature_batch = load_imagenet_raw(train_feature_input, batch_list, self.img_height, self.img_width) else: train_feature_batch = train_feature_input[batch_offset:batch_end] train_label_batch = train_label_input[batch_offset:batch_end] sess.run(train_pack, feed_dict={features: train_feature_batch, labels: train_label_batch}) for me in entity_pack: me.set_current_step() if me.is_complete_train(): print("model has been trained completely:{}".format(me.get_model_instance_name())) sess.run(me.set_batch_size(train_label_input.shape[0])) train_pack.remove(me.get_train_op()) complete_flag = True if len(train_pack) == 0: break if complete_flag: batch_size_set.discard(max_bs) max_bs = max(batch_size_set) complete_flag = False break acc_pack = evaluate_pack_model(sess, features, labels, eval_pack) print(f'Accuracy: {acc_pack}') return acc_pack
def hyperband_pack_bs(self, batch_size, confs, epochs): train_feature_input, train_label_input = load_train_dataset(self.hp_dataset) features = tf.placeholder(tf.float32, [None, self.img_width, self.img_height, self.num_channel]) labels = tf.placeholder(tf.int64, [None, self.num_class]) dt = datetime.now() np.random.seed(dt.microsecond) net_instnace = np.random.randint(sys.maxsize, size=len(confs)) train_pack = list() eval_pack = list() for cidx, civ in enumerate(confs): model_arch = civ[0] model_type = model_arch.split('-')[0] model_layer = int(model_arch.split('-')[1]) opt = civ[2] learning_rate = civ[3] activation = civ[4] dm = ModelImporter(model_type, str(net_instnace[cidx]), model_layer, self.img_height, self.img_width, self.num_channel, self.num_class, batch_size, opt, learning_rate, activation, batch_padding=False) model_entity = dm.get_model_entity() model_logit = model_entity.build(features, is_training=True) train_op = model_entity.train(model_logit, labels) eval_op = model_entity.evaluate(model_logit, labels) train_pack.append(train_op) eval_pack.append(eval_op) if self.hp_dataset == 'imagenet': image_list = sorted(os.listdir(train_feature_input)) config = tf.ConfigProto() config.allow_soft_placement = True config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) num_batch = train_label_input.shape[0] // batch_size for e in range(epochs): for i in range(num_batch): # print('epoch %d / %d, step %d / %d' %(e+1, epochs, i+1, num_batch)) batch_offset = i * batch_size batch_end = (i + 1) * batch_size if self.hp_dataset == 'imagenet': batch_list = image_list[batch_offset:batch_end] train_feature_batch = load_imagenet_raw(train_feature_input, batch_list, self.img_height, self.img_width) else: train_feature_batch = train_feature_input[batch_offset:batch_end] train_label_batch = train_label_input[batch_offset:batch_end] sess.run(train_pack, feed_dict={features: train_feature_batch, labels: train_label_batch}) acc_pack = evaluate_pack_model(sess, features, labels, eval_pack) print(f'Accuracy: {acc_pack}') return acc_pack
def train_model(train_step_arg, batch_size_arg, model_type_arg, tidx_arg, global_args): train_dataset = cfg_para.multi_train_dataset num_epoch = cfg_para.multi_num_epoch use_tf_timeline = cfg_para.multi_use_tb_timeline use_cpu = cfg_para.multi_use_cpu if use_cpu: train_device = '/cpu:0' else: train_device = '/gpu:0' img_width, img_height, num_channel, num_class = load_dataset_para( train_dataset) train_feature_input, train_label_input = load_train_dataset(train_dataset) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True if train_dataset == 'imagenet': image_list = sorted(os.listdir(train_feature_input)) with tf.device(train_device): with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) num_batch = train_label_input.shape[0] // batch_size_arg for e in range(num_epoch): for i in range(num_batch): print('epoch %d / %d, step %d / %d' % (e + 1, num_epoch, i + 1, num_batch)) batch_offset = i * batch_size_arg batch_end = (i + 1) * batch_size_arg if train_dataset == 'imagenet': batch_list = image_list[batch_offset:batch_end] feature_batch = load_imagenet_raw( train_feature_input, batch_list, img_height, img_width) else: feature_batch = train_feature_input[ batch_offset:batch_end] label_batch = train_label_input[batch_offset:batch_end] if use_tf_timeline: profile_path = cfg_path.profile_path run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() sess.run(train_step_arg, feed_dict={ global_args['features' + str(tidx_arg)]: feature_batch, global_args['labels' + str(tidx_arg)]: label_batch }, options=run_options, run_metadata=run_metadata) trace = timeline.Timeline( step_stats=run_metadata.step_stats) trace_file = open( profile_path + '/' + str(model_type_arg) + '-' + str(batch_size_arg) + '-' + str(i) + '.json', 'w') trace_file.write( trace.generate_chrome_trace_format( show_dataflow=True, show_memory=True)) else: sess.run(train_step_arg, feed_dict={ global_args['features' + str(tidx_arg)]: feature_batch, global_args['labels' + str(tidx_arg)]: label_batch })
def train_single(): print('start training single') rand_seed = cfg_para.single_rand_seed num_epoch = cfg_para.single_num_epoch model_type = cfg_para.single_model_type num_layer = cfg_para.single_num_layer learning_rate = cfg_para.single_learning_rate activation = cfg_para.single_activation batch_size = cfg_para.single_batch_size optimizer = cfg_para.single_opt train_dataset = cfg_para.single_train_dataset use_tf_timeline = cfg_para.single_use_tb_timeline use_cpu = cfg_para.single_use_cpu if use_cpu: train_device = '/cpu:0' else: train_device = '/gpu:0' ########################################## # load dataset ########################################## img_width, img_height, num_channel, num_class = load_dataset_para( train_dataset) train_feature_input, train_label_input = load_train_dataset(train_dataset) eval_feature_input, eval_label_input = load_eval_dataset(train_dataset) ########################################## # build model ########################################## feature_ph = tf.placeholder(tf.float32, [None, img_width, img_height, num_channel]) label_ph = tf.placeholder(tf.int64, [None, num_class]) model_name_abbr = np.random.choice(rand_seed, 1, replace=False).tolist() dm = ModelImporter(model_type, str(model_name_abbr.pop()), num_layer, img_height, img_width, num_channel, num_class, batch_size, optimizer, learning_rate, activation, batch_padding=False) model_entity = dm.get_model_entity() model_logit = model_entity.build(feature_ph, is_training=True) train_op = model_entity.train(model_logit, label_ph) eval_op = model_entity.evaluate(model_logit, label_ph) ########################################## # train model ########################################## step_time = 0 step_count = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True if train_dataset == 'imagenet': image_list = sorted(os.listdir(train_feature_input)) overall_time_start = timer() with tf.device(train_device): with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) num_batch = train_label_input.shape[0] // batch_size for e in range(num_epoch): for i in range(num_batch): print('epoch %d / %d, step %d / %d' % (e + 1, num_epoch, i + 1, num_batch)) if i != 0: start_time = timer() batch_offset = i * batch_size batch_end = (i + 1) * batch_size if train_dataset == 'imagenet': batch_list = image_list[batch_offset:batch_end] train_feature_batch = load_imagenet_raw( train_feature_input, batch_list, img_height, img_width) else: train_feature_batch = train_feature_input[ batch_offset:batch_end] train_label_batch = train_label_input[ batch_offset:batch_end] if use_tf_timeline: profile_path = cfg_path.profile_path run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() sess.run(train_op, feed_dict={ feature_ph: train_feature_batch, label_ph: train_label_batch }, options=run_options, run_metadata=run_metadata) trace = timeline.Timeline( step_stats=run_metadata.step_stats) trace_file = open( profile_path + '/' + str(model_type) + '-' + str(batch_size) + '-' + str(i) + '.json', 'w') trace_file.write( trace.generate_chrome_trace_format( show_dataflow=True, show_memory=True)) else: sess.run(train_op, feed_dict={ feature_ph: train_feature_batch, label_ph: train_label_batch }) if i != 0: end_time = timer() dur_time = end_time - start_time print("step time:", dur_time) step_time += dur_time step_count += 1 acc_avg = sess.run(eval_op, feed_dict={ feature_ph: eval_feature_input, label_ph: eval_label_input }) print('evaluation accuracy:{}'.format(acc_avg)) overall_time_end = timer() overall_time = overall_time_end - overall_time_start print( f'overall training time (s):{overall_time}, average step time (ms):{step_time / step_count * 1000}' )
def train_model(job_id): model_type_list = cfg_para.multi_model_type num_layer_list = cfg_para.multi_num_layer activation_list = cfg_para.multi_activation batch_size_list = cfg_para.multi_batch_size learning_rate_list = cfg_para.multi_learning_rate optimizer_list = cfg_para.multi_opt model_type = model_type_list[job_id] num_layer = num_layer_list[job_id] activation = activation_list[job_id] batch_size = batch_size_list[job_id] learning_rate = learning_rate_list[job_id] optimizer = optimizer_list[job_id] num_epoch = cfg_para.multi_num_epoch train_dataset = cfg_para.multi_train_dataset use_tf_timeline = cfg_para.multi_use_tb_timeline use_cpu = cfg_para.multi_use_cpu if use_cpu: train_device = '/cpu:0' else: train_device = '/gpu:0' model_name = '{0}-{1}-{2}-{3}-{4}-{5}-{6}-{7}'.format( job_id, model_type, num_layer, batch_size, learning_rate, optimizer, num_epoch, train_dataset) ########################################## # load dataset ########################################## img_width, img_height, num_channel, num_class = load_dataset_para( train_dataset) train_feature_input, train_label_input = load_train_dataset(train_dataset) ########################################## # build model ########################################## features = tf.placeholder(tf.float32, [None, img_width, img_height, num_channel]) labels = tf.placeholder(tf.int64, [None, num_class]) dm = ModelImporter(model_type, str(job_id), num_layer, img_height, img_width, num_channel, num_class, batch_size, optimizer, learning_rate, activation, batch_padding=False) model_entity = dm.get_model_entity() model_logit = model_entity.build(features, is_training=True) train_op = model_entity.train(model_logit, labels) ########################################## # train model ########################################## step_time = 0 step_count = 0 config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True if train_dataset == 'imagenet': image_list = sorted(os.listdir(train_feature_input)) with tf.device(train_device): with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) num_batch = train_label_input.shape[0] // batch_size for e in range(num_epoch): for i in range(num_batch): print('epoch %d / %d, step %d / %d' % (e + 1, num_epoch, i + 1, num_batch)) if i != 0: start_time = timer() batch_offset = i * batch_size batch_end = (i + 1) * batch_size if train_dataset == 'imagenet': batch_list = image_list[batch_offset:batch_end] train_feature_batch = load_imagenet_raw( train_feature_input, batch_list, img_height, img_width) else: train_feature_batch = train_feature_input[ batch_offset:batch_end] train_label_batch = train_label_input[ batch_offset:batch_end] if use_tf_timeline: profile_path = cfg_path.profile_path run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() sess.run(train_op, feed_dict={ features: train_feature_batch, labels: train_label_batch }, options=run_options, run_metadata=run_metadata) trace = timeline.Timeline( step_stats=run_metadata.step_stats) trace_file = open( profile_path + '/' + str(model_type) + '-' + str(batch_size) + '-' + str(i) + '.json', 'w') trace_file.write( trace.generate_chrome_trace_format( show_dataflow=True, show_memory=True)) else: sess.run(train_op, feed_dict={ features: train_feature_batch, labels: train_label_batch }) if i != 0: end_time = timer() dur_time = end_time - start_time print("step time:", dur_time) step_time += dur_time step_count += 1 step_time_result = f'average step time (ms) of {model_name}: {step_time / step_count * 1000}' return step_time_result