def get_test_numpy(self): if self.test_tfds is None: error("Error: test_tfds is None.") return self.accum_test_x, self.accum_test_y if len(self.accum_test_x) == 0: time_test_np_start = time.time() tfds_test_os_iterator = self.test_tfds.make_one_shot_iterator() as_timer("tfds_test_ositer") tfds_test_iter_next = tfds_test_os_iterator.get_next() time_test_os_iterator_end = time.time() info( "note: now take time_test_os_iterator_end cost_time={}s".format( round(time_test_os_iterator_end - time_test_np_start, 3) ) ) with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess: if self.domain == "text": while True: try: example, labels = sess.run(tfds_test_iter_next) example = np.squeeze(example, (2, 3)) example = np.squeeze(example, axis=-1) example = example.astype(np.int) self.accum_test_x.extend(example) self.accum_test_y.extend(labels) self.accm_test_cnt += example.shape[0] # X.append(example) # Y.append(labels) except tf.errors.OutOfRangeError: break else: while True: try: example, labels = sess.run(tfds_test_iter_next) # output: Note:time example shape=(86401, 1, 1, 1) # logger.info("Note:time example shape={}".format(example.shape)) self.accum_test_x.append(example) self.accum_test_y.append(labels) self.accm_test_cnt += 1 except tf.errors.OutOfRangeError: as_timer("tfds_test_run_OOR_{}".format(self.accm_test_cnt)) break time_test_np_end = time.time() info( "note: now take test accm_test_cnt={}, cost_time={}s".format( self.accm_test_cnt, round(time_test_np_end - time_test_np_start, 3) ) ) self.accum_test_y = np.array(self.accum_test_y) return self.accum_test_x
def test(self, dataset, remaining_time_budget=None): """Test method of domain-specific model.""" # Convert test dataset to necessary format and # store as self.domain_dataset_test # self.set_domain_dataset(dataset, is_training=False) as_timer("test_start") # init tf_test_dataset for the first time. self.tf_dataset_trainsformer.init_test_tfds(dataset) self.domain_dataset_test, self.X_test = self.tf_dataset_trainsformer.get_speech_test_dataset() # As the original metadata doesn't contain number of test examples, we # need to add this information if self.domain in ["text", "speech"] and (not self.domain_metadata["test_num"] >= 0): self.domain_metadata["test_num"] = len(self.X_test) logger.info("Note:test_process test domain metadata is {}".format(self.domain_metadata)) # Make predictions if self.domain in ["speech"]: if ( self.main_train_loop_num <= speech_ms_mlp_conf.midwei_train_start_loop + speech_ms_mlp_conf.midwei_predict_block_loop ): Y_pred = self.domain_model.test(self.domain_dataset_test, remaining_time_budget=remaining_time_budget) logger.info( "Note: speech pasa_model, speech_main_train_loop={}, speech_main_test_loop={}".format( self.main_train_loop_num, self.main_test_loop_num ) ) # Update self.done_training self.done_training = self.domain_model.done_training else: Y_pred = self.speech_widsom_model.test( self.domain_dataset_test, remaining_time_budget=remaining_time_budget ) logger.info( "Note: speech dw_model, train_loop={}, test_loop={}".format( self.main_train_loop_num, self.main_test_loop_num) ) # Update self.done_training self.done_training = self.speech_widsom_model.done_training as_timer("test_end") logger.info(as_timer) else: logger.error("Note: Domain is not Speech!") self.main_test_loop_num += 1 return Y_pred
def init_test_tfds(self, test_tfds): if self.test_tfds is None: if self.domain == "text": self.test_tfds = test_tfds.padded_batch( 20, padded_shapes=([None, 1, 1, 1], [None]), padding_values=(tf.constant(-1, dtype=tf.float32), tf.constant(-1, dtype=tf.float32)), ) else: # config: test_if_map_cutoff tfds_if_test_cutoff = False if tfds_if_test_cutoff: self.test_tfds = test_tfds.map(lambda x,y: (x[:800000], y), num_parallel_calls=4) else: self.test_tfds = test_tfds as_timer("tfds_cvtr_init_tfds")
def __init__(self, metadata): """ Args: metadata: an AutoDLMetadata object. Its definition can be found in AutoDL_ingestion_program/dataset.py """ self.done_training = False self.metadata = metadata # self.domain = infer_domain(metadata) self.domain = "speech" # logger.info("Note:The AutoDL_G_CONF: {}".format(autodl_g_conf_repr)) logger.info("Note:The inferred domain of current dataset is: {}.".format(self.domain)) # Domain识别及Model初始化 # DomainModel = DOMAIN_TO_MODEL[self.domain] DomainModel = meta_domain_2_model(self.domain) self.domain_metadata = get_domain_metadata(metadata, self.domain) self.class_num = self.domain_metadata["class_num"] self.train_num = self.domain_metadata["train_num"] logger.info("Note:The domain metadata is {}".format(self.domain_metadata)) self.domain_model = DomainModel(self.domain_metadata) # fixme: 增加更新数据. self.speech_widsom_model = ASpeechWidsomModel(self.domain_metadata) self.speech_wisdom_dataset_train = None logger.info("Note:Init Speech Wisdom solution, is {}".format(self.domain_metadata)) self.main_train_loop_num = 0 self.main_test_loop_num = 0 # self.raw_tf_train_dataset = None self.dataset_sample_size = None self.dataset_read_num_second = None self.data_all_np_x_list = list() self.data_all_np_y_array = None self.ds_incr_flag = True # dataset sampling if still remain to be sampled incrementally. self.domain_dataset_train = None self.domain_dataset_test = None # for tf_dataset. self.tf_dataset_trainsformer = TfDatasetTransformer(if_train_shuffle=speech_ds_tds_conf.if_shuffle) as_timer("model_speech_init")
def train(self, dataset, remaining_time_budget=None): """Train method of domain-specific model.""" # Convert training dataset to necessary format and # store as self.domain_dataset_train logger.info("Note: speech_train_process model.py starts train") as_timer("train_start") # load tf_train_dataset for first time. self.tf_dataset_trainsformer.init_train_tfds(dataset, self.train_num) if self.domain in ["speech"]: # Train the model with light model. if self.main_train_loop_num < speech_ms_mlp_conf.lightwei_train_end_loop: # fixme: need to be autotuned. ds_take_size = min(int(self.train_num * speech_ds_tds_conf.sample_ratio[self.main_train_loop_num]), self.class_num * 50) # self.domain_dataset_train = self.tf_dataset_trainsformer.get_speech_train_dataset(ds_take_size) # self.domain_model.train(self.domain_dataset_train, remaining_time_budget=remaining_time_budget) self.domain_model.train(self.tf_dataset_trainsformer.get_speech_train_dataset(ds_take_size), remaining_time_budget=remaining_time_budget) logger.info( "Note: domain={}, main_train_loop_num={}, light_model train finished.".format( self.domain, self.main_train_loop_num ) ) as_timer("speech_model_basic_train") if self.main_train_loop_num >= speech_ms_mlp_conf.midwei_train_start_loop: self.speech_widsom_model.train( # (self.domain_dataset_train["x"], self.domain_dataset_train["y"]), remaining_time_budget self.tf_dataset_trainsformer.get_speech_train_dataset_full(), remaining_time_budget ) logger.info("Note: start wisdom at np, main_train_loop_num={}".format( self.main_train_loop_num)) as_timer("speech_tr34_train") logger.info("Note:time_train model.py domain_model train finished.") # Update self.done_training self.done_training = self.domain_model.done_training self.main_train_loop_num += 1 # print(as_timer) as_timer("train_end") logger.info(as_timer) else: logger.error("Note: Domain is not Speech!")
def get_train_numpy(self, update_train_num): # info( # "note: get_train_numpy, update_train_num={}, domain={}, accm_train_cnt={}, train_num={}".format( # update_train_num, self.domain, self.accm_train_cnt, self.train_num # ) # ) as_timer("tfdscvtr_get_train_np_start") if self.train_tfds is None: error("Error: train_tfds is None.") return self.accum_train_x, self.accum_train_y if self.tfds_train_os_iterator is None: time_mosi_start = time.time() self.tfds_train_os_iterator = self.train_tfds.make_one_shot_iterator() as_timer("tfds_train_os_iterator_make") self.tfds_train_iter_next = self.tfds_train_os_iterator.get_next() time_mosi_end = time.time() info("note: train_os_iterator done, cost_time={}s".format(round(time_mosi_end - time_mosi_start, 3))) cur_get_cnt = 0 cur_data_y = list() cur_incre_train_x = list() if self.accm_train_cnt < self.train_num: # info("note: accm_train_cnt={}, train_num={}".format(self.accm_train_cnt, self.train_num)) time_train_np_start = time.time() if self.domain == "text": info("note: domain={}".format(self.domain)) while True: example_batch_num = 0 try: example, labels = self.tfds_convertor_sess.run(self.tfds_train_iter_next) example = np.squeeze(example, (2, 3)) example = np.squeeze(example, axis=-1) example = example.astype(np.int) # fixme: 注意,这里example 和 labels都是batch, batch_size=20 cur_incre_train_x.extend(example) cur_data_y.extend(labels) # X.append(example) # Y.append(labels) cur_get_cnt += example.shape[0] self.accm_train_cnt += example.shape[0] example_batch_num += 1 # info("note: cur_get_cnt={}, accm_train_cnt={}, example_batch_num={}, a_example_shape={}".format(cur_get_cnt, self.accm_train_cnt, example_batch_num, example.shape)) if cur_get_cnt >= update_train_num or self.accm_train_cnt >= self.train_num: time_train_np_end = time.time() info( "note: now text extend batch domain={} take train update={}, accm_train_cnt={}, cost_time={}s".format( self.domain, cur_get_cnt, self.accm_train_cnt, round(time_train_np_end - time_train_np_start, 3) ) ) break except tf.errors.OutOfRangeError: info("train out of range, cur_get_cnt={}".format(cur_get_cnt)) break else: while True: try: example, labels = self.tfds_convertor_sess.run(self.tfds_train_iter_next) # output: Note:time example shape=(86401, 1, 1, 1) # logger.info("Note:time example shape={}".format(example.shape)) # self.accum_train_x.append(example) cur_incre_train_x.append(example) cur_data_y.append(labels) cur_get_cnt += 1 self.accm_train_cnt += 1 if cur_get_cnt >= update_train_num or self.accm_train_cnt >= self.train_num: time_train_np_end = time.time() info( "note: now append domain={} take train update={}, accm_train_cnt={}, train_num={}, cost_time={}s".format( self.domain, cur_get_cnt, self.accm_train_cnt, self.train_num, round(time_train_np_end - time_train_np_start, 3) ) ) as_timer("tfds_get_train_np_update={}".format(cur_get_cnt)) break except tf.errors.OutOfRangeError: break # 获取增量 train_x/y_numpy # info( # "note: self.accum_train_x num = {}, cur_incre_train_x num={}".format( # len(self.accum_train_x), len(cur_incre_train_x) # ) # ) # update accum_train_x/accum_train_y self.accum_train_x.extend(cur_incre_train_x) as_timer("tfds_get_train_np_accum_train_x_{}".format(len(self.accum_train_x))) if self.accum_train_y is None: # info("note: np.array(cur_data_y) shape={}".format(np.array(cur_data_y).shape)) self.accum_train_y = np.array(cur_data_y) else: # info( # "note: self.accum_train_y shape={}, np.array(cur_data_y) shape={}".format( # self.accum_train_y.shape, np.array(cur_data_y).shape # ) # ) self.accum_train_y = np.concatenate((self.accum_train_y, np.array(cur_data_y))) # info( # "note: self.accum_train_y shape={}, np.array(cur_data_y) shape={}".format( # self.accum_train_y.shape, np.array(cur_data_y).shape # ) # ) info("note: self.accum_train_x num_new={}, incre_train_num={}, self.accum_train_y shape={}, cur_data_y shape={}".format( len(self.accum_train_x), len(cur_incre_train_x), self.accum_train_y.shape, np.array(cur_data_y).shape )) else: self.tfds_convertor_sess.close() return cur_incre_train_x, np.array(cur_data_y)