Пример #1
0
    def get_test_np(self):
        if self.test_tfds is None:
            error("Error: test_tfds is None.")
            return self.accum_test_x, self.accum_test_y

        if len(self.accum_test_x) == 0:
            time_test_np_start = time.time()
            tfds_test_os_iterator = self.test_tfds.make_one_shot_iterator()
            as_timer("tfds_test_ositer")
            tfds_test_iter_next = tfds_test_os_iterator.get_next()

            # with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess:
            if self.if_padded_batch:
                while True:
                    try:
                        # example, labels = sess.run(tfds_test_iter_next)
                        example, labels = self.tfds_convertor_sess.run(
                            tfds_test_iter_next)
                        example = np.squeeze(example, (2, 3))
                        example = np.squeeze(example, axis=-1)
                        example = example.astype(np.int)

                        self.accum_test_x.extend(example)
                        self.accum_test_y.extend(labels)
                        self.accm_test_cnt += example.shape[0]
                        # X.append(example)
                        # Y.append(labels)
                    except tf.errors.OutOfRangeError:
                        break
            else:
                while True:
                    try:
                        # example, labels = sess.run(tfds_test_iter_next)
                        example, labels = self.tfds_convertor_sess.run(
                            tfds_test_iter_next)
                        # output:  Note:time example shape=(86401, 1, 1, 1)
                        # info("Note:time example shape={}".format(example.shape))
                        self.accum_test_x.append(example)
                        self.accum_test_y.append(labels)
                        self.accm_test_cnt += 1

                    except tf.errors.OutOfRangeError:
                        as_timer("tfds_test_run_OOR_{}".format(
                            self.accm_test_cnt))
                        break

            time_test_np_end = time.time()
            info("note: now take test accm_test_cnt={}, cost_time={}s".format(
                self.accm_test_cnt,
                round(time_test_np_end - time_test_np_start, 3)))
            self.accum_test_y = np.array(self.accum_test_y)

        # return self.accum_test_x
        return [np.squeeze(x) for x in self.accum_test_x]
Пример #2
0
    def init_train_tfds(self, train_tfds, train_num, force_shuffle=False):
        if self.train_tfds is None or self.train_num == 0 or force_shuffle is True:
            self.train_num = train_num
            self.train_tfds = train_tfds

            if self.if_train_shuffle or force_shuffle is True:
                self.train_tfds = self.train_tfds.shuffle(buffer_size=min(
                    self.train_max_shuffle_size, int(self.train_num * 0.6)))
                # self.train_tfds = self.train_tfds.shuffle(buffer_size=self.train_shuffle_size)

            if self.if_padded_batch:
                self.train_tfds = self.train_tfds.padded_batch(
                    self.padded_batch_size,
                    padded_shapes=([None, 1, 1, 1], [None]),
                    padding_values=(tf.constant(-1, dtype=tf.float32),
                                    tf.constant(-1, dtype=tf.float32)),
                )
            # force reNone tfds_train_iterator.
            self.tfds_train_os_iterator = None

            info(
                "note: train_tfds cache, if_train_shuffle={}, force_shuffle={}, reset tfds_train_os_iterator None."
                .format(self.if_train_shuffle, force_shuffle))
Пример #3
0
    def get_train_np(self, take_size):
        as_timer("tfdscvtr_get_train_np_start")
        if self.train_tfds is None:
            error("Error: train_tfds is None.")
            return self.accum_train_x, self.accum_train_y

        if self.tfds_train_os_iterator is None:
            self.tfds_train_os_iterator = self.train_tfds.make_one_shot_iterator(
            )
            as_timer("tfds_train_os_iterator_make")
            self.tfds_train_iter_next = self.tfds_train_os_iterator.get_next()

        cur_get_cnt = 0
        cur_data_y = list()
        cur_incre_train_x = list()

        if self.accm_train_cnt < self.train_num:
            # info("note: accm_train_cnt={}, train_num={}".format(self.accm_train_cnt, self.train_num))
            time_train_np_start = time.time()
            if self.if_padded_batch:
                info("note: domain={}".format(self.domain))
                while True:
                    example_batch_num = 0
                    try:
                        example, labels = self.tfds_convertor_sess.run(
                            self.tfds_train_iter_next)
                        example = np.squeeze(example, (2, 3))
                        example = np.squeeze(example, axis=-1)
                        example = example.astype(np.int)
                        # fixme: 注意,这里example 和 labels都是batch, batch_size=20
                        cur_incre_train_x.extend(example)
                        cur_data_y.extend(labels)
                        # X.append(example)
                        # Y.append(labels)
                        cur_get_cnt += example.shape[0]
                        self.accm_train_cnt += example.shape[0]
                        example_batch_num += 1
                        # info("note: cur_get_cnt={}, accm_train_cnt={}, example_batch_num={}, a_example_shape={}".format(cur_get_cnt, self.accm_train_cnt, example_batch_num, example.shape))

                        if cur_get_cnt >= take_size or self.accm_train_cnt >= self.train_num:
                            time_train_np_end = time.time()
                            info(
                                "note: now text extend batch domain={} take train update={}, accm_train_cnt={}, cost_time={}s"
                                .format(
                                    self.domain, cur_get_cnt,
                                    self.accm_train_cnt,
                                    round(
                                        time_train_np_end -
                                        time_train_np_start, 3)))
                            break

                    except tf.errors.OutOfRangeError:
                        info("train out of range, cur_get_cnt={}".format(
                            cur_get_cnt))
                        break

            else:
                while True:
                    try:
                        example, labels = self.tfds_convertor_sess.run(
                            self.tfds_train_iter_next)
                        # output:  Note:time example shape=(86401, 1, 1, 1)
                        # logger.info("Note:time example shape={}".format(example.shape))
                        # self.accum_train_x.append(example)
                        cur_incre_train_x.append(example)
                        cur_data_y.append(labels)
                        cur_get_cnt += 1
                        self.accm_train_cnt += 1
                        if cur_get_cnt >= take_size or self.accm_train_cnt >= self.train_num:
                            time_train_np_end = time.time()
                            info(
                                "note: now append domain={} take train update={}, accm_train_cnt={}, train_num={}, cost_time={}s"
                                .format(
                                    self.domain, cur_get_cnt,
                                    self.accm_train_cnt, self.train_num,
                                    round(
                                        time_train_np_end -
                                        time_train_np_start, 3)))
                            as_timer("tfds_get_train_np_update={}".format(
                                cur_get_cnt))
                            break

                    except tf.errors.OutOfRangeError:
                        break

            self.accum_train_x.extend(cur_incre_train_x)
            as_timer("tfds_get_train_np_accum_train_x_{}".format(
                len(self.accum_train_x)))

            if self.accum_train_y is None:
                self.accum_train_y = np.array(cur_data_y)
            else:
                self.accum_train_y = np.concatenate(
                    (self.accum_train_y, np.array(cur_data_y)))

            info(
                "note: self.accum_train_x num_new={}, incre_train_num={}, self.accum_train_y shape={}, cur_data_y shape={}"
                .format(len(self.accum_train_x), len(cur_incre_train_x),
                        self.accum_train_y.shape,
                        np.array(cur_data_y).shape))

        else:
            self.tfds_convertor_sess.close()

        # return cur_incre_train_x, np.array(cur_data_y)
        return {
            "x": [np.squeeze(x) for x in cur_incre_train_x],
            "y": np.array(cur_data_y)
        }
Пример #4
0
import tensorflow as tf
import numpy as np
import time

from at_toolkit.at_utils import info, error, as_timer
from at_toolkit.interface.adl_tfds_convertor import AbsTfdsConvertor

info("note, tf version={}".format(tf.__version__))


class TfdsConvertor(AbsTfdsConvertor):
    def __init__(self,
                 if_train_shuffle=False,
                 train_shuffle_size=100,
                 if_pad_batch=False,
                 padded_batch_size=20,
                 domain=None):
        self.train_tfds = None
        self.test_tfds = None
        self.train_num = 0
        self.test_num = 0
        self.accum_train_x = list()
        self.accum_train_y = None
        self.accm_train_cnt = 0
        self.accum_test_x = list()
        self.accum_test_y = list()
        self.accm_test_cnt = 0

        self.tfds_train_os_iterator = None
        self.tfds_train_iter_next = None
Пример #5
0
    def test_pipeline(self, test_tfds):
        as_timer("test_start")
        # 1. raw data: tfds2np.
        self.tfds_convertor.init_test_tfds(test_tfds)
        if not self.feats_data_db.raw_data_db.if_raw_test_2_np_done:
            raw_test_np = self.tfds_convertor.get_test_np()
            assert isinstance(raw_test_np, list), "raw_test_np is not list"
            info("raw_test_np, len={}, ele={}".format(len(raw_test_np), raw_test_np[0]))
            self.feats_data_db.raw_data_db.put_raw_test_np(raw_test_np)
            as_timer("te_s0_tfds2np_{}".format(len(raw_test_np)))

        if self.cur_cls_name in [CLS_LR_LIBLINEAER, CLS_LR_SAG]:
            # 2. get asso-feats test.
            use_feat_params = {"len_sample": 5, "sr": 16000}
            cur_test_examples_x = self.feats_data_db.get_raw_test_feats(self.cur_feat_name, use_feat_params)

            as_timer("te_s1_examples_{}".format(len(cur_test_examples_x)))

            # 3. cur_cls test and predict.
            assert isinstance(self.cur_cls, AdlClassifier)
            cur_test_preds = self.cur_cls.predict_proba(cur_test_examples_x, predict_prob_params={"if_multilabel": self.is_multilabel})
            self.test_pip_id += 1
            as_timer("test_end")
            info(as_timer)
            return np.array(cur_test_preds)

        if self.cur_cls_name in [CLS_TR34]:
            # tr34 train pipeline warmup.
            while self.tr34_cls_train_pip_run < self.tr34_trainpip_warmup:
            # while self.tr34_cls_train_pip_run < TR34_TRAINPIP_WARMUP:
                self.train_pipeline(train_tfds=None, update_train_data=False)

            assert isinstance(self.cur_cls, ThinResnet34Classifier), "Error, cur_cls type error."
            # 2. get asso-feats test.
            if_force_test_feats = self.cur_cls.decide_if_renew_testfeats()
            use_feat_params = self.cur_cls.imp_feat_args
            cur_test_examples_x = self.feats_data_db.get_raw_test_feats(
                self.cur_feat_name, use_feat_params, if_force_test_feats
            )
            # need reformat for test
            info("tr34_test, type={}".format(type(cur_test_examples_x)))
            # cur_test_examples_x = np.array(cur_test_examples_x)
            cur_test_examples_x = np.asarray(cur_test_examples_x)

            cur_test_examples_x = cur_test_examples_x[:, :, :, np.newaxis]
            info("tr34_test, type={}, shape={}".format(type(cur_test_examples_x), cur_test_examples_x.shape))

            as_timer("te_s1_examples_{}".format(len(cur_test_examples_x)))

            # 3. cur_cls test and predict.
            assert isinstance(self.cur_cls, AdlClassifier)

            cur_test_preds = self.cur_cls.predict_proba(cur_test_examples_x)

            # del
            del cur_test_examples_x

            self.test_pip_id += 1
            as_timer("test_end")
            info(as_timer)
            return cur_test_preds

        as_timer("test_end")
        info(as_timer)
Пример #6
0
    def train_pipeline(self, train_tfds, update_train_data=True):
        if self.train_pip_id < len(self.tfds2np_take_size_array):
            if self.train_pip_id == 1:
                take_train_size = max(200, int(self.tfds2np_take_size_array[self.train_pip_id] * self.train_num))
            else:
                take_train_size = int(self.tfds2np_take_size_array[self.train_pip_id] * self.train_num)
        else:
            take_train_size = 200
        self.token_train_size += take_train_size
        self.cur_train_his_report = dict()
        as_timer("train_start")

        # 1. raw data: tfds2np.
        self.tfds_convertor.init_train_tfds(train_tfds, self.train_num)
        if update_train_data is True and self.feats_data_db.raw_data_db.raw_train_np_filled_num < self.train_num:
            accm_raw_train_np_dict = self.tfds_convertor.get_train_np_accm(take_train_size)
            # if self.minis_eda_report is None:
            self.minis_eda_report = minisamples_edaer(accm_raw_train_np_dict["x"], accm_raw_train_np_dict["y"])
            # decide if re-shuffle train tfds.
            if self.minis_eda_report.get("y_cover_rate") <= 0.5:
                info("Warning, old_y_cover_rate={} is too low, need re_shuffle train_tfds.".format(self.minis_eda_report.get("y_cover_rate")))
                self.tfds_convertor.init_train_tfds(train_tfds, self.train_num, force_shuffle=True)
                # renew data, shuffle, get data and get report.
                accm_raw_train_np_dict = self.tfds_convertor.get_train_np_accm(take_train_size)
                self.minis_eda_report = minisamples_edaer(accm_raw_train_np_dict["x"], accm_raw_train_np_dict["y"])
                info("Note, new_y_cover_rate={} ".format(self.minis_eda_report.get("y_cover_rate")))

            # update: meta-is_multilabel
            self.is_multilabel = self.minis_eda_report.get("is_multilabel")
            self.tr34_cls.renew_if_multilabel(self.is_multilabel)

            # if self.minis_eda_report is None:

            if self.tfds2np_takesize_flag is False:
                self.decision_maker.learn_train_minisamples_report(self.minis_eda_report)
                self.tfds2np_take_size_array = self.decision_maker.decide_tfds2np_array()
                self.tfds2np_takesize_flag = True

            info("Note, mini_eda_report = {}, tfds2np_takesize_array={}".format(self.minis_eda_report, self.tfds2np_take_size_array))
            self.feats_data_db.raw_data_db.put_raw_train_np(accm_raw_train_np_dict["x"], accm_raw_train_np_dict["y"])

        # 1-1: option: split val.
        if_split_val = self.decision_maker.decide_if_split_val(self.token_train_size)
        info("Val: if_val_on={}, if_split_val={}, len={}".format(IF_VAL_ON, if_split_val, len(self.val_sample_idxs)))
        if IF_VAL_ON and if_split_val and len(self.val_sample_idxs) == 0:
            val_mode = "bal"
            # val_mode = "random"
            val_num = self.decision_maker.decide_g_valid_num()
            self.val_sample_idxs = self.val_splitor.get_valid_sample_idxs(
                np.stack(self.feats_data_db.raw_data_db.raw_train_y_np_table_filled), val_num=val_num, mode=val_mode
            )
            self.feats_data_db.raw_data_db.put_split_valid_np(self.val_sample_idxs)
            self.cur_val_examples_y = self.feats_data_db.get_raw_train_y(self.val_sample_idxs)
            info(
                "Note, do val_split, mode={}, val_num={}, real_num={}, val_sampld_idxs={}".format(
                    val_mode, val_num, len(self.val_sample_idxs), self.val_sample_idxs
                )
            )

        # 2. model select.
        self.cur_cls_name = self.decision_maker.decide_model_select(self.train_pip_id)
        info("---------Model_Select_CLS={}---------".format(self.cur_cls_name))
        self.cur_cls = self.cur_cls_ins_table.get(self.cur_cls_name)
        self.cur_sampler = self.cur_sampler_table.get(self.cur_cls_name)

        if self.cur_cls_name in [CLS_LR_LIBLINEAER, CLS_LR_SAG]:
            # 3. sample train_idxs.
            # sample-even_sample.
            if self.is_multilabel is False:
                self.lr_sampler.init_train_y(self.feats_data_db.raw_data_db.raw_train_y_np_table_filled)
                class_inverted_index_array = self.lr_sampler.init_each_class_index_by_y(self.lr_sampler.train_y)
                # info("class_inverted_array={}".format(class_inverted_index_array))
                info(
                    "class_inverted_array len={}, some={}".format(
                        len(class_inverted_index_array), class_inverted_index_array[:3]
                    )
                )
                cur_train_sample_idxs = self.lr_sampler.init_even_class_index_by_each(class_inverted_index_array)
                info("cur_train_sample_idxs len={}, some={}".format(len(cur_train_sample_idxs), cur_train_sample_idxs[:3]))
                cur_train_sample_idxs = [item for sublist in cur_train_sample_idxs for item in sublist]
                info("cur_train_sample_idxs len={}, some={}".format(len(cur_train_sample_idxs), cur_train_sample_idxs[:3]))
                # filter val idxs out.
                cur_train_sample_idxs = [i for i in cur_train_sample_idxs if i not in self.val_sample_idxs]
                info("cur_train_sample_idxs len={}, some={}".format(len(cur_train_sample_idxs), cur_train_sample_idxs[:3]))
                as_timer("t_s3_trainidx_{}".format(len(cur_train_sample_idxs)))

            # config: all put into: nosample
            else:
                cur_train_sample_idxs = range(len(self.feats_data_db.raw_data_db.raw_train_y_np_table_filled))

            # 4. get asso-feats train.
            self.cur_feat_name = CLS_2_FEATNAME_REG_TABLE.get(self.cur_cls_name)
            self.use_feat_params = {"len_sample": 5, "sr": 16000}
            cur_train_examples_x = self.feats_data_db.get_raw_train_feats(
                self.cur_feat_name, cur_train_sample_idxs, self.use_feat_params
            )
            cur_train_examples_y = self.feats_data_db.get_raw_train_y(cur_train_sample_idxs)
            # info("cur_train_examples_x, shape={}, data={}".format(cur_train_examples_x.shape, cur_train_examples_x))
            # info("cur_train_examples_y, shape={}, data={}".format(cur_train_examples_y.shape, cur_train_examples_y))
            info("cur_train_examples_x, shape={},".format(cur_train_examples_x.shape))
            info("cur_train_examples_y, shape={},".format(cur_train_examples_y.shape))

            train_eda_report = sample_y_edaer(cur_train_examples_y)
            info("Note, train_eda_y_report = {}".format(train_eda_report))

            as_timer("t_s4_texamples_{}".format(len(cur_train_examples_x)))

            # 5. cur_cls train and fit
            if self.cur_cls_name == CLS_LR_LIBLINEAER:
                assert isinstance(self.cur_cls, SLLRLiblinear), "Error cur_cls is not {}".format(SLLRLiblinear.__name__)
                self.cur_cls.offline_fit(cur_train_examples_x, cur_train_examples_y, fit_params={"if_multilabel": self.is_multilabel})
            elif self.cur_cls_name == CLS_LR_SAG:
                assert isinstance(self.cur_cls, SLLRSag), "Error cur_cls is not {}".format(SLLRSag.__name__)
                self.cur_cls.offline_fit(cur_train_examples_x, cur_train_examples_y, fit_params={"if_multilabel": self.is_multilabel})

            as_timer("t_s5_fit_{}".format(len(cur_train_examples_x)))

        elif self.cur_cls_name in [CLS_TR34]:
            assert isinstance(self.cur_cls, ThinResnet34Classifier), "Error, cls select is {}".format(
                type(self.cur_cls)
            )
            # 3. sample train_idxs.
            train_use_y_labels = np.stack(self.feats_data_db.raw_data_db.raw_train_y_np_table_filled)
            info(
                "train_use_y_labels, type={}, shape={}".format(
                    # type(train_use_y_labels), train_use_y_labels.shape, train_use_y_labels
                    type(train_use_y_labels), train_use_y_labels.shape
                )
            )

            self.tr34_sampler = AutoSpSamplerNew(y_train_labels=train_use_y_labels)

            if self.is_multilabel is False:
                # old: use sampling.
                self.tr34_sampler.set_up()
                cur_train_sample_idxs = self.tr34_sampler.get_downsample_index_list_by_class(
                    per_class_num=Tr34SamplerHpParams.SAMPL_PA_F_PERC_NUM,
                    max_sample_num=self.tr34_hps_sample_dict.get("SAMP_MAX_NUM"),
                    min_sample_num=self.tr34_hps_sample_dict.get("SAMP_MIN_NUM"),
                )
            else:
                # try: use all or random.
                cur_train_sample_idxs = self.tr34_sampler.get_downsample_index_list_by_random(
                    max_sample_num=self.tr34_hps_sample_dict.get("SAMP_MAX_NUM"),
                    min_sample_num=self.tr34_hps_sample_dict.get("SAMP_MIN_NUM"))

            info("cur_train_sample_idxs len={}, some={}".format(len(cur_train_sample_idxs), cur_train_sample_idxs[:3]))
            # filter val idxs out.
            cur_train_sample_idxs = [i for i in cur_train_sample_idxs if i not in self.val_sample_idxs]
            info("cur_train_sample_idxs len={}, some={}".format(len(cur_train_sample_idxs), cur_train_sample_idxs[:3]))

            # 4. get asso-feats train.
            self.cur_feat_name = CLS_2_FEATNAME_REG_TABLE.get(self.cur_cls_name)
            if_train_feats_force = self.cur_cls.decide_if_renew_trainfeats()
            self.use_feat_params = self.cur_cls.imp_feat_args
            info("train_feats_params={}".format(self.use_feat_params))
            cur_train_examples_x = self.feats_data_db.get_raw_train_feats(
                self.cur_feat_name, cur_train_sample_idxs, self.use_feat_params, if_train_feats_force
            )
            cur_train_examples_y = self.feats_data_db.get_raw_train_y(cur_train_sample_idxs)
            # info("cur_train_examples_x, shape={}, data={}".format(cur_train_examples_x.shape, cur_train_examples_x[:1]))
            info("cur_train_examples_x, shape={}".format(cur_train_examples_x.shape))
            # info("cur_train_examples_y, shape={}, data={}".format(cur_train_examples_y.shape, cur_train_examples_y[:1]))
            info("cur_train_examples_y, shape={}".format(cur_train_examples_y.shape))
            train_eda_report = sample_y_edaer(cur_train_examples_y)
            info("Note, train_eda_y_report = {}".format(train_eda_report))

            as_timer("t_s4_texamples_{}".format(len(cur_train_examples_x)))

            # 5. cur_cls train and fit
            self.tr34_cls_train_pip_run += 1
            self.cur_train_his_report = self.cur_cls.online_fit(cur_train_examples_x, cur_train_examples_y, fit_params=self.tr34_hps_epochs_dict)
            info("tr34_cls_train_pip_run={}".format(self.tr34_cls_train_pip_run))
            as_timer("t_s5_fit_{}".format(len(cur_train_examples_x)))

        # 6. option: val
        if len(self.val_sample_idxs) > 0:
            # cur_val_examples_x = self.feats_data_db.get_raw_train_feats(self.cur_feat_name, self.val_sample_idxs,
            #                                                             self.use_feat_params)
            info("Note, cur_cls_name={}, get_val_feats done".format(self.cur_cls_name))
            if self.cur_cls_name == CLS_TR34:
                assert isinstance(self.cur_cls, ThinResnet34Classifier)
                if_force_val_feats = self.cur_cls.decide_if_renew_valfeats()
                use_feat_params = self.cur_cls.imp_feat_args
                cur_val_examples_x = self.feats_data_db.get_split_val_feats(
                    self.cur_feat_name, self.val_sample_idxs, use_feat_params, if_force_val_feats
                )
                cur_val_examples_x = np.array(cur_val_examples_x)
                cur_val_examples_x = cur_val_examples_x[:, :, :, np.newaxis]
                cur_val_preds = self.cur_cls.predict_val_proba(cur_val_examples_x)
            else:
                cur_val_examples_x = self.feats_data_db.get_split_val_feats(self.cur_feat_name, self.val_sample_idxs, self.use_feat_params)
                cur_val_preds = self.cur_cls.predict_proba(cur_val_examples_x, predict_prob_params={"if_multilabel": self.is_multilabel})

            info("Note, cur_cls_name={}, get_val_preds done".format(self.cur_cls_name))
            self.cur_val_nauc = ATEvaluator.autodl_auc(solution=self.cur_val_examples_y, prediction=cur_val_preds)
            # self.cur_val_nauc = round(2*self.cur_val_auc - 1, 6)
            info("Note, cur_cls_name={}, \033[1;31;m cur_val_nauc={}\033[0m".format(self.cur_cls_name, self.cur_val_nauc))
            as_timer("t_s6_val_{}".format(len(self.val_sample_idxs)))
        else:
            # self.cur_val_auc, self.cur_val_nauc = -1, -1
            self.cur_val_nauc = -1

        self.train_pip_id += 1
        as_timer("train_end")
        info(as_timer)
        self.cur_train_his_report["val_nauc"] = self.cur_val_nauc
        self.cur_train_his_report["cls_name"] = self.cur_cls_name
        # return self.cur_cls_name, self.cur_train_his_report
        return self.cur_train_his_report.copy()