Пример #1
0
    def random_split(x,
                     y,
                     w,
                     e,
                     n_valid=None,
                     n_cv=None,
                     n_era=None,
                     cv_seed=None):

        test_size = n_valid / n_era
        valid_era = []
        ss_train = StratifiedShuffleSplit(n_splits=n_cv,
                                          test_size=test_size,
                                          random_state=cv_seed)
        cv_count = 0

        for train_index, valid_index in ss_train.split(x, y):

            # Training data
            x_train = x[train_index]
            y_train = y[train_index]
            w_train = w[train_index]
            e_train = e[train_index]

            # Validation data
            x_valid = x[valid_index]
            y_valid = y[valid_index]
            w_valid = w[valid_index]
            e_valid = e[valid_index]

            cv_count += 1
            utils.print_cv_info(cv_count, n_cv)

            yield x_train, y_train, w_train, e_train, x_valid, y_valid, w_valid, e_valid, valid_era
Пример #2
0
    def sk_k_fold(x, y, w, n_splits=None, n_cv=None, cv_seed=None):

        if cv_seed is not None:
            np.random.seed(cv_seed)

        if n_cv % n_splits != 0:
            raise ValueError('n_cv must be an integer multiple of n_splits!')

        n_repeats = int(n_cv / n_splits)
        era_k_fold = RepeatedKFold(n_splits=n_splits,
                                   n_repeats=n_repeats,
                                   random_state=cv_seed)
        cv_count = 0

        for train_index, valid_index in era_k_fold.split(x, y):

            np.random.shuffle(train_index)
            np.random.shuffle(valid_index)

            # Training data
            x_train = x[train_index]
            y_train = y[train_index]
            w_train = w[train_index]

            # Validation data
            x_valid = x[valid_index]
            y_valid = y[valid_index]
            w_valid = w[valid_index]

            cv_count += 1
            utils.print_cv_info(cv_count, n_cv)

            yield x_train, y_train, w_train, x_valid, y_valid, w_valid
Пример #3
0
    def era_k_fold_split_all_random(e, n_valid=None, n_cv=None, n_era=None, cv_seed=None, era_list=None):

        if cv_seed is not None:
            np.random.seed(cv_seed)

        trained_cv = []

        for i in range(n_cv):

            if era_list is None:
                era_list = range(0, n_era)

            valid_era = np.random.choice(era_list, n_valid, replace=False)
            while any(set(valid_era) == i_cv for i_cv in trained_cv):
                print('This CV split has been chosen, choosing another one...')
                valid_era = np.random.choice(era_list, n_valid, replace=False)

            train_index = []
            valid_index = []

            for ii, ele in enumerate(e):

                if ele in valid_era:
                    valid_index.append(ii)
                else:
                    train_index.append(ii)

            np.random.shuffle(train_index)
            np.random.shuffle(valid_index)

            trained_cv.append(set(valid_era))

            utils.print_cv_info(i+1, n_cv)

            yield train_index, valid_index
Пример #4
0
 def random_split(x, y, n_splits=None, n_cv=None, cv_seed=None):
     train_data = utils.load_pkl_to_data(
         './data/preprocessed_data/x_g_train.p')
     data_mt = np.array(train_data)
     index = data_mt[:, 2]
     # station_list = index.tolist()
     # min_number = 10000
     # for i in np.unique(index):
     #     if min_number > station_list.count(i):
     #         min_number = station_list.count(i)
     # if n_splits > min_number:
     #     raise ValueError(
     #         '--The least populated station  has only %d members,please input new cv_number--' % min_number)
     cv_count = 0
     skf = StratifiedKFold(n_splits=n_cv,
                           shuffle=True,
                           random_state=cv_seed)
     for train_index, valid_index in skf.split(index, index):
         # Training data
         x_train = x[train_index]
         y_train = y[train_index]
         # Validation data
         x_valid = x[valid_index]
         y_valid = y[valid_index]
         cv_count += 1
         utils.print_cv_info(cv_count, n_cv)
         yield x_train, y_train, x_valid, y_valid
Пример #5
0
    def forward_increase(x, y, w, e, n_valid=None, n_cv=None, n_era=None, cv_seed=None, valid_rate=None):

        if cv_seed is not None:
            np.random.seed(cv_seed)

        # If valid_rate is provided, dynamically calculate n_valid
        if valid_rate is not None:
            n_valid_last = ceil(n_era*valid_rate)
        else:
            n_valid_last = n_valid
        step = (n_era-n_valid_last)//n_cv

        for i in range(n_cv):

            valid_start = (i+1) * step
            # If valid_rate is provided, dynamically calculate n_valid
            if valid_rate is not None:
                n_valid = ceil((valid_start*valid_rate)/(1-valid_rate))

            if i == (n_cv - 1):
                valid_stop = n_era
            else:
                valid_stop = valid_start + n_valid

            print('======================================================')
            print('Train Era: {}-{}'.format(0, valid_start-1))
            print('Valid Era: {}-{}'.format(valid_start, valid_stop-1))

            train_era = range(0, valid_start)
            valid_era = list(range(valid_start, valid_stop))

            train_index = []
            valid_index = []

            for ii, ele in enumerate(e):
                if ele in train_era:
                    train_index.append(ii)
                elif ele in valid_era:
                    valid_index.append(ii)

            np.random.shuffle(train_index)
            np.random.shuffle(valid_index)

            # Training data
            x_train = x[train_index]
            y_train = y[train_index]
            w_train = w[train_index]
            e_train = e[train_index]

            # Validation data
            x_valid = x[valid_index]
            y_valid = y[valid_index]
            w_valid = w[valid_index]
            e_valid = e[valid_index]

            utils.print_cv_info(i+1, n_cv)

            yield x_train, y_train, w_train, e_train, x_valid, y_valid, w_valid, e_valid, valid_era
Пример #6
0
    def era_k_fold_all_random(x,
                              y,
                              w,
                              e,
                              n_valid=None,
                              n_cv=None,
                              n_era=None,
                              cv_seed=None,
                              era_list=None):

        if cv_seed is not None:
            np.random.seed(cv_seed)

        trained_cv = []

        for i in range(n_cv):

            if era_list is None:
                era_list = range(0, n_era)

            valid_era = np.random.choice(era_list, n_valid, replace=False)
            while any(set(valid_era) == i_cv for i_cv in trained_cv):
                print('This CV split has been chosen, choosing another one...')
                valid_era = np.random.choice(era_list, n_valid, replace=False)

            train_index = []
            valid_index = []

            for ii, ele in enumerate(e):
                if ele in valid_era:
                    valid_index.append(ii)
                else:
                    train_index.append(ii)

            np.random.shuffle(train_index)
            np.random.shuffle(valid_index)

            # Training data
            x_train = x[train_index]
            y_train = y[train_index]
            w_train = w[train_index]
            e_train = e[train_index]

            # Validation data
            x_valid = x[valid_index]
            y_valid = y[valid_index]
            w_valid = w[valid_index]
            e_valid = e[valid_index]

            trained_cv.append(set(valid_era))

            utils.print_cv_info(i + 1, n_cv)

            yield x_train, y_train, w_train, e_train, x_valid, y_valid, w_valid, e_valid, valid_era
Пример #7
0
    def era_k_fold_balance(x,
                           y,
                           w,
                           e,
                           n_valid=None,
                           n_cv=None,
                           n_era=None,
                           cv_seed=None,
                           era_list=None):

        if cv_seed is not None:
            np.random.seed(cv_seed)

        trained_cv = []

        for i in range(n_cv):

            if era_list is None:
                era_list = range(0, n_era)

            valid_era = np.random.choice(era_list, n_valid, replace=False)
            while utils.check_bad_cv(trained_cv, valid_era):
                valid_era = np.random.choice(era_list, n_valid, replace=False)

            train_index = []
            valid_index = []

            for ii, ele in enumerate(e):
                if ele in valid_era:
                    valid_index.append(ii)
                else:
                    train_index.append(ii)

            np.random.shuffle(train_index)
            np.random.shuffle(valid_index)

            # Training data
            x_train = x[train_index]
            y_train = y[train_index]
            w_train = w[train_index]
            e_train = e[train_index]

            # Validation data
            x_valid = x[valid_index]
            y_valid = y[valid_index]
            w_valid = w[valid_index]
            e_valid = e[valid_index]

            trained_cv.append(set(valid_era))

            utils.print_cv_info(i + 1, n_cv)

            yield x_train, y_train, w_train, e_train, x_valid, y_valid, w_valid, e_valid, valid_era
Пример #8
0
    def sk_group_k_fold(x, y, e, n_cv=None):

        era_k_fold = GroupKFold(n_splits=n_cv)
        cv_count = 0

        for train_index, valid_index in era_k_fold.split(x, y, e):

            # Training data
            x_train = x[train_index]
            y_train = y[train_index]

            # Validation data
            x_valid = x[valid_index]
            y_valid = y[valid_index]

            cv_count += 1
            utils.print_cv_info(cv_count, n_cv)

            yield x_train, y_train, x_valid, y_valid
Пример #9
0
    def forward_window(x,
                       y,
                       w,
                       e,
                       n_valid=None,
                       n_cv=None,
                       n_era=None,
                       window_size=None,
                       cv_seed=None,
                       valid_rate=None):

        if cv_seed is not None:
            np.random.seed(cv_seed)

        n_step = (n_era - window_size) // n_cv
        if valid_rate is not None:
            n_valid = ceil(window_size * valid_rate)
        train_start = 0

        for i in range(n_cv):

            if i == (n_cv - 1):
                train_start = n_era - window_size
                train_end = n_era - n_valid
                valid_stop = n_era
            else:
                train_end = train_start + window_size - n_valid
                valid_stop = train_start + window_size

            print('======================================================')
            print('Train Era: {}-{}'.format(train_start, train_end - 1))
            print('Valid Era: {}-{}'.format(train_end, valid_stop - 1))

            train_era = list(range(train_start, train_end))
            valid_era = list(range(train_end, valid_stop))

            train_index = []
            valid_index = []

            for ii, ele in enumerate(e):
                if ele in train_era:
                    train_index.append(ii)
                elif ele in valid_era:
                    valid_index.append(ii)

            np.random.shuffle(train_index)
            np.random.shuffle(valid_index)

            # Training data
            x_train = x[train_index]
            y_train = y[train_index]
            w_train = w[train_index]
            e_train = e[train_index]

            # Validation data
            x_valid = x[valid_index]
            y_valid = y[valid_index]
            w_valid = w[valid_index]
            e_valid = e[valid_index]

            train_start += n_step
            utils.print_cv_info(i + 1, n_cv)

            yield x_train, y_train, w_train, e_train, x_valid, y_valid, w_valid, e_valid, valid_era
Пример #10
0
    def era_k_fold_for_stack(self,
                             x,
                             y,
                             w,
                             e,
                             x_g,
                             n_valid=None,
                             n_cv=None,
                             n_era=None,
                             cv_seed=None,
                             return_train_index=False):

        if cv_seed is not None:
            np.random.seed(cv_seed)

        n_traverse = n_era // n_valid
        n_rest = n_era % n_valid

        if n_rest != 0:
            n_traverse += 1

        if n_cv % n_traverse != 0:
            raise ValueError

        n_epoch = n_cv // n_traverse
        cv_count = 0

        for epoch in range(n_epoch):

            era_idx = [list(range(0, n_era))]

            if n_rest == 0:

                for i in range(n_traverse):

                    # Choose eras that have not used
                    if self.trained_cv:
                        valid_era = np.random.choice(era_idx[i],
                                                     n_valid,
                                                     replace=False)
                        while any(
                                set(valid_era) == i_cv
                                for i_cv in self.trained_cv):
                            print(
                                'This CV split has been chosen, choosing another one...'
                            )
                            if set(valid_era) != set(era_idx[i]):
                                valid_era = np.random.choice(era_idx[i],
                                                             n_valid,
                                                             replace=False)
                            else:
                                valid_era = np.random.choice(range(0, n_era),
                                                             n_valid,
                                                             replace=False)
                    else:
                        valid_era = np.random.choice(era_idx[i],
                                                     n_valid,
                                                     replace=False)

                    # Generate era set for next choosing
                    if i != n_traverse - 1:
                        era_next = [
                            rest for rest in era_idx[i]
                            if rest not in valid_era
                        ]
                        era_idx.append(era_next)

                    train_index = []
                    valid_index = []

                    # Generate train-validation split index
                    for ii, ele in enumerate(e):
                        if ele in valid_era:
                            valid_index.append(ii)
                        else:
                            train_index.append(ii)

                    np.random.shuffle(train_index)
                    np.random.shuffle(valid_index)

                    # Training data
                    x_train = x[train_index]
                    y_train = y[train_index]
                    w_train = w[train_index]
                    e_train = e[train_index]
                    x_g_train = x_g[train_index]

                    # Validation data
                    x_valid = x[valid_index]
                    y_valid = y[valid_index]
                    w_valid = w[valid_index]
                    e_valid = e[valid_index]
                    x_g_valid = x_g[valid_index]

                    self.trained_cv.append(set(valid_era))

                    cv_count += 1
                    utils.print_cv_info(cv_count, n_cv)

                    if return_train_index:
                        yield x_train, y_train, w_train, e_train, x_g_train, x_valid, \
                              y_valid, w_valid, e_valid, x_g_valid, train_index, valid_index, valid_era
                    else:
                        yield x_train, y_train, w_train, x_g_train, x_valid, \
                              y_valid, w_valid, x_g_valid, valid_index, valid_era

            # n_cv is not an integer multiple of n_valid
            else:

                for i in range(n_traverse):

                    if i != n_traverse - 1:

                        if self.trained_cv:
                            valid_era = np.random.choice(era_idx[i],
                                                         n_valid,
                                                         replace=False)
                            while any(
                                    set(valid_era) == i_cv
                                    for i_cv in self.trained_cv):
                                print(
                                    'This CV split has been chosen, choosing another one...'
                                )
                                valid_era = np.random.choice(era_idx[i],
                                                             n_valid,
                                                             replace=False)
                        else:
                            valid_era = np.random.choice(era_idx[i],
                                                         n_valid,
                                                         replace=False)

                        era_next = [
                            rest for rest in era_idx[i]
                            if rest not in valid_era
                        ]
                        era_idx.append(era_next)

                        train_index = []
                        valid_index = []

                        for ii, ele in enumerate(e):
                            if ele in valid_era:
                                valid_index.append(ii)
                            else:
                                train_index.append(ii)

                        np.random.shuffle(train_index)
                        np.random.shuffle(valid_index)

                        # Training data
                        x_train = x[train_index]
                        y_train = y[train_index]
                        w_train = w[train_index]
                        e_train = e[train_index]
                        x_g_train = x_g[train_index]

                        # Validation data
                        x_valid = x[valid_index]
                        y_valid = y[valid_index]
                        w_valid = w[valid_index]
                        e_valid = e[valid_index]
                        x_g_valid = x_g[valid_index]

                        self.trained_cv.append(set(valid_era))

                        cv_count += 1
                        utils.print_cv_info(cv_count, n_cv)

                        if return_train_index:
                            yield x_train, y_train, w_train, e_train, x_g_train, x_valid, \
                                  y_valid, w_valid, e_valid, x_g_valid, train_index, valid_index, valid_era
                        else:
                            yield x_train, y_train, w_train, x_g_train, x_valid, \
                                  y_valid, w_valid, x_g_valid, valid_index, valid_era

                    else:

                        era_idx_else = [
                            t for t in list(range(0, n_era))
                            if t not in era_idx[i]
                        ]

                        valid_era = era_idx[i] + list(
                            np.random.choice(
                                era_idx_else, n_valid - n_rest, replace=False))
                        while any(
                                set(valid_era) == i_cv
                                for i_cv in self.trained_cv):
                            print(
                                'This CV split has been chosen, choosing another one...'
                            )
                            valid_era = era_idx[i] + list(
                                np.random.choice(era_idx_else,
                                                 n_valid - n_rest,
                                                 replace=False))

                        train_index = []
                        valid_index = []

                        for ii, ele in enumerate(e):
                            if ele in valid_era:
                                valid_index.append(ii)
                            else:
                                train_index.append(ii)

                        np.random.shuffle(train_index)
                        np.random.shuffle(valid_index)

                        # Training data
                        x_train = x[train_index]
                        y_train = y[train_index]
                        w_train = w[train_index]
                        e_train = e[train_index]
                        x_g_train = x_g[train_index]

                        # Validation data
                        x_valid = x[valid_index]
                        y_valid = y[valid_index]
                        w_valid = w[valid_index]
                        e_valid = e[valid_index]
                        x_g_valid = x_g[valid_index]

                        self.trained_cv.append(set(valid_era))

                        cv_count += 1
                        utils.print_cv_info(cv_count, n_cv)

                        if return_train_index:
                            yield x_train, y_train, w_train, e_train, x_g_train, x_valid, \
                                  y_valid, w_valid, e_valid, x_g_valid, train_index, valid_index, valid_era
                        else:
                            yield x_train, y_train, w_train, x_g_train, x_valid, \
                                  y_valid, w_valid, x_g_valid, valid_index, valid_era
Пример #11
0
    def era_k_fold_split(e,
                         n_valid=None,
                         n_cv=None,
                         n_era=None,
                         cv_seed=None,
                         era_list=None):

        if cv_seed is not None:
            np.random.seed(cv_seed)

        n_traverse = n_era // n_valid
        n_rest = n_era % n_valid

        if n_rest != 0:
            n_traverse += 1

        if n_cv % n_traverse != 0:
            raise ValueError

        n_epoch = n_cv // n_traverse
        trained_cv = []
        cv_count = 0

        for epoch in range(n_epoch):

            if era_list is None:
                era_list = range(0, n_era)

            era_idx = [era_list]

            if n_rest == 0:

                for i in range(n_traverse):

                    # Choose eras that have not used
                    if trained_cv:
                        valid_era = np.random.choice(era_idx[i],
                                                     n_valid,
                                                     replace=False)
                        while any(
                                set(valid_era) == i_cv for i_cv in trained_cv):
                            print(
                                'This CV split has been chosen, choosing another one...'
                            )
                            if set(valid_era) != set(era_idx[i]):
                                valid_era = np.random.choice(era_idx[i],
                                                             n_valid,
                                                             replace=False)
                            else:
                                valid_era = np.random.choice(era_list,
                                                             n_valid,
                                                             replace=False)
                    else:
                        valid_era = np.random.choice(era_idx[i],
                                                     n_valid,
                                                     replace=False)

                    # Generate era set for next choosing
                    if i != n_traverse - 1:
                        era_next = [
                            rest for rest in era_idx[i]
                            if rest not in valid_era
                        ]
                        era_idx.append(era_next)

                    train_index = []
                    valid_index = []

                    # Generate train-validation split index
                    for ii, ele in enumerate(e):
                        if ele in valid_era:
                            valid_index.append(ii)
                        else:
                            train_index.append(ii)

                    np.random.shuffle(train_index)
                    np.random.shuffle(valid_index)

                    trained_cv.append(set(valid_era))

                    cv_count += 1
                    utils.print_cv_info(cv_count, n_cv)

                    yield train_index, valid_index

            # n_cv is not an integer multiple of n_valid
            else:

                for i in range(n_traverse):

                    if i != n_traverse - 1:

                        if trained_cv:
                            valid_era = np.random.choice(era_idx[i],
                                                         n_valid,
                                                         replace=False)
                            while any(
                                    set(valid_era) == i_cv
                                    for i_cv in trained_cv):
                                print(
                                    'This CV split has been chosen, choosing another one...'
                                )
                                valid_era = np.random.choice(era_idx[i],
                                                             n_valid,
                                                             replace=False)
                        else:
                            valid_era = np.random.choice(era_idx[i],
                                                         n_valid,
                                                         replace=False)

                        era_next = [
                            rest for rest in era_idx[i]
                            if rest not in valid_era
                        ]
                        era_idx.append(era_next)

                        train_index = []
                        valid_index = []

                        for ii, ele in enumerate(e):
                            if ele in valid_era:
                                valid_index.append(ii)
                            else:
                                train_index.append(ii)

                        np.random.shuffle(train_index)
                        np.random.shuffle(valid_index)

                        trained_cv.append(set(valid_era))

                        cv_count += 1
                        utils.print_cv_info(cv_count, n_cv)

                        yield train_index, valid_index

                    else:

                        era_idx_else = [
                            t for t in list(range(0, n_era))
                            if t not in era_idx[i]
                        ]

                        valid_era = era_idx[i] + list(
                            np.random.choice(
                                era_idx_else, n_valid - n_rest, replace=False))
                        while any(
                                set(valid_era) == i_cv for i_cv in trained_cv):
                            print(
                                'This CV split has been chosen, choosing another one...'
                            )
                            valid_era = era_idx[i] + list(
                                np.random.choice(era_idx_else,
                                                 n_valid - n_rest,
                                                 replace=False))

                        train_index = []
                        valid_index = []

                        for ii, ele in enumerate(e):
                            if ele in valid_era:
                                valid_index.append(ii)
                            else:
                                train_index.append(ii)

                        np.random.shuffle(train_index)
                        np.random.shuffle(valid_index)

                        trained_cv.append(set(valid_era))

                        cv_count += 1
                        utils.print_cv_info(cv_count, n_cv)

                        yield train_index, valid_index