Python load_logs示例，util.load_logs Python示例

示例#1

0

显示文件

        img = util.draw_pose(dataset, img, lbls[idx], 2, (0, 255, 0))
        img = cv2.resize(img, (200, 200))
        cv2.imwrite(
            root_dir + 'samples/depth/cropped/' + phase + '_' + str(idx) +
            '.png', img)
########################################################################################################################

############################ Draw predicted pose on depth samples and create sample videos of predictions #################################
### Create sample Depth videos from the predicted poses
### this segment is only for validation
###########################################################################################################################################
phase = 'test'
lbls = util.load_labels('fpad', phase)  ### load test/train data
names = util.load_names('fpad', phase)
centers = util.load_centers('fpad', phase).astype(float)
lbls, preds = util.load_logs('fpad', 'fpad_test_b159_lr_1e-2_xyz_.txt',
                             centers)  #### name of test logfile

for idx, name in enumerate(names):
    action = name.split('/')[2]
    subject = name.split('/')[1]
    if action == 'close_juice_bottle' and subject == 'Subject_2':  # use this condition to select a certain action by name
        # if idx in (11000, 13000, 15000, 16000, 18000, 21000, 22000, 23000, 31000):   # or this condition to select specific frames or sequences by ID
        pred, skel_camcoords = util.world2pixel(preds[idx], 'fpad')
        label, skel_camcoords = util.world2pixel(lbls[idx], 'fpad')
        img = util.load_image('fpad', os.path.join(root_dir, name))
        img = cv2.imread(os.path.join(root_dir, name), 2)
        img = img.astype(float)
        max = np.max(img)
        img /= 1160
        img *= 255
        im = np.zeros((480, 640, 3))

示例#2

0

显示文件

文件： dataset.py 项目： Divergent914/kddcup2015

def load_train(earlist_base_date=None, depth=1, cache_only=False):
    """
    Load dataset for training and validating.

    *NOTE*  If you need a validating set, you SHOULD split from training set
    by yourself.

    Parameters
    ----------
    earlist_base_date: datetime, None by default
    Base date won't be smaller than earlist_base_date.

    depth: int, 1 by default
    Maximum moves of time window.

    cache_only: bool, False by default
    Cache data of every period, do not return full spanned data.

    Returns
    -------
    X: numpy ndarray, shape: (num_of_enrollments, num_of_features)
    Rows of features. It is the features of all time if cache_only is True.

    y: numpy ndarray, shape: (num_of_enrollments,)
    Vector of labels. It is the labels of all time if cache_only is True.
    """
    logger = logging.getLogger('load_train')

    enroll_ids = np.sort(util.load_enrollment_train()['enrollment_id'])
    log = util.load_logs()[['enrollment_id', 'time']]
    # base_date = log['time'].max().to_datetime()
    base_date = datetime(2014, 8, 1, 22, 0, 47)

    logger.debug('load features before %s', base_date)

    pkl_X_path = util.cache_path('train_X_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    pkl_y_path = util.cache_path('train_y_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path):
        logger.debug('fetch cached')
        X = util.fetch(pkl_X_path)
        y = util.fetch(pkl_y_path)
    else:
        X, _ = __load_dataset__(enroll_ids, log, base_date)
        y_with_id = util.load_val_y()
        if not np.all(y_with_id[:, 0] == enroll_ids):
            logger.fatal('something wrong with enroll_ids')
            raise RuntimeError('something wrong with enroll_ids')
        y = y_with_id[:, 1]

        util.dump(X, pkl_X_path)
        util.dump(y, pkl_y_path)

    # base_date = log['time'].max().to_datetime() - timedelta(days=10)
    base_date = datetime(2014, 7, 22, 22, 0, 47)
    Dw = timedelta(days=7)
    enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)
    for _ in range(depth - 1):
        if enroll_ids.size <= 0:
            break
        if earlist_base_date is not None and base_date < earlist_base_date:
            break

        logger.debug('load features before %s', base_date)

        # get instances and labels
        pkl_X_path = util.cache_path('train_X_before_%s' %
                                     base_date.strftime('%Y-%m-%d_%H-%M-%S'))
        pkl_y_path = util.cache_path('train_y_before_%s' %
                                     base_date.strftime('%Y-%m-%d_%H-%M-%S'))
        if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path):
            logger.debug('fetch cached')
            X_temp = util.fetch(pkl_X_path)
            y_temp = util.fetch(pkl_y_path)
        else:
            X_temp, y_temp = __load_dataset__(enroll_ids, log, base_date)

            util.dump(X_temp, pkl_X_path)
            util.dump(y_temp, pkl_y_path)

        # update instances and labels
        if not cache_only:
            X = np.r_[X, X_temp]
            y = np.append(y, y_temp)

        # update base_date and enroll_ids
        base_date -= Dw
        enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)

    return X, y

示例#3

0

显示文件

                          target=axes[2],
                          label='Unsmoothed',
                          c='r')
    axes[0].set_xlabel('')
    axes[1].set_xlabel('')

    if show:
        plt.show()
    return fig, axes


dates = [
    '20061214', '20010830', '20050831', '20100405', '20110805', '20150316'
]
for date in dates:
    hour_logs = util.load_logs(
        '/Users/sgraf/Desktop/SWMF_analysis/outputs/{}/hour/'.format(date))
    orig_logs = util.load_logs(
        '/Users/sgraf/Desktop/SWMF_analysis/outputs/{}/unsmoothed/'.format(
            date))
    thirty_logs = util.load_logs(
        '/Users/sgraf/Desktop/SWMF_analysis/outputs/{}/30min/'.format(date))
    thirty_geo = util.load_logs(
        '/Users/sgraf/Desktop/SWMF_analysis/outputs/{}/30min'.format(date),
        logtype='geo')
    hour_geo = util.load_logs(
        '/Users/sgraf/Desktop/SWMF_analysis/outputs/{}/hour/'.format(date),
        logtype='geo')
    orig_geo = util.load_logs(
        '/Users/sgraf/Desktop/SWMF_analysis/outputs/{}/unsmoothed/'.format(
            date),
        logtype='geo')

示例#4

0

显示文件

def source_event_counter(enrollment_set, base_date):
    """
    Counts the source-event pairs.

    Features
    --------
    """
    X_pkl_path = util.cache_path('source_event_counter_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(X_pkl_path):
        return util.fetch(X_pkl_path)

    logger = logging.getLogger('source_event_counter')
    logger.debug('preparing datasets')

    Enroll_all = util.load_enrollments()

    pkl_path = util.cache_path('Log_all_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        Log = util.fetch(pkl_path)
    else:
        Log = util.load_logs()
        Log = Log[Log['time'] <= base_date]
        Log['source_event'] = Log['source'] + '-' + Log['event']
        Log['day_diff'] = (base_date - Log['time']).dt.days
        Log['week_diff'] = Log['day_diff'] // 7
        Log['event_count'] = 1

        util.dump(Log, pkl_path)

    Log_counted = Log.groupby(['enrollment_id', 'source_event', 'week_diff'])\
        .agg({'event_count': np.sum}).reset_index()

    logger.debug('datasets prepared')

    Enroll = Enroll_all.set_index('enrollment_id').ix[enrollment_set]\
        .reset_index()

    n_proc = par.cpu_count()

    pkl_path = util.cache_path('event_count_by_eid_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        event_count_by_eid = util.fetch(pkl_path)
    else:
        params = []
        eids = []
        for eid, df in pd.merge(Enroll_all, Log_counted, on=['enrollment_id'])\
                .groupby(['enrollment_id']):
            params.append(df)
            eids.append(eid)
        pool = par.Pool(processes=min(n_proc, len(params)))
        event_count_by_eid = dict(
            zip(eids, pool.map(__get_counting_feature__, params)))
        pool.close()
        pool.join()

        util.dump(event_count_by_eid, pkl_path)

    X0 = np.array([event_count_by_eid[i] for i in Enroll['enrollment_id']])

    logger.debug('source-event pairs counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X0)), repr(X0.shape))

    pkl_path = util.cache_path('D_full_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        D_full = util.fetch(pkl_path)
    else:
        D_full = pd.merge(Enroll_all, Log, on=['enrollment_id'])

        util.dump(D_full, pkl_path)

    pkl_path = util.cache_path('user_wn_courses_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        user_wn_courses = util.fetch(pkl_path)
    else:
        user_wn_courses = {}
        for u, df in D_full.groupby(['username']):
            x = []
            for wn in __week_span__:
                x.append(len(df[df['week_diff'] == wn]['course_id'].unique()))
            user_wn_courses[u] = x

        util.dump(user_wn_courses, pkl_path)

    X1 = np.array([user_wn_courses[u] for u in Enroll['username']])

    logger.debug('courses by user counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X1)), repr(X1.shape))

    pkl_path = util.cache_path('course_population_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        course_population = util.fetch(pkl_path)
    else:
        course_population = {}
        for c, df in D_full.groupby(['course_id']):
            course_population[c] = len(df['username'].unique())

        util.dump(course_population, pkl_path)

    X2 = np.array([course_population.get(c, 0) for c in Enroll['course_id']])

    logger.debug('course population counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X2)), repr(X2.shape))

    pkl_path = util.cache_path('course_dropout_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        course_dropout_count = util.fetch(pkl_path)
    else:
        course_dropout_count = course_population.copy()
        for c, df in D_full[D_full['day_diff'] < 10].groupby(['course_id']):
            course_dropout_count[c] -= len(df['username'].unique())

        util.dump(course_dropout_count, pkl_path)

    X3 = np.array(
        [course_dropout_count.get(c, 0) for c in Enroll['course_id']])

    logger.debug('course dropout counted, has nan: %s, shape: %s',
                 np.any(np.isnan(X3)), repr(X3.shape))

    pkl_path = util.cache_path('user_ops_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        user_ops_count = util.fetch(pkl_path)
    else:
        user_ops_on_all_courses = D_full.groupby(
            ['username', 'source_event', 'week_diff'])\
            .agg({'event_count': np.sum}).reset_index()
        params = []
        users = []
        for u, df in user_ops_on_all_courses.groupby(['username']):
            params.append(df)
            users.append(u)
        pool = par.Pool(processes=min(n_proc, len(params)))
        user_ops_count = dict(
            zip(users, pool.map(__get_counting_feature__, params)))
        pool.close()
        pool.join()

        util.dump(user_ops_count, pkl_path)

    X4 = X0 / [user_ops_count[u] for u in Enroll['username']]
    X4[np.isnan(X4)] = 0

    logger.debug('ratio of user ops on all courses, has nan: %s, shape: %s',
                 np.any(np.isnan(X4)), repr(X4.shape))

    pkl_path = util.cache_path('course_ops_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        course_ops_count = util.fetch(pkl_path)
    else:
        course_ops_of_all_users = D_full.groupby(
            ['course_id', 'source_event', 'week_diff'])\
            .agg({'event_count': np.sum}).reset_index()
        params = []
        courses = []
        for c, df in course_ops_of_all_users.groupby(['course_id']):
            params.append(df)
            courses.append(c)
        pool = par.Pool(processes=min(n_proc, len(params)))
        course_ops_count = dict(
            zip(courses, pool.map(__get_counting_feature__, params)))
        pool.close()
        pool.join()

        util.dump(course_ops_count, pkl_path)

    X5 = X0 / [course_ops_count[c] for c in Enroll['course_id']]
    X5[np.isnan(X5)] = 0

    logger.debug('ratio of courses ops of all users, has nan: %s, shape: %s',
                 np.any(np.isnan(X5)), repr(X5.shape))

    X6 = np.array([
        course_dropout_count.get(c, 0) / course_population.get(c, 1)
        for c in Enroll['course_id']
    ])

    logger.debug('dropout ratio of courses, has nan: %s, shape: %s',
                 np.any(np.isnan(X6)), repr(X6.shape))

    Obj = util.load_object()
    Obj = Obj[Obj['start'] <= base_date]
    course_time = {}
    for c, df in Obj.groupby(['course_id']):
        start_time = np.min(df['start'])
        update_time = np.max(df['start'])
        course_time[c] = [(base_date - start_time).days,
                          (base_date - update_time).days]

    avg_start_days = np.average([t[0] for _, t in course_time.items()])
    avg_update_days = np.average([t[1] for _, t in course_time.items()])
    default_case = [avg_start_days, avg_update_days]

    X7 = np.array(
        [course_time.get(c, default_case)[0] for c in Enroll['course_id']])

    logger.debug('days from course first update, has nan: %s, shape: %s',
                 np.any(np.isnan(X7)), repr(X7.shape))

    X8 = np.array(
        [course_time.get(c, default_case)[1] for c in Enroll['course_id']])

    logger.debug('days from course last update, has nan: %s, shape: %s',
                 np.any(np.isnan(X8)), repr(X8.shape))

    user_ops_time = pd.merge(Enroll, Log, how='left', on=['enrollment_id'])\
        .groupby(['enrollment_id']).agg({'day_diff': [np.min, np.max]})\
        .fillna(0)
    X9 = np.array(user_ops_time['day_diff']['amin'])

    logger.debug('days from user last op, has nan: %s, shape: %s',
                 np.any(np.isnan(X9)), repr(X9.shape))

    X10 = np.array(user_ops_time['day_diff']['amax'])

    logger.debug('days from user first op, has nan: %s, shape: %s',
                 np.any(np.isnan(X10)), repr(X10.shape))

    X11 = X7 - X10

    logger.debug(
        'days from course first update to user first op, has nan: %s'
        ', shape: %s', np.any(np.isnan(X11)), repr(X11.shape))

    X = np.c_[X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11]
    util.dump(X, X_pkl_path)

    return X

示例#5

0

显示文件

def dropout_history(enrollment_set, base_date):
    X_pkl_path = util.cache_path('dropout_history_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(X_pkl_path):
        return util.fetch(X_pkl_path)

    logger = logging.getLogger('dropout_history')

    n_proc = par.cpu_count()

    pkl_path = util.cache_path('Dropout_count_before_%s' %
                               base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_path):
        logger.debug('load from cache')

        Dropout_count = util.fetch(pkl_path)
    else:
        logger.debug('preparing datasets')

        Enroll_all = util.load_enrollments()

        Log = util.load_logs()
        Log = Log[Log['time'] <= base_date]
        Log_enroll_ids = pd.DataFrame(np.unique(Log['enrollment_id']),
                                      columns=['enrollment_id'])

        logger.debug('datasets prepared')

        params = []
        enroll_ids = []
        for i, df in Log.groupby(['enrollment_id']):
            params.append(df)
            enroll_ids.append(i)
        pool = par.Pool(processes=min(n_proc, len(params)))
        enroll_dropout_count = dict(
            zip(enroll_ids, pool.map(__get_dropout_feature__, params)))
        pool.close()
        pool.join()

        enroll_dropout_count = pd.Series(enroll_dropout_count,
                                         name='dropout_count')
        enroll_dropout_count.index.name = 'enrollment_id'
        enroll_dropout_count = enroll_dropout_count.reset_index()

        Enroll_counted = pd.merge(Enroll_all,
                                  enroll_dropout_count,
                                  how='left',
                                  on=['enrollment_id'])
        Dropout_count = pd.merge(Log_enroll_ids,
                                 Enroll_counted,
                                 how='left',
                                 on=['enrollment_id'])

        util.dump(Dropout_count, pkl_path)

    Dgb = Dropout_count.groupby('username')
    total_dropout = Dgb.agg({
        'dropout_count': np.sum
    }).reset_index().rename(columns={'dropout_count': 'total_dropout'})
    avg_dropout = Dgb.agg({
        'dropout_count': np.average
    }).reset_index().rename(columns={'dropout_count': 'avg_dropout'})
    drop_courses = Dgb.agg(
        {'dropout_count': lambda x: len([i for i in x if i > 0])})\
        .reset_index().rename(columns={'dropout_count': 'drop_courses'})
    course_count = Dgb.agg({
        'dropout_count': len
    }).reset_index().rename(columns={'dropout_count': 'course_count'})

    Dropout_count = pd.merge(Dropout_count,
                             total_dropout,
                             how='left',
                             on=['username'])
    Dropout_count = pd.merge(Dropout_count,
                             avg_dropout,
                             how='left',
                             on=['username'])
    Dropout_count = pd.merge(Dropout_count,
                             drop_courses,
                             how='left',
                             on=['username'])
    Dropout_count = pd.merge(Dropout_count,
                             course_count,
                             how='left',
                             on=['username'])

    Dropout_count['drop_ratio'] = (Dropout_count['drop_courses'] /
                                   Dropout_count['course_count'])

    Enroll = Enroll_all.set_index('enrollment_id').ix[enrollment_set]\
        .reset_index()

    X = pd.merge(Enroll, Dropout_count, how='left', on=['enrollment_id'])\
        .as_matrix(columns=['dropout_count', 'total_dropout', 'avg_dropout',
                            'drop_courses', 'course_count', 'drop_ratio'])

    logger.debug('dropout history, has nan: %s, shape: %s',
                 np.any(np.isnan(X)), repr(X.shape))

    util.dump(X, X_pkl_path)
    return X

示例#6

0

显示文件

文件： preprocessing_rgb.py 项目： ghaithbilbeisi/3D-Hand-Pose-Estimation-Using-Depth-RGB-and-RGBD

        img = util.draw_pose(dataset, img, lbls[idx], 2, (255,0,0))
        img = cv2.resize(img, (200,200))
        cv2.imwrite(root_dir+'samples/rgb/cropped/'+phase+'_'+str(idx)+'.png', img)
########################################################################################################################



############################ Draw predicted pose on RGB samples ###########################################################################
### Create sample RGB videos from the predicted poses
### this segment is only for validation
###########################################################################################################################################
phase = 'test'
lbls = util.load_labels('fpac', phase) ### load test/train data
names = util.load_names('fpac', phase)
centers = util.load_centers('fpac', phase).astype(float)
lbls, preds = util.load_logs('fpac', 'fpac_test_b53_lr_1e-2_xyz_20k_.txt', centers)

for idx, name in enumerate(names):
    action = name.split('/')[2]
    subject = name.split('/')[1]
    if action == 'close_juice_bottle' and subject == 'Subject_2':                  # use this condition to select a certain action by name
    # if idx in (11000, 13000, 15000, 16000, 18000, 21000, 22000, 23000, 31000):   # or this condition to select specific frames or sequences by ID
        pred, skel_camcoords = util.world2pixel(preds[idx], 'fpac')
        label, skel_camcoords = util.world2pixel(lbls[idx], 'fpac')
        img = util.load_image('fpac', os.path.join(root_dir, name))
        img = cv2.imread(os.path.join('/home/bilbeisi/REN', name), 1)
        img = img.astype(float)

        points = centers[idx]
        img = util.draw_pose('fpac', img, pred, 6, (0,0,255))
        img = cv2.resize(img, (480,270))

示例#7

0

显示文件

文件： postprocessing_rgbd.py 项目： ghaithbilbeisi/3D-Hand-Pose-Estimation-Using-Depth-RGB-and-RGBD

        # using cython functions to drastically increase the speed of image pixel looping
        # loops over all RGB-D image pixels and assigns RGB value using cimg and cords_c
        img_rgbd = image_loops.color_map(img, cimg, img_rgbd, cords_c)
        img_rgbd = np.asarray(img_rgbd)

        cv2.imwrite('/home/bilbeisi/REN/samples/rgbd/' + str(idx) + '.png',
                    img_rgbd[:, :, :3])

############################# Draw predicted pose on RGB-D samples #################################
phase = 'test'
lbls = util.load_labels('fpad', phase)  ### load test/train data
names = util.load_names('fpad', phase)
cnames = util.load_names('fpac', phase)
centers = util.load_centers('fpad', phase).astype(float)
lbls, preds = util.load_logs('fpad',
                             'rgbd_test_b159_lr_1e-2_xyz_1_200k_2_20k_.txt',
                             centers)

for idx, name in enumerate(names):
    action = name.split('/')[2]
    subject = name.split('/')[1]
    if action == 'close_juice_bottle' and subject == 'Subject_2':
        # if idx in (11000, 13000, 15000, 16000, 18000, 21000, 22000, 23000, 31000):
        img = util.load_image('fpad', os.path.join(root_dir, name))
        img[img == 0] = 1

        cname = cnames[idx]
        cimg = util.load_image('fpac', os.path.join(root_dir, cname))
        cimg = cimg.astype(float)

        cords_d = np.zeros((480, 640, 3))

示例#8

0

显示文件

def load_train(earlist_base_date=None, depth=1, cache_only=False):
    """
    Load dataset for training and validating.

    *NOTE*  If you need a validating set, you SHOULD split from training set
    by yourself.

    Parameters
    ----------
    earlist_base_date: datetime, None by default
    Base date won't be smaller than earlist_base_date.

    depth: int, 1 by default
    Maximum moves of time window.

    cache_only: bool, False by default
    Cache data of every period, do not return full spanned data.

    Returns
    -------
    X: numpy ndarray, shape: (num_of_enrollments, num_of_features)
    Rows of features. It is the features of all time if cache_only is True.

    y: numpy ndarray, shape: (num_of_enrollments,)
    Vector of labels. It is the labels of all time if cache_only is True.
    """
    logger = logging.getLogger('load_train')

    enroll_ids = np.sort(util.load_enrollment_train()['enrollment_id'])
    log = util.load_logs()[['enrollment_id', 'time']]
    # base_date = log['time'].max().to_datetime()
    base_date = datetime(2014, 8, 1, 22, 0, 47)

    logger.debug('load features before %s', base_date)

    pkl_X_path = util.cache_path('train_X_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    pkl_y_path = util.cache_path('train_y_before_%s' %
                                 base_date.strftime('%Y-%m-%d_%H-%M-%S'))
    if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path):
        logger.debug('fetch cached')
        X = util.fetch(pkl_X_path)
        y = util.fetch(pkl_y_path)
    else:
        X, _ = __load_dataset__(enroll_ids, log, base_date)
        y_with_id = util.load_val_y()
        if not np.all(y_with_id[:, 0] == enroll_ids):
            logger.fatal('something wrong with enroll_ids')
            raise RuntimeError('something wrong with enroll_ids')
        y = y_with_id[:, 1]

        util.dump(X, pkl_X_path)
        util.dump(y, pkl_y_path)

    # base_date = log['time'].max().to_datetime() - timedelta(days=10)
    base_date = datetime(2014, 7, 22, 22, 0, 47)
    Dw = timedelta(days=7)
    enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)
    for _ in range(depth - 1):
        if enroll_ids.size <= 0:
            break
        if earlist_base_date is not None and base_date < earlist_base_date:
            break

        logger.debug('load features before %s', base_date)

        # get instances and labels
        pkl_X_path = util.cache_path('train_X_before_%s' %
                                     base_date.strftime('%Y-%m-%d_%H-%M-%S'))
        pkl_y_path = util.cache_path('train_y_before_%s' %
                                     base_date.strftime('%Y-%m-%d_%H-%M-%S'))
        if os.path.exists(pkl_X_path) and os.path.exists(pkl_y_path):
            logger.debug('fetch cached')
            X_temp = util.fetch(pkl_X_path)
            y_temp = util.fetch(pkl_y_path)
        else:
            X_temp, y_temp = __load_dataset__(enroll_ids, log, base_date)

            util.dump(X_temp, pkl_X_path)
            util.dump(y_temp, pkl_y_path)

        # update instances and labels
        if not cache_only:
            X = np.r_[X, X_temp]
            y = np.append(y, y_temp)

        # update base_date and enroll_ids
        base_date -= Dw
        enroll_ids = __enroll_ids_with_log__(enroll_ids, log, base_date)

    return X, y