示例#1
0
def export_results(df, date_update_fin, date_fin):
    ext = '.csv'
    date_init_pred = (datetime.datetime.strptime(date_update_fin, '%Y-%m-%d') +
                      datetime.timedelta(days=1)).strftime('%Y-%m-%d')
    dir_save = os.sep.join([
        get_dir_main(), 'forecasting', 'forecast', 'results',
        date_init_pred + '_' + date_fin.strftime('%Y-%m-%d') + ext
    ])
    df.to_csv(dir_save, encoding='ansi', index=False)
def save_result_cluster(m_dem_reduce_cluster, op_red, type_day, date_init,
                        date_fin, is_train):
    if is_train:
        dir_save_results = os.sep.join([
            get_dir_main(), 'training', 'cluster', 'results',
            date_init + '_' + date_fin
        ])
    else:
        dir_save_results = os.sep.join([
            get_dir_main(), 'forecasting', 'cluster', 'results',
            date_init + '_' + date_fin
        ])

    if not os.path.exists(dir_save_results):
        os.makedirs(dir_save_results)

    filename_result = 'cluster-data_' + op_red + '_' + type_day + '.csv'
    m_dem_reduce_cluster.to_csv(dir_save_results + os.sep + filename_result,
                                index=True)
示例#3
0
def forecast_train_process(date_init,
                           date_fin,
                           transform='decompose-Fourier',
                           list_num_decompose=(1, 4),
                           list_num_coeff_fourier=(12, 5),
                           type_decompose='additive'):
    if isinstance(list_num_coeff_fourier, int):
        list_num_coeff_fourier = [list_num_coeff_fourier]
    if isinstance(list_num_decompose, int):
        list_num_decompose = [list_num_decompose]
    start_time = time.time()
    dir_train_name = os.sep.join([
        get_dir_main(), 'training', 'cluster', 'results',
        date_init + '_' + date_fin
    ])
    list_files_train = glob.glob(dir_train_name + os.sep + '*.csv')

    date_f_fin = datetime.datetime.strptime(date_fin + ' 23:00:00',
                                            '%Y-%m-%d %H:%M:%S')
    date_init = datetime.datetime.strptime(date_init, '%Y-%m-%d')
    date_fin = datetime.datetime.strptime(date_fin, '%Y-%m-%d')
    df_dir_train = pd.DataFrame(list_files_train, columns=['dir_name_train'])
    df_dir_train['date_init'] = date_init
    df_dir_train['date_fin'] = date_f_fin
    df_dir_train['cod_op_red'] = df_dir_train.dir_name_train.apply(
        lambda x: x.split(os.sep)[-1].split('_')[-2])
    df_dir_train['type_day'] = df_dir_train.dir_name_train.apply(
        lambda x: x.split(os.sep)[-1].split('_')[-1].split('.')[0])
    df_dir_train['train'] = df_dir_train.apply(
        lambda x: pd.read_csv(x.dir_name_train,
                              sep=',',
                              header=0,
                              encoding='ansi',
                              parse_dates=False),
        axis=1)

    if transform == 'decompose-Fourier':
        df_param_transform = pd.DataFrame({
            'num_decompose':
            list_num_decompose,
            'num_coeff_fourier':
            list_num_coeff_fourier
        })
        df_param_transform['key'] = 1
        df_dir_train['key'] = 1
        df_dir_train = df_dir_train.merge(df_param_transform,
                                          how='left',
                                          left_on='key',
                                          right_on='key')
        df_dir_train.drop(columns=['key'], inplace=True)
        df_dir_train['transform_model'] = transform
        df_dir_train['type_decompose'] = type_decompose
        df_dir_train.apply(dynamic_train, axis=1)
    print('total_time_execution_forecast_train_process(sec): ',
          abs(time.time() - start_time))
def save_cluster_model_comp_pca(model_cluster, op_red, type_day, date_init,
                                date_fin):

    dir_model_save = os.sep.join([
        get_dir_main(), 'training', 'cluster', 'models',
        date_init + '_' + date_fin
    ])

    if not os.path.exists(dir_model_save):
        os.makedirs(dir_model_save)
    filename_model = op_red + '_' + type_day + '_cluster-model.pkl'
    pickle.dump(model_cluster,
                open(dir_model_save + os.sep + filename_model, 'wb'))
示例#5
0
def save_model_dir(pipeline, transform, num_cluster, op_red, type_day, type_model, date_init, date_fin
                   , periods_decompose=(), n_decompose='', type_decompose='additive'):
    ext = '.pkl'

    dir_model_save = os.sep.join([get_dir_main(), 'training', 'forecast', 'models', date_init+'_'+date_fin])
    if not os.path.exists(dir_model_save):
        os.makedirs(dir_model_save)
    if 'decompose' in transform:
        filename = '_'.join([op_red, type_day, type_model, type_decompose, 'pd-'+'-'.join(periods_decompose)
                            , 'nd-'+str(n_decompose), 'cluster-'+str(num_cluster), transform])
    elif 'normal' in transform or 'fourier' in transform:
        filename = '_'.join([op_red, type_day, type_model, 'cluster-'+str(num_cluster), transform])
    else:
        raise ValueError('invalid variable transform {}.'.format(transform))

    pickle.dump(pipeline, open(dir_model_save+os.sep+filename+ext, 'wb'))
def cluster_process(directory_input_data,
                    ops_red,
                    types_days,
                    date_init_train,
                    date_fin_train,
                    date_init,
                    date_fin,
                    is_train=False):
    dir_load_model_cluster = os.sep.join([
        get_dir_main(), 'training', 'cluster', 'models',
        date_init_train + '_' + date_fin_train
    ])
    filename_components = 'n_components_features.csv'
    df_comp = pd.read_csv(dir_load_model_cluster + os.sep +
                          filename_components,
                          sep=',',
                          header=0,
                          encoding='ansi')
    for op_red in ops_red:
        print('\n\n Executing OR: ', op_red)
        data_op_red = get_data(directory_input_data, op_red, date_init,
                               date_fin)
        for var_type_day in types_days:
            data_op_red_t_day = data_op_red.query('tipodia == @var_type_day')
            print('\t type day: ', var_type_day)
            dem_data, pv_dem_data = transform_data(data_op_red_t_day,
                                                   date_init, date_fin)
            m_features = matrix_features(pv_dem_data, features='fourier')
            stat_test(m_features)
            n_comp = df_comp.query(
                'cod_or == @op_red and type_day == @var_type_day'
            ).n_components.values[0]
            m_pca_features = get_matrix_pca(m_features,
                                            show_plot=False,
                                            dynamic_component=False,
                                            n_comp=n_comp)
            labels = get_clusters(dir_load_model_cluster, m_pca_features,
                                  var_type_day, op_red)
            m_pca_features['labels'] = labels
            m_dem_cluster = group_dem_users_cluster(
                dem_data=dem_data, m_features_labels=m_pca_features)
            m_dem_reduce_cluster = reduce_k_cluster(m_dem_cluster,
                                                    threshold_dem=0.02)
            save_result_cluster(m_dem_reduce_cluster, op_red, var_type_day,
                                date_init, date_fin, is_train)
def cluster_train_process(ops_red, types_days, directory_input_data, date_init,
                          date_fin):
    start_time = time.time()
    filename_components = 'n_components_features.csv'
    df_comp = pd.DataFrame()
    for op_red in ops_red:
        print('\n\n Executing OR: ', op_red)
        data_op_red = get_data(directory_input_data, op_red, date_init,
                               date_fin)
        for var_type_day in types_days:
            print('\t type day: ', var_type_day)
            data_op_red_t_day = data_op_red.query('tipodia == @var_type_day')
            dem_data, pv_dem_data = transform_data(data_op_red_t_day,
                                                   date_init, date_fin)
            m_features = matrix_features(pv_dem_data, features='fourier')
            stat_test(m_features)
            m_pca_features = get_matrix_pca(m_features,
                                            show_plot=False,
                                            dynamic_component=True)
            df_k_opt, labels, k_means_model = cluster_kmeans(
                x_train=m_pca_features, k_min=2, k_max=10)
            k = k_optimal(df_k_opt)
            _, _, model_cluster = cluster_kmeans(x_train=m_pca_features,
                                                 k_min=k,
                                                 k_max=k + 1)
            save_cluster_model_comp_pca(model_cluster, op_red, var_type_day,
                                        date_init, date_fin)
            dict_comp = {
                'cod_or': [op_red],
                'type_day': [var_type_day],
                'n_components': [m_pca_features.shape[1]]
            }
            comp = pd.DataFrame(dict_comp)
            df_comp = df_comp.append(comp)
    df_comp.to_csv(os.sep.join([
        get_dir_main(), 'training', 'cluster', 'models',
        date_init + '_' + date_fin, filename_components
    ]),
                   sep=',',
                   encoding='ansi',
                   index=False)
    print('total_time_execution_cluster_train_process(sec): ',
          abs(time.time() - start_time))
示例#8
0
def get_train_models(date_train_init, date_train_fin):
    dir_train = os.sep.join([
        get_dir_main(), 'training', 'forecast', 'models',
        date_train_init + '_' + date_train_fin
    ])
    files_train = glob.glob(dir_train + os.sep + '*.pkl')
    df_dir_train = pd.DataFrame(files_train, columns=['dir_model_train'])
    df_dir_train['cod_op_red'] = df_dir_train.dir_model_train.apply(
        lambda x: x.split(os.sep)[-1].split('_')[0])
    df_dir_train['type_day'] = df_dir_train.dir_model_train.apply(
        lambda x: x.split(os.sep)[-1].split('_')[1])
    df_dir_train['t_transform'] = df_dir_train.dir_model_train.apply(
        lambda x: x.split(os.sep)[-1].split('_')[-1].split('.')[0])
    n_transform = df_dir_train.t_transform.drop_duplicates().shape[0]
    if n_transform == 1:
        transform = df_dir_train.t_transform.drop_duplicates().values[0]
        if transform == 'decompose-Fourier' or transform == 'decompose':
            df_dir_train[
                'type_decompose'] = df_dir_train.dir_model_train.apply(
                    lambda x: x.split(os.sep)[-1].split('_')[3])
            df_dir_train['num_decompose'] = df_dir_train.dir_model_train.apply(
                lambda x: x.split(os.sep)[-1].split('_')[-3])
            cols = [
                'dir_model_train', 'cod_op_red', 'type_day', 't_transform',
                'type_decompose', 'num_decompose'
            ]
            df_dir_train = df_dir_train[cols]
            df_train = df_dir_train.groupby(
                by=list(df_dir_train.columns)[1:]).agg(
                    lambda x: ",".join(x)).reset_index()
            return df_train
        else:
            raise ValueError(
                'invalid variable transform {}.'.format(transform))

    else:
        raise ValueError(
            'exist more than one or anything type transform in training models. Number types transform get {}.'
            .format(n_transform))
示例#9
0
def get_data_train(date_train_init, date_update_fin):
    dir_train = os.sep.join([
        get_dir_main(), 'forecasting', 'cluster', 'results',
        date_train_init + '_' + date_update_fin
    ])
    files_train = glob.glob(dir_train + os.sep + '*.csv')
    df_dir_train = pd.DataFrame(files_train, columns=['dir_name_train'])
    df_dir_train['date_train_init'] = date_train_init
    df_dir_train['date_update_fin'] = date_update_fin
    df_dir_train['cod_op_red'] = df_dir_train.dir_name_train.apply(
        lambda x: x.split(os.sep)[-1].split('_')[-2])
    df_dir_train['type_day'] = df_dir_train.dir_name_train.apply(
        lambda x: x.split(os.sep)[-1].split('_')[-1].split('.')[0])
    df_dir_train['train'] = df_dir_train.apply(
        lambda x: pd.read_csv(x.dir_name_train,
                              sep=',',
                              header=0,
                              encoding='ansi',
                              parse_dates=False),
        axis=1)

    return df_dir_train
                                                 k_max=k + 1)
            save_cluster_model_comp_pca(model_cluster, op_red, var_type_day,
                                        date_init, date_fin)
            dict_comp = {
                'cod_or': [op_red],
                'type_day': [var_type_day],
                'n_components': [m_pca_features.shape[1]]
            }
            comp = pd.DataFrame(dict_comp)
            df_comp = df_comp.append(comp)
    df_comp.to_csv(os.sep.join([
        get_dir_main(), 'training', 'cluster', 'models',
        date_init + '_' + date_fin, filename_components
    ]),
                   sep=',',
                   encoding='ansi',
                   index=False)
    print('total_time_execution_cluster_train_process(sec): ',
          abs(time.time() - start_time))


if __name__ == '__main__':
    dir_main = get_dir_main()
    dir_input_data = dir_main + os.sep + 'data' + os.sep + 'input'
    t_days = ['ORD', 'SAB', 'FESTIVO', 'DOM']
    operadores_red = ['ORD1']
    d_i = '2017-01-01'
    d_f = '2020-01-03'
    print(dir_input_data)
    cluster_train_process(operadores_red, t_days, dir_input_data, d_i, d_f)