Exemplo n.º 1
0
def load_datasets(config: configparser.ConfigParser, target_column: str) -> list:
    """
    Load datasets according to info specified in config file
    :param config: config with dataset specific info
    :param target_column: target_column for prediction
    :return: list of datasets to use for optimization
    """
    datasets_lst = list()
    # load and name raw dataset
    dataset_raw = pd.read_csv(config['General']['base_dir'] + 'Data/' + config[target_column]['dataset_raw'] + '.csv',
                              sep=';', decimal=',', index_col=0)
    try:
        dataset_raw.index = pd.to_datetime(dataset_raw.index, format='%d.%m.%Y')
    except:
        dataset_raw.index = pd.to_datetime(dataset_raw.index, format='%Y-%m-%d')
    dataset_raw = dataset_raw.asfreq('D')
    dataset_raw.name = config[target_column]['dataset_raw']
    datasets_lst.append(dataset_raw)

    # split dataset at before_break_date
    if 'before_break_date' in config[target_column]:
        dataset_before_break = dataset_raw.copy()
        dataset_before_break.name = dataset_raw.name + '_before_break'
        before_break_date = datetime.datetime.strptime(config[target_column]['before_break_date'], '%Y-%m-%d').date()
        PreparationHelper.drop_rows_by_dates(df=dataset_before_break, start=before_break_date,
                                             end=dataset_before_break.index[-1])
        datasets_lst.append(dataset_before_break)

    return datasets_lst
Exemplo n.º 2
0
def load_datasets(config: configparser.ConfigParser, company: str,
                  target_column: str) -> list:
    """
    Load datasets according to info specified in config file
    :param config: config with dataset specific info
    :param company: name of the company related to the dataset
    :param target_column: target_column for prediction
    :return: list of datasets to use for optimization
    """
    datasets_lst = list()
    # load and name raw dataset
    try:
        dataset_raw = \
            pd.read_csv(config['General']['base_dir'] + 'Data/' + config[target_column]['dataset_raw'] + '.csv',
                        sep=';', decimal=',', index_col=0)
    except:
        dataset_raw = \
            pd.read_csv(config['General']['base_dir'] + 'Data/' + config[target_column]['dataset_raw'] + '.csv',
                        sep=';', decimal='.', index_col=0)
    if type(dataset_raw.index[0]) == str:
        if '.' in dataset_raw.index[0]:
            dataset_raw.index = pd.to_datetime(dataset_raw.index,
                                               format='%d.%m.%Y')
        elif '-' in dataset_raw.index[0]:
            dataset_raw.index = pd.to_datetime(dataset_raw.index,
                                               format='%Y-%m-%d')
    # drop columns from raw dataset if not needed
    if 'raw_cols_to_drop' in config[target_column]:
        PreparationHelper.drop_columns(
            df=dataset_raw,
            columns=config[target_column]['raw_cols_to_drop'].replace(
                " ", "").split(','))
    PreparationHelper.drop_columns(
        df=dataset_raw,
        columns=[col for col in dataset_raw.columns if 'Unnamed' in col])
    # drop samples after start_date_to_drop if target_column is not recorded for whole dataset
    if 'start_date_to_drop' in config[target_column]:
        start_date_to_drop = datetime.datetime.strptime(
            config[target_column]['start_date_to_drop'], '%Y-%m-%d').date()
        PreparationHelper.drop_rows_by_dates(df=dataset_raw,
                                             start=start_date_to_drop,
                                             end=dataset_raw.index[-1])

    if target_column in ['milk', 'beer', 'usdeaths']:
        dataset_raw = dataset_raw.apply(lambda x: x.str.replace(
            ',', '.').astype(float) if x.dtype == object else x)
    elif target_column == 'maunaloa_monthly':
        dataset_raw = dataset_raw.resample('M').apply(
            lambda x: PreparationHelper.custom_resampler(arraylike=x,
                                                         summation_cols=[]))
    elif target_column == 'VisitorNights':
        dataset_raw = dataset_raw.apply(lambda x: x.str.replace(
            ',', '.').astype(float) if x.dtype == object else x)
    dataset_raw.name = company + config[target_column]['dataset_raw']
    datasets_lst.append(dataset_raw)

    # split dataset at before_break_date
    if 'before_break_date' in config[target_column]:
        dataset_before_break = dataset_raw.copy()
        dataset_before_break.name = dataset_raw.name + '_before_break'
        before_break_date = datetime.datetime.strptime(
            config[target_column]['before_break_date'], '%Y-%m-%d').date()
        PreparationHelper.drop_rows_by_dates(
            df=dataset_before_break,
            start=before_break_date,
            end=dataset_before_break.index[-1])
        datasets_lst.append(dataset_before_break)

    return datasets_lst