Exemplo n.º 1
0
def compute_corr_ver_2_week_1(dep_id, seg_id, season_tf, significance_level,
                              df, year_input, date_ss):
    y_dates = date_ss
    y_col = '_'.join([str(dep_id), str(seg_id)
                      ]) + '_rv'  # Get revenue data name
    df1_full = lib.get_df_by_dates(y_dates, df)
    y_series = df1_full[y_col]
    cols = set(df.columns) - {'date', 'day_of_week'}
    result_df = pd.DataFrame(columns=[
        'col', "cov_col", 'lag', 'cov_value', "conf_interval_0",
        "conf_interval_1", "adj_corr_col"
    ])
    # x is some col with some lag
    index = 0
    dict_df = {}
    dict_df["0_2"] = df1_full[df1_full.columns]
    dict_df["0_1"] = df1_full[df1_full.columns]
    for lag in [1, 7, 365]:
        col_dates = lib.get_dates_lag(y_dates, lag)
        df2 = lib.get_df_by_dates(col_dates, df)
        dict_df[str(lag) + "_2"] = df2[df2.columns]
        dict_df[str(lag) + "_1"] = df1_full[(len(df1_full) - len(df2)):]
    for i, col in enumerate(cols):
        if lib.check_total(y_col, col):
            continue
        for lag in [0, 1, 7, 365]:
            if (col.split('_')[0] != str(dep_id)) and (lag not in [0, 1]):
                continue
            if (col.split('_')[0] == str(dep_id)) and (lag in [0, 1]):
                continue
            df1 = dict_df[str(lag) + "_1"]
            df2 = dict_df[str(lag) + "_2"]
            col_series = df2[col]
            y_series_rm = y_series[-len(col_series):]
            y_series_rm, col_series = lib.remove_outlier(
                y_series_rm, col_series)
            cap_arr = compute_capture_arr_ver_2(y_col, col, 1, 1, df1, df2,
                                                cols)
            cov_val = 1 - lib.compute_error(y_series_rm, col_series,
                                            cap_arr['capture'], cap_arr['ARR'])
            #===========================
            if lag == 365:
                y_365_lag = df2[col]
                date_y_365_lag_not_null = df2[~y_365_lag.isnull()]['date']
                if len(date_y_365_lag_not_null.dt.year.unique()) < 3:
                    cov_val = -99  # explaination is big
            #===========================
            if "rn" in col:
                conf_interval_0, conf_interval_1 = cov_val, cov_val
            else:
                conf_interval_0, conf_interval_1 = lib.compute_interval(
                    cov_val, significance_level)
            result_df.loc[index] = [
                y_col, col, lag, cov_val, conf_interval_0, conf_interval_1,
                col + "_" + str(lag)
            ]
            index += 1
    result_df = result_df.fillna(-99)
    return result_df.sort_values('cov_value', ascending=False)
Exemplo n.º 2
0
def compute_corr_ver_2(dep_id, seg_id, season_tf, num_day, day_of_week,
                       significance_level, df):
    """
    Compute explanation (1 -error) for all pair of deparment_segment in one hotel
    :param dep_id: department id
    :param seg_id: segment id
    :param season_tf: season timeframe
    :param day_of_week:  day of week range 0-6
    :param df: Dataframe contain all data of one hotel
    :param significance_level: significance to compute confidence interval
    :return:
    """

    dates = df['date'].sort_values(ascending=False)
    season_dates = pd.Series(lib.get_season_dates_from_tf(season_tf, dates))
    y_dates = list(season_dates[season_dates.dt.weekday == day_of_week])
    y_col = '_'.join([str(dep_id), str(seg_id)
                      ]) + '_rv'  # Get revenue data name

    cols = set(df.columns) - {'date', 'day_of_week'}
    result_df = pd.DataFrame(columns=[
        'col', "cov_col", 'lag', 'cov_value', "conf_interval_0",
        "conf_interval_1", "adj_corr_col"
    ])
    # x is some col with some lag
    index = 0
    for i, col in enumerate(cols):
        for lag in [0, 1, 7, 14, 21, 30, 365]:
            if (col.split('_')[0] != str(dep_id)) and (lag not in [0, 1]):
                continue
            if (col.split('_')[0] == str(dep_id)) and (lag in [0, 1]):
                continue
            y_series, col_series = get_series_to_compute_corr(
                y_col, col, lag, y_dates, df)

            cap_arr = compute_capture_arr(y_col, col, lag, season_tf, num_day,
                                          day_of_week, df)
            y_series_rm, col_series = lib.remove_outlier(y_series, col_series)
            cov_val = 1 - lib.compute_error(y_series_rm, col_series,
                                            cap_arr['capture'], cap_arr['ARR'])
            if "rn" in col:
                conf_interval_0, conf_interval_1 = cov_val, cov_val
            else:
                conf_interval_0, conf_interval_1 = lib.compute_interval(
                    cov_val, significance_level)
            result_df.loc[index] = [
                y_col, col, lag, cov_val, conf_interval_0, conf_interval_1,
                col + "_" + str(lag)
            ]
            index += 1
    result_df = result_df.fillna(-99)
    return result_df.sort_values('cov_value', ascending=False)
Exemplo n.º 3
0
def compute_corr_ver_2_1_cruise(dep_id, seg_id, season_tf, num_day,
                                day_of_week, significance_level, df, date_ss,
                                cruise):
    """
    Compute explanation (1 -error) for all pair of deparment_segment in one hotel
    :param dep_id: department id
    :param seg_id: segment id
    :param season_tf: season timeframe
    :param day_of_week:  day of week range 0-6
    :param df: Dataframe contain all data of one hotel
    :param significance_level: significance to compute confidence interval
    :return:
    """
    y_weekdays = cruise.cruiseday_list(date_ss)
    y_dates_full = [
        date_ss[i] for i, d in enumerate(y_weekdays) if d == day_of_week
    ]
    y_dates = y_dates_full[-(min(num_day, len(y_dates_full))):]
    y_col = '_'.join([str(dep_id), str(seg_id)
                      ]) + '_rv'  # Get revenue data name
    df1_full = lib.get_df_by_dates(y_dates_full, df)
    y_series = df1_full[y_col]

    df1 = lib.get_df_by_dates(y_dates, df1_full)
    cols = set(df.columns) - {'date', 'day_of_week'}
    result_df = pd.DataFrame(columns=[
        'col', "cov_col", 'lag', 'cov_value', "conf_interval_0",
        "conf_interval_1", "adj_corr_col"
    ])
    # x is some col with some lag
    index = 0
    dict_df = {}
    dict_df[0] = df1[df1.columns]
    dict_df["0_full"] = df1_full[df1_full.columns]
    for lag in [1, 7]:  #remove lag 365
        dates = lib.get_dates_lag(y_dates, lag)
        dates_full = lib.get_dates_lag(y_dates_full, lag)
        df2_full = lib.get_df_by_dates(dates_full, df)
        df2 = lib.get_df_by_dates(dates, df2_full)
        dict_df[lag] = df2
        dict_df[str(lag) + "_full"] = df2_full
    for i, col in enumerate(cols):
        if lib.check_total(y_col, col):
            continue
        for lag in [0, 1, 7]:  #remove lag 365
            if (col.split('_')[0] != str(dep_id)) and (lag not in [0, 1]):
                continue
            if (col.split('_')[0] == str(dep_id)) and (lag in [0, 1]):
                continue
            col_series = dict_df[str(lag) + "_full"][col]
            y_series_r = y_series[-len(col_series):]
            df2 = dict_df[lag]
            y_series_rm, col_series_rm = lib.remove_outlier(
                y_series_r, col_series)
            cap_arr = compute_capture_arr_ver_2(y_col, col, 1, 1, df1, df2,
                                                cols)
            cov_val = 1 - lib.compute_error(y_series_rm, col_series_rm,
                                            cap_arr['capture'], cap_arr['ARR'])
            if y_col == "69_107_rv" and day_of_week == 6 and season_tf == [
                ('01-01', '02-17'), ('12-23', '12-31')
            ] and lag == 0:
                print('test', col)
                with open(
                        'test_caset_69_107_{0}_{1}.txt'.format(season_tf, col),
                        'w') as f:
                    f.write("\ny_series: {0}\ncol_series:{1}\ny_rm:{2}\ncol_rm: {3}\ncap_arr: {4}\cov_val:{5}".format( \
                        ", ".join(map(str,y_series_r)), ", ".join(map(str, col_series)), ", ".join(map(str, y_series_rm)), ", ".join(map(str, col_series_rm)), str(cap_arr), cov_val))
            #===========================
            if lag == 365:
                y_365_lag = df2[col]
                date_y_365_lag_not_null = df2[~y_365_lag.isnull()]['date']
                if len(date_y_365_lag_not_null.dt.year.unique()) < 3:
                    cov_val = -99  # explaination is big
            #===========================
            if "rn" in col:
                conf_interval_0, conf_interval_1 = cov_val, cov_val
            else:
                conf_interval_0, conf_interval_1 = lib.compute_interval(
                    cov_val, significance_level)
            result_df.loc[index] = [
                y_col, col, lag, cov_val, conf_interval_0, conf_interval_1,
                col + "_" + str(lag)
            ]
            index += 1
    result_df = result_df.fillna(-99)
    return result_df.sort_values('cov_value', ascending=False)