예제 #1
0
def compute_corr_ver_2_week_1(dep_id, seg_id, season_tf, significance_level, df, year_input, date_ss):
    y_dates = date_ss
    y_col = '_'.join([str(dep_id), str(seg_id)]) + '_rv'  # Get revenue data name
    df1_full = lib.get_df_by_dates(y_dates, df)
    y_series = df1_full[y_col]
    cols = set(df.columns) - {'date', 'day_of_week'}
    result_df = pd.DataFrame(columns=['col', "cov_col", 'lag', 'cov_value', "conf_interval_0", "conf_interval_1", "adj_corr_col"])
    # x is some col with some lag
    index = 0
    dict_df = {}
    dict_df["0_2"] = df1_full[df1_full.columns]
    dict_df["0_1"] = df1_full[df1_full.columns]
    for lag in [1, 7, 365]:
        col_dates  = lib.get_dates_lag(y_dates, lag)
        df2 = lib.get_df_by_dates(col_dates, df)
        dict_df[str(lag) + "_2"] = df2[df2.columns]
        dict_df[str(lag) + "_1"] = df1_full[(len(df1_full) - len(df2)) : ]
    for i, col in enumerate(cols):
        if lib.check_total(y_col, col):
            continue
        for lag in [0, 1, 7, 365]:
            if (col.split('_')[0] != str(dep_id)) and (lag not in [0, 1]):
                continue
            if (col.split('_')[0] == str(dep_id)) and (lag in [0, 1]):
                continue
            df1 = dict_df[str(lag) + "_1"]
            df2 = dict_df[str(lag) + "_2"]
            col_series = df2[col]
            y_series_rm = y_series[-len(col_series):]
            y_series_rm, col_series = lib.remove_outlier(y_series_rm, col_series)
            cap_arr = compute_capture_arr_ver_2(y_col, col, 1, 1, df1, df2, cols)
            cov_val = 1 - lib.compute_error(y_series_rm, col_series, cap_arr['capture'], cap_arr['ARR'])
            #===========================
            if lag == 365:
                y_365_lag = df2[col]
                date_y_365_lag_not_null = df2[~y_365_lag.isnull()]['date']
                if len(date_y_365_lag_not_null.dt.year.unique()) < 3:
                    cov_val = -99  # explaination is big
            #===========================
            if "rn" in col:
                conf_interval_0, conf_interval_1 = cov_val, cov_val
            else:
                conf_interval_0, conf_interval_1 = lib.compute_interval(cov_val, significance_level)
            result_df.loc[index] = [y_col, col, lag, cov_val, conf_interval_0, conf_interval_1, col + "_" + str(lag)]
            index += 1
    result_df = result_df.fillna(-99)
    return result_df.sort_values('cov_value', ascending=False)
예제 #2
0
def compute_corr_ver_3_week_1(dep_id, seg_id, season_tf, significance_level, df, date_ss):
    """
    Compute correlation for all pair of deparment_segment in one hotel
    :param dep_id: department id
    :param seg_id: segment id
    :param season_tf: season timeframe
    :param day_of_week:  day of week range 0-6
    :param df: Dataframe contain all data of one hotel
    :param significance_level: significance to compute confidence interval
    :return:
    """
    y_dates = date_ss
    y_col = '_'.join([str(dep_id), str(seg_id)]) + '_rv'  # Get revenue data name
    y_series = lib.get_df_by_dates(y_dates, df)[y_col]
    cols = set(df.columns) - {'date', 'day_of_week'}
    result_df = pd.DataFrame(columns=['col', "cov_col", 'lag', 'cov_value', "conf_interval_0", "conf_interval_1", "adj_corr_col"])
    # x is some col with some lag
    index = 0

    for i, col in enumerate(cols):
        if lib.check_total(y_col, col):
            continue
        for lag in [0, 1, 7, 14, 21, 30, 365]:
            if (col.split('_')[0] != str(dep_id)) and (lag not in [0, 1]):
                continue
            if (col.split('_')[0] == str(dep_id)) and (lag in [0, 1]):
                continue
            col_dates  = lib.get_dates_lag(y_dates, lag)
            col_series = lib.get_df_by_dates(col_dates, df)[col]
            y_series_rm, col_series = lib.remove_outlier(y_series, col_series)
            cov_val = np.corrcoef(y_series_rm, col_series)[1, 0]
            if "rn" in col:
                conf_interval_0, conf_interval_1 = cov_val, cov_val
            else:
                conf_interval_0, conf_interval_1 = lib.compute_interval(cov_val, significance_level)
            result_df.loc[index] = [y_col, col, lag, cov_val, conf_interval_0, conf_interval_1, col + "_" + str(lag)]
            index += 1
    result_df = result_df.fillna(-99)
    return result_df.sort_values('cov_value', ascending=False)
예제 #3
0
def compute_corr_ver_2_1_cruise(dep_id, seg_id, season_tf, num_day, day_of_week,
                                significance_level, df, date_ss, cruise):
    """
    Compute explanation (1 -error) for all pair of deparment_segment in one hotel
    :param dep_id: department id
    :param seg_id: segment id
    :param season_tf: season timeframe
    :param day_of_week:  day of week range 0-6
    :param df: Dataframe contain all data of one hotel
    :param significance_level: significance to compute confidence interval
    :return:
    """
    y_weekdays = cruise.cruiseday_list(date_ss)
    y_dates_full = [date_ss[i] for i, d in enumerate(y_weekdays) if d == day_of_week]
    y_dates = y_dates_full[-(min(num_day, len(y_dates_full))):]
    y_col = '_'.join([str(dep_id), str(seg_id)]) + '_rv'  # Get revenue data name
    df1_full = lib.get_df_by_dates(y_dates_full, df)
    y_series = df1_full[y_col]

    df1 = lib.get_df_by_dates(y_dates, df1_full)
    cols = set(df.columns) - {'date', 'day_of_week'}
    result_df = pd.DataFrame(columns=['col', "cov_col", 'lag', 'cov_value', "conf_interval_0", "conf_interval_1", "adj_corr_col"])
    # x is some col with some lag
    index = 0

    dict_df = {}
    dict_df[0] = df1[df1.columns]
    dict_df["0_full"] = df1_full[df1_full.columns]
    for lag in [1, 7, 365]:
        dates  = lib.get_dates_lag(y_dates, lag)
        dates_full  = lib.get_dates_lag(y_dates_full, lag)
        df2_full = lib.get_df_by_dates(dates_full, df)
        df2 = lib.get_df_by_dates(dates, df2_full)
        dict_df[lag] = df2
        dict_df[str(lag) + "_full"] = df2_full
    for i, col in enumerate(cols):
        if lib.check_total(y_col, col):
            continue
        for lag in [0, 1, 7, 365]:
            if (col.split('_')[0] != str(dep_id)) and (lag not in [0, 1]):
                continue
            if (col.split('_')[0] == str(dep_id)) and (lag in [0, 1]):
                continue
            col_series = dict_df[str(lag) + "_full"][col]
            y_series_r = y_series[-len(col_series):]
            df2 = dict_df[lag]
            y_series_rm, col_series_rm = lib.remove_outlier(y_series_r, col_series)
            cap_arr = compute_capture_arr_ver_2(y_col, col, 1, 1, df1, df2, cols)
            cov_val = 1 - lib.compute_error(y_series_rm, col_series_rm, cap_arr['capture'], cap_arr['ARR'])
            if y_col == "69_107_rv" and day_of_week == 6 and season_tf == [('01-01', '02-17'), ('12-23', '12-31')] and lag == 0:
                print('test', col)
                with open('test_caset_69_107_{0}_{1}.txt'.format(season_tf, col), 'w') as f:
                    f.write("\ny_series: {0}\ncol_series:{1}\ny_rm:{2}\ncol_rm: {3}\ncap_arr: {4}\cov_val:{5}".format( \
                        ", ".join(map(str,y_series_r)), ", ".join(map(str, col_series)), ", ".join(map(str, y_series_rm)), ", ".join(map(str, col_series_rm)), str(cap_arr), cov_val))
            #===========================
            if lag == 365:
                y_365_lag = df2[col]
                date_y_365_lag_not_null = df2[~y_365_lag.isnull()]['date']
                if len(date_y_365_lag_not_null.dt.year.unique()) < 3:
                    cov_val = -99  # explaination is big
            #===========================
            if "rn" in col:
                conf_interval_0, conf_interval_1 = cov_val, cov_val
            else:
                conf_interval_0, conf_interval_1 = lib.compute_interval(cov_val, significance_level)
            result_df.loc[index] = [y_col, col, lag, cov_val, conf_interval_0, conf_interval_1, col + "_" + str(lag)]
            index += 1
    result_df = result_df.fillna(-99)
    return result_df.sort_values('cov_value', ascending=False)