예제 #1
0
def join(u, v, v_name, key, type_):
    def func_rename(x):
        if 'mv' in x[1]:
            return f'{CONSTANT.MULTI_CAT_PREFIX}{x[1]}({x[0]})'
        elif 'cat_union' in x[1]:
            return f'{CONSTANT.MULTI_CAT_PREFIX}{x[1]}({x[0]})'
        elif 'cat_last' in x[1]:
            return f'{CONSTANT.CATEGORY_PREFIX}{x[1]}({x[0]})'
        elif CONSTANT.NUMERICAL_PREFIX in x[0]:
            return f"{CONSTANT.NUMERICAL_PREFIX}{x[1]}({x[0]})"
        elif CONSTANT.CATEGORY_PREFIX in x[0]:
            return f"{CONSTANT.CATEGORY_PREFIX}{x[1]}({x[0]})"

    if type_.split("_")[2] == 'many':
        agg_funcs = {
            col: Config.aggregate_op(col)
            for col in v
            if col != key[0] and not col.startswith(CONSTANT.TIME_PREFIX)
            # and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)
        }
        v = v.groupby(key).agg(agg_funcs)
        v.columns = v.columns.map(func_rename)
        # v.columns = v.columns.map(lambda a:
        #         f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}({a[0]})")
    else:
        v = v.set_index(key)
    v.columns = v.columns.map(lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}")

    return u.join(v, on=key)
예제 #2
0
def temporal_join(u, v, v_name, key, time_col):
    timer = Timer()
    window_size = CONSTANT.WINDOW_SIZE if len(u) * CONSTANT.WINDOW_RATIO < CONSTANT.WINDOW_SIZE \
        else int(len(u) * CONSTANT.WINDOW_RATIO)
    hash_max = CONSTANT.HASH_MAX if len(u) / CONSTANT.HASH_MAX > CONSTANT.HASH_BIN \
        else int(len(u) / CONSTANT.HASH_BIN)

    # window_size = CONSTANT.WINDOW_SIZE
    # hash_max = CONSTANT.HASH_MAX

    if isinstance(key, list):
        assert len(key) == 1
        key = key[0]

    tmp_u = u[[time_col, key]]
    timer.check("select")

    tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)
    timer.check("concat")

    # rehash_key = f'rehash_{key}'
    # tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX)
    # timer.check("rehash_key")

    tmp_u.sort_values(time_col, inplace=True)
    timer.check("sort")

    agg_funcs = {
        col: Config.aggregate_op(col)
        for col in v if col != key and not col.startswith(CONSTANT.TIME_PREFIX)
        and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)
    }

    # tmp_u = tmp_u.groupby(rehash_key).rolling(window=CONSTANT.WINDOW_SIZE).agg(agg_funcs)
    tmp_u = tmp_u.rolling(window=window_size).agg(agg_funcs)

    # timer.check("group & rolling & agg")
    #
    # tmp_u.reset_index(0, drop=True, inplace=True)  # drop rehash index
    # timer.check("reset_index")

    tmp_u.columns = tmp_u.columns.map(
        lambda a:
        f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})")

    if tmp_u.empty:
        log("empty tmp_u, return u")
        return u

    # ret = pd.concat([u, tmp_u3.loc['u']], axis=1, sort=False)
    ret = u.merge(tmp_u.loc['u'],
                  right_index=True,
                  left_index=True,
                  how="outer")
    timer.check("final concat")

    del tmp_u, tmp2_u

    return ret
예제 #3
0
def join(u, v, v_name, key, type_):
    if type_.split("_")[2] == 'many':
        agg_funcs = {
            col: Config.aggregate_op(col)
            for col in v
            if col != key and not col.startswith(CONSTANT.TIME_PREFIX)
            and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)
        }
        v = v.groupby(key).agg(agg_funcs)
        v.columns = v.columns.map(join_name)
    else:
        v = v.set_index(key)
    v.columns = v.columns.map(lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}")
    return u.join(v, on=key)
예제 #4
0
def temporal_join(u, v, v_name, key, time_col):
    timer = Timer()

    if isinstance(key, list):
        key = key[0]

    tmp_u = u[[time_col, key]]
    timer.check("select")

    tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)
    timer.check("concat")

    rehash_key = f'rehash_{key}'
    tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX)
    timer.check("rehash_key")

    tmp_u.sort_values(time_col, inplace=True)
    timer.check("sort")

    agg_funcs = {
        col: Config.aggregate_op(col)
        for col in v if col != key and not col.startswith(CONSTANT.TIME_PREFIX)
        and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)
    }

    tmp_u = tmp_u.groupby(rehash_key).rolling(5).agg(agg_funcs)
    timer.check("group & rolling & agg")

    tmp_u.reset_index(0, drop=True, inplace=True)  # drop rehash index
    timer.check("reset_index")

    def rename_func(x):
        dtype = CONSTANT.CATEGORY_PREFIX if x[1].upper() in [
            'MODE'
        ] else CONSTANT.NUMERICAL_PREFIX
        return f"{dtype}{x[1].upper()}_ROLLING10({v_name}.{x[0]})"

    tmp_u.columns = tmp_u.columns.map(rename_func)

    if tmp_u.empty:
        log("empty tmp_u, return u")
        return u

    ret = pd.concat([u, tmp_u.loc['u']], axis=1, sort=False)
    timer.check("final concat")

    del tmp_u

    return ret
예제 #5
0
def temporal_join(u, v, v_name, key, time_col):
    timer = Timer()

    if isinstance(key, list):
        assert len(key) == 1
        key = key[0]

    print ("-----tmp_u--------")
    
    tmp_u = u[[time_col, key]]
    print ("------Number of columns before concatenation---")
    print (len(tmp_u.columns))

    timer.check("select")
     
    tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)


    print (len(tmp_u.columns))
    
    timer.check("concat")

    rehash_key = f'rehash_{key}'
    
    tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX)
    timer.check("rehash_key")
     
    tmp_u.sort_values(time_col, inplace=True)
    print ("----after sorting----")
    #print (tmp_u)
    print ("----after sorting----")
    timer.check("sort")
    
    agg_funcs = {col: Config.aggregate_op(col) for col in v if col != key
                 and not col.startswith(CONSTANT.TIME_PREFIX)
                 and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)}
    print ("-----after group by operation-----")
    tmp_u = tmp_u.groupby(rehash_key).rolling(5, min_periods=1).agg(agg_funcs)
    #print (tmp_u)
    print ("-----after group by operation------")
    
    print ("-----tmp_u--------")
    timer.check("group & rolling & agg")
    
    tmp_u.reset_index(0, drop=True, inplace=True)  # drop rehash index
    timer.check("reset_index")
    
    tmp_u.columns = tmp_u.columns.map(lambda a:
        f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})")

    if tmp_u.empty:
        log("empty tmp_u, return u")
        return u
    #print ("----number of rows in u-----")    
    #print (len(u.index))
    #print ("---number of rows in u-----")
    #print ("----number of rows in tmp_u-----")
    #print (len(tmp_u.index))
    #print ("----number of rows in tmp_u---------")

    ret = pd.concat([u, tmp_u.loc['u']], axis=1, sort=False)
    timer.check("final concat")

    del tmp_u

    return ret
예제 #6
0
def temporal_join(u, v, v_name, key, time_col):
    timer = Timer()

    if isinstance(key, list):
        assert len(key) == 1
        key = key[0]

    tmp_u = u[[time_col, key]]
    timer.check("select")
    #tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)
    #timer.check("concat")

    tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)
    timer.check("concat")

    rehash_key = f'rehash_{key}'
    tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX)
    timer.check("rehash_key")

    tmp_u.sort_values(time_col, inplace=True)
    timer.check("sort")

    agg_funcs = {
        col: Config.aggregate_op(col)
        for col in v
        if col != key and not col.startswith(CONSTANT.TIME_PREFIX) and not col.
        startswith(CONSTANT.MULTI_CAT_PREFIX) and 'mul_feature_' not in col
    }

    tmp_u_2 = tmp_u

    ##---------------FillNA-----------------
    #tmp_u = tmp_u.groupby(rehash_key).rolling(5).agg(agg_funcs)
    # tmp_u_2 = tmp_u

    tmp_u = tmp_u.groupby(key).agg(agg_funcs)

    timer.check("group & rolling & agg")

    # tmp_u.reset_index(0, drop=True, inplace=True)  # drop rehash index
    timer.check("reset_index")

    # tmp_u.columns = tmp_u.columns.map(lambda a:
    #    f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})")

    if tmp_u.empty:
        log("empty tmp_u, return u")
        return u
    # print(u.shape,tmp_u.loc['u'].shape,tmp_u_2.shape)
    # ret = pd.concat([u, tmp_u_2], axis=1, sort=False)
    # ret = pd.concat([u, tmp_u.loc['u'],tmp_u_2], axis=1, sort=False)

    # ret = pd.concat([u, tmp_u.loc['u']], axis=1, sort=False)
    timer.check("final concat")

    tmp_u.columns = tmp_u.columns.map(lambda a: f"{v_name}.{a})")

    tmpindex = u.index

    u["index"] = list(range(0, len(tmpindex)))

    ret = pd.merge(u, tmp_u, left_index=True, on=[key])

    ret.sort_values("index", inplace=True)

    # ret.index = ret["index"]
    ret.index = tmpindex
    ret.drop("index", axis=1, inplace=True)

    # u[key] = u[key].apply(int)
    # v[key] = v[key].apply(int)
    # #u = u.join(v,on=key)
    # u = u.merge(v)
    # print(u)
    del tmp_u

    return u
예제 #7
0
def temporal_join_jinnian(u, v, v_name, key, time_col, type_):
    if isinstance(key, list):
        assert len(key) == 1
        key = key[0]

    if type_.split("_")[2] == 'many':
        timer = Timer()

        tmp_u = u[[time_col, key]]
        timer.check("select")
        # tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)
        # timer.check("concat")

        # tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)
        for c in v.columns:
            if c != key and c.startswith(CONSTANT.CATEGORY_PREFIX):
                v[c] = v[c].apply(lambda x: int(x))
        tmp_u = pd.concat([tmp_u, v], sort=False)
        #tmp_u = v
        # print(tmp_u.index)
        # input()
        # print(tmp_u[key].nunique())
        # input()
        timer.check("concat")

        rehash_key = f'rehash_{key}'
        # tmp_u[rehash_key] = tmp_u[key].apply(lambda x: hash(x) % CONSTANT.HASH_MAX)
        timer.check("rehash_key")

        # tmp_u.sort_values(time_col, inplace=True)
        timer.check("sort")

        agg_funcs = {
            col: Config.aggregate_op(col)
            for col in v
            if col != key and not col.startswith(CONSTANT.TIME_PREFIX)
            and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)
            and 'mul_feature_' not in col
        }
        tmp_u = tmp_u.fillna(0).groupby(key).agg(agg_funcs)
        '''agg_funcs_num = {col: Config.aggregate_op(col) for col in v if col != key
                     and col.startswith(CONSTANT.NUMERICAL_PREFIX)
                     }
        agg_funcs_cat = {col: Config.aggregate_op(col) for col in v if col != key
                     and col.startswith(CONSTANT.CATEGORY_PREFIX)
                     }
        num_features = [c for c in tmp_u.columns if c.startswith(CONSTANT.NUMERICAL_PREFIX)]
        if key not in num_features:
            num_features.append(key)
        cat_features = [c for c in tmp_u.columns if c.startswith(CONSTANT.CATEGORY_PREFIX)]
        if key not in cat_features:
            cat_features.append(key)
        #print(num_features)
        #print(cat_features)
        #input()
        tmp_u_num = tmp_u[num_features]
        tmp_u_cat = tmp_u[cat_features]'''
        ##---------------FillNA-----------------
        # tmp_u = tmp_u.groupby(rehash_key).rolling(5).agg(agg_funcs)
        ##tmp_u = tmp_u.fillna(0).groupby(rehash_key).rolling(5).agg(agg_funcs)
        '''if len(num_features) > 1:
            tmp_u_cat = tmp_u_cat.groupby(key).agg(agg_funcs_cat)
            tmp_u_cat.reset_index(0, inplace=True)

        if len()


        tmp_u_num = tmp_u_num.fillna(0).groupby(key).agg(agg_funcs_num)
        tmp_u_num.reset_index(0, inplace=True)
        print(tmp_u_cat.index)
        print(tmp_u_cat.columns)
        print(tmp_u_num.index)
        print(tmp_u_num.columns)
        input()
        tmp_u = pd.merge(tmp_u_cat, tmp_u_num, on=[key])'''

        timer.check("group & rolling & agg")

        # tmp_u.reset_index(0, drop=True, inplace=True)  # drop rehash index
        timer.check("reset_index")

        tmp_u.columns = tmp_u.columns.map(
            lambda a:
            f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})"
        )

        # new_columns = []
        # for a in tmp_u.columns:
        #     if "collect_list" == a[1]:
        #         new_columns.append(f"{'mul_'}{a[1].upper()}_ROLLING5({v_name}.{a[0]})")
        #     else:
        #         new_columns.append(f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})")
        #
        # tmp_u.columns = new_columns
        # print(tmp_u.columns)

        ##-------------remove duplicated rolling columns---------------
        c_tmp = None
        count = 0
        for c in tmp_u.columns:
            if 'COUNT_ROLLING5' in c:
                if c_tmp is None:
                    c_tmp = tmp_u[c]
                else:
                    tmp_u.drop(c, axis=1, inplace=True)
                    count += 1
        print("There are %d duplicated columns in temporal join!" % count)

        # print(tmp_u.columns)
        # input()

        ##------------check whether all n_COUNT_ROLLING_X are the same---------------
        '''all_columns = tmp_u.columns
        print(all_columns)
        c_tmp = None
        for c in all_columns:
            if 'COUNT_ROLLING5' in c:
                if c_tmp is None:
                    c_tmp = tmp_u[c]
                else:
                    print(c)
                    print([(tmp_u[c]-c_tmp).max(), (tmp_u[c]-c_tmp).min()])
        input()'''

        # print(tmp_u.columns)
        tmp_u.reset_index(0, inplace=True)

        ##-------------check NAN after aggregation functions----------
        '''print(tmp_u.columns)
        #print(tmp_u["n_COUNT_ROLLING5(table_1.c_1)"])
        for c in tmp_u.columns:
            print(c)
            print(tmp_u[c].loc['u'].shape[0])
            print(np.sum(np.isnan(tmp_u[c]).loc['u']))
            print(tmp_u[c].loc['u'])
        #print(tmp_u['n_MEAN_ROLLING5(table_1.n_1)'])
        input()'''

        if tmp_u.empty:
            log("empty tmp_u, return u")
            return u
        # print(u.shape,tmp_u.loc['u'].shape,tmp_u_2.shape)
        # ret = pd.concat([u, tmp_u.loc['u'],#tmp_u_2], axis=1, sort=False)
        # ret = pd.concat([u, tmp_u.loc['u']], axis=1, sort=False)
        index_tmp = u.index
        u["index"] = list(range(0, len(index_tmp)))
        ret = pd.merge(u, tmp_u, on=[key])
        ret.sort_values("index", inplace=True)
        ret.index = index_tmp
        ret.drop("index", axis=1, inplace=True)

        timer.check("final concat")

        del tmp_u

        return ret

    else:
        ###------------ Multi-cat features will be processed in the main function --------##
        for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]:
            v[c].fillna("0", inplace=True)
            v["mul_feature_" + c] = v[c].apply(lambda x: str(x).split(","))
            # v["mul_feature_" + c] = v[c].str.split(",")
            # print(v["mul_feature_" + c])
            # v["mul_feature_" + c] = v[c]
            v[c] = v["mul_feature_" + c].apply(lambda x: int(x[0]))
        '''
        for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]:
            v[c].fillna("0",inplace=True)
            mul_features = get_tfidf_vector(v[c], c)
            v.drop(c, axis=1, inplace=True)
            v = pd.concat([v, mul_features], axis=1)
            '''
        # tmp_u = u[[time_col, key]]
        # tmp_u = pd.concat([tmp_u, v], keys=['u', 'v'], sort=False)
        # print(tmp_u[key].nunique())
        # input()
        # print(u.dtypes)
        # input()
        # u[key] = u[key].astype('int64')
        v = v.set_index(key)
        v.columns = v.columns.map(
            lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}")

        return u.join(v, on=key)
예제 #8
0
def join(u, v, v_name, key, type_):
    if isinstance(key, list):
        assert len(key) == 1
        key = key[0]

    if type_.split("_")[2] == 'many':
        v_features = v.columns
        for c in v_features:
            if c != key and c.startswith(CONSTANT.CATEGORY_PREFIX):
                v[c] = v[c].apply(lambda x: int(x))
        agg_funcs = {
            col: Config.aggregate_op(col)
            for col in v
            if col != key and not col.startswith(CONSTANT.TIME_PREFIX)
            and not col.startswith(CONSTANT.MULTI_CAT_PREFIX)
            and 'mul_feature_' not in col
        }

        v = v.fillna(0).groupby(key).agg(agg_funcs)
        v.columns = v.columns.map(
            lambda a:
            f"{CONSTANT.NUMERICAL_PREFIX}{a[1].upper()}_ROLLING5({v_name}.{a[0]})"
        )

        ## --------------- remove duplicated rolling columns ------------
        c_tmp = None
        count = 0
        for c in v.columns:
            if 'COUNT_ROLLING5' in c:
                if c_tmp is None:
                    c_tmp = v[c]
                else:
                    v.drop(c, axis=1, inplace=True)
                    count += 1
        print("There are %d duplicated columns in temporal join!" % count)

        v.reset_index(0, inplace=True)
        v = v.set_index(key)
    else:
        # for c in [c for c in v if c.startswith(CONSTANT.CATEGORY_PREFIX) and "c_0"]:
        #
        #     v[c] = v[c].apply(lambda x: int(x))

        ###-------------- Multi-cat features will be processed in the main function--------###
        for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]:
            v[c].fillna("0", inplace=True)
            v["mul_feature_" + c] = v[c].apply(lambda x: str(x).split(","))
            #v["mul_feature_" + c] = v[c].str.split(",")
            #print(v["mul_feature_" + c])
            #v["mul_feature_" + c] = v[c]
            v[c] = v["mul_feature_" + c].apply(lambda x: int(x[0]))
        '''
        for c in [c for c in v if c.startswith(CONSTANT.MULTI_CAT_PREFIX)]:
            v[c].fillna("0",inplace=True)
            mul_features = get_tfidf_vector(v[c], c)
            v.drop(c, axis=1, inplace=True)
            v = pd.concat([v, mul_features], axis=1)
            '''
        #v["mul_feature_" + c] = v[c].parallel_apply(lambda x:str(x).split(","))
        #v["mul_feature_" + c] = v[c].apply(lambda x:str(x).split(","))
        #v["mul_feature_" + c] = v[c].str.split(",")
        #print(v["mul_feature_" + c])
        #v["mul_feature_" + c] = v[c]
        #v[c] = v["mul_feature_" + c].parallel_apply(lambda x: int(x[0]))
        #v[c] = v["mul_feature_" + c].apply(lambda x: int(x[0]))
        #v[key].astype(str, copy=True)
        v = v.set_index(key)
        v.columns = v.columns.map(
            lambda a: f"{a.split('_', 1)[0]}_{v_name}.{a}")
    #u[key].astype(str,copy=True)
    '''print(u.dtypes)
    print("*"*50)
    print(v.dtypes)
    print(v.index)
    input()'''

    return u.join(v, on=key)