示例#1
0
def extract_feature_hour(data, feature, ans):

    target = data.copy()

    t = feature.groupby(['uid'])['hour'].agg(
        ['mean', 'std']).reset_index().rename(columns={
            'mean': 'uid_invite_hour_mean',
            'std': 'uid_invite_hour_std'
        })
    s_feature = feature[feature['label'] == 1]
    target = pd.merge(target, t, how='left', on='uid')
    t = s_feature.groupby(['uid'])['hour'].agg(
        ['mean', 'std']).reset_index().rename(columns={
            'mean': 'uid_invite_ans_hour_mean',
            'std': 'uid_invite_ans_hour_std'
        })
    target = pd.merge(target, t, how='left', on='uid')

    t = ans.groupby(['uid'])['hour'].agg(
        ['mean', 'std']).reset_index().rename(columns={
            'mean': 'uid_ans_hour_mean',
            'std': 'uid_ans_hour_std'
        })
    target = pd.merge(target, t, how='left', on='uid')

    return target
示例#2
0
def extract_uid_seq_more(target):
    target = target.reset_index(drop=True)
    target['ii'] = target.index
    target['i_time'] = target['day'] + 0.04166 * target['hour']

    ds = target.sort_values(by=['uid', 'i_time'], axis=0,
                            ascending=True).reset_index(drop=True)
    ds['slabel'] = ds['m_label'].astype(str)
    t = ds.groupby('uid')['slabel'].apply(lambda x: '-' + ''.join(x.tolist(
    ))).reset_index().rename(columns={'slabel': 'uid_m_seq'})
    t1 = ds.groupby('uid')['label'].apply(
        lambda x: [i for i in range(len(x.tolist()))]).reset_index().rename(
            columns={'label': 'uid_rank'})
    t_rank = pd.DataFrame({
        'uid': t1.uid.repeat(t1.uid_rank.str.len()),
        'uid_rank': np.concatenate(t1.uid_rank.values)
    }).reset_index(drop=True)

    ds = pd.merge(ds, t, how='left', on='uid').reset_index(drop=True)
    ds = pd.concat([ds, t_rank], axis=1)
    # 计算用户的访问序列
    ds['uid_mm_seq'] = ds.apply(lambda x: '-' +
                                (x['uid_m_seq'])[:x['uid_rank']],
                                axis=1)
    target = pd.merge(target, ds[['ii', 'uid_mm_seq']], how='left', on='ii')

    return target
示例#3
0
def extract_feature_ans_dif(data, feature, ans_feature):
    target = data.copy()
    print_time('extract feature ans dif')

    que = pd.read_csv('../datasets/question_info.csv',
                      usecols=['qid', 'topic_id'])
    ans_feature = pd.merge(ans_feature, que, how='left', on='qid').fillna(0)
    target = pd.merge(target, que, how='left', on='qid').fillna(0)
    ans_feature[
        'a_time'] = ans_feature['a_day'] + 0.04166 * ans_feature['a_hour']
    target['i_time'] = target['day'] + 0.04166 * target['hour']

    total_extend = ans_feature['topic_id'].str.split(',', expand=True).stack() \
        .reset_index(level=0).set_index('level_0') \
        .rename(columns={0: 'topic'}).join(ans_feature.drop('topic_id', axis=1)) \
        .reset_index(drop=True)

    t = total_extend.groupby(['uid', 'topic'])['a_time'].agg([
        'max'
    ]).reset_index().rename(columns={'max': 'uid_topic_ans_recent_time'})

    topic_df = target['topic_id'].str.split(',', expand=True)
    topic_df = topic_df.fillna(0)
    target = pd.concat([target, topic_df], axis=1)
    fea_name = 'uid_topic_ans_recent_time'
    tmp_name = []
    result_list = []
    for field in [0, 1, 2, 3, 4, 5]:
        target = pd.merge(target,
                          t,
                          how='left',
                          left_on=['uid', field],
                          right_on=['uid', 'topic'
                                    ]).rename(columns={
                                        fea_name: fea_name + str(field)
                                    }).fillna(1000)
        target['s' +
               str(field)] = target['i_time'] - target[fea_name + str(field)]
        tmp_name.append('s' + str(field))

        target[fea_name + '_mean'] = target[tmp_name].mean(axis=1)
        target[fea_name + '_min'] = target[tmp_name].min(axis=1)
        target[fea_name + '_max'] = target[tmp_name].max(axis=1)
    result_list.append(fea_name + '_min')
    result_list.append(fea_name + '_mean')
    result_list.append(fea_name + '_max')

    return target[result_list]
def createAuthorsTable(noDuplicates, stars_list, avatars, bios, nFollowers,
                       authors_loves, authors_views, authors_ranking,
                       finalAuthorsName):
    import modin.pandas as pd

    data = {
        'Authors': noDuplicates,
        'Stars': stars_list,
        'Has Avatar': avatars,
        'Has Bio': bios,
        'Followers': nFollowers,
        'Tot loves': authors_loves,
        'Tot views': authors_views,
        'author_ranking': authors_ranking
    }
    dataAuthors = {'Authors': finalAuthorsName}

    dfAuthors = pd.DataFrame(dataAuthors, columns=['Authors'])
    df = pd.DataFrame(data,
                      columns=[
                          'Authors', 'Stars', 'Has Avatar', 'Has Bio',
                          'Followers', 'Tot loves', 'Tot views',
                          'author_ranking'
                      ])
    mergedAuthors = pd.merge(dfAuthors, df, on='Authors')

    return mergedAuthors
def createTableWithAuthorsPanels(authorsTable, mergedTable):
    import modin.pandas as pd

    tableWithAuthorsPanels = pd.merge(mergedTable,
                                      authorsTable,
                                      on='panel_author')
    tableWithAuthorsPanels = tableWithAuthorsPanels.drop_duplicates(
        subset="id_panel")
    return tableWithAuthorsPanels
def mergeTime(idProjects, mergedTable, time):
    import modin.pandas as pd

    print('mergeTime')
    data = {'id_prog': idProjects, 'time': time}

    df = pd.DataFrame(data, columns=['id_prog', 'time'])
    mergedTable = pd.merge(df, mergedTable, on='id_prog')

    return mergedTable
示例#7
0
def merge_func(df1):
    for field in [0, 1, 2, 3, 4, 5, 6, 7, 8]:
        df1 = pd.merge(df1,
                       t,
                       how='left',
                       left_on=[fea, field],
                       right_on=[
                           fea, 'word'
                       ]).rename(columns={fea_name: fea_name + str(field)})
    return df1
def mergePanelsFeature(tableWithAuthorsPanels):
    import modin.pandas as pd

    projectTable = pd.read_excel("..\\data\\TabellaProgettiPanelJam.xlsx")

    Table = pd.merge(tableWithAuthorsPanels,
                     projectTable,
                     left_on='id_prog',
                     right_on='project')
    Table = Table.drop(columns=['project', 'Remixed', 'Time', 'Project depth'])

    return Table
示例#9
0
def extract_usr_unique(data, feature, ans_feature):

    target = data.copy()
    que = pd.read_csv('../datasets/question_info.csv',
                      usecols=['qid', 'topic_id'])
    ans = ans_feature[['uid', 'qid']]
    ans = pd.merge(ans, que, how='left', on='qid')
    feature = pd.merge(feature, que, how='left', on='qid').fillna('0')
    ## 获取用户被邀请的话题种类
    t = feature.groupby(['uid'])['topic_id'].apply(lambda x: len(
        set(','.join(x.tolist()).split(',')))).reset_index().rename(
            columns={'topic_id': 'sw_uid_invite_topic_unique'})
    target = pd.merge(target, t, how='left', on='uid')

    t = ans.groupby(['uid'])['topic_id'].apply(
        lambda x: len(set(','.join(x.tolist())))).reset_index().rename(
            columns={'topic_id': 'sw_uid_ans_topic_unique'})
    target = pd.merge(target, t, how='left', on='uid')

    fealist = ['sw_uid_invite_topic_unique', 'sw_uid_ans_topic_unique']
    return target[fealist]
示例#10
0
def extract_feature_smilar(data, feature, ans):
    target = data.copy()
    print_time('extract feature smilar')
    uid_qid_list = ans.groupby(['uid'])['qid'] \
        .apply(lambda x: ','.join(x.tolist())).reset_index().rename(columns={'qid': 'uid_qid_list'})

    target = pd.merge(target, uid_qid_list, how='left', on='uid').fillna(0)

    print_time('start')

    target = multiprocessing_apply_data_frame(tmp_func, target, 10)
    print_time('end')

    return target
示例#11
0
def extract_topic_whole_count(data, feature):
    print('extract_feature')

    target = data.copy()

    que = pd.read_csv('../datasets/question_info.csv',
                      usecols=['qid', 'topic_id'])
    target = pd.merge(target, que, how='left', on='qid').fillna('0')
    feature = pd.merge(feature, que, how='left', on='qid').fillna('0')

    print_time('extenf')

    total_extend = feature['topic_id'].str.split(',', expand=True).stack() \
        .reset_index(level=0).set_index('level_0') \
        .rename(columns={0: 'topic'}).join(feature.drop('topic_id', axis=1)) \
        .reset_index(drop=True)

    topic_df = target['topic_id'].str.split(',', expand=True)
    target = pd.concat([target, topic_df], axis=1)

    print_time('extend_finish')

    stat_feat = [
        (['topic'], ['label'], ['count']),
        (['topic'], ['qid'], ['nunique']),
        (['topic'], ['uid'], ['nunique']),
        (['topic', 'day', 'hour'], ['uid'], ['nunique']),
    ]
    final_list = []

    for stat in stat_feat:
        fea_name = '_'.join(stat[0]) + '_' + '_'.join(
            stat[1]) + '_' + '_'.join(stat[2])
        print('extract', fea_name)
        t = total_extend.groupby(stat[0])[stat[1][0]].agg(stat[2]).reset_index() \
            .rename(columns={stat[2][0]: fea_name})
        t.loc[t['topic'] == '0', fea_name] = 0
        tmp_name = []
        for field in [0, 1, 2, 3, 4]:
            lefton = []
            for i in stat[0]:
                if i == 'topic':
                    lefton.append(field)
                else:
                    lefton.append(i)
            target = pd.merge(target,
                              t,
                              how='left',
                              left_on=lefton,
                              right_on=stat[0]).rename(
                                  columns={fea_name: fea_name + str(field)})
            tmp_name.append(fea_name + str(field))

        target[fea_name + '_max'] = target[tmp_name].max(axis=1)
        target[fea_name + '_sum'] = target[tmp_name].sum(axis=1)
        final_list.append(fea_name + '_max')
        final_list.append(fea_name + '_sum')

        for field in [0, 1, 2, 3, 4]:
            target = target.drop([fea_name + str(field)], axis=1)

    return target[final_list], final_list
示例#12
0
        return pd.Series({"x_cross":np.mean(x['x']),'y_cross':np.mean(x['y'])})
    junT1 = junG.groupby("id_jun").apply(clampF).reset_index()
    junT = gpd.GeoDataFrame.from_file(baseDir + "gis/motorway/de_junct_unique.shp")
    junT

    junT.to_file(baseDir + "gis/motorway/de_junct_unique.shp")

if False:
    nodB = gpd.GeoDataFrame.from_file(baseDir + "gis/destatis/junct_bundesland.shp")
    nodc = nodB[['id_jun','GEN']].groupby('GEN').agg(len).reset_index()

if False:
    junT = pd.read_csv(baseDir + "gis/motorway/de_junct_unique.csv")
    nodJ = gpd.GeoDataFrame.from_file(baseDir + "gis/motorway/motorway_link_nodes.shp")
    nodJ.columns = ['id_jun', 'y', 'node_id', 'x', 'geometry']
    nodJ = pd.merge(nodJ,junT,left_on=["id_jun"],right_on=["id_jun"],how="left")
    nodJ.to_file(baseDir + "gis/motorway/motorway_link_nodes.shp")
    
if False:
    motL = motG.geometry.unary_union
    if True:
        nodS = nodJ.copy()
    else:
        nodS = nodJ.loc[nodJ['id_jun'] == "A 4-53"]
    print('--------------------projection-on-the-motorway----------------------')
    neip = nodS.apply(lambda x: motL.interpolate(motL.project(x['geometry'])),axis=1)
    nodS.loc[:,"x_mot"] = [x.xy[0][0] for x in neip]
    nodS.loc[:,"y_mot"] = [x.xy[1][0] for x in neip]
    nodS.loc[:,"m_dist"] = nodS.apply(lambda x: (x['x']-x['x_mot'])**2 + (x['y']-x['y_mot'])**2,axis=1)
    def chirality(x1,y1,x2,y2,xo,yo):
        vp = [x1 - xo,y1 - yo]
示例#13
0
def extruct_cross_feature_topic(data, feature):
    target = data.copy()

    print_time('extract cross feature topic')
    que = pd.read_csv('../datasets/question_info.csv',
                      usecols=['qid', 'topic_id'])
    member_info = pd.read_csv(
        '../datasets/member_info.csv',
        usecols=['uid', 'sex', 'visit', 'CA', 'CB', 'CC', 'CD', 'CE'])

    feature = pd.merge(feature, que, how='left', on='qid').fillna('0')
    feature = pd.merge(feature, member_info, how='left', on='uid').fillna('0')

    target = pd.merge(target, que, how='left', on='qid').fillna('0')
    target = pd.merge(target, member_info, how='left', on='uid').fillna('0')

    target['flag'] = 1
    feature['flag'] = 1

    total_extend = feature['topic_id'].str.split(',', expand=True).stack() \
        .reset_index(level=0).set_index('level_0') \
        .rename(columns={0: 'topic'}).join(feature.drop('topic_id', axis=1)) \
        .reset_index(drop=True)

    topic_df = target['topic_id'].str.split(',', expand=True)
    target = pd.concat([target, topic_df], axis=1)

    fea_list = ['flag', 'uid', 'sex', 'visit', 'CA', 'CB', 'CC', 'CD', 'CE']

    result_list = []
    for fea in fea_list:
        fea_name = 'topic_' + fea + '_rate'
        print(fea_name)
        t = total_extend.groupby(['topic', fea])['label'].agg(['count','sum']).reset_index() \
            .rename(columns={'count':'count_s','sum':'sum_s'})

        HP = HyperParam(1, 1)
        HP.update_from_data_by_moment(t['count_s'].values,
                                      t['sum_s'].values)  # 矩估计
        t[fea_name] = np.divide(t['sum_s'] + HP.alpha,
                                t['count_s'] + HP.alpha + HP.beta)
        t = t.drop(['count_s', 'sum_s'], axis=1)

        tmp_name = []
        for field in [0, 1, 2, 3, 4, 5]:
            target = pd.merge(
                target,
                t,
                how='left',
                left_on=[fea, field],
                right_on=[fea, 'topic'
                          ]).rename(columns={fea_name: fea_name + str(field)})
            tmp_name.append(fea_name + str(field))

        target[fea_name + '_max'] = target[tmp_name].max(axis=1)
        target[fea_name + '_mean'] = target[tmp_name].mean(axis=1)
        result_list.append(fea_name + '_max')
        result_list.append(fea_name + '_mean')

        for field in [0, 1, 2, 3, 4, 5]:
            target = target.drop([fea_name + str(field)], axis=1)

    return target[result_list]
示例#14
0
def extruct_feature(data, feature, ans_feature):
    target = data.copy()
    print_time('extract feature ')
    # 统计uid
    t = feature.groupby('uid')['label'].agg(['count','sum','mean','std']).reset_index()\
        .rename(columns={'count':'sw_' + 'ulc','sum':'sw_' + 'uls', 'mean':'sw_' + 'ulm','std':'sw_' + 'uld'})

    print_time('pinghua')
    HP = HyperParam(1, 1)
    HP.update_from_data_by_moment(t['sw_ulc'].values,
                                  t['sw_uls'].values)  # 矩估计
    t['sw_uid_rate_hp'] = np.divide(t['sw_uls'] + HP.alpha,
                                    t['sw_ulc'] + HP.alpha + HP.beta)
    print('pinghua', HP.alpha, HP.beta)

    target = pd.merge(target, t, how='left', on='uid')

    # 统计qid
    t = feature.groupby('qid')['label'].agg(['count', 'sum', 'mean', 'std']).reset_index() \
        .rename(columns={'count': 'sw_' + 'qlc', 'sum': 'sw_' + 'qls', 'mean': 'sw_' + 'qlm', 'std': 'sw_' + 'qld'})

    HP = HyperParam(1, 1)
    HP.update_from_data_by_moment(t['sw_qlc'].values,
                                  t['sw_qls'].values)  # 矩估计
    t['sw_qid_rate_hp'] = np.divide(t['sw_qls'] + HP.alpha,
                                    t['sw_qlc'] + HP.alpha + HP.beta)
    print('pinghua', HP.alpha, HP.beta)
    target = pd.merge(target, t, how='left', on='qid')

    #统计ansinfo
    gu = ans_feature.groupby('uid')
    t = gu['qid'].agg([
        'count'
    ]).reset_index().rename(columns={'count': 'sw_' + 'u_ans_q_num'})
    target = pd.merge(target, t, how='left', on='uid')

    for feat in [
            'bit7', 'bit8', 'bit10', 'bit11', 'bit12', 'bit13', 'bit15',
            'bit16', 'bit17'
    ]:
        t = gu[feat].agg(['sum',
                          'mean']).reset_index().rename(columns={
                              'sum': 'sw_uc_' + feat,
                              'mean': 'sw_um_' + feat
                          })
        target = pd.merge(target, t, how='left', on='uid')

    feature['i_time'] = feature['day'] * 24 + feature['hour']
    feature_sorted = feature.sort_values(by=['i_time'], axis=0,
                                         ascending=True).reset_index(drop=True)
    feature_sorted['slabel'] = feature_sorted['label'].astype(str)
    t = feature_sorted.groupby('uid')['slabel'].apply(lambda x: '-' + ''.join(
        x.tolist())).reset_index().rename(columns={'slabel': 'sw_uid_seq'})
    t['sw_uid_seq_5'] = t['sw_uid_seq'].apply(lambda x: x[-5:])
    t['sw_uid_recent_uclick'] = t['sw_uid_seq'].apply(
        lambda x: len(x) - 1 - x.rfind('1') if x.rfind('1') != -1 else len(x))
    target = pd.merge(target, t, how='left', on='uid')

    t = feature_sorted.groupby('qid')['slabel'].apply(lambda x: '-' + ''.join(
        x.tolist())).reset_index().rename(columns={'slabel': 'sw_qid_seq'})
    t['sw_qid_seq_5'] = t['sw_qid_seq'].apply(lambda x: x[-5:])
    t['sw_qid_recent_uclick'] = t['sw_qid_seq'].apply(
        lambda x: len(x) - 1 - x.rfind('1') if x.rfind('1') != -1 else len(x))
    target = pd.merge(target, t, how='left', on='qid')

    ans_feature[
        'ans_time'] = ans_feature['a_day'] + 0.04166 * ans_feature['a_hour']
    qustion_info = pd.read_csv('../datasets2/question_info.csv',
                               usecols=['qid', 'q_day', 'q_hour'])
    qustion_info[
        'qus_time'] = qustion_info['q_day'] + 0.04166 * qustion_info['q_hour']
    ans_feature = pd.merge(ans_feature, qustion_info, how='left', on='qid')
    print(ans_feature['qus_time'].isnull().sum(), ans_feature.shape)
    ans_feature['qus_time'] = ans_feature['qus_time'].fillna(3000)
    ans_feature[
        'ans_dif_time'] = ans_feature['ans_time'] - ans_feature['qus_time']
    gu = ans_feature.groupby('uid')['ans_dif_time'].agg(['mean','max','min','std']).reset_index()\
        .rename(columns={'mean': 'at_mean','max':'at_max','min':'at_min','std':'at_std'})
    target = pd.merge(target, gu, how='left', on='uid')

    ans_feature['week'] = ans_feature['a_day'] % 7
    ans_feature['new_hour'] = ans_feature['a_hour'].apply(lambda x: int(x / 6))
    target['week'] = target['day'] % 7
    target['new_hour'] = target['hour'].apply(lambda x: int(x / 6))
    t = ans_feature.groupby(['uid'])['qid'].agg(
        ['count']).reset_index().rename(columns={'count': 'u_t_count'})
    t1 = ans_feature.groupby(['uid', 'week'])['qid'].agg(
        ['count']).reset_index().rename(columns={'count': 'uid_week_count'})
    target = pd.merge(target, t, how='left', on='uid').fillna(0)
    target = pd.merge(target, t1, how='left', on=['uid', 'week']).fillna(0)
    print(target.columns.tolist())

    target['uid_week_ans_rate'] = np.divide(target['uid_week_count'],
                                            target['u_t_count'] + 0.001)
    t1 = ans_feature.groupby(['uid', 'new_hour'])['qid'].agg(
        ['count']).reset_index().rename(columns={'count': 'uid_nhour_count'})
    target = pd.merge(target, t1, how='left', on=['uid', 'new_hour'])
    target['uid_week_ans_rate'] = np.divide(target['uid_nhour_count'],
                                            target['u_t_count'] + 0.001)

    return target
示例#15
0
def extract_id_whole_count(data, feature):
    target = data.copy()
    fea_list = []
    feaname = 'uid_min_day'
    fea_list.append(feaname)
    print('extract', feaname)
    t = feature.groupby(['uid'])['day'].agg(
        ['min']).reset_index().rename(columns={'min': feaname})
    target = pd.merge(target, t, how='left', on='uid')
    #
    feaname = 'uid_count'
    fea_list.append(feaname)
    print('extract', feaname)
    t = feature.groupby(['uid'])['label'].agg(
        ['count']).reset_index().rename(columns={'count': feaname})
    target = pd.merge(target, t, how='left', on='uid')
    #
    feaname = 'uid_day_count'
    fea_list.append(feaname)
    print('extract', feaname)
    t = feature.groupby(['uid', 'day'])['label'].agg(
        ['count']).reset_index().rename(columns={'count': feaname})
    target = pd.merge(target, t, how='left', on=['uid', 'day'])
    #
    feaname = 'uid_day_hour_count'
    fea_list.append(feaname)
    print('extract', feaname)
    t = feature.groupby(['uid', 'day', 'hour'])['label'].agg(
        ['count']).reset_index().rename(columns={'count': feaname})
    target = pd.merge(target, t, how='left', on=['uid', 'day', 'hour'])
    #
    feaname = 'uid_min_day_count'
    fea_list.append(feaname)
    print('extract', feaname)
    t = feature.groupby(['uid', 'day'])['label'].agg(
        ['count']).reset_index().rename(columns={
            'count': feaname,
            'day': 'uid_min_day'
        })
    target = pd.merge(target, t, how='left', on=['uid', 'uid_min_day'])

    feaname = 'uid_day_nuinque'
    fea_list.append(feaname)
    print('extract', feaname)
    t = feature.groupby(['uid'
                         ])['day'].agg(['nunique'
                                        ]).reset_index().rename(columns={
                                            'nunique': feaname,
                                            'day': feaname
                                        })
    target = pd.merge(target, t, how='left', on=['uid'])

    feaname = 'qid_day_nuinque'
    fea_list.append(feaname)
    print('extract', feaname)
    t = feature.groupby(['qid'
                         ])['day'].agg(['nunique'
                                        ]).reset_index().rename(columns={
                                            'nunique': feaname,
                                            'day': feaname
                                        })
    target = pd.merge(target, t, how='left', on=['qid'])

    print_time(target.columns.tolist())

    feaname = 'qid_min_day'
    fea_list.append(feaname)
    print('extract', feaname)
    t = feature.groupby(['qid'])['day'].agg(
        ['min']).reset_index().rename(columns={'min': feaname})
    target = pd.merge(target, t, how='left', on='qid')

    print_time(target.columns.tolist())

    feaname = 'qid_count'
    fea_list.append(feaname)
    print('extract', feaname)
    t = feature.groupby(['qid'])['label'].agg(
        ['count']).reset_index().rename(columns={'count': feaname})
    target = pd.merge(target, t, how='left', on='qid')

    feaname = 'qid_day_count'
    fea_list.append(feaname)
    print('extract', feaname)
    t = feature.groupby(['qid', 'day'])['label'].agg(
        ['count']).reset_index().rename(columns={'count': feaname})
    target = pd.merge(target, t, how='left', on=['qid', 'day'])

    feaname = 'qid_day_hour_count'
    fea_list.append(feaname)
    print('extract', feaname)
    t = feature.groupby(['qid', 'day', 'hour'])['label'].agg(
        ['count']).reset_index().rename(columns={'count': feaname})
    target = pd.merge(target, t, how='left', on=['qid', 'day', 'hour'])

    feaname = 'qid_min_day_count'
    fea_list.append(feaname)
    print('extract', feaname)
    t = feature.groupby(['qid', 'day'])['label'].agg(
        ['count']).reset_index().rename(columns={
            'count': feaname,
            'day': 'qid_min_day'
        })
    target = pd.merge(target, t, how='left', on=['qid', 'qid_min_day'])

    return target[fea_list], fea_list
示例#16
0
def test_merge():
    frame_data = {
        "col1": [0, 1, 2, 3],
        "col2": [4, 5, 6, 7],
        "col3": [8, 9, 0, 1],
        "col4": [2, 4, 5, 6],
    }

    modin_df = pd.DataFrame(frame_data)
    pandas_df = pandas.DataFrame(frame_data)

    frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]}
    modin_df2 = pd.DataFrame(frame_data2)
    pandas_df2 = pandas.DataFrame(frame_data2)

    join_types = ["outer", "inner"]
    for how in join_types:
        with warns_that_defaulting_to_pandas(
        ) if how == "outer" else contextlib.nullcontext():
            modin_result = pd.merge(modin_df, modin_df2, how=how)
        pandas_result = pandas.merge(pandas_df, pandas_df2, how=how)
        df_equals(modin_result, pandas_result)

        # left_on and right_index
        with warns_that_defaulting_to_pandas():
            modin_result = pd.merge(modin_df,
                                    modin_df2,
                                    how=how,
                                    left_on="col1",
                                    right_index=True)
        pandas_result = pandas.merge(pandas_df,
                                     pandas_df2,
                                     how=how,
                                     left_on="col1",
                                     right_index=True)
        df_equals(modin_result, pandas_result)

        # left_index and right_on
        with warns_that_defaulting_to_pandas():
            modin_result = pd.merge(modin_df,
                                    modin_df2,
                                    how=how,
                                    left_index=True,
                                    right_on="col1")
        pandas_result = pandas.merge(pandas_df,
                                     pandas_df2,
                                     how=how,
                                     left_index=True,
                                     right_on="col1")
        df_equals(modin_result, pandas_result)

        # left_on and right_on col1
        if how == "outer":
            warning_catcher = warns_that_defaulting_to_pandas()
        else:
            warning_catcher = contextlib.nullcontext()
        with warning_catcher:
            modin_result = pd.merge(modin_df,
                                    modin_df2,
                                    how=how,
                                    left_on="col1",
                                    right_on="col1")
        pandas_result = pandas.merge(pandas_df,
                                     pandas_df2,
                                     how=how,
                                     left_on="col1",
                                     right_on="col1")
        df_equals(modin_result, pandas_result)

        # left_on and right_on col2
        if how == "outer":
            warning_catcher = warns_that_defaulting_to_pandas()
        else:
            warning_catcher = contextlib.nullcontext()
        with warning_catcher:
            modin_result = pd.merge(modin_df,
                                    modin_df2,
                                    how=how,
                                    left_on="col2",
                                    right_on="col2")
        pandas_result = pandas.merge(pandas_df,
                                     pandas_df2,
                                     how=how,
                                     left_on="col2",
                                     right_on="col2")
        df_equals(modin_result, pandas_result)

        # left_index and right_index
        modin_result = pd.merge(modin_df,
                                modin_df2,
                                how=how,
                                left_index=True,
                                right_index=True)
        pandas_result = pandas.merge(pandas_df,
                                     pandas_df2,
                                     how=how,
                                     left_index=True,
                                     right_index=True)
        df_equals(modin_result, pandas_result)

    s = pd.Series(frame_data.get("col1"))
    with pytest.raises(ValueError):
        pd.merge(s, modin_df2)

    with pytest.raises(TypeError):
        pd.merge("Non-valid type", modin_df2)
    if not isVaildDate(str(df_row[field])):
        df_row[field] = np.nan

    return df_row


purge_pat_files('../data', r'^[^_]+_log.csv$')

logs1 = pd.read_csv('../data/log_reduced.csv',
                    encoding='utf-8',
                    chunksize=c_sz)

for df, _ in zip(logs1, trange(1000)):
    df = pd.merge(df,
                  ad_static,
                  left_on='曝光广告id',
                  right_on='广告id',
                  how='inner')
    # 3. 去掉非法时间行
    df['广告请求时间'] = pd.to_datetime(df['广告请求时间'], unit='s')  # 转为日期
    df['广告请求时间_date'] = df['广告请求时间'].apply(lambda x: x.date())
    df = df[col_names1]

    # 1. 去空值
    df.dropna(axis=0, how='any', inplace=True)

    # 数据分割
    _ = df.apply(save_csv, axis=1)

print('done')
示例#18
0
def extract_topic_score(data, ttt, feature):
    print('extract_topic count feature')

    target = data.copy()
    feature['label'] = 1

    que = pd.read_csv('../datasets2/question_info.csv',
                      usecols=['qid', 'topic_id'])
    m_list = ['uid', 'SCORE']
    meb = pd.read_csv('../datasets2/member_info.csv', usecols=m_list)
    target = pd.merge(target, que, how='left', on='qid').fillna('0')
    feature = pd.merge(feature, que, how='left', on='qid').fillna('0')
    target = pd.merge(target, meb, how='left', on='uid').fillna('0')
    feature = pd.merge(feature, meb, how='left', on='uid').fillna('0')


    total_extend = feature['topic_id'].str.split(',', expand=True).stack() \
        .reset_index(level=0).set_index('level_0') \
        .rename(columns={0: 'topic'}).join(feature.drop('topic_id', axis=1)) \
        .reset_index(drop=True)

    topic_df = target['topic_id'].str.split(',', expand=True)
    target = pd.concat([target, topic_df], axis=1)

    fealist = m_list
    final_list = []

    ###统计topic的总量

    fealist = []

    for stat in fealist:

        s_total_extend = total_extend[['topic', stat, 'label']]
        fea_name = stat + '_ans_topic_count_ratio'
        print('extract', fea_name)
        ###统计话题和用户属性的交叉量
        t = total_extend.groupby(['topic', stat])['label'].agg(
            ['count']).reset_index().rename(columns={'count': 'sum_count'})
        t.loc[t['topic'] == '0', 'sum_count'] = 0
        t = pd.merge(t, t1, how='left', on='topic')

        #平滑求占比
        HP = HyperParam(1, 1)
        HP.update_from_data_by_moment(t['topic_count'].values,
                                      t['sum_count'].values)  # 矩估计
        t[fea_name] = np.divide(t['sum_count'] + HP.alpha,
                                t['topic_count'] + HP.alpha + HP.beta)
        t = t.drop(['topic_count', 'sum_count'], axis=1)
        stat = ['topic', stat]
        tmp_name = []
        for field in [0, 1, 2, 3, 4]:
            lefton = []
            for i in stat:
                if i == 'topic':
                    lefton.append(field)
                else:
                    lefton.append(i)
            target = pd.merge(target,
                              t,
                              how='left',
                              left_on=lefton,
                              right_on=stat).rename(
                                  columns={fea_name: fea_name + str(field)})
            tmp_name.append(fea_name + str(field))

        target[fea_name + '_max'] = target[tmp_name].max(axis=1)
        target[fea_name + '_mean'] = target[tmp_name].mean(axis=1)
        final_list.append(fea_name + '_max')
        final_list.append(fea_name + '_mean')

        for field in [0, 1, 2, 3, 4]:
            target = target.drop([fea_name + str(field)], axis=1)

    return target[final_list]
示例#19
0
def test_merge():
    frame_data = {
        "col1": [0, 1, 2, 3],
        "col2": [4, 5, 6, 7],
        "col3": [8, 9, 0, 1],
        "col4": [2, 4, 5, 6],
    }

    modin_df = pd.DataFrame(frame_data)
    pandas_df = pandas.DataFrame(frame_data)

    frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]}
    modin_df2 = pd.DataFrame(frame_data2)
    pandas_df2 = pandas.DataFrame(frame_data2)

    join_types = ["outer", "inner"]
    for how in join_types:
        # Defaults
        modin_result = pd.merge(modin_df, modin_df2, how=how)
        pandas_result = pandas.merge(pandas_df, pandas_df2, how=how)
        df_equals(modin_result, pandas_result)

        # left_on and right_index
        modin_result = pd.merge(
            modin_df, modin_df2, how=how, left_on="col1", right_index=True
        )
        pandas_result = pandas.merge(
            pandas_df, pandas_df2, how=how, left_on="col1", right_index=True
        )
        df_equals(modin_result, pandas_result)

        # left_index and right_on
        modin_result = pd.merge(
            modin_df, modin_df2, how=how, left_index=True, right_on="col1"
        )
        pandas_result = pandas.merge(
            pandas_df, pandas_df2, how=how, left_index=True, right_on="col1"
        )
        df_equals(modin_result, pandas_result)

        # left_on and right_on col1
        modin_result = pd.merge(
            modin_df, modin_df2, how=how, left_on="col1", right_on="col1"
        )
        pandas_result = pandas.merge(
            pandas_df, pandas_df2, how=how, left_on="col1", right_on="col1"
        )
        df_equals(modin_result, pandas_result)

        # left_on and right_on col2
        modin_result = pd.merge(
            modin_df, modin_df2, how=how, left_on="col2", right_on="col2"
        )
        pandas_result = pandas.merge(
            pandas_df, pandas_df2, how=how, left_on="col2", right_on="col2"
        )
        df_equals(modin_result, pandas_result)

        # left_index and right_index
        modin_result = pd.merge(
            modin_df, modin_df2, how=how, left_index=True, right_index=True
        )
        pandas_result = pandas.merge(
            pandas_df, pandas_df2, how=how, left_index=True, right_index=True
        )
        df_equals(modin_result, pandas_result)

    s = pd.Series(frame_data.get("col1"))
    with pytest.raises(ValueError):
        pd.merge(s, modin_df2)

    with pytest.raises(TypeError):
        pd.merge("Non-valid type", modin_df2)
示例#20
0
df_list = []
# Go through each chromosome and create its own dataframe, with properly labeled columns
for chrom in range(1, 23):
    # Filter for the current chromosome's dataframe
    tmp_df = abs_anttd_seg_df[abs_anttd_seg_df.index.get_level_values(
        'Chromosome') == chrom]
    # Change the column names to identify the chromosome
    tmp_df.columns = [f'{col}_chromosome_{chrom}' for col in tmp_df.columns]
    # Remove now redundant `Chromosome` column
    tmp_df = tmp_df.reset_index().drop(columns='Chromosome', axis=1)
    # Add to the dataframes list
    df_list.append(tmp_df)

df_list[3]

abs_anttd_seg_df = reduce(lambda x, y: pd.merge(x, y, on='Sample'), df_list)
abs_anttd_seg_df.head()

abs_anttd_seg_df.Sample.nunique()

len(abs_anttd_seg_df)

# Remove duplicate columns (redundant features that are independent of the chromosome):

unique_features = set(
    [col.split('_chromosome')[0] for col in abs_anttd_seg_df.columns])
unique_features

[col for col in abs_anttd_seg_df.columns if 'Cancer_cell_frac_a1' in col]

# Save the feature names that are redundant (i.e. no difference between chromosomes)