예제 #1
0
def _ans_topic_feat(train,feature,mode,ans):

    feature['i_time'] = feature['day'] + 0.04166 * feature['hour']
    train['i_time'] = train['day'] + 0.04166 * train['hour']
    feature = pd.merge(feature,qus_info,how='left',on='qid').fillna('0')
    train = pd.merge(train, qus_info, how='left', on='qid').fillna('0')

    if mode == 'test_B':
        s = pd.read_csv('../datasets/invite_test.csv')
        s_score = pd.read_csv('../datasets/feature/test/p_score')
        s['i_time'] = s['day'] + 0.04166 * s['hour']
        s = pd.concat([s, s_score], axis=1)
        s = s[s['p_score'] >= 0.42]
        s = pd.merge(s, qus_info, how='left', on='qid').fillna('0')

    train_score = pd.read_csv('../datasets/feature/' + mode + '/p_score')

    print(train_score.shape)

    s1 = feature[feature['label'] == 1]
    train = pd.concat([train, train_score], axis=1)
    s2 = train[train['p_score'] >= 0.42]
    s1 = s1[['uid', 'qid','i_time']]
    s2 = s2[['uid', 'qid','i_time']]


    if mode == 'test_B':
        s = s[['uid', 'qid', 'i_time']]
        s = pd.concat([s1, s2, s], axis=0)
    else:
        s = pd.concat([s1,s2],axis=0)




    s = pd.concat([s1, s2, s], axis=0)

    # 得到每个用户7天前的回答列表
    def get_time_list(df):

        qid_list = df['qid'].tolist()
        time_list = df['i_time'].tolist()
        tmp = []
        for i in range(len(qid_list)):
            tmp.append((qid_list[i],time_list[i]))
        return sorted(tmp,key=lambda x:x[1])

    t = s.groupby(['uid']).apply(get_time_list).reset_index().rename(columns={0: 'answer_time_list'})

    train = pd.merge(train, t, on='uid', how='left')
    train['answer_time_list'] = train['answer_time_list'].fillna(0)

    train = multiprocessing_apply_data_frame(tmp_func3, train, 10)

    train['merge_ans_dif_topic_1'] = train['i_time'] - train['merge_last_ans_time_topic_1b']

    train = train[['merge_ans_dif_topic_1']]

    return train
예제 #2
0
def ans_time_feat(train, feature, mode, ans):
    feature['i_time'] = feature['day'] + 0.04166 * feature['hour']
    train['i_time'] = train['day'] + 0.04166 * train['hour']
    if mode == 'test_B':
        s = pd.read_csv('../datasets1/invite_test.csv')
        s_score = pd.read_csv('../datasets3/feature/test/p_score')
        s['i_time'] = s['day'] + 0.04166 * s['hour']
        s = pd.concat([s, s_score], axis=1)
        s = s[s['p_score'] >= 0.42]

    train_score = pd.read_csv('../datasets3/feature/' + mode + '/p_score')

    print(train_score.shape)

    s1 = feature[feature['label'] == 1]
    train = pd.concat([train, train_score], axis=1)
    s2 = train[train['p_score'] >= 0.42]
    s1 = s1[['uid', 'i_time']]
    s2 = s2[['uid', 'i_time']]

    if mode == 'test_B':
        s = s[['uid', 'i_time']]
        s = pd.concat([s1, s2, s], axis=0)
    else:
        s = pd.concat([s1,s2],axis=0)

    # 得到每个用户7天前的回答列表
    def get_time_list(df):
        return sorted(df['i_time'].tolist())

    t = s.groupby(['uid']).apply(get_time_list).reset_index().rename(columns={0: 'answer_time_list'})

    train = pd.merge(train, t, on='uid', how='left')
    train['answer_time_list'] = train['answer_time_list'].fillna(0)

    train = multiprocessing_apply_data_frame(tmp_func1, train, 10)

    train = multiprocessing_apply_data_frame(tmp_func7, train, 10)

    train['merge_ans_dif_1'] = train['i_time'] - train['merge_last_ans_time_1b']

    train['merge_ans_dif_7'] = train['i_time'] - train['merge_last_ans_time_7b']

    train = train[['merge_ans_dif_1', 'merge_ans_dif_7']]

    return train
예제 #3
0
def uid_seq_mlabel(train,feature,mode,ans):

    feature['i_time'] = feature['day'] + 0.04166 * feature['hour']
    train['i_time'] = train['day'] + 0.04166 * train['hour']


    if mode == 'test_B':
        s = pd.read_csv('../datasets/invite_test.csv')
        s_score = pd.read_csv('../datasets/feature/test/p_score')
        s['i_time'] = s['day'] + 0.04166 * s['hour']
        s = pd.concat([s, s_score], axis=1)
        s['m_label'] = s['p_score'].apply(lambda x: 1 if x >= 0.42 else 0)

    train_score = pd.read_csv('../datasets/feature/' + mode + '/p_score')

    print(train_score.shape)

    feature['m_label'] = feature['label']
    train = pd.concat([train, train_score], axis=1)
    train['m_label'] = train['p_score'].apply(lambda x: 1 if x >= 0.42 else 0)

    train = train[['uid','qid','i_time','m_label']]
    feature = feature[['uid', 'qid', 'i_time', 'm_label']]

    if mode == 'test_B':
        s = s[['uid', 'qid', 'i_time', 'm_label']]

        s = pd.concat([train, feature,s], axis=0)
    else:
        s = pd.concat([train, feature], axis=0)

    # 得到每个用户7天前的回答列表
    def get_time_list(df):

        qid_list = df['m_label'].tolist()
        time_list = df['i_time'].tolist()
        tmp = []
        for i in range(len(qid_list)):
            tmp.append((qid_list[i],time_list[i]))
        tmp = sorted(tmp,key=lambda x:x[1])
        return tmp
    t = s.groupby(['uid']).apply(get_time_list).reset_index().rename(columns={0: 'answer_time_list'})

    train = pd.merge(train, t, on='uid', how='left')
    train['answer_time_list'] = train['answer_time_list'].fillna(0)

    train = multiprocessing_apply_data_frame(tmp_func4, train, 10)


    train = train[['merge_recent_7_day_click_rate']]

    return train
예제 #4
0
def extract_feature_smilar(data, feature, ans):
    target = data.copy()
    print_time('extract feature smilar')
    uid_qid_list = ans.groupby(['uid'])['qid'] \
        .apply(lambda x: ','.join(x.tolist())).reset_index().rename(columns={'qid': 'uid_qid_list'})

    target = pd.merge(target, uid_qid_list, how='left', on='uid').fillna(0)

    print_time('start')

    target = multiprocessing_apply_data_frame(tmp_func, target, 10)
    print_time('end')

    return target