Exemplo n.º 1
0
        inputs = (features, labels)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the read end of the pipeline.
    return dataset.make_one_shot_iterator().get_next()


newuser_dataset=pd.read_excel('/Users/andpay/Documents/job/model/newuser_marketing_credithands/newuser_marketing_dataset_v4_1.xlsx')
var_list=list(newuser_dataset.columns)
model_var_list=remove_list(var_list,['partyid','cate'])

category_var=['sex','city-id','channel','brandcode']
continue_var=remove_list(model_var_list,category_var)
newuser_dataset=disper_split(newuser_dataset,category_var)
newuser_dataset[continue_var]=newuser_dataset[continue_var].fillna(-1)

newuser_dataset=newuser_dataset[model_var_list+['cate']].apply(pd.to_numeric)

#自变量标准化处理,可以减少训练时间
newuser_dataset[model_var_list]=preprocessing.scale(newuser_dataset[model_var_list])


#固定训练集和测试集数据
traindata, testdata= train_test_split(newuser_dataset,test_size=0.25,random_state=1)
x_train,y_train=traindata[model_var_list],traindata['cate']
Exemplo n.º 2
0
# 对好样本进行随机欠采样
# random_sample=RandomSample(np.array(end_user_info[end_user_info['cate']==0]),0.3).random_under_sample()
# random_sample_df=pd.DataFrame(random_sample,columns=end_user_info.columns)
# end_user_info=pd.concat([end_user_info[end_user_info['cate']==1],random_sample_df],axis=0)
# print(end_user_info.shape)


'''缺失值检测'''
unuse_list=check_nullvalue(end_user_info)
end_user_info=end_user_info.drop(unuse_list,axis=1)
new_column=end_user_info.columns


cat_var=['id_city']
con_var=remove_list(new_column,cat_var+['over_dueday','cate'])
end_user_info=disper_split(end_user_info,cat_var)



'''变量间相关性检验'''
var_relative=regression_analysis(end_user_info,con_var)
relative_df=pd.DataFrame(var_relative,columns=['var1','var2','p_value','relative_coeffient'])
print(relative_df)


#对变量进行卡方分箱
split_point_chi,chi_df_1=chi_equalwide(end_user_info,con_var,'cate',max_interval=5,numOfSplit=300,mont=False,special_list=['age'])
end_col=list(chi_df_1.columns)

Exemplo n.º 3
0
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import make_scorer, fbeta_score, accuracy_score, recall_score
from sklearn import metrics
import time

start_time = time.time()

dataset = pd.read_excel(
    '/Users/andpay/Documents/job/model/behave_model/behave_model_dataset_v1_1.xlsx'
)
dataset.loc[dataset['last_overday'] >= 10, 'cate'] = 1
dataset.loc[dataset['last_overday'] < 10, 'cate'] = 0

var_list = list(dataset.columns)
model_var_list = remove_list(
    var_list,
    ['partyid', 'loanid', 'last_overday', 'cate', 'register_duration'])

category_var = ['sex', 'city_id', 'channel_type', 'brandcode']
continue_var = remove_list(model_var_list,
                           category_var)  #这里会改变newvar_list的元素数量
newuser_dataset = disper_split(dataset, category_var)
newuser_dataset[continue_var] = newuser_dataset[continue_var].fillna(0)

# x_train,x_test,y_train,y_test= train_test_split(,test_size=0.25,random_state=1)
XGC = XGBClassifier(n_estimators=150, max_depth=9, learning_rate=0.03)
XGC.fit(newuser_dataset[continue_var].astype(int), dataset['cate'].astype(int))
xgc_col = list(np.round(XGC.feature_importances_, 3))

#变量重要性排序
var_importance = pd.DataFrame({
Exemplo n.º 4
0
start_time = time.time()

dataset = pd.read_excel(
    '/Users/andpay/Documents/job/model/behave_model/behave_model_dataset_v2.xlsx'
)
dataset.loc[dataset['last_overday'] >= 30, 'cate'] = 1
dataset.loc[dataset['last_overday'] < 30, 'cate'] = 0
print(dataset['partyid'].groupby(dataset['cate']).count())

test_dataset = pd.read_excel(
    '/Users/andpay/Documents/job/model/behave_model/model_practice/behave_userlist_v2_2.xlsx'
)

var_list = list(dataset.columns)
model_var_list = remove_list(var_list,
                             ['partyid', 'loanid', 'last_overday', 'cate'])

category_var = ['sex', 'city_id', 'channel_type', 'brandcode']
continue_var = remove_list(model_var_list,
                           category_var)  #这里会改变newvar_list的元素数量
newuser_dataset = disper_split(dataset, category_var)  #分类变量处理
newuser_dataset[continue_var] = newuser_dataset[continue_var].fillna(0)

test_dataset[continue_var] = test_dataset[continue_var].fillna(0)
'''变量间相关性检验'''
var_relative = regression_analysis(dataset, continue_var, rsquare_limit=0.8)
relative_df = pd.DataFrame(
    var_relative, columns=['var1', 'var2', 'p_value', 'relative_coeffient'])
print(relative_df)

relative_var = [
Exemplo n.º 5
0
            dif=self.samples[nnarray[nn]]-self.samples[i]
            #生成一个随机数
            gap=random.random()
            #合成人工样本
            self.synthetic[self.newindex]=self.samples[i]+gap*dif
            self.newindex+=1



if __name__=='__main__':
    df = pd.read_excel('/Users/andpay/Documents/job/data/帮还活动/activity_history/marketing_modedata3_14.xlsx')
    df = df[0:100]
    print(df.shape)

    cate_list = ['sex', 'brandcode', 'channel_type', 'marry', 'ccerate']
    df = df.fillna(0)
    var_list = list(df.columns)
    var_list.remove('partyid')
    var_list.remove('name')
    continue_list = remove_list(var_list, cate_list)

    #a=np.array([[1,2,3],[4,5,6],[2,3,1],[2,1,2],[2,3,4],[2,3,4]])
    data=np.array(df[continue_list])
    #print(np.round(data,3))
    s=Smote(data,N=50)

    dataset=s.over_sampling()
    print (dataset.shape)
    print (s.newindex)