#pca_ratio_curve(right_type_dummies, 7, 4) #新定义两个属性 --right_get描述的是获得权利的数目 right_get = X_right[['EID', 'FBDATE']] right_get.loc[:, 'right_get'] = isExist(X_right['FBDATE']) right_get = right_get.groupby('EID').sum() #right_applied描述的是权利申请的数目 right_applied = X_right[['EID', 'RIGHTTYPE']] right_applied.loc[:, 'right_applied'] = isExist(X_right['RIGHTTYPE']) right_applied = right_applied.groupby('EID').sum() #取消键'EID',方便使用merge right_get.reset_index(inplace=True) right_applied.reset_index(inplace=True) #ASKDATE --拆分为年份跟日期两个属性 right['right_year'], right['right_month'] = splitDate(right['ASKDATE']) right_year_dummies = pd.get_dummies(right['right_year']) right_month_dummies = pd.get_dummies(right['right_month']) addColumnsPrefix(right_year_dummies, 'right_year') addColumnsPrefix(right_month_dummies, 'right_month') #将年份与月份合并为一个DataFrame right_date_dummies = right_year_dummies.join(right_month_dummies) right_date_dummies[['EID']] = right[['EID']] right_date_dummies = right_date_dummies.groupby('EID').sum() right_date_dummies.reset_index(inplace=True) #FBDATE --拆分为年份和日期两个属性 mask = [False if str(n) == "nan" else True for n in right['FBDATE']] right_year_get, right_month_get = splitDate(right.loc[mask, 'FBDATE']) right_year_get_df = pd.DataFrame(columns=['EID','right_year_get']) right_year_get_df['EID'] = right.loc[mask, 'EID']
@author: zeroquest """ import numpy as np import pandas as pd from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from dankit import pca_ratio_curve, compress, isExist, addColumnsPrefix, splitDate X_train = pd.read_pickle('X_train_with_project_pickle') X_answer = pd.read_pickle('X_answer_with_project_pickle') lawsuit = pd.read_csv('7lawsuit.csv') lawsuit['lawsuit_number'] = 1 lawsuit['lawsuit_year'], lawsuit['lawsuit_month'] = splitDate(lawsuit['LAWDATE']) lawsuit_year_dummies = pd.get_dummies(lawsuit['lawsuit_year']) lawsuit_month_dummies = pd.get_dummies(lawsuit['lawsuit_month']) addColumnsPrefix(lawsuit_year_dummies, 'lawsuit_year') addColumnsPrefix(lawsuit_month_dummies, 'lawsuit_month') #将年份与月份合并为一个DataFrame lawsuit_date_dummies = lawsuit_year_dummies.join(lawsuit_month_dummies) lawsuit_date_dummies[['EID']] = lawsuit[['EID']] lawsuit_date_dummies = lawsuit_date_dummies.groupby('EID').sum() lawsuit_date_dummies.reset_index(inplace=True) lawsuit_number = lawsuit[['EID', 'lawsuit_number']].groupby('EID').sum() lawsuit_number.reset_index(inplace=True)
project['project_number'] = 1 X = pd.concat([X_train.drop('TARGET', axis=1), X_answer]) X_project = pd.merge( X, project, how='left', on='EID' ).loc[:, ['EID', 'DJDATE', 'project_number', 'project_at_home_number']] X_project.fillna(0, inplace=True) X_project_numbers = X_project[[ 'EID', 'project_number', 'project_at_home_number' ]].groupby('EID').sum() X_project_numbers.reset_index(inplace=True) #X_train = pd.merge(X_train, X_project_numbers, how='left', on='EID') #X_answer = pd.merge(X_answer, X_project_numbers, how='left', on='EID') #ASKDATE --拆分为年份跟日期两个属性 project['project_year'], project['project_month'] = splitDate( project['DJDATE']) project_year_dummies = pd.get_dummies(project['project_year']) project_month_dummies = pd.get_dummies(project['project_month']) addColumnsPrefix(project_year_dummies, 'project_year') addColumnsPrefix(project_month_dummies, 'project_month') #将年份与月份合并为一个DataFrame project_date_dummies = project_year_dummies.join(project_month_dummies) project_date_dummies[['EID']] = project[['EID']] project_date_dummies = project_date_dummies.groupby('EID').sum() project_date_dummies.reset_index(inplace=True) X_train = pd.merge(X_train, project_date_dummies, how='left', on='EID') X_answer = pd.merge(X_answer, project_date_dummies, how='left', on='EID') X_train.fillna(0, inplace=True) X_answer.fillna(0, inplace=True)
""" import numpy as np import pandas as pd from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from dankit import isExist, addColumnsPrefix, splitDate, getDateDummies X_train = pd.read_pickle('X_train_with_lawsuit_pickle') X_answer = pd.read_pickle('X_answer_with_lawsuit_pickle') breakfaith = pd.read_csv('8breakfaith.csv') breakfaith['breakfaith_number'] = 1 breakfaith['breakfaith_end_number'] = isExist(breakfaith['SXENDDATE']) breakfaith['breakfaith_year'], breakfaith['breakfaith_month'] = splitDate( breakfaith['FBDATE'], '/') #FBDATE breakfaith_date_dummies = getDateDummies(breakfaith['breakfaith_year'], breakfaith['breakfaith_month'], 'breakfaith_year', 'breakfaith_month', breakfaith['EID']) #SXENDDATE mask = [False if str(n) == "nan" else True for n in breakfaith['SXENDDATE']] breakfaith_end_date = breakfaith.loc[mask, ['EID', 'SXENDDATE']] breakfaith_end_date['breakfaith_end_year'], breakfaith_end_date[ 'breakfaith_end_month'] = splitDate(breakfaith_end_date['SXENDDATE'], '/') breakfaith_end_date_dummies = getDateDummies( breakfaith_end_date['breakfaith_end_year'], breakfaith_end_date['breakfaith_end_month'], 'breakfaith_end_year',
""" import numpy as np import pandas as pd from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import Imputer from dankit import pca_ratio_curve, compress, isExist, addColumnsPrefix, splitDate X_train = pd.read_pickle('X_train_with_breakfaith_pickle') X_answer = pd.read_pickle('X_answer_with_breakfaith_pickle') recruit = pd.read_csv('9recruit.csv') recruit['recruit_times'] = 1 recruit['recruit_year'], recruit['recruit_month'] = splitDate( recruit['RECDATE']) recruit_year_dummies = pd.get_dummies(recruit['recruit_year']) recruit_month_dummies = pd.get_dummies(recruit['recruit_month']) addColumnsPrefix(recruit_year_dummies, 'recruit_year') addColumnsPrefix(recruit_month_dummies, 'recruit_month') #将年份与月份合并为一个DataFrame recruit_date_dummies = recruit_year_dummies.join(recruit_month_dummies) recruit_date_dummies[['EID']] = recruit[['EID']] recruit_date_dummies = recruit_date_dummies.groupby('EID').sum() recruit_date_dummies.reset_index(inplace=True) recruit_website_dummies = pd.get_dummies(recruit['WZCODE']) addColumnsPrefix(recruit_website_dummies, 'recruit_website_') recruit_website_dummies[['EID']] = recruit[['EID']] recruit_website_dummies = recruit_website_dummies.groupby('EID').sum()
X_alter.fillna(0, inplace=True) #alter['ALTBE'].fillna('0', inplace=True) #l = np.array([x[:-2] if len(x) > 2 else x for x in alter['ALTBE']]) #alter.loc[np.where(l == 'null')[0],'ALTBE'] = '0' #alter['ALTBE'] = np.array( # [float(x[:-2]) if len(x) > 2 else float(x) for x in alter['ALTBE']]) #alter['ALTAF'].fillna('0', inplace=True) #l = np.array([x[:-2] if len(x) > 2 else x for x in alter['ALTAF']]) #alter.loc[np.where(l == 'null')[0],'ALTAF'] = '0' #alter['ALTAF'] = np.array( # [float(x[:-2]) if len(x) > 2 else float(x) for x in alter['ALTAF']]) #ALTDATE alter['alt_year'], alter['alt_month'] = splitDate(alter['ALTDATE']) alter_year_dummies = pd.get_dummies(alter['alt_year']) alter_month_dummies = pd.get_dummies(alter['alt_month']) addColumnsPrefix(alter_year_dummies, 'alter_year') addColumnsPrefix(alter_month_dummies, 'alter_month') alter_date_dummies = alter_year_dummies.join(alter_month_dummies) alter_date_dummies[['EID']] = alter[['EID']] alter_date_dummies = alter_date_dummies.groupby('EID').sum() alter_date_dummies.reset_index(inplace=True) #ALTERNO alterno_dummies = pd.get_dummies(X_alter['ALTERNO']) alterno_dummies[['EID']] = X_alter[['EID']] alterno_dummies = alterno_dummies.groupby('EID').sum() addColumnsPrefix(alterno_dummies, 'alterno')