예제 #1
0
#pca_ratio_curve(right_type_dummies, 7, 4)

#新定义两个属性 --right_get描述的是获得权利的数目
right_get = X_right[['EID', 'FBDATE']]
right_get.loc[:, 'right_get'] = isExist(X_right['FBDATE'])
right_get = right_get.groupby('EID').sum()
#right_applied描述的是权利申请的数目
right_applied = X_right[['EID', 'RIGHTTYPE']]
right_applied.loc[:, 'right_applied'] = isExist(X_right['RIGHTTYPE'])
right_applied = right_applied.groupby('EID').sum()
#取消键'EID',方便使用merge
right_get.reset_index(inplace=True)
right_applied.reset_index(inplace=True)

#ASKDATE --拆分为年份跟日期两个属性
right['right_year'], right['right_month'] = splitDate(right['ASKDATE'])
right_year_dummies = pd.get_dummies(right['right_year'])
right_month_dummies = pd.get_dummies(right['right_month'])
addColumnsPrefix(right_year_dummies, 'right_year')
addColumnsPrefix(right_month_dummies, 'right_month')
#将年份与月份合并为一个DataFrame
right_date_dummies = right_year_dummies.join(right_month_dummies)
right_date_dummies[['EID']] = right[['EID']]
right_date_dummies = right_date_dummies.groupby('EID').sum()
right_date_dummies.reset_index(inplace=True)

#FBDATE --拆分为年份和日期两个属性
mask = [False if str(n) == "nan" else True for n in right['FBDATE']]
right_year_get, right_month_get = splitDate(right.loc[mask, 'FBDATE'])
right_year_get_df = pd.DataFrame(columns=['EID','right_year_get'])
right_year_get_df['EID'] = right.loc[mask, 'EID']
예제 #2
0
@author: zeroquest
"""

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from dankit import pca_ratio_curve, compress, isExist, addColumnsPrefix, splitDate

X_train = pd.read_pickle('X_train_with_project_pickle')
X_answer = pd.read_pickle('X_answer_with_project_pickle')
lawsuit = pd.read_csv('7lawsuit.csv')

lawsuit['lawsuit_number'] = 1
lawsuit['lawsuit_year'], lawsuit['lawsuit_month'] = splitDate(lawsuit['LAWDATE'])


lawsuit_year_dummies = pd.get_dummies(lawsuit['lawsuit_year'])
lawsuit_month_dummies = pd.get_dummies(lawsuit['lawsuit_month'])
addColumnsPrefix(lawsuit_year_dummies, 'lawsuit_year')
addColumnsPrefix(lawsuit_month_dummies, 'lawsuit_month')
#将年份与月份合并为一个DataFrame
lawsuit_date_dummies = lawsuit_year_dummies.join(lawsuit_month_dummies)
lawsuit_date_dummies[['EID']] = lawsuit[['EID']]
lawsuit_date_dummies = lawsuit_date_dummies.groupby('EID').sum()
lawsuit_date_dummies.reset_index(inplace=True)

lawsuit_number = lawsuit[['EID', 'lawsuit_number']].groupby('EID').sum()
lawsuit_number.reset_index(inplace=True)
예제 #3
0
project['project_number'] = 1
X = pd.concat([X_train.drop('TARGET', axis=1), X_answer])
X_project = pd.merge(
    X, project, how='left', on='EID'
).loc[:, ['EID', 'DJDATE', 'project_number', 'project_at_home_number']]
X_project.fillna(0, inplace=True)
X_project_numbers = X_project[[
    'EID', 'project_number', 'project_at_home_number'
]].groupby('EID').sum()
X_project_numbers.reset_index(inplace=True)

#X_train = pd.merge(X_train, X_project_numbers, how='left', on='EID')
#X_answer = pd.merge(X_answer, X_project_numbers, how='left', on='EID')

#ASKDATE --拆分为年份跟日期两个属性
project['project_year'], project['project_month'] = splitDate(
    project['DJDATE'])
project_year_dummies = pd.get_dummies(project['project_year'])
project_month_dummies = pd.get_dummies(project['project_month'])
addColumnsPrefix(project_year_dummies, 'project_year')
addColumnsPrefix(project_month_dummies, 'project_month')
#将年份与月份合并为一个DataFrame
project_date_dummies = project_year_dummies.join(project_month_dummies)
project_date_dummies[['EID']] = project[['EID']]
project_date_dummies = project_date_dummies.groupby('EID').sum()
project_date_dummies.reset_index(inplace=True)

X_train = pd.merge(X_train, project_date_dummies, how='left', on='EID')
X_answer = pd.merge(X_answer, project_date_dummies, how='left', on='EID')

X_train.fillna(0, inplace=True)
X_answer.fillna(0, inplace=True)
예제 #4
0
"""

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from dankit import isExist, addColumnsPrefix, splitDate, getDateDummies

X_train = pd.read_pickle('X_train_with_lawsuit_pickle')
X_answer = pd.read_pickle('X_answer_with_lawsuit_pickle')
breakfaith = pd.read_csv('8breakfaith.csv')

breakfaith['breakfaith_number'] = 1
breakfaith['breakfaith_end_number'] = isExist(breakfaith['SXENDDATE'])
breakfaith['breakfaith_year'], breakfaith['breakfaith_month'] = splitDate(
    breakfaith['FBDATE'], '/')

#FBDATE
breakfaith_date_dummies = getDateDummies(breakfaith['breakfaith_year'],
                                         breakfaith['breakfaith_month'],
                                         'breakfaith_year', 'breakfaith_month',
                                         breakfaith['EID'])

#SXENDDATE
mask = [False if str(n) == "nan" else True for n in breakfaith['SXENDDATE']]
breakfaith_end_date = breakfaith.loc[mask, ['EID', 'SXENDDATE']]
breakfaith_end_date['breakfaith_end_year'], breakfaith_end_date[
    'breakfaith_end_month'] = splitDate(breakfaith_end_date['SXENDDATE'], '/')
breakfaith_end_date_dummies = getDateDummies(
    breakfaith_end_date['breakfaith_end_year'],
    breakfaith_end_date['breakfaith_end_month'], 'breakfaith_end_year',
예제 #5
0
"""

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

from dankit import pca_ratio_curve, compress, isExist, addColumnsPrefix, splitDate

X_train = pd.read_pickle('X_train_with_breakfaith_pickle')
X_answer = pd.read_pickle('X_answer_with_breakfaith_pickle')
recruit = pd.read_csv('9recruit.csv')

recruit['recruit_times'] = 1
recruit['recruit_year'], recruit['recruit_month'] = splitDate(
    recruit['RECDATE'])

recruit_year_dummies = pd.get_dummies(recruit['recruit_year'])
recruit_month_dummies = pd.get_dummies(recruit['recruit_month'])
addColumnsPrefix(recruit_year_dummies, 'recruit_year')
addColumnsPrefix(recruit_month_dummies, 'recruit_month')
#将年份与月份合并为一个DataFrame
recruit_date_dummies = recruit_year_dummies.join(recruit_month_dummies)
recruit_date_dummies[['EID']] = recruit[['EID']]
recruit_date_dummies = recruit_date_dummies.groupby('EID').sum()
recruit_date_dummies.reset_index(inplace=True)

recruit_website_dummies = pd.get_dummies(recruit['WZCODE'])
addColumnsPrefix(recruit_website_dummies, 'recruit_website_')
recruit_website_dummies[['EID']] = recruit[['EID']]
recruit_website_dummies = recruit_website_dummies.groupby('EID').sum()
예제 #6
0
X_alter.fillna(0, inplace=True)

#alter['ALTBE'].fillna('0', inplace=True)
#l = np.array([x[:-2] if len(x) > 2 else x for x in alter['ALTBE']])
#alter.loc[np.where(l == 'null')[0],'ALTBE'] = '0'
#alter['ALTBE'] = np.array(
#        [float(x[:-2]) if len(x) > 2 else float(x) for x in alter['ALTBE']])

#alter['ALTAF'].fillna('0', inplace=True)
#l = np.array([x[:-2] if len(x) > 2 else x for x in alter['ALTAF']])
#alter.loc[np.where(l == 'null')[0],'ALTAF'] = '0'
#alter['ALTAF'] = np.array(
#        [float(x[:-2]) if len(x) > 2 else float(x) for x in alter['ALTAF']])

#ALTDATE
alter['alt_year'], alter['alt_month'] = splitDate(alter['ALTDATE'])
alter_year_dummies = pd.get_dummies(alter['alt_year'])
alter_month_dummies = pd.get_dummies(alter['alt_month'])
addColumnsPrefix(alter_year_dummies, 'alter_year')
addColumnsPrefix(alter_month_dummies, 'alter_month')

alter_date_dummies = alter_year_dummies.join(alter_month_dummies)
alter_date_dummies[['EID']] = alter[['EID']]
alter_date_dummies = alter_date_dummies.groupby('EID').sum()
alter_date_dummies.reset_index(inplace=True)

#ALTERNO
alterno_dummies = pd.get_dummies(X_alter['ALTERNO'])
alterno_dummies[['EID']] = X_alter[['EID']]
alterno_dummies = alterno_dummies.groupby('EID').sum()
addColumnsPrefix(alterno_dummies, 'alterno')