def split_continuum_value_data (data) : """ split the continuum value into some interval with same length then convert the category variable into binary variable @params: data: original data (ndarray) @return: $1: the corresponding data after spliting """ logging.info ('begin split_continuum_value_data') print data.shape if os.path.exists (ROOT + '/data/split_' + str (SPLITCONTINUUM)) : logging.info (ROOT + '/data/split_' + str (SPLITCONTINUUM) + ' exist!') return io.grab (ROOT + '/data/split_' + str (SPLITCONTINUUM)) else : data = pd.DataFrame (data) feature_list = data.columns for feature in feature_list : global min_val, max_val min_val = min (data[feature].values) max_val = max (data[feature].values) data[feature] = data[feature].map (lambda x : split_value (x)) data = convert.binary_feature (data, feature) data.drop (feature, axis = 1, inplace = 1) io.store (data.values[:,1:], ROOT + '/data/split_' + str (SPLITCONTINUUM)) return data.values[:,1:]
def feature_handler (data) : """ convert categorical variable into binary variable and standardization dataset @parameters: data: original data @return: $1: the data after handlering """ logging.info ('begin to handle feature') featuretype = open (ROOT + '/data/features_type.csv').readlines () for i in xrange (1 , len (featuretype)) : line = featuretype[i].strip ().split (',') # remove the " in text line[0] = line[0][1:-1] line[1] = line[1][1:-1] # if the feature is categorical variable, convert it into binary variable if line[1] == 'category': data = convert.binary_feature (data , line[0]) data.drop (line[0] , axis = 1 , inplace = 1) # standardization all of the feature featurelist = data.columns for feature in featurelist : data = convert.scale_feature (data, feature) data.drop (feature, axis = 1, inplace = 1) logging.info ('finished hanlering feature') return data