示例#1
0
def split_continuum_value_data (data) :
    """
    split the continuum value into some interval with same length
    then convert the category variable into binary variable 
    @params:
        data: original data (ndarray)
    @return:
        $1: the corresponding data after spliting
    """
    logging.info ('begin split_continuum_value_data')
    print data.shape
    if os.path.exists (ROOT + '/data/split_' + str (SPLITCONTINUUM)) :
        logging.info (ROOT + '/data/split_' + str (SPLITCONTINUUM) + ' exist!')
        return io.grab (ROOT + '/data/split_' + str (SPLITCONTINUUM))
    else :
        data = pd.DataFrame (data)
        feature_list = data.columns
        for feature in feature_list :
            global min_val, max_val
            min_val = min (data[feature].values)
            max_val = max (data[feature].values)
            data[feature] = data[feature].map (lambda x : split_value (x))
            data = convert.binary_feature (data, feature)
            data.drop (feature, axis = 1, inplace = 1)

        io.store (data.values[:,1:], ROOT + '/data/split_' + str (SPLITCONTINUUM))

    return data.values[:,1:]
示例#2
0
def feature_handler (data) :
    """
    convert categorical variable into binary variable and standardization dataset
    @parameters:
        data: original data
    @return:
        $1: the data after handlering 
    """
    logging.info ('begin to handle feature')
    featuretype = open (ROOT + '/data/features_type.csv').readlines ()
    for i in xrange (1 , len (featuretype)) :
        line = featuretype[i].strip ().split (',') 
        # remove the " in text
        line[0] = line[0][1:-1]
        line[1] = line[1][1:-1]
        # if the feature is categorical variable, convert it into binary variable
        if line[1] == 'category':
            data = convert.binary_feature (data , line[0])
            data.drop (line[0] , axis = 1 , inplace = 1)
        
    # standardization all of the feature
    featurelist = data.columns 
    for feature in featurelist :
        data = convert.scale_feature (data, feature)
        data.drop (feature, axis = 1, inplace = 1)
    logging.info ('finished hanlering feature')
    return data