示例#1
0
def clean_data (usePCA = False) :
    """
    """
    logging.info ('begin to clean the data')
    if os.path.exists (ROOT + '/data/cleandata.csv') :
        # we need not to clean the data each time
        # if you want to reclean the data, please delete '../data/cleandata.csv' file
        logging.info ('the clean data is already exists')
        data = pd.read_csv (ROOT + '/data/cleandata.csv')
        train_number, val_number, test_number, unlabel_number, label, uid = io.grab (ROOT + '/data/datadescribe')
    else :
        data, train_number, val_number, test_number, unlabel_number, label, uid = read.read_data ()
        data = feature_handler (data)
        # store the result
        data.to_csv (ROOT + '/data/cleandata.csv')
        io.store ([train_number, val_number, test_number, unlabel_number, label, uid], ROOT + '/data/datadescribe')

    logging.info ('finished cleaning the data')

    if usePCA :
        # dimensionality reduction
        if not os.path.exists (ROOT + '/data/datapca') :
            # we need not to rerun this step
            # if you change the parameters and want to relearn it, please delete '../data/datapca' file
            data_values = decomposition.pca_solver (data)
            io.store (data_values, ROOT + '/data/datapca')

        data_values = io.grab (ROOT + '/data/datapca')
    else :
        data_values = data.values[:,1:]
    return data_values, train_number, val_number, test_number, unlabel_number, label, uid
示例#2
0
def split_continuum_value_data (data) :
    """
    split the continuum value into some interval with same length
    then convert the category variable into binary variable 
    @params:
        data: original data (ndarray)
    @return:
        $1: the corresponding data after spliting
    """
    logging.info ('begin split_continuum_value_data')
    print data.shape
    if os.path.exists (ROOT + '/data/split_' + str (SPLITCONTINUUM)) :
        logging.info (ROOT + '/data/split_' + str (SPLITCONTINUUM) + ' exist!')
        return io.grab (ROOT + '/data/split_' + str (SPLITCONTINUUM))
    else :
        data = pd.DataFrame (data)
        feature_list = data.columns
        for feature in feature_list :
            global min_val, max_val
            min_val = min (data[feature].values)
            max_val = max (data[feature].values)
            data[feature] = data[feature].map (lambda x : split_value (x))
            data = convert.binary_feature (data, feature)
            data.drop (feature, axis = 1, inplace = 1)

        io.store (data.values[:,1:], ROOT + '/data/split_' + str (SPLITCONTINUUM))

    return data.values[:,1:]
示例#3
0
def gbdt_feature_importance(train, label):
    if os.path.exists(ROOT + "/data/feature_importance"):
        logging.info("feature_importance exists!")
        feature_importance = io.grab(ROOT + "/data/feature_importance")
    else:
        logging.info("feature_importance start!")
        gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.05, max_depth=3, random_state=1000000007).fit(
            train, label
        )
        feature_importance = gb.feature_importances_
        feature_importance = 100.0 * (feature_importance / feature_importance.max())
        io.store(feature_importance, ROOT + "/data/feature_importance")
    return feature_importance
示例#4
0
def gbdt_dimreduce_number(train_data, train_label, validataion, test, unlabel, K=GBDTFEATURENUMBER):
    """  
    """
    logging.info("before gbdt dim-reducing : (%d %d)" % (train_data.shape))
    if os.path.exists(ROOT + "/data/gbdt_number_" + str(K)):
        logging.info(ROOT + "/data/gbdt_number_" + str(K) + " exist!")
        sorted_index = io.grab(ROOT + "/data/gbdt_number_" + str(K))
    else:
        feature_importance = gbdt_feature_importance(train_data, train_label)
        sorted_index = np.argsort(feature_importance)[::-1]
        sorted_index = sorted_index[:K]
    # print 'feature importance :' , feature_importance[sorted_index]

    new_train_data = train_data[:, sorted_index]
    new_val = validataion[:, sorted_index]
    new_test = test[:, sorted_index]
    new_unlabel = label[:, sorted_index]
    logging.info("after gbdt dim-reducing : (%d %d)" % (new_train_data.shape))
    return new_train_data, new_val, new_test, new_unlabel
示例#5
0
def gbdt_dimreduce_threshold(
    train_data, train_label, validataion, test, unlabel, feature_threshold=GBDTFEATURETHRESHOLD
):
    """
    """
    logging.info("begin gbdt_dimreduce_threshold")
    if os.path.exists(ROOT + "/data/gbdt_threshold_" + str(GBDTFEATURETHRESHOLD)):
        logging.info(ROOT + "/data/gbdt_threshold_" + str(GBDTFEATURETHRESHOLD) + " exist!")
        important_index, sorted_index = io.grab(ROOT + "/data/gbdt_threshold_" + str(GBDTFEATURETHRESHOLD))
    else:
        feature_importance = gbdt_feature_importance(train_data, train_label)
        important_index = np.where(feature_importance > feature_threshold)[0]
        sorted_index = np.argsort(feature_importance[important_index])[::-1]
        io.store([important_index, sorted_index], ROOT + "/data/gbdt_threshold_" + str(GBDTFEATURETHRESHOLD))

    new_train_data = train_data[:, important_index][:, sorted_index]
    new_val = validataion[:, important_index][:, sorted_index]
    new_test = test[:, important_index][:, sorted_index]
    new_unlabel = unlabel[:, important_index][:, sorted_index]
    return new_train_data, new_val, new_test, new_unlabel
示例#6
0
def mix_pca_gbdt(train_data, train_label, validataion, test, unlabel):
    """
    """
    if os.path.exists(ROOT + "/data/mix_pca_gbdt"):
        logging.info(ROOT + "/data/mix_pca_gbdt exists!")
        new_train_data, new_val, new_test, new_unlabel = io.grab(ROOT + "/data/mix_pca_gbdt")
    else:
        logging.info("before mix_pca_gbdt dim-reducing : (%d %d)" % (train_data.shape))
        feature_importance = gbdt_feature_importance(train_data, train_label)
        important_index = np.where(feature_importance > GBDTFEATURETHRESHOLD)[0]
        sorted_index = np.argsort(feature_importance[important_index])[::-1]

        other_index = np.where(feature_importance <= GBDTFEATURETHRESHOLD)[0]
        pca_data = np.vstack(
            (train_data[:, other_index], validataion[:, other_index], test[:, other_index], unlabel[:, other_index])
        )
        pca_data = pca_solver(pca_data)

        new_train_data = np.hstack(
            (train_data[:, important_index][:, sorted_index], pca_data[: train_data.shape[0], :])
        )
        new_val = np.hstack(
            (
                validataion[:, important_index][:, sorted_index],
                pca_data[train_data.shape[0] : train_data.shape[0] + validataion.shape[0], :],
            )
        )
        new_test = np.hstack(
            (
                test[:, important_index][:, sorted_index],
                pca_data[train_data.shape[0] + validataion.shape[0] : -unlabel.shape[0], :],
            )
        )
        new_unlabel = np.hstack((unlabel[:, important_index][:, sorted_index], pca_data[-unlabel.shape[0] :, :]))
        logging.info("after mix_pca_gbdt dim-reducing : (%d %d)" % (new_train_data.shape))
        io.store([new_train_data, new_val, new_test, new_unlabel], ROOT + "/data/mix_pca_gbdt")
    return new_train_data, new_val, new_test, new_unlabel
示例#7
0
if __name__ == '__main__' :
    data, train_number, val_number, test_number, unlabel_number, label, uid = datahandler.clean_data ()
    train = data[:train_number,:]
    validation = data[train_number:train_number+val_number,:]
    test = data[train_number+val_number:-unlabel_number,:]
    unlabel = data[-unlabel_number:,:]

    val_label = pd.read_csv ('../../data/val_cv_y.csv').y.values

    io.store ([train, label, validation, val_label, test, unlabel], '../../data/data_standard')
    train, validation, test, unlabel = decomposition.gbdt_dimreduce_threshold (train, label, validation, test, unlabel)
    io.store ([train, label, validation, val_label, test, unlabel], '../../data/data_standard_decompose')
    # train, validation, test, unlabel = split.split_continuum_value_tvt (train, validation, test, unlabel)
    
    train_data, train_label, validation_data, validation_label, test, unlabel = io.grab ('../../data/data_standard')
    print 'training set:' , train_data.shape
    print 'validation set: ' , validation_data.shape
    print 'testing set', test.shape
    print 'unlabel set', unlabel.shape

    assert train_data.shape[0] == len (train_label)
    assert validation_data.shape[0] == len (validation_label)



    """
    train: traning set
    validation: validation set, used for testing, the label is in 'val_cv_y.csv'
    test: testing set
    unlabel: unlabel set