示例#1
0
def write_feature_importance_files(n_est=10, max_depth=None):
    profile_ids = map(lambda x: int(x.split("_")[1]), os.listdir(param.FEATURE_AGG_PATH))
    profile_ids.sort()
    user_feature_df = loader.read_csv_dataset(profile_ids=profile_ids)
    print "user feature data set is loaded"
    # mod_feature_dfs = {mod: loader.read_csv_dataset(profile_ids=profile_ids, modality=mod)
    #                    for mod in info.FREE_MODE_LIST}
    # print "each modality feature data sets are loaded"
    p_df = loader.load_profile_info()
    print "profile information is loaded"

    target_labels = info.LABEL_LIST
    # target_labels = ['gender', 'age', 'job', 'religion', 'marriage', 'numberOfChildren', 'income', 'education']
    # target_label = 'religion'

    ranking_limits = [-1]
    # min_not_nans = [-1, 10]

    if not os.path.isdir(param.IMPORTANCE_PATH):
        os.makedirs(param.IMPORTANCE_PATH)

    for target_label in target_labels:
        features_powers_lr = fimp.compute_randomized_lr_score(user_feature_df, p_df, target_label)
        # features_powers_mi = fimp.compute_mics(user_feature_df, p_df, target_label)
        # features_powers_fs_10 = fimp.compute_fscore(user_feature_df, p_df, target_label, min_not_nan=10)

        for r_l in ranking_limits:
            features_powers_lr.iloc[:r_l].to_csv(
                    "%s/%s_%s_%s-%s.csv" % (param.IMPORTANCE_PATH, target_label, "withAppFeat",
                                            "LR", r_l if r_l > -1 else "all"))
示例#2
0
def test_feature_set_performance(target_label=info.LABEL_GEN, base_fset=param.FEATURE_SET_ORIGINAL,
                                 reduced=False, fillna=True, scaling=True,
                                 feat_sel=False, feat_num=None, with_pred=False, clf_name='LR', reg_param=1.0):
    p_df = loader.load_profile_info()
    print "profile information is loaded"

    feature_set_name = base_fset
    feature_set_name += param.REDUCED_SUFFIX if reduced else ""
    feature_set_name += param.FILL_SUFFIX if fillna else ""
    feature_set_name += param.SCALING_SUFFIX if scaling else ""
    print feature_set_name

    user_feature_df = loader.read_csv_feature_set(feature_set_name)
    user_feature_df.columns = map(lambda x: int(x), user_feature_df.columns)
    # user_feature_df = loader.read_csv_feature_set(param.FEATURE_SET_EXTENSION_APP,
    #                                               fill_na=True, normalize=False)
    print "user feature data set is loaded"

    if with_pred:
        pred_df = pd.read_csv(param.DATA_PATH + "/predictions/item_preds.csv", index_col='profile_id')
        # pred_df = pd.read_csv(param.DATA_PATH + "/predictions/content_preds.csv", index_col='profile_id')
        for col in pred_df.columns:
            uls = list(pred_df[col].unique())
            uls.sort()
            pred_df.loc[:, col] = pred_df[col].apply(lambda x: uls.index(x))

        p_df = p_df.loc[pred_df.index]
        user_feature_df = user_feature_df[pred_df.index]

        pred_df.columns = [[info.APP] * len(pred_df.columns), ['itemBased_prediction'] * len(pred_df.columns),
                           list(pred_df.columns), [feat.NOMINAL_VAL] * len(pred_df.columns)]
        pred_df.columns.names = ['modality', 'field', 'feature', 'type']
        user_feature_df = pd.concat([user_feature_df.T, pred_df], axis=1).T

    # method_types = ["LR", "MI", "MI-min10", "FS", "FS-min10", "RF-100"]
    method_type = "MI" if feat_sel else None
    cv = 10
    repeat_num = 20
    nf = feat_num if feat_sel else None

    if not os.path.isdir(param.EXPERIMENT_PATH):
        os.makedirs(param.EXPERIMENT_PATH)

    print "\nlabel, fillna, scaling, feat_sel, clf_name, reg_param, k-CV, ith-fold, featNum, accuracy"

    for repeat in range(repeat_num):
        temp_score = clf.classify(user_feature_df, p_df, feature_set_name, features=None, label=target_label,
                                  reg_param=reg_param, selection=feat_sel, num_feat=nf,
                                  sel_method=method_type, cv=cv)
示例#3
0
文件: classifier.py 项目: heevery/ohp
 def __init__(self, exp_name="", base_set=param.FEATURE_SET_ORIGINAL,
              reduced_set=False, fill=False, scale=True, label_categorize=True):
     self.exp_name = exp_name
     self.feature_set_info = {"base": base_set, "reduced": reduced_set, "fillNa": fill, "scaling": scale}
     self.profile_df = loader.load_profile_info(categorize=label_categorize)
     self.feature_set_df = self.load_feature_set()
示例#4
0
        x = df_filtered.dropna(how='all')
        compute(x)

    feature_importances.sort_values('importance', ascending=False, inplace=True, na_position='last')
    return feature_importances


if __name__ == '__main__':
    profile_ids = map(lambda x: int(x.split("_")[1]), os.listdir(param.FEATURE_AGG_PATH))[:20]
    user_feature_df = data_loader.read_csv_dataset(profile_ids=profile_ids)
    # user_feature_df = ida_data_loader.read_csv_dataset(file_name=csv_file)
    print "user feature data set is loaded"
    # mod_feature_dfs = {mod: ida_data_loader.read_csv_dataset(profile_ids=profile_ids, modality=mod)
    #                    for mod in info.FREE_MODE_LIST}
    # print "each modality feature data sets are loaded"
    profile_df = data_loader.load_profile_info()
    print "profile information is loaded"

    # target_labels = info.LABEL_LIST
    target_labels = [info.LABEL_GEN, info.LABEL_AGE]
    # target_labels = ['gender', 'age', 'job', 'religion', 'marriage', 'numberOfChildren', 'income', 'education']

    compute_randomized_lr_score(user_feature_df, profile_df)
    # compute_fscore(user_feature_df, profile_df)
    #
    # ranking_limits = [1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 40, 50, 75, 100]
    # # ranking_limits = [1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 40, 50, 75, 100, 125, 150, 200, 250, 300, 400, 500,
    # #                   750, 1000, 1500, 2000, 3000, 4000, 5000, 7500, 10000, 15000, 20000, 50000, -1]
    #
    # if not os.path.isdir(param.IMPORTANCE_PATH):
    #     os.makedirs(param.IMPORTANCE_PATH)
示例#5
0
def insert_new_features_to_csv(modality, file_name):
    p_df = loader.load_profile_info()
    ids = list(p_df.index)
    ids.sort()
示例#6
0
def data_set_random_partition_algorithm():
    user_table = loader.load_profile_info()
    train, test, error = train_test_random_partition(user_table)
    write_data_set_partition(train, test, error)
示例#7
0
import numpy as np
import os
import pandas as pd
from pandas import DataFrame
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


uid_list = sorted(map(lambda x: int(x.split("_")[1]), os.listdir(param.CHUNK_PATH)))
user_exp_dict = {uid: sorted(map(lambda x: int(x.split("_")[1].split(".")[0]),
                                 os.listdir("%s/user_%s" % (param.CHUNK_PATH, uid)))) for uid in uid_list}

user_info_df = loader.load_profile_info(categorize=False)
user_info_cat_df = loader.load_profile_info(categorize=True)


def current_timestamp():
    return int(round(time.time() * 1000))


def make_data_set():
    x_df = []
    y_df_dict = {lb: [] for lb in info.LABEL_LIST}

    for uid in uid_list:
	print uid, "done"
	sys.stdout.flush()
        u_info_series = user_info_df.query('profileId == %s' % uid).iloc[0]
示例#8
0
    if info.MOD_FIELD_TYPE[modality][field] in info.VAR_NUMERIC:
        bin_split_dic = feat.BIN_INFO['freq']
        categories += range(len(bin_split_dic[(modality, field)]) + 1)
    elif info.MOD_FIELD_TYPE[modality][field] in info.VAR_CATEGORICAL:
        if info.MOD_FIELD_TYPE[modality][field] == info.VAR_BINARY:
            categories += range(2)
        else:
            pass
    return categories


if __name__ == '__main__':
    reload(sys)
    sys.setdefaultencoding('utf-8')

    profile_df = loader.load_profile_info()
    target_tables = info.MOD_LIST
    if param.PERMISSION_FREE:
        target_tables = info.FREE_MODE_LIST

    data_set = None
    # file_names = os.listdir('data_set/features/')
    # profile_ids = []
    # for file_name in file_names:
    #     modiff_time = datetime.datetime.strptime(time.ctime(os.path.getmtime('data_set/features/%s' % file_name)),
    #                                              "%a %b %d %H:%M:%S %Y")
    #     if (datetime.datetime.now() - modiff_time).seconds > 3600:
    #         profile_ids.append(file_name)
    # profile_ids = map(lambda x: int(x.split("_")[1]), profile_ids)
    # profile_ids.sort()