Пример #1
0
def main(args):

    # retrieve dataset
    start = time.time()
    feature_df = pd.read_csv(os.path.join('training_set_features.csv'))
    label_df = pd.read_csv(os.path.join('training_set_labels.csv'))
    data_df = feature_df.merge(label_df, on='respondent_id', how='left')
    print('\ntime to read in data...{:.3f}s'.format(time.time() - start))

    # split data into train and test
    train_df, test_df = train_test_split(data_df,
                                         test_size=args.test_size,
                                         random_state=args.seed,
                                         stratify=data_df['seasonal_vaccine'])

    # get features
    columns = list(train_df.columns)

    # remove select columns
    remove_cols = ['respondent_id']
    if len(remove_cols) > 0:
        train_df = train_df.drop(columns=remove_cols)
        test_df = test_df.drop(columns=remove_cols)
        columns = [x for x in columns if x not in remove_cols]

    # categorize attributes
    features = {}
    features['label'] = ['seasonal_vaccine']
    features['numeric'] = []
    features['categorical'] = list(
        set(columns) - set(features['numeric']) - set(features['label']))

    util.preprocess(train_df, test_df, features, processing=args.processing)
Пример #2
0
def main(args):

    # retrieve dataset
    start = time.time()
    data_df = pd.read_csv('Surgical-deepnet.csv')
    print('\ntime to read in data...{:.3f}s'.format(time.time() - start))

    # split data into train and test
    train_df, test_df = train_test_split(data_df,
                                         test_size=args.test_size,
                                         random_state=args.seed,
                                         stratify=data_df['complication'])

    # get features
    columns = list(train_df.columns)

    # remove select columns
    remove_cols = []
    if len(remove_cols) > 0:
        train_df = train_df.drop(columns=remove_cols)
        test_df = test_df.drop(columns=remove_cols)
        columns = [x for x in columns if x not in remove_cols]

    # categorize attributes
    features = {}
    features['label'] = ['complication']
    features['numeric'] = [
        'bmi', 'Age', 'ccsComplicationRate', 'ccsMort30Rate',
        'complication_rsi', 'hour', 'mortality_rsi'
    ]
    features['categorical'] = list(
        set(columns) - set(features['numeric']) - set(features['label']))

    util.preprocess(train_df, test_df, features, processing=args.processing)
Пример #3
0
def main(args):

    # retrieve dataset
    start = time.time()
    data_df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
    print('\ntime to read in data...{:.3f}s'.format(time.time() - start))

    # fix numeric column containing empy strings
    data_df['TotalCharges'] = pd.to_numeric(data_df['TotalCharges'],
                                            errors='coerce')

    # split data into train and test
    train_df, test_df = train_test_split(data_df,
                                         test_size=args.test_size,
                                         random_state=args.seed,
                                         stratify=data_df['Churn'])

    # get features
    columns = list(train_df.columns)

    # remove select columns
    remove_cols = ['customerID']
    if len(remove_cols) > 0:
        train_df = train_df.drop(columns=remove_cols)
        test_df = test_df.drop(columns=remove_cols)
        columns = [x for x in columns if x not in remove_cols]

    # categorize attributes
    features = {}
    features['label'] = ['Churn']
    features['numeric'] = ['tenure', 'MonthlyCharges', 'TotalCharges']
    features['categorical'] = list(
        set(columns) - set(features['numeric']) - set(features['label']))

    util.preprocess(train_df, test_df, features, processing=args.processing)
Пример #4
0
def main(args):

    # categorize attributes
    columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
               'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
               'hours-per-week', 'native-country', 'label']

    # retrieve dataset
    start = time.time()
    train_df = pd.read_csv('adult.data', header=None, names=columns)
    test_df = pd.read_csv('adult.test', header=None, names=columns, skiprows=1)
    print('\ntime to read in data...{:.3f}s'.format(time.time() - start))

    # remove first row and fix label columns
    test_df['label'] = test_df['label'].apply(lambda x: x.replace('.', ''))

    # remove select columns
    remove_cols = []
    if len(remove_cols) > 0:
        train_df = train_df.drop(columns=remove_cols)
        test_df = test_df.drop(columns=remove_cols)
        columns = [x for x in columns if x not in remove_cols]

    # categorize attributes
    features = {}
    features['label'] = ['label']
    features['numeric'] = ['age', 'fnlwgt', 'education-num', 'capital-gain',
                           'capital-loss', 'hours-per-week']
    features['categorical'] = list(set(columns) - set(features['numeric']) - set(features['label']))

    util.preprocess(train_df, test_df, features, processing=args.processing)
Пример #5
0
def main(args):

    # retrieve dataset
    start = time.time()
    df = pd.read_csv('bank-additional_bank-additional-full.csv', sep=';')
    print('\ntime to read in data...{:.3f}s'.format(time.time() - start))

    # split data into train and test
    train_df, test_df = train_test_split(df,
                                         test_size=args.test_size,
                                         random_state=args.seed,
                                         stratify=df['y'])

    # get features
    columns = list(train_df.columns)

    # remove select columns
    remove_cols = []
    if len(remove_cols) > 0:
        train_df = train_df.drop(columns=remove_cols)
        test_df = test_df.drop(columns=remove_cols)
        columns = [x for x in columns if x not in remove_cols]

    # categorize attributes
    features = {}
    features['label'] = ['y']
    features['numeric'] = [
        'age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
        'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed'
    ]
    features['categorical'] = list(
        set(columns) - set(features['numeric']) - set(features['label']))

    util.preprocess(train_df, test_df, features, processing=args.processing)
Пример #6
0
def main(args):

    # retrieve dataset
    start = time.time()
    train_df = pd.read_csv('train.csv')
    test_df = pd.read_csv('test.csv')
    print('\ntime to read in data...{:.3f}s'.format(time.time() - start))

    # get features
    columns = list(train_df.columns)

    # remove select columns
    remove_cols = []
    if len(remove_cols) > 0:
        train_df = train_df.drop(columns=remove_cols)
        test_df = test_df.drop(columns=remove_cols)
        columns = [x for x in columns if x not in remove_cols]

    # categorize attributes
    features = {}
    features['label'] = ['ACTION']
    features['numeric'] = []
    features['categorical'] = list(
        set(columns) - set(features['numeric']) - set(features['label']))

    util.preprocess(train_df, test_df, features, processing=args.processing)
Пример #7
0
def main(args):

    # categorize attributes
    columns = [
        'age', 'workclass', 'industry_code', 'occupation_code', 'education',
        'wage_per_hour', 'enrolled_in_edu', 'marital_status',
        'major_industry_code', 'major_occupation_code', 'race',
        'hispanic_origin', 'sex', 'union_member', 'unemployment_reason',
        'employment', 'capital_gain', 'capital_loss', 'dividends', 'tax_staus',
        'prev_region', 'prev_state', 'household_stat', 'household_summary',
        'weight', 'migration_msa', 'migration_reg', 'migration_reg_move',
        '1year_house', 'prev_sunbelt', 'n_persons_employer', 'parents',
        'father_birth', 'mother_birth', 'self_birth', 'citizenship', 'income',
        'business', 'taxable_income', 'veterans_admin', 'veterans_benfits',
        'label'
    ]

    # retrieve dataset
    start = time.time()
    train_df = pd.read_csv('census-income.data', header=None, names=columns)
    test_df = pd.read_csv('census-income.test', header=None, names=columns)
    print('\ntime to read in data...{:.3f}s'.format(time.time() - start))

    # remove select columns
    remove_cols = []
    if len(remove_cols) > 0:
        train_df = train_df.drop(columns=remove_cols)
        test_df = test_df.drop(columns=remove_cols)
        columns = [x for x in columns if x not in remove_cols]

    # categorize attributes
    features = {}
    features['label'] = ['label']
    features['numeric'] = [
        'age', 'wage_per_hour', 'capital_gain', 'capital_loss', 'dividends',
        'weight', 'n_persons_employer'
    ]
    features['categorical'] = list(
        set(columns) - set(features['numeric']) - set(features['label']))

    util.preprocess(train_df, test_df, features, processing=args.processing)