def main(args): # retrieve dataset start = time.time() feature_df = pd.read_csv(os.path.join('training_set_features.csv')) label_df = pd.read_csv(os.path.join('training_set_labels.csv')) data_df = feature_df.merge(label_df, on='respondent_id', how='left') print('\ntime to read in data...{:.3f}s'.format(time.time() - start)) # split data into train and test train_df, test_df = train_test_split(data_df, test_size=args.test_size, random_state=args.seed, stratify=data_df['seasonal_vaccine']) # get features columns = list(train_df.columns) # remove select columns remove_cols = ['respondent_id'] if len(remove_cols) > 0: train_df = train_df.drop(columns=remove_cols) test_df = test_df.drop(columns=remove_cols) columns = [x for x in columns if x not in remove_cols] # categorize attributes features = {} features['label'] = ['seasonal_vaccine'] features['numeric'] = [] features['categorical'] = list( set(columns) - set(features['numeric']) - set(features['label'])) util.preprocess(train_df, test_df, features, processing=args.processing)
def main(args): # retrieve dataset start = time.time() data_df = pd.read_csv('Surgical-deepnet.csv') print('\ntime to read in data...{:.3f}s'.format(time.time() - start)) # split data into train and test train_df, test_df = train_test_split(data_df, test_size=args.test_size, random_state=args.seed, stratify=data_df['complication']) # get features columns = list(train_df.columns) # remove select columns remove_cols = [] if len(remove_cols) > 0: train_df = train_df.drop(columns=remove_cols) test_df = test_df.drop(columns=remove_cols) columns = [x for x in columns if x not in remove_cols] # categorize attributes features = {} features['label'] = ['complication'] features['numeric'] = [ 'bmi', 'Age', 'ccsComplicationRate', 'ccsMort30Rate', 'complication_rsi', 'hour', 'mortality_rsi' ] features['categorical'] = list( set(columns) - set(features['numeric']) - set(features['label'])) util.preprocess(train_df, test_df, features, processing=args.processing)
def main(args): # retrieve dataset start = time.time() data_df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv') print('\ntime to read in data...{:.3f}s'.format(time.time() - start)) # fix numeric column containing empy strings data_df['TotalCharges'] = pd.to_numeric(data_df['TotalCharges'], errors='coerce') # split data into train and test train_df, test_df = train_test_split(data_df, test_size=args.test_size, random_state=args.seed, stratify=data_df['Churn']) # get features columns = list(train_df.columns) # remove select columns remove_cols = ['customerID'] if len(remove_cols) > 0: train_df = train_df.drop(columns=remove_cols) test_df = test_df.drop(columns=remove_cols) columns = [x for x in columns if x not in remove_cols] # categorize attributes features = {} features['label'] = ['Churn'] features['numeric'] = ['tenure', 'MonthlyCharges', 'TotalCharges'] features['categorical'] = list( set(columns) - set(features['numeric']) - set(features['label'])) util.preprocess(train_df, test_df, features, processing=args.processing)
def main(args): # categorize attributes columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'label'] # retrieve dataset start = time.time() train_df = pd.read_csv('adult.data', header=None, names=columns) test_df = pd.read_csv('adult.test', header=None, names=columns, skiprows=1) print('\ntime to read in data...{:.3f}s'.format(time.time() - start)) # remove first row and fix label columns test_df['label'] = test_df['label'].apply(lambda x: x.replace('.', '')) # remove select columns remove_cols = [] if len(remove_cols) > 0: train_df = train_df.drop(columns=remove_cols) test_df = test_df.drop(columns=remove_cols) columns = [x for x in columns if x not in remove_cols] # categorize attributes features = {} features['label'] = ['label'] features['numeric'] = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] features['categorical'] = list(set(columns) - set(features['numeric']) - set(features['label'])) util.preprocess(train_df, test_df, features, processing=args.processing)
def main(args): # retrieve dataset start = time.time() df = pd.read_csv('bank-additional_bank-additional-full.csv', sep=';') print('\ntime to read in data...{:.3f}s'.format(time.time() - start)) # split data into train and test train_df, test_df = train_test_split(df, test_size=args.test_size, random_state=args.seed, stratify=df['y']) # get features columns = list(train_df.columns) # remove select columns remove_cols = [] if len(remove_cols) > 0: train_df = train_df.drop(columns=remove_cols) test_df = test_df.drop(columns=remove_cols) columns = [x for x in columns if x not in remove_cols] # categorize attributes features = {} features['label'] = ['y'] features['numeric'] = [ 'age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed' ] features['categorical'] = list( set(columns) - set(features['numeric']) - set(features['label'])) util.preprocess(train_df, test_df, features, processing=args.processing)
def main(args): # retrieve dataset start = time.time() train_df = pd.read_csv('train.csv') test_df = pd.read_csv('test.csv') print('\ntime to read in data...{:.3f}s'.format(time.time() - start)) # get features columns = list(train_df.columns) # remove select columns remove_cols = [] if len(remove_cols) > 0: train_df = train_df.drop(columns=remove_cols) test_df = test_df.drop(columns=remove_cols) columns = [x for x in columns if x not in remove_cols] # categorize attributes features = {} features['label'] = ['ACTION'] features['numeric'] = [] features['categorical'] = list( set(columns) - set(features['numeric']) - set(features['label'])) util.preprocess(train_df, test_df, features, processing=args.processing)
def main(args): # categorize attributes columns = [ 'age', 'workclass', 'industry_code', 'occupation_code', 'education', 'wage_per_hour', 'enrolled_in_edu', 'marital_status', 'major_industry_code', 'major_occupation_code', 'race', 'hispanic_origin', 'sex', 'union_member', 'unemployment_reason', 'employment', 'capital_gain', 'capital_loss', 'dividends', 'tax_staus', 'prev_region', 'prev_state', 'household_stat', 'household_summary', 'weight', 'migration_msa', 'migration_reg', 'migration_reg_move', '1year_house', 'prev_sunbelt', 'n_persons_employer', 'parents', 'father_birth', 'mother_birth', 'self_birth', 'citizenship', 'income', 'business', 'taxable_income', 'veterans_admin', 'veterans_benfits', 'label' ] # retrieve dataset start = time.time() train_df = pd.read_csv('census-income.data', header=None, names=columns) test_df = pd.read_csv('census-income.test', header=None, names=columns) print('\ntime to read in data...{:.3f}s'.format(time.time() - start)) # remove select columns remove_cols = [] if len(remove_cols) > 0: train_df = train_df.drop(columns=remove_cols) test_df = test_df.drop(columns=remove_cols) columns = [x for x in columns if x not in remove_cols] # categorize attributes features = {} features['label'] = ['label'] features['numeric'] = [ 'age', 'wage_per_hour', 'capital_gain', 'capital_loss', 'dividends', 'weight', 'n_persons_employer' ] features['categorical'] = list( set(columns) - set(features['numeric']) - set(features['label'])) util.preprocess(train_df, test_df, features, processing=args.processing)