def run(model_fname, out_csv, num_iteration=-1): test_df = data2.load('test') click_ids = data2.load_click_ids() m = load_model(model_fname) predictors = m.feature_name() print('test_df: ', len(test_df)) print('predictors: ', predictors) if num_iteration != -1 and num_iteration != m.best_iteration - 1: print('best iter: {}, specified: {}', m.best_iteration, num_iteration) preds = m.predict(test_df[predictors], num_iteration=num_iteration) # generated using map_clickid.ipynb mapping = pd.read_csv('../cache/test_mapping.csv') print('len before: ', len(mapping)) mapping = mapping.drop_duplicates(subset=['click_id']) print('len after duplicates removed: ', len(mapping)) mapping = mapping.set_index(['click_id_v0']) print(mapping.head(10)) preds_df = pd.DataFrame(preds, columns=['is_attributed']) preds_df['click_id_v0'] = click_ids print(preds_df[preds_df.click_id_v0 == 21290878]) preds_df = preds_df.set_index(['click_id_v0']) preds_df = mapping.join(preds_df, how='left') subm = pd.read_csv('../input/test.csv', usecols=['click_id']) preds_df = preds_df.reset_index().set_index(['click_id']) subm = subm.set_index(['click_id']) subm = subm.join(preds_df, how='left') subm = subm.reset_index() subm[['click_id', 'is_attributed']].to_csv(out_csv, index=False) print('saved ', out_csv)
def run_train(days, iterations, seed): trainval_df = data2.load('train') target = 'is_attributed' categorical = ['app', 'device', 'os', 'channel', 'hour', 'binip'] excluded = ['click_time', 'ip', 'day'] # i'm not convinced yet restricting training days is good train_cond = (trainval_df.day.isin(days)) & (trainval_df.hour.isin( [4, 5, 9, 10, 13, 14])) train_df = trainval_df[train_cond] for column in excluded: del trainval_df[column] gc.collect() predictors = list( sorted([c for c in trainval_df.columns if c not in ['is_attributed']])) val_dfs = None iterations = 57 run(train_df, val_dfs, predictors, target, categorical, iterations, seed)
result = {} result.update(args.__dict__) start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S") result['start_time'] = start_time if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Running in: ' + gethostname()) print('# Start: ' + start_time) sys.stdout.flush() load_params = {} if args.data == 'epi_ad': load_params = {'read_original': True, 'skip_pickle': True} data, factors = load(args.data, data_path=args.data_path, log=result, **load_params) if args.tissue: data = data[factors['source tissue'] == args.tissue] factors = factors[factors['source tissue'] == args.tissue] target = factors[args.target] split = StratifiedShuffleSplit(target, n_iter=1, test_size=args.test_size) train, test = next(iter(split)) data = data.iloc[train, :] target = target.iloc[train] target = LabelEncoder().fit_transform(target) result['test_samples'] = test.tolist() data = ExpressionDiscretizer().fit(data).transform(data)
from __future__ import division, print_function from data2 import load if __name__ == '__main__': data, factors = load('mdd_raw')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None) parser.add_argument('--target') parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') args = parser.parse_args() result = {} result.update(args.__dict__) start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S") result['start_time'] = start_time if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Running in: ' + gethostname()) print('# Start: ' + start_time) experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1))) result_file = join(args.results_path, 'rfe_{}.json'.format(experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) load_params = {} if args.data == 'epi_ad': load_params = {'read_original': True, 'skip_pickle': True} data, factors = load(args.data, data_path=args.data_path, log=result, **load_params) if args.tissue: data = data[factors['source tissue'] == args.tissue] factors = factors[factors['source tissue'] == args.tissue] target = factors[args.target] clf, param_grid = choose_classifier(args.clf, result, args.verbose) split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 9 preprocess_steps = [('scaler', StandardScaler())] # RFE d0 = datetime.now() result['experiments'] = [] for i, (train, test) in enumerate(split): if args.verbose: print('### ITERATION {}'.format(i)) result['experiments'].append({ 'iteration': i, 'train_samples': data.index[train].tolist(), 'subsets': [] }) support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) for step in subset_sizes(n_features, n_features_to_select): if args.verbose: print('[{}] Evaluating with {} features and selecting {}.' .format(datetime.now() - d0, np.sum(support_), np.sum(support_) - step)) # Train with current subset pipeline = preprocess_steps + [('grid', GridWithCoef(clf, param_grid, cv=args.n_folds))] pipeline = Pipeline(pipeline) features = np.arange(n_features)[support_] pipeline.fit(data.iloc[train, features], target.iloc[train]) # Save results for current set of features grid = pipeline.steps[-1][1] result['experiments'][-1]['subsets'].append({ 'n_features': np.sum(support_), 'features': data.columns[features].tolist(), 'best_params': grid.best_params_, 'train': { 'y_true': target.iloc[train].tolist(), 'y_pred': grid.predict(data.iloc[train, features]).tolist() }, 'test': { 'y_true': target.iloc[test].tolist(), 'y_pred': grid.predict(data.iloc[test, features]).tolist() } }) # Select best subset coef_ = safe_sqr(grid.coef_) if coef_.ndim > 1: ranks = np.argsort(coef_.sum(axis=0)) else: ranks = np.argsort(coef_) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features support_[features[ranks][:step]] = False ranking_[np.logical_not(support_)] += 1 # Store results with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) if args.verbose: print('# OK')
import pandas as pd import numpy as np import lightgbm as lgb import data2 import train2 from dataset import * from util import info SEED = 0 from sklearn.preprocessing import RobustScaler if __name__ == '__main__': trainval_df = data2.load('train') target = 'is_attributed' categorical = ['app', 'device', 'os', 'channel', 'hour', 'binip'] # faster feedback train_cond = (trainval_df.day == 8) & (trainval_df.hour.isin( [4, 5, 9, 10, 13, 14])) train_df = trainval_df[train_cond] #train_df = trainval_df.iloc[:-VALID_ROWS] #info('shuffling train') #train_df = train_df.iloc[np.random.permutation(len(train_df))] # used to save memory only, as when building lgbm dataset we specify # columns to be used explicitly
from sklearn.metrics import mean_squared_error parser = argparse.ArgumentParser(description='XGBoost for BNP') parser.add_argument('-f','--n_features', help='Number of features', type=int, default=1200) parser.add_argument('-n','--n_rounds', help='Number of Boost iterations', type=int, default=2000) parser.add_argument('-e','--eta', help='Learning rate', type=float, default=0.01) parser.add_argument('-r','--r_seed', help='Set random seed', type=int, default=3) parser.add_argument('-b','--minbin', help='Minimum categorical bin size', type=int, default=1) parser.add_argument('-ct','--cat_trans', help='Category transformation method', type=str, default='std') parser.add_argument('-cv','--cv', action='store_true') parser.add_argument('-codetest','--codetest', action='store_true') parser.add_argument('-getcached', '--getcached', action='store_true') m_params = vars(parser.parse_args()) # Load data X, y, X_sub, ids = data.load(m_params) print("Two Sigma: GBT Classifier...\n") clf = GradientBoostingRegressor(learning_rate=0.02, n_estimators=500, subsample=0.6, min_samples_split=5, min_samples_leaf=1, max_depth=4, random_state=1, verbose=1) # do cross validation scoring kf = KFold(X.shape[0], n_folds=5, shuffle=True, random_state=1) scr = np.zeros([len(kf)]) oob_pred = np.zeros(X.shape[0]) sub_pred = np.zeros(X_sub.shape[0]) for i, (tr_ix, val_ix) in enumerate(kf): clf.fit(X[tr_ix], y[tr_ix]) pred = clf.predict(X[val_ix].toarray()) oob_pred[val_ix] = np.array(pred) sub_pred += clf.predict(X_sub.toarray()) / 5 scr[i] = mean_squared_error(y[val_ix], np.array(pred))
parser.add_argument('--test-size', type=float, default=0.1) # parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x)) parser.add_argument('--clf') parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--data-path', default='./bucket/data/') args = parser.parse_args() result = {} result.update(args.__dict__) if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S")) data, factors = load(args.data, data_path=args.data_path, log=result) target = factors[args.target] # if args.tissue is not None: # condition = factors['source tissue'] == args.tissue # if condition.sum() == 0: # result['error'] = '{} is not a valid tissue.'.format(args.tissue) # save_experiment(result, folder=args.results_path, filename=None, error=True, # verbose=args.verbose) # sys.exit() # data = data[condition] # factors = factors[condition] split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--target') parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') args = parser.parse_args() result = {} result.update(args.__dict__) if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S")) result['selections'] = [] experiment_id = hash(json.dumps(result) + str(np.random.rand(10))) result_file = join(args.results_path, 'fs_{}.json'.format(experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) data, factors = load(args.data, data_path=args.data_path, log=result) target = factors[args.target] clf, param_grid = choose_classifier(args.clf, result, args.verbose) feature_names = data.columns split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 1 support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) # Elimination t0 = time() d0 = datetime.now() while np.sum(support_) > n_features_to_select: step = 10**int(np.log10(np.sum(support_) - 1)) odd_step = np.sum(support_) - step * (np.sum(support_) // step) if odd_step > 0: step = odd_step if args.verbose: print('[{}] Selecting best {:d} features.'.format( datetime.now() - d0, np.sum(support_) - step)) # Remaining features features = np.arange(n_features)[support_] coef_ = None test_scores = [] for train, test in split: # Rank the remaining features if args.n_folds == 'loo': cv = LeaveOneOut(len(train)) else: cv = args.n_folds estimator = GridWithCoef(clf, param_grid, cv=cv) estimator.fit(data.iloc[train, features], target.iloc[train]) if coef_ is None: coef_ = safe_sqr(estimator.coef_) else: coef_ += safe_sqr(estimator.coef_) test_scores.append( estimator.score(data.iloc[test, features], target.iloc[test])) if coef_.ndim > 1: ranks = np.argsort(coef_.sum(axis=0)) else: ranks = np.argsort(coef_) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features threshold = min(step, np.sum(support_) - n_features_to_select) support_[features[ranks][:threshold]] = False ranking_[np.logical_not(support_)] += 1 result['selections'].append({ 'scores': test_scores, 'n_features': np.sum(support_), 'features': feature_names[support_].tolist() }) with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) if args.verbose: print('# OK')
result['data'] = mrmr_results['data'] result['tissue'] = mrmr_results['tissue'] result['target'] = mrmr_results['target'] result['test_samples'] = mrmr_results['test_samples'] if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Running in: ' + gethostname()) print('# Start: ' + start_time) sys.stdout.flush() load_params = {} if result['data'] == 'epi_ad': load_params = {'read_original': True, 'skip_pickle': True} data, factors = load(result['data'], data_path=args.data_path, log=result, **load_params) if result['tissue']: data = data[factors['source tissue'] == result['tissue']] factors = factors[factors['source tissue'] == result['tissue']] target = factors[result['target']] clf, param_grid = choose_classifier(args.clf, result, args.verbose) train_samples = np.ones(data.shape[0], dtype=np.bool) train_samples[result['test_samples']] = False train_data = data.loc[train_samples, :] train_target = target.loc[train_samples] test_data = data.iloc[result['test_samples'], :] test_target = target.iloc[result['test_samples']]
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--target') parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None) parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') parser.add_argument('--filter', default='anova') args = parser.parse_args() result = {} result.update(args.__dict__) start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S") result['start_time'] = start_time if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Running in: ' + gethostname()) print('# Start: ' + start_time) sys.stdout.flush() load_params = {} if args.data == 'epi_ad': load_params = {'read_original': True, 'skip_pickle': True} data, factors = load(args.data, data_path=args.data_path, log=result, **load_params) if args.tissue: data = data[factors['source tissue'] == args.tissue] factors = factors[factors['source tissue'] == args.tissue] target = factors[args.target] target_num = LabelEncoder().fit_transform(target) clf, param_grid = choose_classifier(args.clf, result, args.verbose) score_params = {} preprocessor = None if args.filter == 'anova': score_features = anova elif args.filter == 'infogain_10': score_features = relevance score_params = {'bins': 10} elif args.filter == 'infogain_exp': preprocessor = ExpressionDiscretizer() score_features = relevance score_params = {'bins': 3} elif args.filter == 'chi2': preprocessor = ExpressionDiscretizer() score_features = chi_squared else: raise ValueError('Filter {} unknown.'.format(args.filter)) experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1))) result_file = join(args.results_path, '{}_{}.json'.format(args.filter, experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) sys.stdout.flush() split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 9 d0 = datetime.now() result['experiments'] = [] for i, (train, test) in enumerate(split): if args.verbose: print('### ITERATION {}'.format(i)) if preprocessor: preprocessor.fit(data.iloc[train, :]) train_data = preprocessor.transform(data.iloc[train, :]) test_data = preprocessor.transform(data.iloc[test, :]) else: train_data = data.iloc[train, :] test_data = data.iloc[test, :] scores_ = score_features(train_data, target_num[train], **score_params) result['experiments'].append({ 'iteration': i, 'train_samples_label': data.index[train].tolist(), 'train_samples_idx': train.tolist(), 'scores': scores_.tolist() }) if args.verbose: print('[{}] Features scored.'.format(datetime.now() - d0)) sys.stdout.flush() result['experiments'][-1]['subsets'] = [] current_size = n_features sorted_features = np.argsort(scores_) for step in subset_sizes(n_features, n_features_to_select): if args.verbose: print('[{}] Fitting with {} features.'.format(datetime.now() - d0, current_size)) sys.stdout.flush() features = sorted_features[-current_size:] grid = GridWithCoef(clf, param_grid, cv=args.n_folds) grid.fit(train_data.iloc[:, features], target.iloc[train]) # Save results for current set of features result['experiments'][-1]['subsets'].append({ 'n_features': current_size, 'features': data.columns[features].tolist(), 'best_params': grid.best_params_, 'train': { 'y_true': target.iloc[train].tolist(), 'y_pred': grid.predict(train_data.iloc[:, features]).tolist() }, 'test': { 'y_true': target.iloc[test].tolist(), 'y_pred': grid.predict(test_data.iloc[:, features]).tolist() } }) # Store results with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) current_size -= step if args.verbose: print('# OK') sys.stdout.flush()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--target') parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') args = parser.parse_args() result = {} result.update(args.__dict__) if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S")) result['selections'] = [] experiment_id = hash(json.dumps(result) + str(np.random.rand(10))) result_file = join(args.results_path, 'fs_{}.json'.format(experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) data, factors = load(args.data, data_path=args.data_path, log=result) target = factors[args.target] clf, param_grid = choose_classifier(args.clf, result, args.verbose) feature_names = data.columns split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 1 support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) # Elimination t0 = time() d0 = datetime.now() while np.sum(support_) > n_features_to_select: step = 10 ** int(np.log10(np.sum(support_) - 1)) odd_step = np.sum(support_) - step * (np.sum(support_) // step) if odd_step > 0: step = odd_step if args.verbose: print('[{}] Selecting best {:d} features.' .format(datetime.now() - d0, np.sum(support_) - step)) # Remaining features features = np.arange(n_features)[support_] coef_ = None test_scores = [] for train, test in split: # Rank the remaining features if args.n_folds == 'loo': cv = LeaveOneOut(len(train)) else: cv = args.n_folds estimator = GridWithCoef(clf, param_grid, cv=cv) estimator.fit(data.iloc[train, features], target.iloc[train]) if coef_ is None: coef_ = safe_sqr(estimator.coef_) else: coef_ += safe_sqr(estimator.coef_) test_scores.append(estimator.score(data.iloc[test, features], target.iloc[test])) if coef_.ndim > 1: ranks = np.argsort(coef_.sum(axis=0)) else: ranks = np.argsort(coef_) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features threshold = min(step, np.sum(support_) - n_features_to_select) support_[features[ranks][:threshold]] = False ranking_[np.logical_not(support_)] += 1 result['selections'].append({ 'scores': test_scores, 'n_features': np.sum(support_), 'features': feature_names[support_].tolist() }) with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) if args.verbose: print('# OK')
parser.add_argument('--test-size', type=float, default=0.1) # parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x)) parser.add_argument('--clf') parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--data-path', default='./bucket/data/') args = parser.parse_args() result = {} result.update(args.__dict__) if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S")) data, factors = load(args.data, data_path=args.data_path, log=result) target = factors[args.target] # if args.tissue is not None: # condition = factors['source tissue'] == args.tissue # if condition.sum() == 0: # result['error'] = '{} is not a valid tissue.'.format(args.tissue) # save_experiment(result, folder=args.results_path, filename=None, error=True, # verbose=args.verbose) # sys.exit() # data = data[condition] # factors = factors[condition] split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) result['split'] = { 'type': 'StratifiedShuffleSplit',
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--target') parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None) parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') parser.add_argument('--filter', default='anova') args = parser.parse_args() result = {} result.update(args.__dict__) start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S") result['start_time'] = start_time if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Running in: ' + gethostname()) print('# Start: ' + start_time) sys.stdout.flush() load_params = {} if args.data == 'epi_ad': load_params = {'read_original': True, 'skip_pickle': True} data, factors = load(args.data, data_path=args.data_path, log=result, **load_params) if args.tissue: data = data[factors['source tissue'] == args.tissue] factors = factors[factors['source tissue'] == args.tissue] target = factors[args.target] target_num = LabelEncoder().fit_transform(target) clf, param_grid = choose_classifier(args.clf, result, args.verbose) score_params = {} preprocessor = None if args.filter == 'anova': score_features = anova elif args.filter == 'infogain_10': score_features = relevance score_params = {'bins': 10} elif args.filter == 'infogain_exp': preprocessor = ExpressionDiscretizer() score_features = relevance score_params = {'bins': 3} elif args.filter == 'chi2': preprocessor = ExpressionDiscretizer() score_features = chi_squared else: raise ValueError('Filter {} unknown.'.format(args.filter)) experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1))) result_file = join(args.results_path, '{}_{}.json'.format(args.filter, experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) sys.stdout.flush() split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 9 d0 = datetime.now() result['experiments'] = [] for i, (train, test) in enumerate(split): if args.verbose: print('### ITERATION {}'.format(i)) if preprocessor: preprocessor.fit(data.iloc[train, :]) train_data = preprocessor.transform(data.iloc[train, :]) test_data = preprocessor.transform(data.iloc[test, :]) else: train_data = data.iloc[train, :] test_data = data.iloc[test, :] scores_ = score_features(train_data, target_num[train], **score_params) result['experiments'].append({ 'iteration': i, 'train_samples_label': data.index[train].tolist(), 'train_samples_idx': train.tolist(), 'scores': scores_.tolist() }) if args.verbose: print('[{}] Features scored.'.format(datetime.now() - d0)) sys.stdout.flush() result['experiments'][-1]['subsets'] = [] current_size = n_features sorted_features = np.argsort(scores_) for step in subset_sizes(n_features, n_features_to_select): if args.verbose: print('[{}] Fitting with {} features.'.format( datetime.now() - d0, current_size)) sys.stdout.flush() features = sorted_features[-current_size:] grid = GridWithCoef(clf, param_grid, cv=args.n_folds) grid.fit(train_data.iloc[:, features], target.iloc[train]) # Save results for current set of features result['experiments'][-1]['subsets'].append({ 'n_features': current_size, 'features': data.columns[features].tolist(), 'best_params': grid.best_params_, 'train': { 'y_true': target.iloc[train].tolist(), 'y_pred': grid.predict(train_data.iloc[:, features]).tolist() }, 'test': { 'y_true': target.iloc[test].tolist(), 'y_pred': grid.predict(test_data.iloc[:, features]).tolist() } }) # Store results with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) current_size -= step if args.verbose: print('# OK') sys.stdout.flush()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None) parser.add_argument('--target') parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') args = parser.parse_args() result = {} result.update(args.__dict__) start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S") result['start_time'] = start_time if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Running in: ' + gethostname()) print('# Start: ' + start_time) experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1))) result_file = join(args.results_path, 'rfe_{}.json'.format(experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) load_params = {} if args.data == 'epi_ad': load_params = {'read_original': True, 'skip_pickle': True} data, factors = load(args.data, data_path=args.data_path, log=result, **load_params) if args.tissue: data = data[factors['source tissue'] == args.tissue] factors = factors[factors['source tissue'] == args.tissue] target = factors[args.target] clf, param_grid = choose_classifier(args.clf, result, args.verbose) split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 9 preprocess_steps = [('scaler', StandardScaler())] # RFE d0 = datetime.now() result['experiments'] = [] for i, (train, test) in enumerate(split): if args.verbose: print('### ITERATION {}'.format(i)) result['experiments'].append({ 'iteration': i, 'train_samples': data.index[train].tolist(), 'subsets': [] }) support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) for step in subset_sizes(n_features, n_features_to_select): if args.verbose: print('[{}] Evaluating with {} features and selecting {}.'. format(datetime.now() - d0, np.sum(support_), np.sum(support_) - step)) # Train with current subset pipeline = preprocess_steps + [ ('grid', GridWithCoef(clf, param_grid, cv=args.n_folds)) ] pipeline = Pipeline(pipeline) features = np.arange(n_features)[support_] pipeline.fit(data.iloc[train, features], target.iloc[train]) # Save results for current set of features grid = pipeline.steps[-1][1] result['experiments'][-1]['subsets'].append({ 'n_features': np.sum(support_), 'features': data.columns[features].tolist(), 'best_params': grid.best_params_, 'train': { 'y_true': target.iloc[train].tolist(), 'y_pred': grid.predict(data.iloc[train, features]).tolist() }, 'test': { 'y_true': target.iloc[test].tolist(), 'y_pred': grid.predict(data.iloc[test, features]).tolist() } }) # Select best subset coef_ = safe_sqr(grid.coef_) if coef_.ndim > 1: ranks = np.argsort(coef_.sum(axis=0)) else: ranks = np.argsort(coef_) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features support_[features[ranks][:step]] = False ranking_[np.logical_not(support_)] += 1 # Store results with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) if args.verbose: print('# OK')
parser.add_argument('--radius', '-r', type=int, default=3, help='Radius parameter of NFP') parser.add_argument('--hidden-dim', '-H', type=int, default=128, help='Number of hidden units of NFP') parser.add_argument('--out-dim', '-O', type=int, default=128, help='Number of output units of NFP') args = parser.parse_args() if args.mode == 1: train, val, max_degree, atom2id, C = data.load() nfp = model.NFP(args.hidden_dim, args.out_dim, max_degree, len(atom2id), args.radius, concat_hidden=True) converter = data.concat_example elif args.mode == 2: train, val, max_degree, atom2id, C = data2.load() nfp = model2.NFP(args.hidden_dim, args.out_dim, max_degree, len(atom2id), args.radius, concat_hidden=True) converter = chainer.dataset.concat_examples elif args.mode == 3: train, val, max_degree, atom2id, C = data2.load() nfp = model3.NFP(args.hidden_dim, args.out_dim, max_degree, len(atom2id), args.radius, concat_hidden=True) converter = chainer.dataset.concat_examples else: train, val, max_degree, atom2id, C = data_ggnn.load() nfp = ggnn.GGNN(args.hidden_dim, args.out_dim, len(atom2id), args.radius, concat_hidden=True) converter = chainer.dataset.concat_examples print('data', max_degree, len(atom2id), C)