def main(): description = 'Build ML models to predict by-cellline drug response.' parser = get_parser(description) args = parser.parse_args() print('Args:', args, end='\n\n') print('Use percent growth for dose levels in log concentration range: [{}, {}]'.format(args.min_logconc, args.max_logconc)) print() cells = NCI60.all_cells() if 'all' in args.cells else args.cells for cell in cells: print('-' * 10, 'Cell line:', cell, '-' * 10) df = NCI60.load_by_cell_data(cell, drug_features=args.drug_features, scaling=args.scaling, min_logconc=args.min_logconc, max_logconc=args.max_logconc, subsample=args.subsample, feature_subsample=args.feature_subsample) if not df.shape[0]: print('No response data found\n') continue if args.classify: good_bins = summarize(df, args.cutoffs, min_count=args.cv) if good_bins < 2: print('Not enough classes\n') continue else: summarize(df) out = os.path.join(args.out_dir, cell) for model in args.models: if args.classify: classify(model, df, cv=args.cv, cutoffs=args.cutoffs, threads=args.threads, prefix=out) else: regress(model, df, cv=args.cv, cutoffs=args.cutoffs, threads=args.threads, prefix=out)
def main(): parser = get_parser() args = parser.parse_args() set_seed(args.seed) prefix = args.prefix or os.path.basename(args.data) prefix = os.path.join(args.out_dir, prefix) df = pd.read_table(args.data, engine='c', sep=',' if args.csv else '\t') x, y, splits, features = split_data(df, ycol=args.ycol, classify=args.classify, cv=args.cv, bins=args.bins, cutoffs=args.cutoffs, groupcols=args.groupcols, ignore_categoricals=args.ignore_categoricals, verbose=True) if args.classify and len(np.unique(y)) < 2: print('Not enough classes\n') return best_score, best_model = -np.Inf, None for model in args.models: if args.classify: class_weight = 'balanced' if args.balanced else None score = classify(model, x, y, splits, features, threads=args.threads, prefix=prefix, seed=args.seed, class_weight=class_weight) else: score = regress(model, x, y, splits, features, threads=args.threads, prefix=prefix, seed=args.seed) if score >= best_score: best_score = score best_model = model print('Training the best model ({}={:.3g}) on the entire dataset...'.format(best_model, best_score)) name = 'best.classifier' if args.classify else 'best.regressor' fname = train(best_model, x, y, features, classify=args.classify, threads=args.threads, prefix=prefix, name=name, save=True) print('Model saved in {}\n'.format(fname))
def main(): description = 'Build ML models to predict by-drug tumor response.' parser = get_parser(description) args = parser.parse_args() print('Args:', args, end='\n\n') if args.use_gi50: print('Use NCI GI50 value instead of percent growth') else: print('Use percent growth at log concentration: {}'.format(args.logconc)) drugs = args.drugs if 'all' in drugs: drugs = NCI60.all_drugs() elif len(drugs) == 1 and re.match("^[ABC]$", drugs[0].upper()): drugs = NCI60.drugs_in_set('Jason:' + drugs[0].upper()) print("Drugs in set '{}': {}".format(args.drugs[0], len(drugs))) print() for drug in drugs: print('-' * 10, 'Drug NSC:', drug, '-' * 10) df = NCI60.load_by_drug_data(drug, cell_features=args.cell_features, scaling=args.scaling, use_gi50=args.use_gi50, logconc=args.logconc, subsample=args.subsample, feature_subsample=args.feature_subsample) if not df.shape[0]: print('No response data found\n') continue if args.classify: cutoffs = None if args.autobins > 1 else args.cutoffs good_bins = summarize(df, cutoffs, autobins=args.autobins, min_count=args.cv) if good_bins < 2: print('Not enough classes\n') continue else: summarize(df) out = os.path.join(args.out_dir, 'NSC_' + drug) for model in args.models: if args.classify: classify(model, df, cv=args.cv, cutoffs=args.cutoffs, autobins=args.autobins, threads=args.threads, prefix=out) else: regress(model, df, cv=args.cv, cutoffs=args.cutoffs, threads=args.threads, prefix=out)
def test1(): df = NCI60.load_by_cell_data() regress('XGBoost', df)
def test2(): from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor(n_estimators=20) df = NCI60.load_by_cell_data() regress(model, df, cv=2)