Пример #1
0
def main():
    description = 'Build ML models to predict by-cellline drug response.'
    parser = get_parser(description)
    args = parser.parse_args()

    print('Args:', args, end='\n\n')
    print('Use percent growth for dose levels in log concentration range: [{}, {}]'.format(args.min_logconc, args.max_logconc))
    print()

    cells = NCI60.all_cells() if 'all' in args.cells else args.cells

    for cell in cells:
        print('-' * 10, 'Cell line:', cell, '-' * 10)
        df = NCI60.load_by_cell_data(cell, drug_features=args.drug_features, scaling=args.scaling,
                                     min_logconc=args.min_logconc, max_logconc=args.max_logconc,
                                     subsample=args.subsample, feature_subsample=args.feature_subsample)
        if not df.shape[0]:
            print('No response data found\n')
            continue

        if args.classify:
            good_bins = summarize(df, args.cutoffs, min_count=args.cv)
            if good_bins < 2:
                print('Not enough classes\n')
                continue
        else:
            summarize(df)

        out = os.path.join(args.out_dir, cell)
        for model in args.models:
            if args.classify:
                classify(model, df, cv=args.cv, cutoffs=args.cutoffs, threads=args.threads, prefix=out)
            else:
                regress(model, df, cv=args.cv, cutoffs=args.cutoffs, threads=args.threads, prefix=out)
Пример #2
0
def main():
    parser = get_parser()
    args = parser.parse_args()
    set_seed(args.seed)

    prefix = args.prefix or os.path.basename(args.data)
    prefix = os.path.join(args.out_dir, prefix)

    df = pd.read_table(args.data, engine='c', sep=',' if args.csv else '\t')
    x, y, splits, features = split_data(df, ycol=args.ycol, classify=args.classify, cv=args.cv,
                                        bins=args.bins, cutoffs=args.cutoffs, groupcols=args.groupcols,
                                        ignore_categoricals=args.ignore_categoricals, verbose=True)

    if args.classify and len(np.unique(y)) < 2:
        print('Not enough classes\n')
        return

    best_score, best_model = -np.Inf, None
    for model in args.models:
        if args.classify:
            class_weight = 'balanced' if args.balanced else None
            score = classify(model, x, y, splits, features, threads=args.threads, prefix=prefix, seed=args.seed, class_weight=class_weight)
        else:
            score = regress(model, x, y, splits, features, threads=args.threads, prefix=prefix, seed=args.seed)
        if score >= best_score:
            best_score = score
            best_model = model

    print('Training the best model ({}={:.3g}) on the entire dataset...'.format(best_model, best_score))
    name = 'best.classifier' if args.classify else 'best.regressor'
    fname = train(best_model, x, y, features, classify=args.classify,
                  threads=args.threads, prefix=prefix, name=name, save=True)
    print('Model saved in {}\n'.format(fname))
Пример #3
0
def main():
    description = 'Build ML models to predict by-drug tumor response.'
    parser = get_parser(description)
    args = parser.parse_args()

    print('Args:', args, end='\n\n')
    if args.use_gi50:
        print('Use NCI GI50 value instead of percent growth')
    else:
        print('Use percent growth at log concentration: {}'.format(args.logconc))

    drugs = args.drugs
    if 'all' in drugs:
        drugs = NCI60.all_drugs()
    elif len(drugs) == 1 and re.match("^[ABC]$", drugs[0].upper()):
        drugs = NCI60.drugs_in_set('Jason:' + drugs[0].upper())
        print("Drugs in set '{}': {}".format(args.drugs[0], len(drugs)))

    print()
    for drug in drugs:
        print('-' * 10, 'Drug NSC:', drug, '-' * 10)
        df = NCI60.load_by_drug_data(drug, cell_features=args.cell_features, scaling=args.scaling,
                                     use_gi50=args.use_gi50, logconc=args.logconc,
                                     subsample=args.subsample, feature_subsample=args.feature_subsample)
        if not df.shape[0]:
            print('No response data found\n')
            continue

        if args.classify:
            cutoffs = None if args.autobins > 1 else args.cutoffs
            good_bins = summarize(df, cutoffs, autobins=args.autobins, min_count=args.cv)
            if good_bins < 2:
                print('Not enough classes\n')
                continue
        else:
            summarize(df)

        out = os.path.join(args.out_dir, 'NSC_' + drug)
        for model in args.models:
            if args.classify:
                classify(model, df, cv=args.cv, cutoffs=args.cutoffs, autobins=args.autobins, threads=args.threads, prefix=out)
            else:
                regress(model, df, cv=args.cv, cutoffs=args.cutoffs, threads=args.threads, prefix=out)
Пример #4
0
def test1():
    df = NCI60.load_by_cell_data()
    regress('XGBoost', df)
Пример #5
0
def test2():
    from sklearn.ensemble import RandomForestRegressor
    model = RandomForestRegressor(n_estimators=20)
    df = NCI60.load_by_cell_data()
    regress(model, df, cv=2)