def main(args=None): init_log() if args is None: args = sys.argv[1:] np.seterr(all='raise') # so some option parsing parser, ns, args = init_args(description="Predict epitope sites.", args=args) parser = hmmer_args(parser) parser = featsel_args(parser) parser = feature_args(parser) parser = mrmr_args(parser) parser = rfe_args(parser) parser = optstat_args(parser) parser = filter_args(parser) parser = svm_args(parser) parser = cv_args(parser) parser.add_argument('ANTIBODY', type=AntibodyTypeFactory(ns.DATA), nargs='+') ARGS = parse_args(parser, args, namespace=ns) # do some argument parsing if ARGS.TEST: test_discrete(ARGS) finalize_args(ARGS) return {} # maxrel doesn't support similar if ARGS.MRMR_METHOD == 'MAXREL': ARGS.SIMILAR = 0.0 antibodies = tuple(ARGS.ANTIBODY) # set the util params set_util_params(ARGS.REFSEQ.id) # grab the relevant antibody from the SQLITE3 data # format as SeqRecord so we can output as FASTA # and generate an alignment using HMMER if it doesn't already exist seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(antibodies, ARGS.CLONAL) # if we're doing LOOCV, make sure we set CV_FOLDS appropriately if ARGS.LOOCV: ARGS.CV_FOLDS = len(seqrecords) ab_basename = ''.join(( '+'.join(antibodies), '_dna' if ARGS.ENCODER == DNAEncoder else '_amino', '_clonal' if clonal else '' )) alignment_basename = '_'.join(( ab_basename, ARGS.DATA.basename_root, __version__ )) sto_filename = alignment_basename + '.sto' # don't capture the second variable, let it be gc'd alignment = generate_alignment(seqrecords, sto_filename, is_refseq, ARGS)[0] re_pngs = re_compile(r'N[^P][TS][^P]', re_I) ylabeler = Labeler( partial(expression, ARGS.LABEL), partial(skipper, is_refseq, ARGS.SUBTYPES) ) alignment, y, threshold = ylabeler(alignment) filter = naive_filter( max_conservation=ARGS.MAX_CONSERVATION, min_conservation=ARGS.MIN_CONSERVATION, max_gap_ratio=ARGS.MAX_GAP_RATIO ) extractors = [('site_ident', MSAVectorizer(ARGS.ENCODER, filter))] if ARGS.RADIUS: extractors.append(('pair_ident', MSAVectorizerPairwise(ARGS.ENCODER, filter, ARGS.RADIUS))) if ARGS.PNGS: extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4, name='PNGS'))) if ARGS.PNGS_PAIRS: extractors.append( ('pngs_pair', MSAVectorizerRegexPairwise(re_pngs, 4, name='PNGS')) ) extractor = FeatureUnion(extractors, n_jobs=1) # n_jobs must be 1 for now X = extractor.fit_transform(alignment) assert y.shape[0] == X.shape[0], \ "number of classes doesn't match the data: %d vs %d" % (y.shape[0], X.shape[0]) scorer = Scorer(ARGS.OPTSTAT) # do grid-search as part of the svm to avoid # performing feature selection on every iteration # of the grid search, which naturally takes forever svm = GridSearchCV( estimator=SVC(kernel='linear', class_weight='auto'), param_grid=dict(C=list(C_range(*ARGS.LOG2C))), scoring=scorer, n_jobs=int(getenv('NCPU', -1)), pre_dispatch='3 * n_jobs', cv=ARGS.CV_FOLDS - 1 ) results = None for n_features in ARGS.FEATURE_GRID: results_ = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR) for train_idxs, test_idxs in StratifiedKFold(y, ARGS.CV_FOLDS): if train_idxs.sum() < 1 or test_idxs.sum() < 1: y_true = y[test_idxs] results_.add(y_true, y_true, {}) continue X_train = X[train_idxs] y_train = y[train_idxs] if ARGS.RFE: clf = RFE( estimator=svm, n_features_to_select=n_features, step=ARGS.RFE_STEP ) else: mrmr = MRMR( k=n_features, method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR ) clf = Pipeline([('mrmr', mrmr), ('svm', svm)]) clf.fit(X_train, y_train) X_test = X[test_idxs] y_true = y[test_idxs] if ARGS.RFE: selector_ = clf svm_ = clf.estimator_.best_estimator_ else: selector_ = clf.named_steps['mrmr'] svm_ = clf.named_steps['svm'].best_estimator_ y_pred = clf.predict(X_test) coefs, ranks = coefs_ranks(selector_.ranking_, selector_.support_, svm_.coef_) results_.add(y_true, y_pred, coefs, ranks) if results is None or results_ > results: results = results_ # the alignment reflects the number of sequences either naturally results.metadata(antibodies, ARGS.LABEL) print(results.dumps(), file=ARGS.OUTPUT) finalize_args(ARGS) return results
def main(args=None): if args is None: args = sys.argv[1:] np.seterr(all='raise') parser, ns, args = init_args(description='learn model for labeled sequences', args=args) parser = hmmer_args(parser) parser = featsel_args(parser) parser = feature_args(parser) parser = mrmr_args(parser) parser = rfe_args(parser) parser = optstat_args(parser) parser = filter_args(parser) parser = svm_args(parser) parser = cv_args(parser) def GzipType(string): try: return gzip_open(string, 'wb') except: return ArgumentTypeError("cannot open '{0:s}' for writing".format(string)) parser.add_argument('--tree', dest='TREE') parser.add_argument('ANTIBODY', type=AntibodyTypeFactory(ns.DATA), nargs='+') parser.add_argument('MODEL', type=GzipType) ARGS = parse_args(parser, args, namespace=ns) antibodies = tuple(ARGS.ANTIBODY) # do some argument parsing if ARGS.TEST: test_discrete(ARGS) finalize_args(ARGS) return {} if ARGS.MRMR_METHOD == 'MAXREL': ARGS.SIMILAR = 0.0 # set the util params set_util_params(ARGS.REFSEQ.id) # grab the relevant antibody from the SQLITE3 data # format as SeqRecord so we can output as FASTA # and generate an alignment using HMMER if it doesn't already exist seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(antibodies, ARGS.CLONAL) ab_basename = ''.join(( '+'.join(antibodies), '_dna' if ARGS.ENCODER == DNAEncoder else '_amino', '_clonal' if clonal else '' )) alignment_basename = '_'.join(( ab_basename, ARGS.DATA.basename_root, __version__ )) sto_filename = alignment_basename + '.sto' alignment, hmm = generate_alignment(seqrecords, sto_filename, is_refseq, ARGS) re_pngs = re_compile(r'N[^P][TS][^P]', re_I) # compute features ylabeler = Labeler( partial(expression, ARGS.LABEL), partial(skipper, is_refseq, ARGS.SUBTYPES) ) alignment, y, threshold = ylabeler(alignment) filter = naive_filter( max_conservation=ARGS.MAX_CONSERVATION, min_conservation=ARGS.MIN_CONSERVATION, max_gap_ratio=ARGS.MAX_GAP_RATIO ) extractors = [('site', SiteVectorizer(ARGS.ENCODER, filter))] if ARGS.RADIUS: extractors.append(('site_pairs', PairwiseSiteVectorizer(ARGS.ENCODER, filter, ARGS.RADIUS))) if ARGS.PNGS: extractors.append(('pngs', MotifVectorizer(re_pngs, 4, name='PNGS'))) if ARGS.PNGS_PAIRS: extractors.append( ('pngs_pairs', PairwiseMotifVectorizer(re_pngs, 4, name='PNGS')) ) extractor = FeatureUnion(extractors, n_jobs=1) # n_jobs must be one for now X = extractor.fit_transform(alignment) Cs = list(C_range(*ARGS.LOG2C)) scorer = Scorer(ARGS.OPTSTAT) # we don't let GridSearchCV do its parallelization over all combinations # of grid points, because when the length of FEATURE_GRID is short, # it takes way longer than it should # usually the # of Cs is larger than the # of ks C_jobs = int(getenv('NCPU', -1)) k_jobs = 1 # if not, swap the parallelization strategy if len(ARGS.FEATURE_GRID) > len(Cs): C_jobs, k_jobs = k_jobs, C_jobs mrmr = MRMR( method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR ) svm = GridSearchCV( estimator=SVC(kernel='linear', class_weight='auto'), param_grid=dict(C=Cs), scoring=scorer, n_jobs=C_jobs, pre_dispatch='3 * n_jobs' ) pipe = Pipeline([('mrmr', mrmr), ('svm', svm)]) if len(ARGS.FEATURE_GRID) == 1: pipe.set_params(mrmr__k=ARGS.FEATURE_GRID[0], svm__cv=ARGS.CV_FOLDS) clf = pipe.fit(X, y) else: pipe.set_params(svm__cv=ARGS.CV_FOLDS - 1) clf = GridSearchCV( estimator=pipe, param_grid=dict(mrmr__k=ARGS.FEATURE_GRID), scoring=scorer, n_jobs=k_jobs, pre_dispatch='3 * n_jobs', cv=ARGS.CV_FOLDS ).fit(X, y).best_estimator_ pickle_dump((MODEL_VERSION, ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf), ARGS.MODEL) ARGS.MODEL.close() mrmr_ = clf.named_steps['mrmr'] svm_ = clf.named_steps['svm'].best_estimator_ coefs, ranks = coefs_ranks(mrmr_.ranking_, mrmr_.support_, svm_.coef_) results = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR) results.add(y, clf.predict(X), coefs, ranks) results.metadata(antibodies, ARGS.LABEL) print(results.dumps(), file=ARGS.OUTPUT) finalize_args(ARGS) return ARGS.MODEL
def main(args=None): if args is None: args = sys.argv[1:] np.seterr(all='raise') parser, ns, args = init_args(description='Predict label for unlabeled sequences', args=args) parser = hmmer_args(parser) parser.add_argument('MODEL', type=PathType) parser.add_argument('SEQUENCES', type=PathType) ARGS = parse_args(parser, args, namespace=ns) with gzip_open(ARGS.MODEL, 'rb') as fh: try: model = pickle_load(fh) if model[0] != MODEL_VERSION: raise ImportError('incompatible model version') ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf = model[1:] except ImportError: msg = 'your model is not of the appropriate version, please re-learn your model' raise RuntimeError(msg) # create a temporary file wherein space characters have been removed with open(ARGS.SEQUENCES) as seq_fh: def seqrecords(): is_dna = ARGS.ENCODER == DNAEncoder seq_fmt = seqfile_format(ARGS.SEQUENCES) source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet) try: for record in source: yield record if is_dna else translate(record) except VerifyError: if is_dna: msg = ( "your model specifies a DNA encoding " "which is incompatible with protein sequences" ) raise RuntimeError(msg) source.set_alphabet(AminoAlphabet) for record in source: yield record try: fd, tmphmm = mkstemp(); close(fd) with open(tmphmm, 'wb') as hmm_fh: hmm_fh.write(hmm) # explicitly gc hmm hmm = None tmpaln = generate_alignment_(seqrecords(), tmphmm, ARGS) alignment = load_stockholm(tmpaln, trim=True) finally: if exists(tmphmm): remove(tmphmm) if exists(tmpaln): remove(tmpaln) X = extractor.transform(alignment) y = clf.predict(X) feature_names = extractor.get_feature_names() support = clf.named_steps['mrmr'].support_ labels = ['"{0:s}"'.format(feature_names[i]) for i, s in enumerate(support) if s] emptys = [' ' * (len(label) + 2) for label in labels] idlen = max(len(r.id) for r in alignment) + 3 print('{{\n "label": "{0:s}",\n "predictions": ['.format(ARGS.LABEL), file=ARGS.OUTPUT) for i, r in enumerate(alignment): if i > 0: print(',') features = ['[ '] for j, x in enumerate(X[i, support]): if x: features.append(labels[j]) features.append(', ') else: features.append(emptys[j]) features.append(' ]') # replace the last comma with a space idx = None for k, f in enumerate(features): if f == ', ': idx = k if idx is None: features[0] = features[0].rstrip() features[-1] = features[-1].lstrip() else: features[idx] = '' features_ = ''.join(features) print( ' {{{{ "id": {{0:<{0:d}s}} "value": {{1: d}}, "features": {{2:s}} }}}}'.format( idlen).format('"{0:s}",'.format(r.id), y[i], features_), file=ARGS.OUTPUT, end='') print('\n ]\n}', file=ARGS.OUTPUT) finalize_args(ARGS) return 0
def main(args=None): if args is None: args = sys.argv[1:] np.seterr(all='raise') parser, ns, args = init_args(description='Predict label for unlabeled sequences', args=args) parser = hmmer_args(parser) parser.add_argument('MODEL', type=PathType) parser.add_argument('SEQUENCES', type=PathType) ARGS = parse_args(parser, args, namespace=ns) with gzip_open(ARGS.MODEL, 'rb') as fh: try: model = pickle_load(fh) if model[0] != 4: raise ImportError('incompatible model version') ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf = model[1:] except ImportError: msg = 'your model is not of the appropriate version, please re-learn your model' raise RuntimeError(msg) # create a temporary file wherein space characters have been removed with open(ARGS.SEQUENCES) as seq_fh: def seqrecords(): is_dna = ARGS.ENCODER == DNAEncoder seq_fmt = seqfile_format(ARGS.SEQUENCES) source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet) try: for record in source: yield record if is_dna else translate(record) except VerifyError: if is_dna: msg = ( "your model specifies a DNA encoding " "which is incompatible with protein sequences" ) raise RuntimeError(msg) source.set_alphabet(AminoAlphabet) for record in source: yield record try: fd, tmphmm = mkstemp(); close(fd) with open(tmphmm, 'wb') as hmm_fh: hmm_fh.write(hmm) # explicitly gc hmm hmm = None tmpaln = generate_alignment_(seqrecords(), tmphmm, ARGS) alignment = load_stockholm(tmpaln, trim=True) finally: if exists(tmphmm): remove(tmphmm) if exists(tmpaln): remove(tmpaln) X = extractor.transform(alignment) y = clf.predict(X) feature_names = extractor.get_feature_names() support = clf.named_steps['mrmr'].support_ labels = ['"{0:s}"'.format(feature_names[i]) for i, s in enumerate(support) if s] emptys = [' ' * (len(label) + 2) for label in labels] idlen = max(len(r.id) for r in alignment) + 3 print('{{\n "label": "{0:s}",\n "predictions": ['.format(ARGS.LABEL), file=ARGS.OUTPUT) for i, r in enumerate(alignment): if i > 0: print(',') features = ['[ '] for j, x in enumerate(X[i, support]): if x: features.append(labels[j]) features.append(', ') else: features.append(emptys[j]) features.append(' ]') # replace the last comma with a space idx = None for k, f in enumerate(features): if f == ', ': idx = k if idx is None: features[0] = features[0].rstrip() features[-1] = features[-1].lstrip() else: features[idx] = '' features_ = ''.join(features) print( ' {{{{ "id": {{0:<{0:d}s}} "value": {{1: d}}, "features": {{2:s}} }}}}'.format( idlen).format('"{0:s}",'.format(r.id), y[i], features_), file=ARGS.OUTPUT, end='') print('\n ]\n}', file=ARGS.OUTPUT) finalize_args(ARGS) return 0
def main(args=None): init_log() if args is None: args = sys.argv[1:] np.seterr(all='raise') # so some option parsing parser, ns, args = init_args(description="Predict epitope sites.", args=args) parser = hmmer_args(parser) parser = featsel_args(parser) parser = feature_args(parser) parser = mrmr_args(parser) parser = rfe_args(parser) parser = optstat_args(parser) parser = filter_args(parser) parser = svm_args(parser) parser = cv_args(parser) parser.add_argument('ANTIBODY', type=AntibodyTypeFactory(ns.DATA), nargs='+') ARGS = parse_args(parser, args, namespace=ns) # do some argument parsing if ARGS.TEST: test_discrete(ARGS) finalize_args(ARGS) return {} # maxrel doesn't support similar if ARGS.MRMR_METHOD == 'MAXREL': ARGS.SIMILAR = 0.0 antibodies = tuple(ARGS.ANTIBODY) # set the util params set_util_params(ARGS.REFSEQ.id) # grab the relevant antibody from the SQLITE3 data # format as SeqRecord so we can output as FASTA # and generate an alignment using HMMER if it doesn't already exist seqrecords, clonal, antibodies = ARGS.DATA.seqrecords( antibodies, ARGS.CLONAL) # if we're doing LOOCV, make sure we set CV_FOLDS appropriately if ARGS.LOOCV: ARGS.CV_FOLDS = len(seqrecords) ab_basename = ''.join(('+'.join(antibodies), '_dna' if ARGS.ENCODER == DNAEncoder else '_amino', '_clonal' if clonal else '')) alignment_basename = '_'.join( (ab_basename, ARGS.DATA.basename_root, __version__)) sto_filename = alignment_basename + '.sto' # don't capture the second variable, let it be gc'd alignment = generate_alignment(seqrecords, sto_filename, is_refseq, ARGS)[0] re_pngs = re_compile(r'N[^P][TS][^P]', re_I) ylabeler = Labeler(partial(expression, ARGS.LABEL), partial(skipper, is_refseq, ARGS.SUBTYPES)) alignment, y, threshold = ylabeler(alignment) filter = naive_filter(max_conservation=ARGS.MAX_CONSERVATION, min_conservation=ARGS.MIN_CONSERVATION, max_gap_ratio=ARGS.MAX_GAP_RATIO) extractors = [('site_ident', MSAVectorizer(ARGS.ENCODER, filter))] if ARGS.RADIUS: extractors.append(('pair_ident', MSAVectorizerPairwise(ARGS.ENCODER, filter, ARGS.RADIUS))) if ARGS.PNGS: extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4, name='PNGS'))) if ARGS.PNGS_PAIRS: extractors.append( ('pngs_pair', MSAVectorizerRegexPairwise(re_pngs, 4, name='PNGS'))) extractor = FeatureUnion(extractors, n_jobs=1) # n_jobs must be 1 for now X = extractor.fit_transform(alignment) assert y.shape[0] == X.shape[0], \ "number of classes doesn't match the data: %d vs %d" % (y.shape[0], X.shape[0]) scorer = Scorer(ARGS.OPTSTAT) # do grid-search as part of the svm to avoid # performing feature selection on every iteration # of the grid search, which naturally takes forever svm = GridSearchCV(estimator=SVC(kernel='linear', class_weight='auto'), param_grid=dict(C=list(C_range(*ARGS.LOG2C))), scoring=scorer, n_jobs=int(getenv('NCPU', -1)), pre_dispatch='3 * n_jobs', cv=ARGS.CV_FOLDS - 1) results = None for n_features in ARGS.FEATURE_GRID: results_ = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR) for train_idxs, test_idxs in StratifiedKFold(y, ARGS.CV_FOLDS): if train_idxs.sum() < 1 or test_idxs.sum() < 1: y_true = y[test_idxs] results_.add(y_true, y_true, {}) continue X_train = X[train_idxs] y_train = y[train_idxs] if ARGS.RFE: clf = RFE(estimator=svm, n_features_to_select=n_features, step=ARGS.RFE_STEP) else: mrmr = MRMR(k=n_features, method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR) clf = Pipeline([('mrmr', mrmr), ('svm', svm)]) clf.fit(X_train, y_train) X_test = X[test_idxs] y_true = y[test_idxs] if ARGS.RFE: selector_ = clf svm_ = clf.estimator_.best_estimator_ else: selector_ = clf.named_steps['mrmr'] svm_ = clf.named_steps['svm'].best_estimator_ y_pred = clf.predict(X_test) coefs, ranks = coefs_ranks(selector_.ranking_, selector_.support_, svm_.coef_) results_.add(y_true, y_pred, coefs, ranks) if results is None or results_ > results: results = results_ # the alignment reflects the number of sequences either naturally results.metadata(antibodies, ARGS.LABEL) print(results.dumps(), file=ARGS.OUTPUT) finalize_args(ARGS) return results
def main(args=None): if args is None: args = sys.argv[1:] np.seterr(all='raise') parser, ns, args = init_args( description='learn model for labeled sequences', args=args) parser = hmmer_args(parser) parser = featsel_args(parser) parser = feature_args(parser) parser = mrmr_args(parser) parser = rfe_args(parser) parser = optstat_args(parser) parser = filter_args(parser) parser = svm_args(parser) parser = cv_args(parser) def GzipType(string): try: return gzip_open(string, 'wb') except: return ArgumentTypeError( "cannot open '{0:s}' for writing".format(string)) parser.add_argument('--tree', dest='TREE') parser.add_argument('ANTIBODY', type=AntibodyTypeFactory(ns.DATA), nargs='+') parser.add_argument('MODEL', type=GzipType) ARGS = parse_args(parser, args, namespace=ns) antibodies = tuple(ARGS.ANTIBODY) # do some argument parsing if ARGS.TEST: test_discrete(ARGS) finalize_args(ARGS) return {} if ARGS.MRMR_METHOD == 'MAXREL': ARGS.SIMILAR = 0.0 # set the util params set_util_params(ARGS.REFSEQ.id) # grab the relevant antibody from the SQLITE3 data # format as SeqRecord so we can output as FASTA # and generate an alignment using HMMER if it doesn't already exist seqrecords, clonal, antibodies = ARGS.DATA.seqrecords( antibodies, ARGS.CLONAL) ab_basename = ''.join(('+'.join(antibodies), '_dna' if ARGS.ENCODER == DNAEncoder else '_amino', '_clonal' if clonal else '')) alignment_basename = '_'.join( (ab_basename, ARGS.DATA.basename_root, __version__)) sto_filename = alignment_basename + '.sto' alignment, hmm = generate_alignment(seqrecords, sto_filename, is_refseq, ARGS) re_pngs = re_compile(r'N[^P][TS][^P]', re_I) # compute features ylabeler = Labeler(partial(expression, ARGS.LABEL), partial(skipper, is_refseq, ARGS.SUBTYPES)) alignment, y, threshold = ylabeler(alignment) filter = naive_filter(max_conservation=ARGS.MAX_CONSERVATION, min_conservation=ARGS.MIN_CONSERVATION, max_gap_ratio=ARGS.MAX_GAP_RATIO) extractors = [('site', MSAVectorizer(ARGS.ENCODER, filter))] if ARGS.RADIUS: extractors.append(('site_pairs', MSAVectorizerPairwise(ARGS.ENCODER, filter, ARGS.RADIUS))) if ARGS.PNGS: extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4, name='PNGS'))) if ARGS.PNGS_PAIRS: extractors.append( ('pngs_pairs', MSAVectorizerRegexPairwise(re_pngs, 4, name='PNGS'))) extractor = FeatureUnion(extractors, n_jobs=1) # n_jobs must be one for now X = extractor.fit_transform(alignment) Cs = list(C_range(*ARGS.LOG2C)) scorer = Scorer(ARGS.OPTSTAT) # we don't let GridSearchCV do its parallelization over all combinations # of grid points, because when the length of FEATURE_GRID is short, # it takes way longer than it should # usually the # of Cs is larger than the # of ks C_jobs = int(getenv('NCPU', -1)) k_jobs = 1 # if not, swap the parallelization strategy if len(ARGS.FEATURE_GRID) > len(Cs): C_jobs, k_jobs = k_jobs, C_jobs mrmr = MRMR(method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR) svm = GridSearchCV(estimator=SVC(kernel='linear', class_weight='auto'), param_grid=dict(C=Cs), scoring=scorer, n_jobs=C_jobs, pre_dispatch='3 * n_jobs') pipe = Pipeline([('mrmr', mrmr), ('svm', svm)]) if len(ARGS.FEATURE_GRID) == 1: pipe.set_params(mrmr__k=ARGS.FEATURE_GRID[0], svm__cv=ARGS.CV_FOLDS) clf = pipe.fit(X, y) else: pipe.set_params(svm__cv=ARGS.CV_FOLDS - 1) clf = GridSearchCV(estimator=pipe, param_grid=dict(mrmr__k=ARGS.FEATURE_GRID), scoring=scorer, n_jobs=k_jobs, pre_dispatch='3 * n_jobs', cv=ARGS.CV_FOLDS).fit(X, y).best_estimator_ pickle_dump((4, ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf), ARGS.MODEL) ARGS.MODEL.close() mrmr_ = clf.named_steps['mrmr'] svm_ = clf.named_steps['svm'].best_estimator_ coefs, ranks = coefs_ranks(mrmr_.ranking_, mrmr_.support_, svm_.coef_) results = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR) results.add(y, clf.predict(X), coefs, ranks) results.metadata(antibodies, ARGS.LABEL) print(results.dumps(), file=ARGS.OUTPUT) finalize_args(ARGS) return ARGS.MODEL