def by_mrmr(X, y, n_features_to_select=None, only_get_index=True): from sklmrmr import MRMR mrmr = MRMR(n_features_to_select=n_features_to_select) mrmr.fit(X, y) index = ret.get_support(indices=True) if only_get_index == True: return index else: return mrmr.transform(X, y)
def test_mrmr(self): X = np.zeros((10, 10)) X[5:, 0] = 1 y = np.zeros(10) y[5:] = 1 model = MRMR(k=1) model.fit(X, y) assert model.selected_[0] == 0 assert model.n_features_ == 1
def test2(): from sklearn.svm import SVC from sklearn.datasets import load_digits from sklmrmr import MRMR digits = load_digits() X = digits.images.reshape((len(digits.images), -1)).astype(int) y = digits.target svc = SVC(kernel='linear', C=1) mrmr = MRMR(estimator=svc, n_features_to_select=5) mrmr.fit(X, y) ranking = mrmr.ranking_ print(ranking) return 0
def test_discrete(ARGS): # set these to this so we don't exclude anything (just testing file generation and parsing) ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13 ARGS.MRMR_METHOD = 'MID' ARGS.MAX_CONSERVATION = 1.0 ARGS.MAX_GAP_RATIO = 1.0 ARGS.MIN_CONSERVATION = 1.0 ARGS.CUTOFF = 20. # if we don't do this, DOOMBUNNIES set_util_params(ARGS.REFSEQ_IDS) fd, sto_filename = mkstemp(); close(fd) try: fh = open(sto_filename, 'w') print(TEST_AMINO_STO, file=fh) fh.close() alignment = AlignIO.read(sto_filename, 'stockholm') for ARGS.ENCODER in (AminoEncoder, StanfelEncoder): if ARGS.ENCODER == StanfelEncoder: TEST_NAMES = TEST_STANFEL_NAMES TEST_X = TEST_STANFEL_X else: TEST_NAMES = TEST_AMINO_NAMES TEST_X = TEST_AMINO_X # test mRMR and LSVM file generation ylabeler = Labeler( seqrecord_get_values, lambda row: is_refseq(row) or False, # TODO: again filtration function lambda x: x > ARGS.CUTOFF, False ) alignment, y, ic50 = ylabeler(alignment) refidx = reference_index(alignment, is_refseq) alignment = LabeledMSA.from_msa_with_ref(alignment, refidx) extractor = SiteVectorizer(ARGS.ENCODER) x = extractor.fit_transform(alignment) colnames = extractor.get_feature_names() # test the feature names portion try: assert(len(colnames) == len(TEST_NAMES)) except AssertionError: raise AssertionError('gen: %s\ntruth: %s' % (colnames, TEST_NAMES)) for name in TEST_NAMES: try: assert(name in colnames) except AssertionError: raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames))) assert(np.all(TEST_X == x)) assert(np.all(TEST_Y == y)) # generate and test the mRMR portion mrmr = MRMR( estimator=SVC(kernel='linear'), n_features_to_select=ARGS.NUM_FEATURES, method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR ) mrmr.fit(x, y) finally: remove(sto_filename) print('ALL TESTS PASS', file=sys.stderr)
' maximum relevance feature selection') parser.add_argument('--method', type=str, default="mid") parser.add_argument('--normalize', action='store_true') parser.add_argument('--n_features', type=int, default=10) parser.add_argument('--file', type=FileType('r'), default=open(DEFAULT_FILE)) parser.add_argument('--digits', action='store_true') ns = parser.parse_args(args) if ns.digits: X, y = get_digits() data_name = 'digits' else: X, y = read_csv(ns.file) data_name = ns.file.name model = MRMR(k=ns.n_features, method=ns.method, normalize=ns.normalize) names = list("feature_{}".format(i) for i in range(X.shape[1])) print('running on {}'.format(data_name)) print('model: {}'.format(model)) t = time() model.fit(X, y) t = time() - t print("time: {:.3f} seconds".format(t)) selected_names = list(names[i] for i in np.argsort(model.ranking_)[:model.k]) print("selected features:\n{}".format(", ".join(selected_names)))
def test_discrete(ARGS): # set these to this so we don't exclude anything (just testing file generation and parsing) ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13 ARGS.MRMR_METHOD = 'MID' ARGS.MAX_CONSERVATION = 1.0 ARGS.MAX_GAP_RATIO = 1.0 ARGS.MIN_CONSERVATION = 1.0 ARGS.CUTOFF = 20. # if we don't do this, DOOMBUNNIES set_util_params(ARGS.REFSEQ_IDS) fd, sto_filename = mkstemp(); close(fd) try: fh = open(sto_filename, 'w') print(TEST_AMINO_STO, file=fh) fh.close() alignment = AlignIO.read(sto_filename, 'stockholm') for ARGS.ENCODER in (AminoEncoder, StanfelEncoder): if ARGS.ENCODER == StanfelEncoder: TEST_NAMES = TEST_STANFEL_NAMES TEST_X = TEST_STANFEL_X else: TEST_NAMES = TEST_AMINO_NAMES TEST_X = TEST_AMINO_X # test mRMR and LSVM file generation ylabeler = Labeler( seqrecord_get_values, lambda row: is_refseq(row) or False, # TODO: again filtration function lambda x: x > ARGS.CUTOFF, False ) alignment, y, ic50 = ylabeler(alignment) refidx = reference_index(alignment, is_refseq) alignment = LabeledMSA.from_msa_with_ref(alignment, refidx) extractor = MSAVectorizer(ARGS.ENCODER) x = extractor.fit_transform(alignment) colnames = extractor.get_feature_names() # test the feature names portion try: assert(len(colnames) == len(TEST_NAMES)) except AssertionError: raise AssertionError('gen: %s\ntruth: %s' % (colnames, TEST_NAMES)) for name in TEST_NAMES: try: assert(name in colnames) except AssertionError: raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames))) assert(np.all(TEST_X == x)) assert(np.all(TEST_Y == y)) # generate and test the mRMR portion mrmr = MRMR( estimator=SVC(kernel='linear'), n_features_to_select=ARGS.NUM_FEATURES, method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR ) mrmr.fit(x, y) finally: remove(sto_filename) print('ALL TESTS PASS', file=sys.stderr)
def main(args=None): init_log() if args is None: args = sys.argv[1:] np.seterr(all='raise') # so some option parsing parser, ns, args = init_args(description="Predict epitope sites.", args=args) parser = hmmer_args(parser) parser = featsel_args(parser) parser = feature_args(parser) parser = mrmr_args(parser) parser = rfe_args(parser) parser = optstat_args(parser) parser = filter_args(parser) parser = svm_args(parser) parser = cv_args(parser) parser.add_argument('ANTIBODY', type=AntibodyTypeFactory(ns.DATA), nargs='+') ARGS = parse_args(parser, args, namespace=ns) # do some argument parsing if ARGS.TEST: test_discrete(ARGS) finalize_args(ARGS) return {} # maxrel doesn't support similar if ARGS.MRMR_METHOD == 'MAXREL': ARGS.SIMILAR = 0.0 antibodies = tuple(ARGS.ANTIBODY) # set the util params set_util_params(ARGS.REFSEQ.id) # grab the relevant antibody from the SQLITE3 data # format as SeqRecord so we can output as FASTA # and generate an alignment using HMMER if it doesn't already exist seqrecords, clonal, antibodies = ARGS.DATA.seqrecords( antibodies, ARGS.CLONAL) # if we're doing LOOCV, make sure we set CV_FOLDS appropriately if ARGS.LOOCV: ARGS.CV_FOLDS = len(seqrecords) ab_basename = ''.join(('+'.join(antibodies), '_dna' if ARGS.ENCODER == DNAEncoder else '_amino', '_clonal' if clonal else '')) alignment_basename = '_'.join( (ab_basename, ARGS.DATA.basename_root, __version__)) sto_filename = alignment_basename + '.sto' # don't capture the second variable, let it be gc'd alignment = generate_alignment(seqrecords, sto_filename, is_refseq, ARGS)[0] re_pngs = re_compile(r'N[^P][TS][^P]', re_I) ylabeler = Labeler(partial(expression, ARGS.LABEL), partial(skipper, is_refseq, ARGS.SUBTYPES)) alignment, y, threshold = ylabeler(alignment) filter = naive_filter(max_conservation=ARGS.MAX_CONSERVATION, min_conservation=ARGS.MIN_CONSERVATION, max_gap_ratio=ARGS.MAX_GAP_RATIO) extractors = [('site_ident', MSAVectorizer(ARGS.ENCODER, filter))] if ARGS.RADIUS: extractors.append(('pair_ident', MSAVectorizerPairwise(ARGS.ENCODER, filter, ARGS.RADIUS))) if ARGS.PNGS: extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4, name='PNGS'))) if ARGS.PNGS_PAIRS: extractors.append( ('pngs_pair', MSAVectorizerRegexPairwise(re_pngs, 4, name='PNGS'))) extractor = FeatureUnion(extractors, n_jobs=1) # n_jobs must be 1 for now X = extractor.fit_transform(alignment) assert y.shape[0] == X.shape[0], \ "number of classes doesn't match the data: %d vs %d" % (y.shape[0], X.shape[0]) scorer = Scorer(ARGS.OPTSTAT) # do grid-search as part of the svm to avoid # performing feature selection on every iteration # of the grid search, which naturally takes forever svm = GridSearchCV(estimator=SVC(kernel='linear', class_weight='auto'), param_grid=dict(C=list(C_range(*ARGS.LOG2C))), scoring=scorer, n_jobs=int(getenv('NCPU', -1)), pre_dispatch='3 * n_jobs', cv=ARGS.CV_FOLDS - 1) results = None for n_features in ARGS.FEATURE_GRID: results_ = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR) for train_idxs, test_idxs in StratifiedKFold(y, ARGS.CV_FOLDS): if train_idxs.sum() < 1 or test_idxs.sum() < 1: y_true = y[test_idxs] results_.add(y_true, y_true, {}) continue X_train = X[train_idxs] y_train = y[train_idxs] if ARGS.RFE: clf = RFE(estimator=svm, n_features_to_select=n_features, step=ARGS.RFE_STEP) else: mrmr = MRMR(k=n_features, method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR) clf = Pipeline([('mrmr', mrmr), ('svm', svm)]) clf.fit(X_train, y_train) X_test = X[test_idxs] y_true = y[test_idxs] if ARGS.RFE: selector_ = clf svm_ = clf.estimator_.best_estimator_ else: selector_ = clf.named_steps['mrmr'] svm_ = clf.named_steps['svm'].best_estimator_ y_pred = clf.predict(X_test) coefs, ranks = coefs_ranks(selector_.ranking_, selector_.support_, svm_.coef_) results_.add(y_true, y_pred, coefs, ranks) if results is None or results_ > results: results = results_ # the alignment reflects the number of sequences either naturally results.metadata(antibodies, ARGS.LABEL) print(results.dumps(), file=ARGS.OUTPUT) finalize_args(ARGS) return results
def main(args=None): if args is None: args = sys.argv[1:] np.seterr(all='raise') parser, ns, args = init_args( description='learn model for labeled sequences', args=args) parser = hmmer_args(parser) parser = featsel_args(parser) parser = feature_args(parser) parser = mrmr_args(parser) parser = rfe_args(parser) parser = optstat_args(parser) parser = filter_args(parser) parser = svm_args(parser) parser = cv_args(parser) def GzipType(string): try: return gzip_open(string, 'wb') except: return ArgumentTypeError( "cannot open '{0:s}' for writing".format(string)) parser.add_argument('--tree', dest='TREE') parser.add_argument('ANTIBODY', type=AntibodyTypeFactory(ns.DATA), nargs='+') parser.add_argument('MODEL', type=GzipType) ARGS = parse_args(parser, args, namespace=ns) antibodies = tuple(ARGS.ANTIBODY) # do some argument parsing if ARGS.TEST: test_discrete(ARGS) finalize_args(ARGS) return {} if ARGS.MRMR_METHOD == 'MAXREL': ARGS.SIMILAR = 0.0 # set the util params set_util_params(ARGS.REFSEQ.id) # grab the relevant antibody from the SQLITE3 data # format as SeqRecord so we can output as FASTA # and generate an alignment using HMMER if it doesn't already exist seqrecords, clonal, antibodies = ARGS.DATA.seqrecords( antibodies, ARGS.CLONAL) ab_basename = ''.join(('+'.join(antibodies), '_dna' if ARGS.ENCODER == DNAEncoder else '_amino', '_clonal' if clonal else '')) alignment_basename = '_'.join( (ab_basename, ARGS.DATA.basename_root, __version__)) sto_filename = alignment_basename + '.sto' alignment, hmm = generate_alignment(seqrecords, sto_filename, is_refseq, ARGS) re_pngs = re_compile(r'N[^P][TS][^P]', re_I) # compute features ylabeler = Labeler(partial(expression, ARGS.LABEL), partial(skipper, is_refseq, ARGS.SUBTYPES)) alignment, y, threshold = ylabeler(alignment) filter = naive_filter(max_conservation=ARGS.MAX_CONSERVATION, min_conservation=ARGS.MIN_CONSERVATION, max_gap_ratio=ARGS.MAX_GAP_RATIO) extractors = [('site', MSAVectorizer(ARGS.ENCODER, filter))] if ARGS.RADIUS: extractors.append(('site_pairs', MSAVectorizerPairwise(ARGS.ENCODER, filter, ARGS.RADIUS))) if ARGS.PNGS: extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4, name='PNGS'))) if ARGS.PNGS_PAIRS: extractors.append( ('pngs_pairs', MSAVectorizerRegexPairwise(re_pngs, 4, name='PNGS'))) extractor = FeatureUnion(extractors, n_jobs=1) # n_jobs must be one for now X = extractor.fit_transform(alignment) Cs = list(C_range(*ARGS.LOG2C)) scorer = Scorer(ARGS.OPTSTAT) # we don't let GridSearchCV do its parallelization over all combinations # of grid points, because when the length of FEATURE_GRID is short, # it takes way longer than it should # usually the # of Cs is larger than the # of ks C_jobs = int(getenv('NCPU', -1)) k_jobs = 1 # if not, swap the parallelization strategy if len(ARGS.FEATURE_GRID) > len(Cs): C_jobs, k_jobs = k_jobs, C_jobs mrmr = MRMR(method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR) svm = GridSearchCV(estimator=SVC(kernel='linear', class_weight='auto'), param_grid=dict(C=Cs), scoring=scorer, n_jobs=C_jobs, pre_dispatch='3 * n_jobs') pipe = Pipeline([('mrmr', mrmr), ('svm', svm)]) if len(ARGS.FEATURE_GRID) == 1: pipe.set_params(mrmr__k=ARGS.FEATURE_GRID[0], svm__cv=ARGS.CV_FOLDS) clf = pipe.fit(X, y) else: pipe.set_params(svm__cv=ARGS.CV_FOLDS - 1) clf = GridSearchCV(estimator=pipe, param_grid=dict(mrmr__k=ARGS.FEATURE_GRID), scoring=scorer, n_jobs=k_jobs, pre_dispatch='3 * n_jobs', cv=ARGS.CV_FOLDS).fit(X, y).best_estimator_ pickle_dump((4, ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf), ARGS.MODEL) ARGS.MODEL.close() mrmr_ = clf.named_steps['mrmr'] svm_ = clf.named_steps['svm'].best_estimator_ coefs, ranks = coefs_ranks(mrmr_.ranking_, mrmr_.support_, svm_.coef_) results = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR) results.add(y, clf.predict(X), coefs, ranks) results.metadata(antibodies, ARGS.LABEL) print(results.dumps(), file=ARGS.OUTPUT) finalize_args(ARGS) return ARGS.MODEL