def by_mrmr(X, y, n_features_to_select=None, only_get_index=True): from sklmrmr import MRMR mrmr = MRMR(n_features_to_select=n_features_to_select) mrmr.fit(X, y) index = ret.get_support(indices=True) if only_get_index == True: return index else: return mrmr.transform(X, y)
def test_mrmr(self): X = np.zeros((10, 10)) X[5:, 0] = 1 y = np.zeros(10) y[5:] = 1 model = MRMR(k=1) model.fit(X, y) assert model.selected_[0] == 0 assert model.n_features_ == 1
def test_discrete(ARGS): # set these to this so we don't exclude anything (just testing file generation and parsing) ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13 ARGS.MRMR_METHOD = 'MID' ARGS.MAX_CONSERVATION = 1.0 ARGS.MAX_GAP_RATIO = 1.0 ARGS.MIN_CONSERVATION = 1.0 ARGS.CUTOFF = 20. # if we don't do this, DOOMBUNNIES set_util_params(ARGS.REFSEQ_IDS) fd, sto_filename = mkstemp(); close(fd) try: fh = open(sto_filename, 'w') print(TEST_AMINO_STO, file=fh) fh.close() alignment = AlignIO.read(sto_filename, 'stockholm') for ARGS.ENCODER in (AminoEncoder, StanfelEncoder): if ARGS.ENCODER == StanfelEncoder: TEST_NAMES = TEST_STANFEL_NAMES TEST_X = TEST_STANFEL_X else: TEST_NAMES = TEST_AMINO_NAMES TEST_X = TEST_AMINO_X # test mRMR and LSVM file generation ylabeler = Labeler( seqrecord_get_values, lambda row: is_refseq(row) or False, # TODO: again filtration function lambda x: x > ARGS.CUTOFF, False ) alignment, y, ic50 = ylabeler(alignment) refidx = reference_index(alignment, is_refseq) alignment = LabeledMSA.from_msa_with_ref(alignment, refidx) extractor = SiteVectorizer(ARGS.ENCODER) x = extractor.fit_transform(alignment) colnames = extractor.get_feature_names() # test the feature names portion try: assert(len(colnames) == len(TEST_NAMES)) except AssertionError: raise AssertionError('gen: %s\ntruth: %s' % (colnames, TEST_NAMES)) for name in TEST_NAMES: try: assert(name in colnames) except AssertionError: raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames))) assert(np.all(TEST_X == x)) assert(np.all(TEST_Y == y)) # generate and test the mRMR portion mrmr = MRMR( estimator=SVC(kernel='linear'), n_features_to_select=ARGS.NUM_FEATURES, method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR ) mrmr.fit(x, y) finally: remove(sto_filename) print('ALL TESTS PASS', file=sys.stderr)
' maximum relevance feature selection') parser.add_argument('--method', type=str, default="mid") parser.add_argument('--normalize', action='store_true') parser.add_argument('--n_features', type=int, default=10) parser.add_argument('--file', type=FileType('r'), default=open(DEFAULT_FILE)) parser.add_argument('--digits', action='store_true') ns = parser.parse_args(args) if ns.digits: X, y = get_digits() data_name = 'digits' else: X, y = read_csv(ns.file) data_name = ns.file.name model = MRMR(k=ns.n_features, method=ns.method, normalize=ns.normalize) names = list("feature_{}".format(i) for i in range(X.shape[1])) print('running on {}'.format(data_name)) print('model: {}'.format(model)) t = time() model.fit(X, y) t = time() - t print("time: {:.3f} seconds".format(t)) selected_names = list(names[i] for i in np.argsort(model.ranking_)[:model.k]) print("selected features:\n{}".format(", ".join(selected_names)))
def test_discrete(ARGS): # set these to this so we don't exclude anything (just testing file generation and parsing) ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13 ARGS.MRMR_METHOD = 'MID' ARGS.MAX_CONSERVATION = 1.0 ARGS.MAX_GAP_RATIO = 1.0 ARGS.MIN_CONSERVATION = 1.0 ARGS.CUTOFF = 20. # if we don't do this, DOOMBUNNIES set_util_params(ARGS.REFSEQ_IDS) fd, sto_filename = mkstemp(); close(fd) try: fh = open(sto_filename, 'w') print(TEST_AMINO_STO, file=fh) fh.close() alignment = AlignIO.read(sto_filename, 'stockholm') for ARGS.ENCODER in (AminoEncoder, StanfelEncoder): if ARGS.ENCODER == StanfelEncoder: TEST_NAMES = TEST_STANFEL_NAMES TEST_X = TEST_STANFEL_X else: TEST_NAMES = TEST_AMINO_NAMES TEST_X = TEST_AMINO_X # test mRMR and LSVM file generation ylabeler = Labeler( seqrecord_get_values, lambda row: is_refseq(row) or False, # TODO: again filtration function lambda x: x > ARGS.CUTOFF, False ) alignment, y, ic50 = ylabeler(alignment) refidx = reference_index(alignment, is_refseq) alignment = LabeledMSA.from_msa_with_ref(alignment, refidx) extractor = MSAVectorizer(ARGS.ENCODER) x = extractor.fit_transform(alignment) colnames = extractor.get_feature_names() # test the feature names portion try: assert(len(colnames) == len(TEST_NAMES)) except AssertionError: raise AssertionError('gen: %s\ntruth: %s' % (colnames, TEST_NAMES)) for name in TEST_NAMES: try: assert(name in colnames) except AssertionError: raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames))) assert(np.all(TEST_X == x)) assert(np.all(TEST_Y == y)) # generate and test the mRMR portion mrmr = MRMR( estimator=SVC(kernel='linear'), n_features_to_select=ARGS.NUM_FEATURES, method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR ) mrmr.fit(x, y) finally: remove(sto_filename) print('ALL TESTS PASS', file=sys.stderr)