示例#1
0
def by_mrmr(X, y, n_features_to_select=None, only_get_index=True):
    from sklmrmr import MRMR
    mrmr = MRMR(n_features_to_select=n_features_to_select)
    mrmr.fit(X, y)
    index = ret.get_support(indices=True)
    if only_get_index == True:
        return index
    else:
        return mrmr.transform(X, y)
示例#2
0
 def test_mrmr(self):
     X = np.zeros((10, 10))
     X[5:, 0] = 1
     y = np.zeros(10)
     y[5:] = 1
     model = MRMR(k=1)
     model.fit(X, y)
     assert model.selected_[0] == 0
     assert model.n_features_ == 1
示例#3
0
def test_discrete(ARGS):
    # set these to this so we don't exclude anything (just testing file generation and parsing)
    ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13
    ARGS.MRMR_METHOD = 'MID'
    ARGS.MAX_CONSERVATION = 1.0
    ARGS.MAX_GAP_RATIO    = 1.0
    ARGS.MIN_CONSERVATION = 1.0
    ARGS.CUTOFF = 20.

    # if we don't do this, DOOMBUNNIES
    set_util_params(ARGS.REFSEQ_IDS)

    fd, sto_filename = mkstemp(); close(fd)

    try:
        fh = open(sto_filename, 'w')
        print(TEST_AMINO_STO, file=fh)
        fh.close()

        alignment = AlignIO.read(sto_filename, 'stockholm')

        for ARGS.ENCODER in (AminoEncoder, StanfelEncoder):

            if ARGS.ENCODER == StanfelEncoder:
                TEST_NAMES = TEST_STANFEL_NAMES
                TEST_X = TEST_STANFEL_X
            else:
                TEST_NAMES = TEST_AMINO_NAMES
                TEST_X = TEST_AMINO_X

            # test mRMR and LSVM file generation
            ylabeler = Labeler(
                seqrecord_get_values,
                lambda row: is_refseq(row) or False, # TODO: again filtration function
                lambda x: x > ARGS.CUTOFF,
                False
            )
            alignment, y, ic50 = ylabeler(alignment)

            refidx = reference_index(alignment, is_refseq)
            alignment = LabeledMSA.from_msa_with_ref(alignment, refidx)
            extractor = SiteVectorizer(ARGS.ENCODER)
            x = extractor.fit_transform(alignment)
            colnames = extractor.get_feature_names()

            # test the feature names portion
            try:
                assert(len(colnames) == len(TEST_NAMES))
            except AssertionError:
                raise AssertionError('gen:   %s\ntruth: %s' % (colnames, TEST_NAMES))

            for name in TEST_NAMES:
                try:
                    assert(name in colnames)
                except AssertionError:
                    raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames)))

            assert(np.all(TEST_X == x))

            assert(np.all(TEST_Y == y))

            # generate and test the mRMR portion
            mrmr = MRMR(
                estimator=SVC(kernel='linear'),
                n_features_to_select=ARGS.NUM_FEATURES,
                method=ARGS.MRMR_METHOD,
                normalize=ARGS.MRMR_NORMALIZE,
                similar=ARGS.SIMILAR
                )

            mrmr.fit(x, y)

    finally:
        remove(sto_filename)

    print('ALL TESTS PASS', file=sys.stderr)
示例#4
0
文件: demo.py 项目: kemaleren/sklmrmr
                            ' maximum relevance feature selection')
    parser.add_argument('--method', type=str, default="mid")
    parser.add_argument('--normalize', action='store_true')
    parser.add_argument('--n_features', type=int, default=10)
    parser.add_argument('--file', type=FileType('r'),
                        default=open(DEFAULT_FILE))
    parser.add_argument('--digits', action='store_true')
    ns = parser.parse_args(args)

    if ns.digits:
        X, y = get_digits()
        data_name = 'digits'
    else:
        X, y = read_csv(ns.file)
        data_name = ns.file.name

    model = MRMR(k=ns.n_features, method=ns.method, normalize=ns.normalize)
    names = list("feature_{}".format(i) for i in range(X.shape[1]))

    print('running on {}'.format(data_name))
    print('model: {}'.format(model))

    t = time()
    model.fit(X, y)
    t = time() - t
    print("time: {:.3f} seconds".format(t))

    selected_names = list(names[i]
                          for i in np.argsort(model.ranking_)[:model.k])
    print("selected features:\n{}".format(", ".join(selected_names)))
示例#5
0
def test_discrete(ARGS):
    # set these to this so we don't exclude anything (just testing file generation and parsing)
    ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13
    ARGS.MRMR_METHOD = 'MID'
    ARGS.MAX_CONSERVATION = 1.0
    ARGS.MAX_GAP_RATIO    = 1.0
    ARGS.MIN_CONSERVATION = 1.0
    ARGS.CUTOFF = 20.

    # if we don't do this, DOOMBUNNIES
    set_util_params(ARGS.REFSEQ_IDS)

    fd, sto_filename = mkstemp(); close(fd)

    try:
        fh = open(sto_filename, 'w')
        print(TEST_AMINO_STO, file=fh)
        fh.close()

        alignment = AlignIO.read(sto_filename, 'stockholm')

        for ARGS.ENCODER in (AminoEncoder, StanfelEncoder):

            if ARGS.ENCODER == StanfelEncoder:
                TEST_NAMES = TEST_STANFEL_NAMES
                TEST_X = TEST_STANFEL_X
            else:
                TEST_NAMES = TEST_AMINO_NAMES
                TEST_X = TEST_AMINO_X

            # test mRMR and LSVM file generation
            ylabeler = Labeler(
                seqrecord_get_values,
                lambda row: is_refseq(row) or False, # TODO: again filtration function
                lambda x: x > ARGS.CUTOFF,
                False
            )
            alignment, y, ic50 = ylabeler(alignment)

            refidx = reference_index(alignment, is_refseq)
            alignment = LabeledMSA.from_msa_with_ref(alignment, refidx)
            extractor = MSAVectorizer(ARGS.ENCODER)
            x = extractor.fit_transform(alignment)
            colnames = extractor.get_feature_names()

            # test the feature names portion
            try:
                assert(len(colnames) == len(TEST_NAMES))
            except AssertionError:
                raise AssertionError('gen:   %s\ntruth: %s' % (colnames, TEST_NAMES))

            for name in TEST_NAMES:
                try:
                    assert(name in colnames)
                except AssertionError:
                    raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames)))

            assert(np.all(TEST_X == x))

            assert(np.all(TEST_Y == y))

            # generate and test the mRMR portion
            mrmr = MRMR(
                estimator=SVC(kernel='linear'),
                n_features_to_select=ARGS.NUM_FEATURES,
                method=ARGS.MRMR_METHOD,
                normalize=ARGS.MRMR_NORMALIZE,
                similar=ARGS.SIMILAR
                )

            mrmr.fit(x, y)

    finally:
        remove(sto_filename)

    print('ALL TESTS PASS', file=sys.stderr)