def main(args=None): if args is None: args = sys_argv[1:] parser = ArgumentParser( description='Generate a phylogeny from an alignment.') parser.add_argument('ALIGNMENT', type=PathType) parser.add_argument('OUTPUT', type=PathType) ns = parser.parse_args(args) msa = load_stockholm(ns.ALIGNMENT) try: refidx = reference_index(msa, is_refseq) except IndexError: raise RuntimeError('No reference sequence found!') labels = MSAVectorizer(AminoEncoder).fit(msa).get_feature_names() seqrecords = [r for i, r in enumerate(msa) if not i == refidx] tree, alignment = Phylo()(seqrecords) PhyloGzFile.write(ns.OUTPUT, tree, alignment, labels) return 0
def main(args=None): if args is None: args = sys_argv[1:] parser = ArgumentParser(description='Generate a phylogeny from an alignment.') parser.add_argument('ALIGNMENT', type=PathType) parser.add_argument('OUTPUT', type=PathType) ns = parser.parse_args(args) msa = load_stockholm(ns.ALIGNMENT) try: refidx = reference_index(msa, is_refseq) except IndexError: raise RuntimeError('No reference sequence found!') labels = SiteVectorizer(AminoEncoder).fit(msa).get_feature_names() seqrecords = [r for i, r in enumerate(msa) if not i == refidx] tree, alignment = Phylo()(seqrecords) PhyloGzFile.write(ns.OUTPUT, tree, alignment, labels) return 0
def test_discrete(ARGS): # set these to this so we don't exclude anything (just testing file generation and parsing) ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13 ARGS.MRMR_METHOD = 'MID' ARGS.MAX_CONSERVATION = 1.0 ARGS.MAX_GAP_RATIO = 1.0 ARGS.MIN_CONSERVATION = 1.0 ARGS.CUTOFF = 20. # if we don't do this, DOOMBUNNIES set_util_params(ARGS.REFSEQ_IDS) fd, sto_filename = mkstemp(); close(fd) try: fh = open(sto_filename, 'w') print(TEST_AMINO_STO, file=fh) fh.close() alignment = AlignIO.read(sto_filename, 'stockholm') for ARGS.ENCODER in (AminoEncoder, StanfelEncoder): if ARGS.ENCODER == StanfelEncoder: TEST_NAMES = TEST_STANFEL_NAMES TEST_X = TEST_STANFEL_X else: TEST_NAMES = TEST_AMINO_NAMES TEST_X = TEST_AMINO_X # test mRMR and LSVM file generation ylabeler = Labeler( seqrecord_get_values, lambda row: is_refseq(row) or False, # TODO: again filtration function lambda x: x > ARGS.CUTOFF, False ) alignment, y, ic50 = ylabeler(alignment) refidx = reference_index(alignment, is_refseq) alignment = LabeledMSA.from_msa_with_ref(alignment, refidx) extractor = SiteVectorizer(ARGS.ENCODER) x = extractor.fit_transform(alignment) colnames = extractor.get_feature_names() # test the feature names portion try: assert(len(colnames) == len(TEST_NAMES)) except AssertionError: raise AssertionError('gen: %s\ntruth: %s' % (colnames, TEST_NAMES)) for name in TEST_NAMES: try: assert(name in colnames) except AssertionError: raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames))) assert(np.all(TEST_X == x)) assert(np.all(TEST_Y == y)) # generate and test the mRMR portion mrmr = MRMR( estimator=SVC(kernel='linear'), n_features_to_select=ARGS.NUM_FEATURES, method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR ) mrmr.fit(x, y) finally: remove(sto_filename) print('ALL TESTS PASS', file=sys.stderr)
def test_discrete(ARGS): # set these to this so we don't exclude anything (just testing file generation and parsing) ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13 ARGS.MRMR_METHOD = 'MID' ARGS.MAX_CONSERVATION = 1.0 ARGS.MAX_GAP_RATIO = 1.0 ARGS.MIN_CONSERVATION = 1.0 ARGS.CUTOFF = 20. # if we don't do this, DOOMBUNNIES set_util_params(ARGS.REFSEQ_IDS) fd, sto_filename = mkstemp(); close(fd) try: fh = open(sto_filename, 'w') print(TEST_AMINO_STO, file=fh) fh.close() alignment = AlignIO.read(sto_filename, 'stockholm') for ARGS.ENCODER in (AminoEncoder, StanfelEncoder): if ARGS.ENCODER == StanfelEncoder: TEST_NAMES = TEST_STANFEL_NAMES TEST_X = TEST_STANFEL_X else: TEST_NAMES = TEST_AMINO_NAMES TEST_X = TEST_AMINO_X # test mRMR and LSVM file generation ylabeler = Labeler( seqrecord_get_values, lambda row: is_refseq(row) or False, # TODO: again filtration function lambda x: x > ARGS.CUTOFF, False ) alignment, y, ic50 = ylabeler(alignment) refidx = reference_index(alignment, is_refseq) alignment = LabeledMSA.from_msa_with_ref(alignment, refidx) extractor = MSAVectorizer(ARGS.ENCODER) x = extractor.fit_transform(alignment) colnames = extractor.get_feature_names() # test the feature names portion try: assert(len(colnames) == len(TEST_NAMES)) except AssertionError: raise AssertionError('gen: %s\ntruth: %s' % (colnames, TEST_NAMES)) for name in TEST_NAMES: try: assert(name in colnames) except AssertionError: raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames))) assert(np.all(TEST_X == x)) assert(np.all(TEST_Y == y)) # generate and test the mRMR portion mrmr = MRMR( estimator=SVC(kernel='linear'), n_features_to_select=ARGS.NUM_FEATURES, method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR ) mrmr.fit(x, y) finally: remove(sto_filename) print('ALL TESTS PASS', file=sys.stderr)