def generate_alignment(seqrecords, sto_filename, ref_id_func, opts, load=True): from ..simulation import Simulation log = getLogger(IDEPI_LOGGER) hmm = None if hasattr(opts, 'SIM') and opts.SIM == Simulation.DUMB: # we're assuming pre-aligned because they're all generated from the same refseq with open(sto_filename, 'w') as fh: SeqIO.write(seqrecords, fh, 'stockholm') else: try: tmphmm = generate_hmm_(opts) tmpaln = generate_alignment_(seqrecords, tmphmm, opts, refseq=opts.REFSEQ) copyfile(tmpaln, sto_filename) log.debug('finished alignment, output moved to {0:s}'.format(sto_filename)) with open(tmphmm, 'rb') as hmm_fh: hmm = hmm_fh.read() finally: if exists(tmphmm): remove(tmphmm) if exists(tmpaln): remove(tmpaln) if load: with open(sto_filename) as fh: msa = AlignIO.read(fh, 'stockholm') refidx = reference_index(msa, ref_id_func) msa = LabeledMSA.from_msa_with_ref(msa, refidx) ranges = stockholm_rf_ranges(sto_filename) return trim_msa_to_ranges(msa, ranges), hmm return None, hmm
def generate_alignment(seqrecords, sto_filename, ref_id_func, opts, load=True): from ..simulation import Simulation log = getLogger(IDEPI_LOGGER) hmm = None if hasattr(opts, 'SIM') and opts.SIM == Simulation.DUMB: # we're assuming pre-aligned because they're all generated from the same refseq with open(sto_filename, 'w') as fh: SeqIO.write(seqrecords, fh, 'stockholm') else: try: tmphmm = generate_hmm_(opts) tmpaln = generate_alignment_(seqrecords, tmphmm, opts, refseq=opts.REFSEQ) copyfile(tmpaln, sto_filename) log.debug('finished alignment, output moved to {0:s}'.format( sto_filename)) with open(tmphmm, 'rb') as hmm_fh: hmm = hmm_fh.read() finally: if exists(tmphmm): remove(tmphmm) if exists(tmpaln): remove(tmpaln) if load: with open(sto_filename) as fh: msa = AlignIO.read(fh, 'stockholm') refidx = reference_index(msa, ref_id_func) msa = LabeledMSA.from_msa_with_ref(msa, refidx) ranges = stockholm_rf_ranges(sto_filename) return trim_msa_to_ranges(msa, ranges), hmm return None, hmm
def test_discrete(ARGS): # set these to this so we don't exclude anything (just testing file generation and parsing) ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13 ARGS.MRMR_METHOD = 'MID' ARGS.MAX_CONSERVATION = 1.0 ARGS.MAX_GAP_RATIO = 1.0 ARGS.MIN_CONSERVATION = 1.0 ARGS.CUTOFF = 20. # if we don't do this, DOOMBUNNIES set_util_params(ARGS.REFSEQ_IDS) fd, sto_filename = mkstemp(); close(fd) try: fh = open(sto_filename, 'w') print(TEST_AMINO_STO, file=fh) fh.close() alignment = AlignIO.read(sto_filename, 'stockholm') for ARGS.ENCODER in (AminoEncoder, StanfelEncoder): if ARGS.ENCODER == StanfelEncoder: TEST_NAMES = TEST_STANFEL_NAMES TEST_X = TEST_STANFEL_X else: TEST_NAMES = TEST_AMINO_NAMES TEST_X = TEST_AMINO_X # test mRMR and LSVM file generation ylabeler = Labeler( seqrecord_get_values, lambda row: is_refseq(row) or False, # TODO: again filtration function lambda x: x > ARGS.CUTOFF, False ) alignment, y, ic50 = ylabeler(alignment) refidx = reference_index(alignment, is_refseq) alignment = LabeledMSA.from_msa_with_ref(alignment, refidx) extractor = SiteVectorizer(ARGS.ENCODER) x = extractor.fit_transform(alignment) colnames = extractor.get_feature_names() # test the feature names portion try: assert(len(colnames) == len(TEST_NAMES)) except AssertionError: raise AssertionError('gen: %s\ntruth: %s' % (colnames, TEST_NAMES)) for name in TEST_NAMES: try: assert(name in colnames) except AssertionError: raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames))) assert(np.all(TEST_X == x)) assert(np.all(TEST_Y == y)) # generate and test the mRMR portion mrmr = MRMR( estimator=SVC(kernel='linear'), n_features_to_select=ARGS.NUM_FEATURES, method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR ) mrmr.fit(x, y) finally: remove(sto_filename) print('ALL TESTS PASS', file=sys.stderr)
def test_discrete(ARGS): # set these to this so we don't exclude anything (just testing file generation and parsing) ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13 ARGS.MRMR_METHOD = 'MID' ARGS.MAX_CONSERVATION = 1.0 ARGS.MAX_GAP_RATIO = 1.0 ARGS.MIN_CONSERVATION = 1.0 ARGS.CUTOFF = 20. # if we don't do this, DOOMBUNNIES set_util_params(ARGS.REFSEQ_IDS) fd, sto_filename = mkstemp(); close(fd) try: fh = open(sto_filename, 'w') print(TEST_AMINO_STO, file=fh) fh.close() alignment = AlignIO.read(sto_filename, 'stockholm') for ARGS.ENCODER in (AminoEncoder, StanfelEncoder): if ARGS.ENCODER == StanfelEncoder: TEST_NAMES = TEST_STANFEL_NAMES TEST_X = TEST_STANFEL_X else: TEST_NAMES = TEST_AMINO_NAMES TEST_X = TEST_AMINO_X # test mRMR and LSVM file generation ylabeler = Labeler( seqrecord_get_values, lambda row: is_refseq(row) or False, # TODO: again filtration function lambda x: x > ARGS.CUTOFF, False ) alignment, y, ic50 = ylabeler(alignment) refidx = reference_index(alignment, is_refseq) alignment = LabeledMSA.from_msa_with_ref(alignment, refidx) extractor = MSAVectorizer(ARGS.ENCODER) x = extractor.fit_transform(alignment) colnames = extractor.get_feature_names() # test the feature names portion try: assert(len(colnames) == len(TEST_NAMES)) except AssertionError: raise AssertionError('gen: %s\ntruth: %s' % (colnames, TEST_NAMES)) for name in TEST_NAMES: try: assert(name in colnames) except AssertionError: raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames))) assert(np.all(TEST_X == x)) assert(np.all(TEST_Y == y)) # generate and test the mRMR portion mrmr = MRMR( estimator=SVC(kernel='linear'), n_features_to_select=ARGS.NUM_FEATURES, method=ARGS.MRMR_METHOD, normalize=ARGS.MRMR_NORMALIZE, similar=ARGS.SIMILAR ) mrmr.fit(x, y) finally: remove(sto_filename) print('ALL TESTS PASS', file=sys.stderr)