def score_sequence_set(sequence_set, pssms, p_binding_site): models = [ build_hmm_from_semi_parsed( parsed, p_binding_site=p_binding_site ) for parsed in pssms ] return calculate_model_counts_on_sequences(models, sequence_set)
def get_roc_for_sequences(p_binding_site, positive_sequences, negative_sequences, pssms): models = [ build_hmm_from_semi_parsed( parsed, p_binding_site=p_binding_site ) for parsed in pssms ] roc = RocCalculator() update_roc(roc, generate_roc_data(models, positive_sequences, negative_sequences)) return roc
def score_sequence_set(sequence_set, pssms, p_binding_site): models = [ build_hmm_from_semi_parsed(parsed, p_binding_site=p_binding_site) for parsed in pssms ] return calculate_model_counts_on_sequences(models, sequence_set)
for fragment, pssm in pssms(): sequence_file = os.path.join(sequence_dir, sequence_filename_fmt % fragment) model_file = os.path.join(model_dir, '%s-%s.pssm' % (fragment, pssm)) logging.info('Loading sequences: %s', sequence_file) sequences = list(sequences_from_fasta(sequence_file)) numpy_seqs = map(seq_to_numpy, sequences) logging.info('Loaded %d sequences', len(sequences)) logging.info('Parsing PSSMs: %s', model_file) pssms = list(parse_models(open(model_file))) logging.info('Building models') models = [ build_hmm_from_semi_parsed(parsed, p_binding_site=p_binding_site) for parsed in pssms ] def nucleotide_dist(): return numpy.zeros(4) + .25 base_dists = DictOf(nucleotide_dist) min_site_length = 20 logging.info('Analysing sequences') for hmm, traits in models: sites = [] for sequence in numpy_seqs: # analyse the sequence for its most likely state sequence
sequence_file = os.path.join(sequence_dir, sequence_filename_fmt % fragment) model_file = os.path.join(model_dir, '%s-%s.pssm' % (fragment, pssm)) logging.info('Loading sequences: %s', sequence_file) sequences = list(sequences_from_fasta(sequence_file)) numpy_seqs = map(seq_to_numpy, sequences) logging.info('Loaded %d sequences', len(sequences)) logging.info('Parsing PSSMs: %s', model_file) pssms = list(parse_models(open(model_file))) logging.info('Building models') models = [ build_hmm_from_semi_parsed(parsed, p_binding_site=p_binding_site) for parsed in pssms ] def nucleotide_dist(): return numpy.zeros(4) + .25 base_dists = DictOf(nucleotide_dist) min_site_length = 20 logging.info('Analysing sequences') for hmm, traits in models: sites = [] for sequence in numpy_seqs: # analyse the sequence for its most likely state sequence LL, states = hmm.viterbi(sequence)
logging.info('Loading sequences: %s', options.sequences_file) sequences = dict(sequences_from_fasta(options.sequences_file)) numpy_seqs = dict((desc, seq_to_numpy(seq)) for desc, seq in sequences.iteritems()) logging.info('Loaded %d sequences', len(sequences)) logging.info('Parsing PSSMs: %s', options.models_file) pssms = list(parse_models(open(options.models_file))) logging.info('Building models') models = [ build_hmm_from_semi_parsed(parsed, p_binding_site=options.p_binding_site) for parsed in pssms ] logging.info('Analysing sequences') p_binding_sites = list() sites_file = open('sites.txt', 'w') for hmm, traits in models: for desc, sequence in numpy_seqs.iteritems(): p_binding_site, site_seq = analyse_sequence_for_best_site(sequence, hmm, traits) p_binding_sites.append(p_binding_site) site = numpy_to_seq(site_seq) logging.info('%s: p(binding site)=%12g, sequence=%s', desc, p_binding_site, site) print >> sites_file, '%s, %12g, %s' % (desc, p_binding_site, site) sites_file.close()