def parse_locus(locus, fh): genome_id_str = '%s:%d-%d' % (locus.chrom, locus.start, locus.end) logging.debug('LocusIndex: %s coords: %s transfrags: %d' % (locus.name, genome_id_str, locus.num_lines)) # fast-forward to 'filepos' fh.seek(locus.filepos) # parse 'num_lines' from file into Transfrag objects transfrags = [] for i in xrange(locus.num_lines): transfrags.append(Transfrag.from_bed(fh.next())) return transfrags
def main(): logging.basicConfig(level=logging.DEBUG) parser = argparse.ArgumentParser() parser.add_argument('genome_fasta_file') parser.add_argument('bed_file') args = parser.parse_args() # check args if not os.path.exists(args.genome_fasta_file): parser.error('genome fasta file %s not found' % args.genome_fasta_file) if not os.path.exists(args.bed_file): parser.error('bed file %s not found' % args.bed_file) logging.info('genome fasta file: %s' % args.genome_fasta_file) logging.info('bed file: %s' % args.bed_file) # process bed file to get junctions logging.info('Reading Junctions') splice_juncs = set() fasta_fh = FastaFile(args.genome_fasta_file) with open(args.bed_file) as bed_fh: for line in bed_fh: t = Transfrag.from_bed(line) if t.chrom not in fasta_fh: continue for start, end in t.iterintrons(): splice_juncs.add((t.chrom, start, end, t.strand)) logging.info('Read %d Junctions' % (len(splice_juncs))) logging.info('Profiling Splice Motifs') motif_counter = Counter() for chrom, start, end, strand in splice_juncs: s = fasta_fh.fetch(chrom, start, start + 2) s += fasta_fh.fetch(chrom, end - 2, end) if strand == Strand.NEG: s = dna_reverse_complement(s) motif_counter[s] += 1 fasta_fh.close() # report statistics total = sum(motif_counter.values()) print '\t'.join(['motif', 'count', 'frac']) for motif, count in motif_counter.most_common(): print '\t'.join([motif, str(count), str(float(count) / total)]) logging.info('Done')