def build_ivals(fp, genome_db, reads_db): for row in bowtie_parser.read(fp): src_seq = get_src_sequence(genome_db, row) read_seq = get_read_sequence(reads_db, row) yield src_seq, read_seq
import bowtie_parser import sys files = sys.argv[2:] dict1 = {} fp = open(sys.argv[1], 'w') for file in files: dict = {} for n, line in enumerate(bowtie_parser.read(open(file))): contig_id = line.seqid length = int(contig_id.split('_')[3])+33-1 start = line.start read = line.read if dict.has_key(contig_id): for index in range(start,int(start)+len(read)): dict[contig_id][1][index]=1 else: count_mapped = [0]*length dict[contig_id]=[length] dict[contig_id].append(count_mapped) for key in dict.keys(): mapped_bases = dict[key][1].count(1) dict[key][1] = mapped_bases mapped_percent = dict[key][1]/float(dict[key][0]) dict[key].append(mapped_percent) group_index = file.find('group')
### parser = optparse.OptionParser() parser.add_option('-M', '--max-reads', dest='max_reads', default=0, type=int, help='only use first M reads, then exit') (options, args) = parser.parse_args() bowtie_mapping_file, = args bowtie_fp = open(bowtie_mapping_file) ### # iterate over the bowtie mapping file (output from bowtie) for n, line in enumerate(bowtie_parser.read(bowtie_fp)): # print out status/progress. if n % 10000 == 0: print>>sys.stderr, 'scanning reads', n if options.max_reads and n > options.max_reads: print>>sys.stderr, 'EXITING EARLY; -M specified as %d' % \ options.max_reads break # retrieve mismatches from the bowtie mapping mismatches = line.mismatches.strip() # record the mismatch positions if mismatches: mismatches = [ x for x in mismatches.split(',') if 'N' not in x ]