示例#1
0
def get_sorted_aligned_reads(args, header, sequence):
    if args.reference_hash and os.path.exists(args.reference_hash):
        print("Loading index...")
        ref_index = load_hash(args.reference_hash)
    else:
        print("Computing reference index...")
        ref_index = build_hashtable(sequence, args.kmer, args.stride)
        save_hash(*ref_index, file=args.reference_hash)
    print("Verifying hash...")
    for hash_, offset_ in islice(ref_index[0].iteritems(), 20):
        if not verify_hash(sequence, offset_, args.kmer, hash_):
            raise ValueError(
                'Index failed to verify: offset {} has mismatching hashes'.
                format(offset_))
    print("Aligning reads...")
    pair_iterator = read_paired_fasta(args.reads_file)
    sam_iterator = align_pairs(sequence, ref_index, pair_iterator, 'hw2_rg')
    sam_iterator = iter(sorted(sam_iterator, cmp=compsam))
    if args.out_bam:
        outfile = Samfile(args.out_bam,
                          'wb',
                          header=SAM_HEADER(header, sequence))
        for read in sam_iterator:
            outfile.write(read)
        outfile.close()
        infile = Samfile(args.out_bam, 'rb')
        sam_iterator = infile
    return sam_iterator
示例#2
0
def main(args):
    m260b.debug.debug.DEBUG = args.debug
    ref_header, ref_sequence = read_basic_fasta(args.reference_file)
    if args.input_bam:
        reads = Samfile(args.input_bam)
        if args.start and args.stop:
            reads = reads.fetch(ref_header[1:].strip(), args.start, args.stop)
    else:
        reads = get_sorted_aligned_reads(args, ref_header, ref_sequence)
    #vcf_stream = VCFWriter(open(args.out_vcf, 'wb'), make_vcf_header(args)) if args.out_vcf else None
    chr = ref_header[1:].strip()
    fail_reasons = Counter()
    haplo_out = None
    if args.haplotype_out:
        haplo_out = Samfile(args.haplotype_out,
                            'wb',
                            header=SAM_HEADER(ref_header, ref_sequence))
    vcf_stream = VCFWriter(open(args.out_vcf, 'wb'),
                           make_vcf_header(args)) if args.out_vcf else None
    for region, reads in active_regions(reads,
                                        ref_sequence,
                                        chr,
                                        start_offset=0,
                                        flank=30,
                                        dfrac=1.0):
        #print('Calling region {}-{}'.format(region.start, region.stop))
        haplotype = build_haplotype(region.reference,
                                    reads,
                                    k=11,
                                    min_kmer_count=2)
        if haplotype.fail_reason:
            print('Failure {} at window\n{}'.format(haplotype.fail_reason,
                                                    region))
            continue
        # align the haplotype to the reference sequence
        offset, cigar, score, mismatch = banded_sw(region.reference,
                                                   haplotype.seq)
        haplotype_start = region.start + offset
        _info = AlignmentInfo(haplotype_start, cigar, False, mismatch)
        haplo_seq = SeqRecord(Seq(haplotype.seq, DNA),
                              id='Haplotype{}'.format(region.start))
        dict.__setitem__(haplo_seq._per_letter_annotations, 'phred_quality',
                         [40] * len(haplotype.seq))
        haplo_read = alignment_info_to_sam(haplo_seq, _info, 'nomate', None,
                                           'hw2_rg', False)
        if haplo_out:
            haplo_out.write(haplo_read)
        #print(haplotype)
        for variant in vcf_from_haplotype(region, haplotype, SAMPLE_NAME, chr):
            if vcf_stream:
                vcf_stream.write_record(variant)
            print(vcf2m260(variant))
    if vcf_stream:
        vcf_stream.flush()
        vcf_stream.close()
示例#3
0
def split_samfile(sam_file, splits, prefix='', path=''):
    """Take a sam file and split it splits number of times.

    :path:    Where to put the split files.
    :prefix:  A prefix for the outfile names.
    :returns: A tuple of job files.
    """
    # Determine how many reads will be in each split sam file.
    num_lines = count_reads(sam_file)
    num_reads = int(int(num_lines)/splits) + 1

    # Get rid of starting path
    sam_name = os.path.basename(sam_file)

    # Subset the SAM file into X number of jobs
    cnt      = 0
    currjob  = 1
    suffix   = '.split_sam_' + str(currjob).zfill(4)
    run_file = os.path.join(path, prefix + sam_name + suffix)
    rmode    = 'rb' if sam_name.split('.')[0] == 'bam' else 'r'
    wmode    = 'wb'

    # Actually split the file
    outfiles = [run_file]
    with Samfile(sam_file, rmode) as in_sam:
        sam_split = Samfile(run_file, wmode, template=in_sam)
        for line in in_sam:
            cnt += 1
            if cnt < num_reads:
                sam_split.write(line)
            elif cnt == num_reads:
                # Check if next line is mate-pair. If so, don't split.
                line2     = next(in_sam)
                currjob  += 1
                suffix    = '.split_sam_' + str(currjob).zfill(4)
                run_file  = os.path.join(path, prefix + sam_name + suffix)
                new_sam   = Samfile(run_file, wmode, template=in_sam)
                outfiles.append(run_file)

                if line.qname == line2.qname:
                    sam_split.write(line)
                    sam_split.write(line2)
                    sam_split.close()
                    cnt = 0
                else:
                    sam_split.write(line)
                    sam_split.close()
                    new_sam.write(line2)
                    cnt = 0
                sam_split = new_sam
        sam_split.close()
    return tuple(outfiles)
示例#4
0
def parse_barcode(bamfile):
	"""parses a sorted and index bam file, removes all cases where rna hits more than one spot in genome
	and writes to a file, create file for mutant and wildtype based on barcodes"""
	samfile = Samfile(bamfile, "rb")
	multi_hit_file = Samfile("MultiHit.bam","wb",template=samfile)
	mutant_one = Samfile("MutantOne.bam","wb",template=samfile)
	wildtype_one = Samfile("WildtypeOne.bam","wb",template=samfile)
	mutant_two = Samfile("MutantTwo.bam","wb",template=samfile)
	wildtype_two = Samfile("WildtypeTwo.bam","wb",template=samfile)
	for line in samfile.fetch():
		#if line.is_secondary:
		## does this hit to more than one spot in genome
		#	multi_hit_file.write(line)
		if "#GAGT"in line.qname: 
		## write to mutant file
			mutant_one.write(line)
		elif "#TTAG" in line.qname:
			mutant_two.write(line)
		elif "#ACCC" in line.qname:
		### write to wildtype file
			wildtype_one.write(line)
		elif "#CGTA" in line.qname:
		### write to wildtype file
			wildtype_two.write(line)

	multi_hit_file.close()
	mutant_one.close()
	wildtype_one.close()
	mutant_two.close()
	wildtype_two.close()
	samfile.close()
示例#5
0
def subsample(fn, ns=None):
    if ns is None:
        fn, ns = fn
    sample = []
    count = 0
    outdir_base = path.join(path.dirname(fn), 'subset')
    sf = Samfile(fn)
    try:
        i_weight = float(sf.mapped) / max(ns)
        print "Read out ", i_weight
    except ValueError:
        i_weight = 0.0
        for read in sf:
            i_weight += 1
        print "Counted ", i_weight
        i_weight /= float(max(ns))
        sf = Samfile(fn)

    print fn, count, i_weight
    for i, read in enumerate(sf):
        key = random()**i_weight
        if len(sample) < max(ns):
            heappush(sample, (key, read, i + count))
        else:
            heappushpop(sample, (key, read, i + count))

    count += i

    for n in ns:
        if n == min(ns):
            outdir = outdir_base + '_min'
        else:
            outdir = outdir_base + '{:04.1f}M'.format(n / 1e6)
        try:
            makedirs(outdir)
        except OSError:
            pass
        sampN = sorted(sample, reverse=True)[:int(n)]
        print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count)
        print fn, '->', outdir
        stdout.flush()
        of = Samfile(path.join(outdir, 'accepted_hits.bam'),
                     mode='wb',
                     template=sf)
        sample.sort(key=lambda (key, read, pos): (read.tid, read.pos))
        for key, read, pos in sampN:
            of.write(read)
        of.close()
    sf.close()
    return [count for key, read, count in sample]
示例#6
0
def subsample(fn, ns=None):
    if ns is None:
        fn, ns = fn
    sample = []
    count = 0
    outdir_base = path.join(path.dirname(fn), "subset")
    sf = Samfile(fn)
    try:
        i_weight = float(sf.mapped) / max(ns)
        print "Read out ", i_weight
    except ValueError:
        i_weight = 0.0
        for read in sf:
            i_weight += 1
        print "Counted ", i_weight
        i_weight /= float(max(ns))
        sf = Samfile(fn)

    print fn, count, i_weight
    for i, read in enumerate(sf):
        key = random() ** i_weight
        if len(sample) < max(ns):
            heappush(sample, (key, read, i + count))
        else:
            heappushpop(sample, (key, read, i + count))

    count += i

    for n in ns:
        if n == min(ns):
            outdir = outdir_base + "_min"
        else:
            outdir = outdir_base + "{:04.1f}M".format(n / 1e6)
        try:
            makedirs(outdir)
        except OSError:
            pass
        sampN = sorted(sample, reverse=True)[: int(n)]
        print "Kept {: >12,} of {: >12,} reads".format(len(sampN), count)
        print fn, "->", outdir
        stdout.flush()
        of = Samfile(path.join(outdir, "accepted_hits.bam"), mode="wb", template=sf)
        sample.sort(key=lambda (key, read, pos): (read.tid, read.pos))
        for key, read, pos in sampN:
            of.write(read)
        of.close()
    sf.close()
    return [count for key, read, count in sample]
示例#7
0
def get_sorted_aligned_reads(args, header, sequence):
    if args.ref_idx and os.path.exists(args.ref_idx):
        print("Loading index...")
        ref_index = load_hash(args.ref_idx)
    else:
        print("Computing reference index...")
        ref_index = build_hashtable(sequence, args.kmer, args.stride)
    print("Verifying hash...")
    for hash_, offset_ in islice(ref_index[0].iteritems(), 20):
        if not verify_hash(sequence, offset_, args.kmer, hash_):
            raise ValueError(
                'Index failed to verify: offset {} has mismatching hashes'.
                format(offset_))
    print("Aligning reads...")
    pair_iterator = read_paired_fasta(args.reads_file)
    sam_iterator = align_pairs(sequence, ref_index, pair_iterator, 'hw1_rg')
    print('Sorting SAMRecords in memory...')
    sam_iterator = iter(sorted(sam_iterator, cmp=compsam))
    if args.out_bam:
        header = {
            'HD': {
                'VN': '1.0'
            },
            'SQ': [{
                'SN': header[1:],
                'LN': len(sequence)
            }],
            'RG': [{
                'ID': 'hw1_rg',
                'SM': SAMPLE_NAME,
                'PU': 'Unknown',
                'PL': 'Unknown',
                'LB': 'Unknown'
            }]
        }
        outfile = Samfile(args.out_bam, 'wb', header=header)
        for read in sam_iterator:
            outfile.write(read)
        outfile.close()
        infile = Samfile(args.out_bam, 'rb')
        sam_iterator = infile
    return sam_iterator
示例#8
0
def split_reads(reads, ref_reads, alt_reads):
    read_results = Counter()

    ref_bam = Samfile(ref_reads, 'wb', template=reads)
    alt_bam = Samfile(alt_reads, 'wb', template=reads)

    prev_phase = None
    prev_read = None
    prev_qname = None
    test = 0

    for read in reads.fetch(until_eof=True):
        test += 1
        chrom = read.reference_name
        snps_on_chrom = snp_dict[chrom]
        phase = get_phase(read, snps_on_chrom)
        read_qname = read.qname
        if read_qname == prev_qname:
            read_results["tot_read"]+=1
            phase_set = set([phase, prev_phase])
            phase_set.discard(None)
            if len(phase_set) == 1:
                read_phase = phase_set.pop()
                if read_phase == -1:
                    ref_bam.write(read)
                    ref_bam.write(prev_read)
                    read_results["ref_read"]+=1
                elif read_phase == 1:
                    alt_bam.write(read)
                    alt_bam.write(prev_read)
                    read_results["alt_read"]+=1
                elif read_phase == 0:
                    read_results['misphased_read']+=1
            elif len(phase_set)==0:
                read_results["no_snps_read"]+=1
            else:
                read_results['misphased_read']+=1

        prev_read = read
        prev_phase = phase
        prev_qname = read_qname

    return(read_results)
示例#9
0
    """
    ret = []
    for i in re.findall("\d+|\^?[ATCGN]+", md):
        if i.startswith('^'):
            ret.extend(list(i[1:]))
        elif i[0] in ["A", "T", "C", "G", "N"]:
            ret.extend(list(i))
        else:
            ret.extend(['-'] * int(i))

    return ret


if __name__ == '__main__':
    f = Samfile(sys.argv[1])
    out = Samfile(sys.argv[1][:-4] + "_realign.bam", 'wb', template=f)
    count = 0.0
    n = 0.05
    for read in f:
        q, t = expandAlign(read)
        query, target = realign(read)
        replace(read, query, target)
        out.write(read)
        count += 1
        if (count / f.mapped) > n:
            n += 0.05
            print "[%s] -- parsed %d of %d reads (%.2f)" % (
                time.asctime(), int(count), f.mapped, count / f.mapped)

    out.close()
示例#10
0
def _sort_regional_reads(regional_reads, regions, output_dir, basename, header,
                         log_output):
    """Sorts reads in dict struct by coordinate and writes to BAM files
  Uses counting sort with the intuition that most positions in a range will
  map to a non-empty bucket, where a bucket holds reads mapped to a certain
  position.
  Args:
    regional_reads: Complicated dict struct
      ex. regional_reads[region][chromosome][position] returns list of reads
          at this position in this chromosome in this region
    regions: Regions dict used to sort reads during alignment. Produced by
      _load_regions()
    output_dir: self-explanatory
    basename: Prefix for output BAM files
    header: Dict representation of SAM header (described in pysam docs)
    log_output: File object for log.
  Returns:
    output_paths: List of output file paths
  """
    output_paths = []
    region_intervals = {}
    sequence_order = [chrom["SN"] for chrom in header["SQ"]]
    region_names = set(["chri"])
    # Get region intervals
    for seq_id in sequence_order:
        for interval in regions[seq_id]:
            region_names.add(interval.name)
            if interval.name in region_intervals:
                region_intervals[interval.name].append(
                    [seq_id, interval.lower_bound, interval.upper_bound])
            else:
                region_intervals[interval.name] = [[
                    seq_id, interval.lower_bound, interval.upper_bound
                ]]
    #for region in regional_reads.keys():
    for region in region_names:
        output_path = "{}/{}/{}.bam".format(output_dir, region, basename)
        output_file = Samfile(output_path, 'wb', header=header)
        if region == "chri":
            for chromosome in sequence_order:
                if chromosome in regional_reads[region]:
                    for position in sorted(
                            regional_reads[region][chromosome].keys()):
                        for read in regional_reads[region][chromosome].pop(
                                position):
                            output_file.write(read)
        elif region in regional_reads:
            for entries in region_intervals[region]:
                chromosome = entries[0]
                low, high = entries[1], entries[2]
                if chromosome in regional_reads[region]:
                    for position in range(low, high + 1):
                        if position in regional_reads[region][chromosome]:
                            for read in regional_reads[region][chromosome].pop(
                                    position):
                                output_file.write(read)
                else:
                    log_output.write("No reads mapping to "
                                     "{} {}\n".format(region, chromosome))
                    log_output.flush()
                    fsync(log_output.fileno())
        output_paths.append(output_path)
        output_file.close()
    return output_paths
示例#11
0
def main():
    parser = OptionParser(usage=usage)
    #parser.add_option("-s", action="store_true", dest="sam_input", default=False,
    #help="Input is in SAM format instead of BAM format")
    (options, args) = parser.parse_args()
    if len(args) != 4:
        parser.print_help()
        sys.exit(1)
    psl_filename = args[0]
    ref_filename = args[1]
    contigs_filename = args[2]
    bam_filename = args[3]
    liftover_dir = args[1]

    references, ref_chromosomes = read_fasta(ref_filename)
    refname_to_id = dict([(name, i) for i, name in enumerate(ref_chromosomes)])
    print('Read',
          len(ref_chromosomes),
          'reference chromosomes:',
          ','.join(ref_chromosomes),
          file=sys.stderr)
    contigs, contig_names = read_fasta(contigs_filename)
    print('Read', len(contig_names), 'contigs.', file=sys.stderr)
    bam_header = {
        'HD': {
            'VN': '1.0'
        },
        'SQ': [
            dict([('LN', len(references[chromosome])), ('SN', chromosome)])
            for chromosome in ref_chromosomes
        ]
    }
    outfile = Samfile(bam_filename, 'wb', header=bam_header)

    line_nr = 0
    header_read = False
    for line in (s.strip() for s in open(psl_filename)):
        line_nr += 1
        if line.startswith('------'):
            header_read = True
            continue
        if not header_read: continue
        fields = line.split()
        assert len(
            fields
        ) == 21, 'Error reading PSL file, offending line: %d' % line_nr
        sizes = [int(x) for x in fields[18].strip(',').split(',')]
        contig_starts = [int(x) for x in fields[19].strip(',').split(',')]
        ref_starts = [int(x) for x in fields[20].strip(',').split(',')]
        assert 0 < len(sizes) == len(contig_starts) == len(ref_starts)
        strand = fields[8]
        contig_name = fields[9]
        ref_name = fields[13]
        assert strand in ['-', '+']
        assert contig_name in contigs
        assert ref_name in references
        a = AlignedRead()
        a.qname = contig_name
        if strand == '+':
            a.seq = str(contigs[contig_name])
        else:
            a.seq = str(contigs[contig_name].reverse_complement())
        a.flag = (16 if strand == '+' else 0)
        a.rname = refname_to_id[ref_name]
        a.pos = ref_starts[0]
        a.mapq = 255
        qpos = contig_starts[0]
        refpos = ref_starts[0]
        cigar = []
        # soft-clipping at the start?
        if contig_starts[0] > 0:
            cigar.append((4, contig_starts[0]))
        longest_insertion = 0
        longest_deletion = 0
        total_matches = 0
        total_insertion = 0
        total_deletion = 0
        for length, contig_start, ref_start in zip(sizes, contig_starts,
                                                   ref_starts):
            assert contig_start >= qpos
            assert ref_start >= refpos
            # insertion?
            if contig_start > qpos:
                insertion_length = contig_start - qpos
                longest_insertion = max(longest_insertion, insertion_length)
                total_insertion += insertion_length
                append_to_cigar(cigar, 1, insertion_length)
                qpos = contig_start
            # deletion?
            if ref_start > refpos:
                deletion_length = ref_start - refpos
                longest_deletion = max(longest_deletion, deletion_length)
                total_deletion += deletion_length
                append_to_cigar(cigar, 2, deletion_length)
                refpos = ref_start
            # strech of matches/mismatches
            append_to_cigar(cigar, 0, length)
            refpos += length
            qpos += length
            total_matches += length
        # soft-clipping at the end?
        if len(a.seq) > qpos:
            cigar.append((4, len(a.seq) - qpos))
        a.cigar = tuple(cigar)
        # only use contigs where longest deletion is <= 10000 bp
        if longest_deletion > 10000: continue
        # require at least 200 matching positions
        if total_matches < 200: continue
        # require the matching positions to make up at least 75 percent of the contig
        # (without counting parts of the contig that are insertions).
        if total_matches / (len(a.seq) - total_insertion) < 0.75: continue
        outfile.write(a)
    outfile.close()
示例#12
0
def subsample(fn, ns=None, paired=False):
    if ns is None:
        fn, ns = fn
    sample = []
    count = 0
    outdir_base = path.join(path.dirname(fn), 'subset')
    sf = Samfile(fn)
    try:
        i_weight = float(sf.mapped) / max(ns)
        print("Read out ", i_weight)
    except ValueError:
        i_weight = 0.0
        for read in sf:
            i_weight += 1
        print("Counted ", i_weight)
        i_weight /= float(max(ns))
        sf = Samfile(fn)

    if paired:
        read_2s = {}
    print(fn, count, i_weight)
    for i, read in enumerate(sf):
        key = random()**i_weight
        if not paired or read.is_read1:
            if len(sample) < max(ns):
                heappush(sample, (key, i + count, read))
            else:
                dropped = heappushpop(sample, (key, i + count, read))
                if paired:
                    read_2s.pop(dropped[-1].qname, None)
        elif paired:
            read_2s[read.qname] = read
        else:
            assert ValueError("I don't know how we got here")

    count += i

    for n in ns:
        outdir = outdir_base + '{:04.1f}M'.format(n / 1e6)
        try:
            makedirs(outdir)
        except OSError:
            pass
        sampN = sorted(sample, reverse=True)[:int(n)]
        print("Kept {: >12,} of {: >12,} reads".format(len(sampN), count))
        print(fn, '->', outdir)
        stdout.flush()
        of = Samfile(path.join(outdir, 'accepted_hits.bam'),
                     mode='wb',
                     template=sf)
        sample.sort(
            key=lambda heap_item: (heap_item[-1].tid, heap_item[-1].pos))
        missing_mates = 0
        for key, pos, read in sampN:
            of.write(read)
            if paired and read.is_proper_pair:
                if read.qname not in read_2s:
                    missing_mates += 1
                    continue
                of.write(read_2s[read.qname])
        of.close()
    sf.close()
    print(missing_mates)
    return [count for key, read, count in sample]
示例#13
0
def main():
	parser = OptionParser(usage=usage)
	#parser.add_option("-s", action="store_true", dest="sam_input", default=False,
					  #help="Input is in SAM format instead of BAM format")
	(options, args) = parser.parse_args()
	if len(args) != 4:
		parser.print_help()
		sys.exit(1)
	psl_filename = args[0]
	ref_filename = args[1]
	contigs_filename = args[2]
	bam_filename = args[3]
	liftover_dir = args[1]
	
	references, ref_chromosomes = read_fasta(ref_filename)
	refname_to_id = dict([(name,i) for i,name in enumerate(ref_chromosomes)])
	print('Read', len(ref_chromosomes), 'reference chromosomes:', ','.join(ref_chromosomes), file=sys.stderr)
	contigs, contig_names = read_fasta(contigs_filename)
	print('Read', len(contig_names), 'contigs.', file=sys.stderr)
	bam_header = {'HD': {'VN': '1.0'}, 'SQ': [dict([('LN', len(references[chromosome])), ('SN', chromosome)]) for chromosome in ref_chromosomes] }
	outfile = Samfile(bam_filename, 'wb', header=bam_header)

	line_nr = 0
	header_read = False
	for line in (s.strip() for s in open(psl_filename)):
		line_nr += 1
		if line.startswith('------'): 
			header_read = True
			continue
		if not header_read: continue
		fields = line.split()
		assert len(fields) == 21, 'Error reading PSL file, offending line: %d'%line_nr
		sizes = [int(x) for x in fields[18].strip(',').split(',')]
		contig_starts = [int(x) for x in fields[19].strip(',').split(',')]
		ref_starts = [int(x) for x in fields[20].strip(',').split(',')]
		assert 0 < len(sizes) == len(contig_starts) == len(ref_starts)
		strand = fields[8]
		contig_name = fields[9]
		ref_name = fields[13]
		assert strand in ['-','+']
		assert contig_name in contigs
		assert ref_name in references
		a = AlignedRead()
		a.qname = contig_name
		if strand == '+':
			a.seq = str(contigs[contig_name])
		else:
			a.seq = str(contigs[contig_name].reverse_complement())
		a.flag = (16 if strand == '+' else 0)
		a.rname = refname_to_id[ref_name]
		a.pos = ref_starts[0]
		a.mapq = 255
		qpos = contig_starts[0]
		refpos = ref_starts[0]
		cigar = []
		# soft-clipping at the start?
		if contig_starts[0] > 0:
			cigar.append((4,contig_starts[0]))
		longest_insertion = 0
		longest_deletion = 0
		total_matches = 0
		total_insertion = 0
		total_deletion = 0
		for length, contig_start, ref_start in zip(sizes, contig_starts, ref_starts):
			assert contig_start >= qpos
			assert ref_start >= refpos
			# insertion?
			if contig_start > qpos:
				insertion_length = contig_start - qpos
				longest_insertion = max(longest_insertion, insertion_length)
				total_insertion += insertion_length
				append_to_cigar(cigar, 1, insertion_length)
				qpos = contig_start
			# deletion?
			if ref_start > refpos:
				deletion_length = ref_start - refpos
				longest_deletion = max(longest_deletion, deletion_length)
				total_deletion += deletion_length
				append_to_cigar(cigar, 2, deletion_length)
				refpos = ref_start
			# strech of matches/mismatches
			append_to_cigar(cigar, 0, length)
			refpos += length
			qpos += length
			total_matches += length
		# soft-clipping at the end?
		if len(a.seq) > qpos:
			cigar.append((4,len(a.seq) - qpos))
		a.cigar = tuple(cigar)
		# only use contigs where longest deletion is <= 10000 bp
		if longest_deletion > 10000: continue
		# require at least 200 matching positions
		if total_matches < 200: continue
		# require the matching positions to make up at least 75 percent of the contig
		# (without counting parts of the contig that are insertions).
		if total_matches / (len(a.seq) - total_insertion) < 0.75: continue
		outfile.write(a)
	outfile.close()
示例#14
0
def print_reads(reads_to_print, ref_name, header):
    output_name = "{0}_{1}.bam".format(args.output_base, ref_name)
    output_samfile = Samfile(output_name, "wb", header=header)
    for aln in reads_to_print:
        output_samfile.write(aln)
    output_samfile.close()
示例#15
0
def subsample(fn, ns=None, paired=False):
    if ns is None:
        fn, ns = fn
    sample = []
    count = 0
    outdir_base = path.join(path.dirname(fn), 'subset')
    sf = Samfile(fn)
    try:
        i_weight = float(sf.mapped)/max(ns)
        print("Read out ", i_weight)
    except ValueError:
        i_weight = 0.0
        for read in sf:
            i_weight += 1
        print("Counted ", i_weight)
        i_weight /= float(max(ns))
        sf = Samfile(fn)

    if paired:
        read_2s = {}
    print(fn, count, i_weight)
    for i, read in enumerate(sf):
        key = random()**i_weight
        if not paired or read.is_read1:
            if len(sample) < max(ns):
                heappush(sample, (key, i+count, read))
            else:
                dropped = heappushpop(sample, (key, i+count, read))
                if paired:
                    read_2s.pop(dropped[-1].qname, None)
        elif paired:
            read_2s[read.qname] = read
        else:
            assert ValueError("I don't know how we got here")


    count += i

    for n in ns:
        outdir = outdir_base + '{:04.1f}M'.format(n/1e6)
        try:
            makedirs(outdir)
        except OSError:
            pass
        sampN = sorted(sample, reverse=True)[:int(n)]
        print("Kept {: >12,} of {: >12,} reads".format(len(sampN), count))
        print(fn, '->', outdir)
        stdout.flush()
        of = Samfile(path.join(outdir, 'accepted_hits.bam'),
                     mode='wb', template=sf)
        sample.sort(key=lambda heap_item: (heap_item[-1].tid, heap_item[-1].pos))
        missing_mates = 0
        for key, pos, read in sampN:
            of.write(read)
            if paired and read.is_proper_pair:
                if read.qname not in read_2s:
                    missing_mates += 1
                    continue
                of.write(read_2s[read.qname])
        of.close()
    sf.close()
    print(missing_mates)
    return [count for key, read, count in sample]
示例#16
0
    """
    Turns abbreviated MD into a full array
    """
    ret = []
    for i in re.findall("\d+|\^?[ATCGN]+", md):
        if i.startswith('^'):
            ret.extend(list(i[1:]))
        elif i[0] in ["A","T","C","G","N"]:
            ret.extend(list(i))
        else:
            ret.extend(['-']*int(i))

    return ret

if __name__ == '__main__':
    f = Samfile(sys.argv[1])
    out = Samfile(sys.argv[1][:-4]+"_realign.bam",'wb', template=f)
    count = 0.0
    n = 0.05
    for read in f: 
        q,t = expandAlign(read)
        query, target = realign(read)
        replace(read, query, target)
        out.write(read)
        count += 1
        if (count / f.mapped) > n:
            n += 0.05
            print "[%s] -- parsed %d of %d reads (%.2f)" % (time.asctime(), int(count), f.mapped, count/f.mapped )
        
    out.close()