예제 #1
0
def main():
    # Parse command line arguments
    args = parse_args()
    bamfile = args.bam
    bedfile = args.bed
    outfile = args.output

    if outfile:
        outstream = open(outfile, 'w')
    else:
        outstream = sys.stdout

    max_distance = args.dist

    out_header = '\t'.join(
        ['STR_chr', 'STR_start', 'STR_stop', 'motif', 'reflen', 'count'])
    outstream.write(out_header + '\n')

    #STR_bed = parse_bed(args.bed, position_base=0)
    STR_bed = bt.BedTool(bedfile)
    readlen = detect_readlen(bamfile)

    # Read bam
    bam = pysam.Samfile(bamfile, 'rb')

    # Get relevant chromosomes
    required_chroms = []
    unpaired = 0
    total = 0
    for chrom in bam.references:
        if chrom.startswith('STR-'):
            required_chroms.append(chrom)

    for chrom in required_chroms:
        motif = chrom.split('-')[1]
        all_positions = []
        all_segments = bam.fetch(reference=chrom)

        for read in all_segments:
            total += 1
            try:
                mate_chr = read.next_reference_name
            except ValueError:
                unpaired += 1
                continue
            mate_start = read.next_reference_start
            mate_stop = mate_start + readlen
            all_positions.append([mate_chr, mate_start, mate_stop])

        # Strategy:
        # Merge all overlapping intervals
        # Keep the count of reads corresponding to each merged interval (i.e. 1 for each read contained in it)
        # Assign each interval to the closest STR (within 500 bp? - the insert size) with the correct motif, adding together the count of reads
        # Check motif: == normalise_str()
        # There should be two 1-2 intervals per STR, likely one for each flank.
        # Report the read count for each STR

        if len(all_positions) > 0:
            motif_bed = bt.BedTool(all_positions).sort()
            # Merge all the intervals, then count how many of the original intervals overlap the merged ones (4th column)
            motif_coverage = motif_bed.merge(stream=True).coverage(b=motif_bed,
                                                                   counts=True)

            tmp_bed = 'tmp-' + randomletters(
                8
            ) + '.bed'  #create temporary file for bedtools to write to and pandas to read since streams don't seem to work
            closest_STR = motif_coverage.closest(STR_bed, d=True,
                                                 stream=True).saveas(tmp_bed)
            colnames = [
                'chr', 'start', 'stop', 'count', 'STR_chr', 'STR_start',
                'STR_stop', 'motif', 'reflen', 'distance'
            ]
            df = pd.read_csv(tmp_bed, sep='\t', header=None, names=colnames)
            os.remove(tmp_bed)  #delete temporary file

            # Filter out loci that are too far away
            df = df.loc[df['distance'] <= max_distance, :]
            df['motif'] = df['motif'].map(
                normalise_str)  # Normalise the STR motif to enable comparisons
            # Remove STRs that don't match the motif
            df = df.loc[df['motif'] == normalise_str(motif), :]
            df = df.loc[:, [
                'STR_chr', 'STR_start', 'STR_stop', 'motif', 'count', 'reflen'
            ]]
            summed = df.groupby(
                ['STR_chr', 'STR_start', 'STR_stop', 'motif', 'reflen'],
                as_index=False).aggregate(np.sum)
            summed.to_csv(outstream, sep='\t', header=False, index=False)

    outstream.close()

    if unpaired == total:
        sys.exit(
            'ERROR: all reads tested appear to be unpaired. You may wish to check your bam file is paired end and correctly formed.'
        )
    elif unpaired > 0:
        sys.stderr.write(
            'WARNING: it appears that {} of {} reads checked were unpaired and so no useful data could be obtained from them.\n'
        )
예제 #2
0
def locus_counts(bamfiles, bedfile, outfile, max_distance):

    # Check bamfiles have unique names
    #print(bamfiles, type(bamfiles))
    if not isinstance(bamfiles, list):
        raise TypeError('Expecting a list, got {}'.format(type(bamfiles)))
    if len(set(bamfiles)) < len(bamfiles):
        sys.exit(
            'ERROR: There were multiple bamfiles with the same filename. Please check your input'
        )

    #STR_bed = parse_bed(args.bed, position_base=0)
    STR_bed = bt.BedTool(bedfile).sort()

    all_results = []
    for bamfile in bamfiles:

        readlen, count_noCIGAR = detect_readlen(bamfile)

        # Read bam
        bam = pysam.Samfile(bamfile, 'rb')

        # Get relevant chromosomes
        required_chroms = []
        unpaired = 0
        total = 0
        for chrom in bam.references:
            if chrom.startswith('STR-'):
                required_chroms.append(chrom)
        # Check if any STR- chromosomes
        if len(required_chroms) == 0:
            sys.exit(
                'ERROR: There were no reads mapping to chromosomes with names starting with "STR-" in {0}. Are you sure this data is mapped to a reference genome with STR decoy chromosomes?'
                .format(bamfile))

        for chrom in required_chroms:
            motif = chrom.split('-')[1]
            all_positions = []
            all_segments = bam.fetch(reference=chrom)

            for read in all_segments:
                #if read.is_secondary:
                #    continue
                total += 1
                try:
                    mate_chr = read.next_reference_name
                except ValueError:
                    unpaired += 1
                    continue
                mate_start = read.next_reference_start
                mate_stop = mate_start + readlen
                all_positions.append([mate_chr, mate_start, mate_stop])

            # Strategy:
            # Merge all overlapping intervals
            # Keep the count of reads corresponding to each merged interval (i.e. 1 for each read contained in it)
            # Assign each interval to the closest STR (within 500 bp? - the insert size) with the correct motif, adding together the count of reads
            # Check motif: == normalise_str()
            # There should be two 1-2 intervals per STR, likely one for each flank.
            # Report the read count for each STR

            if len(all_positions) > 0:
                motif_bed = bt.BedTool(all_positions).sort()
                # Merge all the intervals, then count how many of the original intervals overlap the merged ones (4th column)
                motif_coverage = motif_bed.merge(stream=True).coverage(
                    b=motif_bed, counts=True, nonamecheck=True)

                tmp_bed = 'tmp-' + randomletters(
                    8
                ) + '.bed'  #create temporary file for bedtools to write to and pandas to read since streams don't seem to work
                closest_STR = motif_coverage.closest(
                    STR_bed, d=True, stream=True,
                    nonamecheck=True).saveas(tmp_bed)
                colnames = [
                    'chr', 'start', 'stop', 'count', 'STR_chr', 'STR_start',
                    'STR_stop', 'motif', 'reflen', 'distance'
                ]
                df = pd.read_csv(tmp_bed,
                                 sep='\t',
                                 header=None,
                                 names=colnames)
                os.remove(tmp_bed)  #delete temporary file

                # Filter out loci that are too far away
                df = df.loc[df['distance'] <= max_distance, :]
                df['motif'] = df['motif'].map(
                    normalise_str
                )  # Normalise the STR motif to enable comparisons
                # Remove STRs that don't match the motif
                df = df.loc[df['motif'] == normalise_str(motif), :]
                df = df.loc[:, [
                    'STR_chr', 'STR_start', 'STR_stop', 'motif', 'count',
                    'reflen'
                ]]

                all_results.append(df)

        if total == 0:
            sys.exit(
                'ERROR: there were no reads overlapping the target STR regions. This may indicate a problem with the input file.\n'
            )
        elif unpaired == total:
            sys.exit(
                'ERROR: all {0} reads overlapping the target STR regions appear to be unpaired. You may wish to check your bam file is paired-end and correctly formed.\n'
                .format(total))
        elif unpaired > 0:
            sys.stderr.write(
                'WARNING: it appears that {0} of the {1} reads overlapping the target STR regions were unpaired and so no useful data could be obtained from them.\n'
                .format(unpaired, total))

    # Sum counts from multiple bam files and multiple rows
    if len(all_results) == 1:
        df_total = all_results[0]
    else:
        df_total = pd.concat(all_results, ignore_index=True)

    summed = df_total.groupby(
        ['STR_chr', 'STR_start', 'STR_stop', 'motif', 'reflen'],
        as_index=False).aggregate(np.sum)

    # Print a warning message in case of reads without a CIGAR string
    if count_noCIGAR > 0:
        sys.stderr.write('WARNING: ' + str(count_noCIGAR) + ' read(s) in ' +
                         bamfile + ' file had no CIGAR string.\n')

    if total == 0:
        sys.exit(
            'ERROR: there were no reads overlapping the target STR regions. This may indicate a problem with the input file.\n'
        )
    elif unpaired == total:
        sys.exit(
            'ERROR: all {0} reads overlapping the target STR regions appear to be unpaired. You may wish to check your bam file is paired-end and correctly formed.\n'
            .format(total))
    elif unpaired > 0:
        sys.stderr.write(
            'WARNING: it appears that {0} of the {1} reads overlapping the target STR regions were unpaired and so no useful data could be obtained from them.\n'
            .format(unpaired, total))

    # Write results
    if outfile:
        outstream = open(outfile, 'w')
    else:
        outstream = sys.stdout

    out_header = '\t'.join(
        ['STR_chr', 'STR_start', 'STR_stop', 'motif', 'reflen', 'count'])
    outstream.write(out_header + '\n')
    outstring = summed.to_csv(sep='\t', header=False, index=False)
    outstream.write(outstring)
    outstream.close()