예제 #1
0
    def calculate_all_counts(self, input_files, output_file):
        """
        Helper method to calculate the ref and alt counts for all bpps in a file

        Args:
            input_files (list): List of mavis formatted files to use as input
            output_file (str): Path to the desired output file
        """
        processed_bpps = {}
        filtered_events = []

        bpps = read_inputs(input_files, add_default={'stranded': False})

        for bpp in bpps:
            # only use precise bpps that are within a certain event size
            try:
                processed_bpps[bpp.product_id] = self.calculate_ref_counts(bpp)
            except ValueError:
                # wrong event type to calculate a ref/alt count
                filtered_events.append(bpp)
                continue

        log('filtered {} events'.format(len(filtered_events)))

        output_tabbed_file(processed_bpps.values(), output_file)
        return processed_bpps, filtered_events
예제 #2
0
 def __init__(self, input_bams, reference_genome, max_event_size=6, buffer=1):
     if isinstance(reference_genome, str):
         log('loading:', reference_genome, time_stamp=True)
         self.reference_genome = load_reference_genome(reference_genome)
     else:
         self.reference_genome = reference_genome
     self._load_bams(input_bams)
     self.bpp_cache = dict()
     self.max_event_size = max_event_size
     self.buffer = buffer
    def calculate_ref_counts(self, bpp):
        """
        Calculates the ref and alt count for a single BreakPointPair object
        """
        if len(bpp.break1) + len(bpp.break2) > 2 or \
            max(bpp.break2.end - bpp.break1.start,
                len(bpp.untemplated_seq if bpp.untemplated_seq else '')) > self.max_event_size:
            raise ValueError(
                "Cannot determine ref and alt count for non precise breakpoint pairs"
            )

        if bpp not in self.bpp_cache:
            log("processing {}".format(bpp))
            data = dict()
            for name, read_length, bam in self.input_bams:
                ref, alt, ign, mul, ref_sequence, alt_sequence = calculate_ref_count(
                    bpp, read_length, self.reference_genome, bam, self.buffer)
                log(bpp, name)
                log('Calculated counts: Ref: {}, Alt: {}, Mul: {}, Ignored: {} '
                    .format(len(ref), len(alt), len(mul), len(ign)))
                log('Ref_probe: {}, Alt_probe: {}'.format(
                    ref_sequence, alt_sequence))
                info = {
                    '{}_ref_count'.format(name): len(ref),
                    '{}_alt_count'.format(name): len(alt),
                    '{}_ignored_count'.format(name): len(ign)
                }
                for key, value in info.items():
                    data[key] = value
                self.bpp_cache[bpp] = data
        for key, value in self.bpp_cache[bpp].items():
            bpp.data[key] = value
        return bpp
예제 #4
0
def main():
    args = parse_arguments()
    repeat_sequences = sorted(list(set([s.lower() for s in args.repeat_seq])))
    log('loading:', args.input)
    reference_genome = load_reference_genome(args.input)
    comments = [
        os.path.basename(__file__),
        'input: {}'.format(args.input),
        'min_length: {}'.format(args.min_length),
        'repeat_seq: {}'.format(', '.join(args.repeat_seq)),
    ]
    log('writing:', args.output)
    with open(args.output, 'w') as fh:
        for comment in comments:
            fh.write('## {}\n'.format(comment))
        fh.write('chr\tstart\tend\tname\n')
        visited = set()
        for chrom, seq in sorted(reference_genome.items()):
            if chrom.startswith('chr'):
                chrom = chrom[3:]
            seq = str(seq.seq).lower()
            if seq in visited:
                continue
            else:
                visited.add(seq)
            spans = []
            for repseq in repeat_sequences:
                log(
                    'finding {}_repeat (min_length: {}), for chr{} (length: {})'.format(
                        repseq, args.min_length, chrom, len(seq)
                    )
                )
                index = 0
                while index < len(seq):
                    next_n = seq.find(repseq, index)
                    if next_n < 0:
                        break
                    index = next_n
                    while (
                        index + len(repseq) <= len(seq)
                        and seq[index : index + len(repseq)] == repseq
                    ):
                        index += len(repseq)
                    span = BioInterval(chrom, next_n + 1, index, name='repeat_{}'.format(repseq))
                    if len(span) >= args.min_length and len(span) >= 2 * len(repseq):
                        spans.append(span)
            log('found', len(spans), 'spans', time_stamp=False)
            for span in spans:
                fh.write(
                    '{}\t{}\t{}\t{}\n'.format(
                        span.reference_object, span.start, span.end, span.name
                    )
                )