def calculate_all_counts(self, input_files, output_file): """ Helper method to calculate the ref and alt counts for all bpps in a file Args: input_files (list): List of mavis formatted files to use as input output_file (str): Path to the desired output file """ processed_bpps = {} filtered_events = [] bpps = read_inputs(input_files, add_default={'stranded': False}) for bpp in bpps: # only use precise bpps that are within a certain event size try: processed_bpps[bpp.product_id] = self.calculate_ref_counts(bpp) except ValueError: # wrong event type to calculate a ref/alt count filtered_events.append(bpp) continue log('filtered {} events'.format(len(filtered_events))) output_tabbed_file(processed_bpps.values(), output_file) return processed_bpps, filtered_events
def __init__(self, input_bams, reference_genome, max_event_size=6, buffer=1): if isinstance(reference_genome, str): log('loading:', reference_genome, time_stamp=True) self.reference_genome = load_reference_genome(reference_genome) else: self.reference_genome = reference_genome self._load_bams(input_bams) self.bpp_cache = dict() self.max_event_size = max_event_size self.buffer = buffer
def calculate_ref_counts(self, bpp): """ Calculates the ref and alt count for a single BreakPointPair object """ if len(bpp.break1) + len(bpp.break2) > 2 or \ max(bpp.break2.end - bpp.break1.start, len(bpp.untemplated_seq if bpp.untemplated_seq else '')) > self.max_event_size: raise ValueError( "Cannot determine ref and alt count for non precise breakpoint pairs" ) if bpp not in self.bpp_cache: log("processing {}".format(bpp)) data = dict() for name, read_length, bam in self.input_bams: ref, alt, ign, mul, ref_sequence, alt_sequence = calculate_ref_count( bpp, read_length, self.reference_genome, bam, self.buffer) log(bpp, name) log('Calculated counts: Ref: {}, Alt: {}, Mul: {}, Ignored: {} ' .format(len(ref), len(alt), len(mul), len(ign))) log('Ref_probe: {}, Alt_probe: {}'.format( ref_sequence, alt_sequence)) info = { '{}_ref_count'.format(name): len(ref), '{}_alt_count'.format(name): len(alt), '{}_ignored_count'.format(name): len(ign) } for key, value in info.items(): data[key] = value self.bpp_cache[bpp] = data for key, value in self.bpp_cache[bpp].items(): bpp.data[key] = value return bpp
def main(): args = parse_arguments() repeat_sequences = sorted(list(set([s.lower() for s in args.repeat_seq]))) log('loading:', args.input) reference_genome = load_reference_genome(args.input) comments = [ os.path.basename(__file__), 'input: {}'.format(args.input), 'min_length: {}'.format(args.min_length), 'repeat_seq: {}'.format(', '.join(args.repeat_seq)), ] log('writing:', args.output) with open(args.output, 'w') as fh: for comment in comments: fh.write('## {}\n'.format(comment)) fh.write('chr\tstart\tend\tname\n') visited = set() for chrom, seq in sorted(reference_genome.items()): if chrom.startswith('chr'): chrom = chrom[3:] seq = str(seq.seq).lower() if seq in visited: continue else: visited.add(seq) spans = [] for repseq in repeat_sequences: log( 'finding {}_repeat (min_length: {}), for chr{} (length: {})'.format( repseq, args.min_length, chrom, len(seq) ) ) index = 0 while index < len(seq): next_n = seq.find(repseq, index) if next_n < 0: break index = next_n while ( index + len(repseq) <= len(seq) and seq[index : index + len(repseq)] == repseq ): index += len(repseq) span = BioInterval(chrom, next_n + 1, index, name='repeat_{}'.format(repseq)) if len(span) >= args.min_length and len(span) >= 2 * len(repseq): spans.append(span) log('found', len(spans), 'spans', time_stamp=False) for span in spans: fh.write( '{}\t{}\t{}\t{}\n'.format( span.reference_object, span.start, span.end, span.name ) )