def main(): args = parse_args() bam = create_pysam_bam(args.bam) query_fasta = create_pysam_fasta(args.query_fasta) genome_fasta = create_pysam_fasta(args.genome_fasta) transcripts_dict = Transcript.extract_transcripts(args.gtf) annot_tabix = create_pysam_tabix(args.gtf) genome_bam = None if args.genome_bam: genome_bam = create_pysam_bam(args.genome_bam) em = ExonMapper(annot_tabix, transcripts_dict, genome_fasta, debug=args.debug) annots = [args.gtf] if args.suppl_annot: annots.extend(args.suppl_annot) accessory_known_features = extract_features(annots) mappings, junc_adjs, events = em.map_aligns( bam, query_fasta, genome_fasta, accessory_known_features=accessory_known_features, max_diff=args.max_diff_splice) juncs_merged = Adjacency.merge(junc_adjs) events_merged = Adjacency.merge(events) if args.r2c: all_adjs = [] if juncs_merged: all_adjs.extend(juncs_merged) if events_merged: all_adjs.extend(events_merged) if all_adjs: find_support(all_adjs, args.r2c, args.query_fasta, num_procs=args.nproc, debug=args.debug) if events_merged: filter_events(events_merged, args.min_support) if genome_bam: corroborate_genome(events_merged, genome_bam) cmd = ' '.join(sys.argv) time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S") software = '%s %s' % (pv.__name__, pv.__version__) em.output_mappings(mappings, '%s/mappings.tsv' % args.outdir) em.output_juncs(juncs_merged, '%s/junctions.bed' % args.outdir) em.output_events(events_merged, '%s/novel_splicing.bedpe' % args.outdir, header=(software, '%s %s' % (time, cmd)))
def main(): args = parse_args() bam = create_pysam_bam(args.bam) query_fasta = create_pysam_fasta(args.query_fasta) genome_fasta = create_pysam_fasta(args.genome_fasta) transcripts_dict = Transcript.extract_transcripts(args.gtf) annot_tabix = create_pysam_tabix(args.gtf) genome_bam = None if args.genome_bam: genome_bam = create_pysam_bam(args.genome_bam) em = ExonMapper(annot_tabix, transcripts_dict, genome_fasta, debug = args.debug) annots = [args.gtf] if args.suppl_annot: annots.extend(args.suppl_annot) accessory_known_features = extract_features(annots) mappings, junc_adjs, events = em.map_aligns(bam, query_fasta, genome_fasta, accessory_known_features=accessory_known_features, max_diff=args.max_diff_splice) juncs_merged = Adjacency.merge(junc_adjs) events_merged = Adjacency.merge(events) if args.r2c: all_adjs = [] if juncs_merged: all_adjs.extend(juncs_merged) if events_merged: all_adjs.extend(events_merged) if all_adjs: find_support(all_adjs, args.r2c, args.query_fasta, num_procs=args.nproc, debug=args.debug) if events_merged: filter_events(events_merged, args.min_support) if genome_bam: corroborate_genome(events_merged, genome_bam) cmd = ' '.join(sys.argv) time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S") software = '%s %s' % (pv.__name__, pv.__version__) em.output_mappings(mappings, '%s/mappings.tsv' % args.outdir) em.output_juncs(juncs_merged, '%s/junctions.bed' % args.outdir) em.output_events(events_merged, '%s/novel_splicing.bedpe' % args.outdir, header=(software, '%s %s' % (time, cmd)))
def main(): args = parse_args() # extract unmapped read pairs r2c_bam = pysam.AlignmentFile(args.r2c) unmapped_fastas = find_unmapped_mates(r2c_bam, args.outdir) # align unmapped reads unmapped_bam_file = align_unmapped_mates(unmapped_fastas, args.transcripts_fasta, args.nthreads, args.outdir) # extract fusions unmapped_bam = pysam.AlignmentFile(unmapped_bam_file) transcripts_dict = Transcript.extract_transcripts(args.gtf) adjs = find_discordant_pairs(unmapped_bam, transcripts_dict, pysam.FastaFile(args.genome_fasta), min_pairs=args.min_pairs) Adjacency.report_events(adjs, '{}/discordant_pairs.bedpe'.format(args.outdir)) if not args.no_cleanup: cleanup(args.outdir)
def main(): args = parse_args() # extract unmapped read pairs r2c_bam = pysam.AlignmentFile(args.r2c) unmapped_fastas = find_unmapped_mates(r2c_bam, args.outdir) # align unmapped reads unmapped_bam_file = align_unmapped_mates(unmapped_fastas, args.transcripts_fasta, args.nthreads, args.outdir) # extract fusions unmapped_bam = pysam.AlignmentFile(unmapped_bam_file) transcripts_dict = Transcript.extract_transcripts(args.gtf) adjs = find_discordant_pairs(unmapped_bam, transcripts_dict, pysam.FastaFile(args.genome_fasta), min_pairs=args.min_pairs) Adjacency.report_events(adjs, '%s/discordant_pairs.bedpe' % args.outdir) if not args.no_cleanup: cleanup(args.outdir)
def extract_multiple_contig_events(events_by_contig): """Finds multiple contigs that map to same event, for use in same_event()""" events = [] for contig in events_by_contig.keys(): events.extend(events_by_contig[contig]) events_merged = Adjacency.merge(events) contigs = set() for event in events_merged: if ',' in event.seq_id: for contig in event.seq_id.split(','): contigs.add(contig) return contigs
def extract_multiple_contig_events(events_by_contig): """Finds multiple contigs that map to same event, for use in same_event()""" events = [] for contig in events_by_contig.keys(): events.extend(events_by_contig[contig]) events_merged = Adjacency.merge(events) contigs = Set() for event in events_merged: if ',' in event.seq_id: for contig in event.seq_id.split(','): contigs.add(contig) return contigs
def create_adj(event): seq_id = ','.join(event['spanning'] + event['flanking']) adj = Adjacency(seq_id, (event['txts'][0].chrom, event['txts'][1].chrom), ('na', 'na'), event['genome_breaks'], ) adj.event = 'fusion' adj.sense_fusion = True adj.transcripts = event['txts'] adj.upstream_transcript = event['txts'][0] adj.downstream_transcript = event['txts'][1] adj.orients = event['orients'] adj.genome_breaks = event['genome_breaks'] adj.chroms = (event['txts'][0].chrom, event['txts'][1].chrom) adj.transcript_breaks = event['txt_breaks'] adj.exons_oriented = adj.exons = event['exons'] adj.exon_bounds_oriented = event['exon_bounds'] adj.exon_bounds = event['exon_bounds'] adj.spanning = len(event['spanning']) adj.flanking = len(event['flanking']) adj.support = adj.spanning + adj.flanking return adj
def main(): args = parse_args() gbam = create_pysam_bam(args.gbam) tbam = create_pysam_bam(args.tbam) query_fasta = create_pysam_fasta(args.query_fasta) genome_fasta = create_pysam_fasta(args.genome_fasta) transcripts_fasta = create_pysam_fasta(args.transcripts_fasta) transcripts_dict = Transcript.extract_transcripts(args.gtf) annot_tabix = create_pysam_tabix(args.gtf) sf = SVFinder(genome_fasta, annot_tabix, transcripts_dict, args.outdir, probe_len=args.probe_len, debug=args.debug) events = {'via_genome': {}, 'via_transcripts': {}} mappings = {'via_genome': {}, 'via_transcripts': {}} gene_hits = None if gbam and annot_tabix: events['via_genome'], mappings['via_genome'] = sf.find_events( gbam, query_fasta, genome_fasta, 'genome', min_indel_size=args.min_indel_size, min_dup_size=args.min_dup_size, min_indel_flanking=args.min_indel_flanking, no_utr=args.no_utr, max_novel_len=args.max_novel_len, max_homol_len=args.max_homol_len, only_sense_fusion=not args.include_nonsense_fusion, only_exon_bound_fusion=not args.include_non_exon_bound_fusion, only_coding_fusion=not args.include_noncoding_fusion, only_fusions=args.only_fusions) if tbam: events['via_transcripts'], mappings[ 'via_transcripts'] = sf.find_events( tbam, query_fasta, transcripts_fasta, 'transcripts', external_mappings=mappings['via_genome'], min_indel_size=args.min_indel_size, min_dup_size=args.min_dup_size, min_indel_flanking=args.min_indel_flanking, no_utr=args.no_utr, no_indels=True, max_novel_len=args.max_novel_len, max_homol_len=args.max_homol_len, only_sense_fusion=not args.include_nonsense_fusion, only_exon_bound_fusion=not args.include_non_exon_bound_fusion, only_coding_fusion=not args.include_noncoding_fusion, only_fusions=args.only_fusions) # combine events from genome and transcriptome alignments events_combined = combine_events(events, mappings) # merge identical events from different contigs events_merged = Adjacency.merge(events_combined) # filter by checking probe and subseq alignments if events_merged and args.genome_index and len(args.genome_index) == 2: if not args.disable_subseq_filtering: sf.filter_subseqs(events_merged, query_fasta, args.genome_index[0], args.genome_index[1], args.outdir, subseq_len=args.subseq_len, debug=args.debug) if not args.disable_probe_filtering: sf.filter_probes(events_merged, args.genome_index[0], args.genome_index[1], args.outdir, args.probe_len, debug=args.debug) # read support if args.r2c: find_support(events_merged, args.r2c, args.query_fasta, min_overlap=args.min_overhang, num_procs=args.nproc, debug=args.debug) events_filtered = [ event for event in events_merged if event.spanning >= args.min_support ] else: events_filtered = events_merged # determine if events are in- or out-of-frame sf.set_frame(events_filtered, query_fasta, genome_fasta) # report (with meta data) cmd = ' '.join(sys.argv) time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S") software = '{} {}'.format(pv.__name__, pv.__version__) Adjacency.report_events(events_filtered, '{}/sv.bedpe'.format(args.outdir), sort_by_coord=args.sort_by_coord, header=(software, '{} {}'.format(time, cmd)))
def main(): args = parse_args() gbam = create_pysam_bam(args.gbam) tbam = create_pysam_bam(args.tbam) query_fasta = create_pysam_fasta(args.query_fasta) genome_fasta = create_pysam_fasta(args.genome_fasta) transcripts_fasta = create_pysam_fasta(args.transcripts_fasta) transcripts_dict = Transcript.extract_transcripts(args.gtf) annot_tabix = create_pysam_tabix(args.gtf) sf = SVFinder(genome_fasta, annot_tabix, transcripts_dict, args.outdir, probe_len=args.probe_len, debug=args.debug) events = {'via_genome': {}, 'via_transcripts': {}} mappings = {'via_genome': {}, 'via_transcripts': {}} gene_hits = None if gbam and annot_tabix: events['via_genome'], mappings['via_genome'] = sf.find_events(gbam, query_fasta, genome_fasta, 'genome', min_indel_size=args.min_indel_size, min_dup_size=args.min_dup_size, min_indel_flanking=args.min_indel_flanking, no_utr=args.no_utr, max_novel_len=args.max_novel_len, max_homol_len=args.max_homol_len, only_sense_fusion=not args.include_nonsense_fusion, only_exon_bound_fusion=not args.include_non_exon_bound_fusion, only_coding_fusion=not args.include_noncoding_fusion, only_fusions=args.only_fusions ) if tbam: events['via_transcripts'], mappings['via_transcripts'] = sf.find_events(tbam, query_fasta, transcripts_fasta, 'transcripts', external_mappings=mappings['via_genome'], min_indel_size=args.min_indel_size, min_dup_size=args.min_dup_size, min_indel_flanking=args.min_indel_flanking, no_utr=args.no_utr, no_indels=True, max_novel_len=args.max_novel_len, max_homol_len=args.max_homol_len, only_sense_fusion=not args.include_nonsense_fusion, only_exon_bound_fusion=not args.include_non_exon_bound_fusion, only_coding_fusion=not args.include_noncoding_fusion, only_fusions=args.only_fusions ) # combine events from genome and transcriptome alignments events_combined = combine_events(events, mappings) # merge identical events from different contigs events_merged = Adjacency.merge(events_combined) # filter by checking probe and subseq alignments if events_merged and args.genome_index and len(args.genome_index) == 2: if not args.disable_subseq_filtering: sf.filter_subseqs(events_merged, query_fasta, args.genome_index[0], args.genome_index[1], args.outdir, subseq_len=args.subseq_len, debug=args.debug) if not args.disable_probe_filtering: sf.filter_probes(events_merged, args.genome_index[0], args.genome_index[1], args.outdir, args.probe_len, debug=args.debug) # read support if args.r2c: find_support(events_merged, args.r2c, args.query_fasta, min_overlap=args.min_overhang, num_procs=args.nproc, debug=args.debug) events_filtered = [event for event in events_merged if event.spanning >= args.min_support] else: events_filtered = events_merged # determine if events are in- or out-of-frame sf.set_frame(events_filtered, query_fasta, genome_fasta) # report (with meta data) cmd = ' '.join(sys.argv) time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S") software = '%s %s' % (pv.__name__, pv.__version__) Adjacency.report_events(events_filtered, '%s/sv.bedpe' % args.outdir, sort_by_coord=args.sort_by_coord, header=(software, '%s %s' % (time, cmd)))