def get_transcripts(annot,
                    coding_only,
                    genes=None,
                    only_longest=False,
                    coding_first=False):
    transcripts_dict = Transcript.extract_transcripts(annot)
    transcripts = []
    if only_longest:
        by_gene = defaultdict(list)
        for transcript in transcripts_dict.values():
            by_gene[transcript.gene].append(transcript)
        for gene in by_gene.keys():
            transcripts.append(get_longest(by_gene[gene], coding_first))
    else:
        transcripts = transcripts_dict.values()

    if genes and type(genes) is Set:
        transcripts = [t for t in transcripts if t.gene in genes]
        captured_genes = Set([t.gene for t in transcripts])

        for gene in genes - Set([t.gene for t in transcripts]):
            print "can't find %s from gtf" % gene

    if coding_only:
        transcripts = [t for t in transcripts if t.is_coding()]

    return transcripts
Exemplo n.º 2
0
def main():
    args = parse_args()
    bam = create_pysam_bam(args.bam)
    query_fasta = create_pysam_fasta(args.query_fasta)
    genome_fasta = create_pysam_fasta(args.genome_fasta)
    transcripts_dict = Transcript.extract_transcripts(args.gtf)
    annot_tabix = create_pysam_tabix(args.gtf)
    genome_bam = None
    if args.genome_bam:
        genome_bam = create_pysam_bam(args.genome_bam)

    em = ExonMapper(annot_tabix,
                    transcripts_dict,
                    genome_fasta,
                    debug=args.debug)

    annots = [args.gtf]
    if args.suppl_annot:
        annots.extend(args.suppl_annot)
    accessory_known_features = extract_features(annots)

    mappings, junc_adjs, events = em.map_aligns(
        bam,
        query_fasta,
        genome_fasta,
        accessory_known_features=accessory_known_features,
        max_diff=args.max_diff_splice)
    juncs_merged = Adjacency.merge(junc_adjs)

    events_merged = Adjacency.merge(events)

    if args.r2c:
        all_adjs = []
        if juncs_merged:
            all_adjs.extend(juncs_merged)
        if events_merged:
            all_adjs.extend(events_merged)
        if all_adjs:
            find_support(all_adjs,
                         args.r2c,
                         args.query_fasta,
                         num_procs=args.nproc,
                         debug=args.debug)
        if events_merged:
            filter_events(events_merged, args.min_support)

    if genome_bam:
        corroborate_genome(events_merged, genome_bam)

    cmd = ' '.join(sys.argv)
    time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S")
    software = '%s %s' % (pv.__name__, pv.__version__)
    em.output_mappings(mappings, '%s/mappings.tsv' % args.outdir)
    em.output_juncs(juncs_merged, '%s/junctions.bed' % args.outdir)
    em.output_events(events_merged,
                     '%s/novel_splicing.bedpe' % args.outdir,
                     header=(software, '%s %s' % (time, cmd)))
Exemplo n.º 3
0
def main():
    args = parse_args()
    bam = create_pysam_bam(args.bam)
    query_fasta = create_pysam_fasta(args.query_fasta)
    genome_fasta = create_pysam_fasta(args.genome_fasta)
    transcripts_dict = Transcript.extract_transcripts(args.gtf)
    annot_tabix = create_pysam_tabix(args.gtf)
    genome_bam = None
    if args.genome_bam:
	genome_bam = create_pysam_bam(args.genome_bam)
    
    em = ExonMapper(annot_tabix,
                    transcripts_dict,
                    genome_fasta,
                    debug = args.debug)

    annots = [args.gtf]
    if args.suppl_annot:
	annots.extend(args.suppl_annot)
    accessory_known_features = extract_features(annots)
    
    mappings, junc_adjs, events = em.map_aligns(bam,
                                                query_fasta,
                                                genome_fasta,
                                                accessory_known_features=accessory_known_features,
                                                max_diff=args.max_diff_splice)
    juncs_merged = Adjacency.merge(junc_adjs)

    events_merged = Adjacency.merge(events)

    if args.r2c:
	all_adjs = []
	if juncs_merged:
	    all_adjs.extend(juncs_merged)
	if events_merged:
	    all_adjs.extend(events_merged)
	if all_adjs:
	    find_support(all_adjs, args.r2c, args.query_fasta, num_procs=args.nproc, debug=args.debug)
	if events_merged:
	    filter_events(events_merged, args.min_support)

    if genome_bam:
	corroborate_genome(events_merged, genome_bam)
    
    cmd = ' '.join(sys.argv)
    time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S")
    software = '%s %s' % (pv.__name__, pv.__version__)
    em.output_mappings(mappings, '%s/mappings.tsv' % args.outdir)
    em.output_juncs(juncs_merged, '%s/junctions.bed' % args.outdir)    
    em.output_events(events_merged, '%s/novel_splicing.bedpe' % args.outdir, header=(software, '%s %s' % (time, cmd)))
Exemplo n.º 4
0
def main():
    args = parse_args()
    # extract unmapped read pairs
    r2c_bam = pysam.AlignmentFile(args.r2c)
    unmapped_fastas = find_unmapped_mates(r2c_bam, args.outdir)
    
    # align unmapped reads
    unmapped_bam_file = align_unmapped_mates(unmapped_fastas, args.transcripts_fasta, args.nthreads, args.outdir)

    # extract fusions
    unmapped_bam = pysam.AlignmentFile(unmapped_bam_file)
    transcripts_dict = Transcript.extract_transcripts(args.gtf)
    adjs = find_discordant_pairs(unmapped_bam, transcripts_dict, pysam.FastaFile(args.genome_fasta), min_pairs=args.min_pairs)
    Adjacency.report_events(adjs, '{}/discordant_pairs.bedpe'.format(args.outdir))
    
    if not args.no_cleanup:
	cleanup(args.outdir)
Exemplo n.º 5
0
def main():
    args = parse_args()
    # extract unmapped read pairs
    r2c_bam = pysam.AlignmentFile(args.r2c)
    unmapped_fastas = find_unmapped_mates(r2c_bam, args.outdir)
    
    # align unmapped reads
    unmapped_bam_file = align_unmapped_mates(unmapped_fastas, args.transcripts_fasta, args.nthreads, args.outdir)

    # extract fusions
    unmapped_bam = pysam.AlignmentFile(unmapped_bam_file)
    transcripts_dict = Transcript.extract_transcripts(args.gtf)
    adjs = find_discordant_pairs(unmapped_bam, transcripts_dict, pysam.FastaFile(args.genome_fasta), min_pairs=args.min_pairs)
    Adjacency.report_events(adjs, '%s/discordant_pairs.bedpe' % args.outdir)
    
    if not args.no_cleanup:
	cleanup(args.outdir)
def get_transcripts(annot, coding_only, genes=None, only_longest=False, coding_first=False):
    transcripts_dict = Transcript.extract_transcripts(annot)
    transcripts = []
    if only_longest:
        by_gene = defaultdict(list)
        for transcript in transcripts_dict.values():
            by_gene[transcript.gene].append(transcript)
        for gene in by_gene.keys():
            transcripts.append(get_longest(by_gene[gene], coding_first))
    else:
        transcripts = transcripts_dict.values()
    
    if genes and type(genes) is Set:
        transcripts = [t for t in transcripts if t.gene in genes]
        captured_genes = Set([t.gene for t in transcripts])
        
        for gene in genes - Set([t.gene for t in transcripts]):
            print "can't find %s from gtf" % gene
                
    if coding_only:
        transcripts = [t for t in transcripts if t.is_coding()]
        
    return transcripts
Exemplo n.º 7
0
def main():
    args = parse_args()

    gbam = create_pysam_bam(args.gbam)
    tbam = create_pysam_bam(args.tbam)
    query_fasta = create_pysam_fasta(args.query_fasta)
    genome_fasta = create_pysam_fasta(args.genome_fasta)
    transcripts_fasta = create_pysam_fasta(args.transcripts_fasta)
    transcripts_dict = Transcript.extract_transcripts(args.gtf)
    annot_tabix = create_pysam_tabix(args.gtf)

    sf = SVFinder(genome_fasta,
                  annot_tabix,
                  transcripts_dict,
                  args.outdir,
                  probe_len=args.probe_len,
                  debug=args.debug)
    events = {'via_genome': {}, 'via_transcripts': {}}
    mappings = {'via_genome': {}, 'via_transcripts': {}}
    gene_hits = None
    if gbam and annot_tabix:
        events['via_genome'], mappings['via_genome'] = sf.find_events(
            gbam,
            query_fasta,
            genome_fasta,
            'genome',
            min_indel_size=args.min_indel_size,
            min_dup_size=args.min_dup_size,
            min_indel_flanking=args.min_indel_flanking,
            no_utr=args.no_utr,
            max_novel_len=args.max_novel_len,
            max_homol_len=args.max_homol_len,
            only_sense_fusion=not args.include_nonsense_fusion,
            only_exon_bound_fusion=not args.include_non_exon_bound_fusion,
            only_coding_fusion=not args.include_noncoding_fusion,
            only_fusions=args.only_fusions)

    if tbam:
        events['via_transcripts'], mappings[
            'via_transcripts'] = sf.find_events(
                tbam,
                query_fasta,
                transcripts_fasta,
                'transcripts',
                external_mappings=mappings['via_genome'],
                min_indel_size=args.min_indel_size,
                min_dup_size=args.min_dup_size,
                min_indel_flanking=args.min_indel_flanking,
                no_utr=args.no_utr,
                no_indels=True,
                max_novel_len=args.max_novel_len,
                max_homol_len=args.max_homol_len,
                only_sense_fusion=not args.include_nonsense_fusion,
                only_exon_bound_fusion=not args.include_non_exon_bound_fusion,
                only_coding_fusion=not args.include_noncoding_fusion,
                only_fusions=args.only_fusions)

    # combine events from genome and transcriptome alignments
    events_combined = combine_events(events, mappings)

    # merge identical events from different contigs
    events_merged = Adjacency.merge(events_combined)

    # filter by checking probe and subseq alignments
    if events_merged and args.genome_index and len(args.genome_index) == 2:
        if not args.disable_subseq_filtering:
            sf.filter_subseqs(events_merged,
                              query_fasta,
                              args.genome_index[0],
                              args.genome_index[1],
                              args.outdir,
                              subseq_len=args.subseq_len,
                              debug=args.debug)
        if not args.disable_probe_filtering:
            sf.filter_probes(events_merged,
                             args.genome_index[0],
                             args.genome_index[1],
                             args.outdir,
                             args.probe_len,
                             debug=args.debug)

    # read support
    if args.r2c:
        find_support(events_merged,
                     args.r2c,
                     args.query_fasta,
                     min_overlap=args.min_overhang,
                     num_procs=args.nproc,
                     debug=args.debug)
        events_filtered = [
            event for event in events_merged
            if event.spanning >= args.min_support
        ]
    else:
        events_filtered = events_merged

    # determine if events are in- or out-of-frame
    sf.set_frame(events_filtered, query_fasta, genome_fasta)

    # report (with meta data)
    cmd = ' '.join(sys.argv)
    time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S")
    software = '{} {}'.format(pv.__name__, pv.__version__)
    Adjacency.report_events(events_filtered,
                            '{}/sv.bedpe'.format(args.outdir),
                            sort_by_coord=args.sort_by_coord,
                            header=(software, '{} {}'.format(time, cmd)))
Exemplo n.º 8
0
def main():
    args = parse_args()
        
    gbam = create_pysam_bam(args.gbam)
    tbam = create_pysam_bam(args.tbam)
    query_fasta = create_pysam_fasta(args.query_fasta)
    genome_fasta = create_pysam_fasta(args.genome_fasta)
    transcripts_fasta = create_pysam_fasta(args.transcripts_fasta)
    transcripts_dict = Transcript.extract_transcripts(args.gtf)
    annot_tabix = create_pysam_tabix(args.gtf)
            
    sf = SVFinder(genome_fasta, annot_tabix, transcripts_dict, args.outdir, probe_len=args.probe_len, debug=args.debug)
    events = {'via_genome': {}, 'via_transcripts': {}}
    mappings = {'via_genome': {}, 'via_transcripts': {}}
    gene_hits = None
    if gbam and annot_tabix:
        events['via_genome'], mappings['via_genome'] = sf.find_events(gbam,
                                                                      query_fasta,
                                                                      genome_fasta,
                                                                      'genome',
                                                                      min_indel_size=args.min_indel_size,
                                                                      min_dup_size=args.min_dup_size,
                                                                      min_indel_flanking=args.min_indel_flanking,
                                                                      no_utr=args.no_utr,
                                                                      max_novel_len=args.max_novel_len,
                                                                      max_homol_len=args.max_homol_len,
                                                                      only_sense_fusion=not args.include_nonsense_fusion,
                                                                      only_exon_bound_fusion=not args.include_non_exon_bound_fusion,
                                                                      only_coding_fusion=not args.include_noncoding_fusion,
                                                                      only_fusions=args.only_fusions
                                                                      )
        
    if tbam:
        events['via_transcripts'], mappings['via_transcripts'] = sf.find_events(tbam,
                                                                                query_fasta,
                                                                                transcripts_fasta,
                                                                                'transcripts',
                                                                                external_mappings=mappings['via_genome'],
                                                                                min_indel_size=args.min_indel_size,
                                                                                min_dup_size=args.min_dup_size,
                                                                                min_indel_flanking=args.min_indel_flanking,
                                                                                no_utr=args.no_utr,
                                                                                no_indels=True,
                                                                                max_novel_len=args.max_novel_len,
                                                                                max_homol_len=args.max_homol_len,
                                                                                only_sense_fusion=not args.include_nonsense_fusion,
                                                                                only_exon_bound_fusion=not args.include_non_exon_bound_fusion,
                                                                                only_coding_fusion=not args.include_noncoding_fusion,
                                                                                only_fusions=args.only_fusions
                                                                                )        

    # combine events from genome and transcriptome alignments
    events_combined = combine_events(events, mappings)

    # merge identical events from different contigs
    events_merged = Adjacency.merge(events_combined)
    
    # filter by checking probe and subseq alignments
    if events_merged and args.genome_index and len(args.genome_index) == 2:
        if not args.disable_subseq_filtering:
            sf.filter_subseqs(events_merged, query_fasta, args.genome_index[0], args.genome_index[1], args.outdir,
                              subseq_len=args.subseq_len, debug=args.debug)
        if not args.disable_probe_filtering:
            sf.filter_probes(events_merged, args.genome_index[0], args.genome_index[1], args.outdir, args.probe_len, debug=args.debug)

    # read support
    if args.r2c:
        find_support(events_merged, args.r2c, args.query_fasta, min_overlap=args.min_overhang, num_procs=args.nproc, debug=args.debug)
        events_filtered = [event for event in events_merged if event.spanning >= args.min_support]
    else:
        events_filtered = events_merged

    # determine if events are in- or out-of-frame
    sf.set_frame(events_filtered, query_fasta, genome_fasta)

    # report (with meta data)
    cmd = ' '.join(sys.argv)
    time = datetime.datetime.now().strftime("%Y-%m-%d:%H:%M:%S")
    software = '%s %s' % (pv.__name__, pv.__version__)
    Adjacency.report_events(events_filtered, '%s/sv.bedpe' % args.outdir, sort_by_coord=args.sort_by_coord, header=(software, '%s %s' % (time, cmd)))