def nominate_spanning_reads(discordant_reads_fh, chimeras_fh, fastq_fh):
    # build index of chimera candidates
    logging.info("Indexing chimera candidates")
    tx5p = collections.defaultdict(lambda: [])
    tx3p = collections.defaultdict(lambda: [])
    for chimera in Chimera.parse(chimeras_fh):
        tx5p[chimera.mate5p.tx_name].append(chimera.mate5p.end)
        tx3p[chimera.mate3p.tx_name].append(chimera.mate3p.start)
    # parse discordant reads
    logging.info("Nominating spanning reads")
    read1, read2 = None, None
    prev_qname = None
    for frag in parse_discordant_reads(discordant_reads_fh):
        if frag.discordant_type.is_genome:
            continue
        qname = frag.qname
        if prev_qname is not None and (qname != prev_qname):
            if read1 is not None:
                print >> fastq_fh, read1
            if read2 is not None:
                print >> fastq_fh, read2
            read1, read2 = None, None
        # skip if reads already found
        if (read1 is not None) and (read2 is not None):
            continue
        # update read fastq
        r1, r2 = check_fragment(frag, tx5p, tx3p)
        if read1 is None: read1 = r1
        if read2 is None: read2 = r2
        prev_qname = qname
    if read1 is not None:
        print >> fastq_fh, read1
    if read2 is not None:
        print >> fastq_fh, read2
Exemplo n.º 2
0
def filter_encompassing_chimeras(input_file, output_file, gene_file,
                                 max_multimap=1,
                                 multimap_cov_ratio=0.10,
                                 max_isize=1000,
                                 strand_pval=0.01,
                                 keep_overlap=False):
    logging.debug("Filtering chimeras")
    logging.debug("Must have a read with <= %d multimaps" % (max_multimap))
    logging.debug("Coverage to reads ratio >= %f" % (multimap_cov_ratio))
    logging.debug("Insert size < %d" % (max_isize))
    logging.debug("Strand balance p-value > %f" % (strand_pval))
    # first perform basic filtering
    tmpfile1 = make_temp(base_dir=os.path.dirname(output_file),
                         suffix='.bedpe')
    fh = open(tmpfile1, "w")
    for c in Chimera.parse(open(input_file)):
        res = filter_multimapping(c, max_multimap=max_multimap, 
                                  multimap_cov_ratio=multimap_cov_ratio)
        res = res and filter_insert_size(c, max_isize)
        if not keep_overlap:
            res = res and filter_overlapping(c)
        res = res and filter_strand_balance(c, strand_pval)
        if res:
            print >>fh, '\t'.join(map(str, c.to_list()))
    fh.close()
    logging.debug("Building gene/genome index")
    ggmap = build_gene_to_genome_map(open(gene_file))
    logging.debug("Finding junction permiscuity")
    juncmap5p, juncmap3p = collect_permiscuity_stats(tmpfile1, ggmap)
    fh = open(output_file, "w")
    for c in Chimera.parse(open(tmpfile1)):
        frac5p, frac3p = calc_permiscuity(c, juncmap5p, juncmap3p, ggmap)
        c.mate5p.frac = frac5p
        c.mate3p.frac = frac3p
        print >>fh, '\t'.join(map(str, c.to_list()))
    fh.close()
    # delete tmp files
    os.remove(tmpfile1)
def nominate_spanning_reads(discordant_reads_fh,
                            chimeras_fh,
                            fastq_fh):
    # build index of chimera candidates
    logging.info("Indexing chimera candidates")
    tx5p = collections.defaultdict(lambda: [])
    tx3p = collections.defaultdict(lambda: [])
    for chimera in Chimera.parse(chimeras_fh):
        tx5p[chimera.mate5p.tx_name].append(chimera.mate5p.end)
        tx3p[chimera.mate3p.tx_name].append(chimera.mate3p.start)
    # parse discordant reads    
    logging.info("Nominating spanning reads")    
    read1, read2 = None, None
    prev_qname = None
    for frag in parse_discordant_reads(discordant_reads_fh):        
        if frag.discordant_type.is_genome:
            continue
        qname = frag.qname
        if prev_qname is not None and (qname != prev_qname):
            if read1 is not None:
                print >>fastq_fh, read1
            if read2 is not None:
                print >>fastq_fh, read2
            read1, read2 = None, None
        # skip if reads already found
        if (read1 is not None) and (read2 is not None):
            continue
        # update read fastq
        r1, r2 = check_fragment(frag, tx5p, tx3p)
        if read1 is None: read1 = r1
        if read2 is None: read2 = r2
        prev_qname = qname
    if read1 is not None:
        print >>fastq_fh, read1
    if read2 is not None:
        print >>fastq_fh, read2
Exemplo n.º 4
0
def collect_permiscuity_stats(input_file, ggmap):
    # break name into 5'/3' genes linked in a dictionary
    logging.debug("Building chimera permiscuity map")
    juncmap5p, juncmap3p = \
        build_junc_permiscuity_map(Chimera.parse(open(input_file)), ggmap)
    return juncmap5p, juncmap3p