예제 #1
0
def main(input_prefix):
    input_gff = input_prefix + '.gff'
    if not os.path.exists(input_gff):
        print("Looking for input GFF {0} but not found! Abort!".format(
            input_gff))
        sys.exit(-1)

    f1 = open(input_prefix + '.simple_stats.txt', 'w')
    f1.write("pbid\tlocus\tlength\tnum_exon\n")
    f2 = open(input_prefix + '.exon_stats.txt', 'w')
    f2.write("pbid\texon_index\texon_size\tintron_size\n")
    for r in collapseGFFReader(input_gff):
        f1.write(r.seqid + '\t')
        f1.write(r.seqid.split('.')[1] + '\t')
        sum_len = 0
        for i, e in enumerate(r.ref_exons):
            exon_len = e.end - e.start
            sum_len += exon_len
            f2.write("{0}\t{1}\t{2}\t".format(r.seqid, i + 1, exon_len))
            if i == 0: f2.write("NA\n")
            else: f2.write(str(e.start - r.ref_exons[i - 1].end) + '\n')

        f1.write(str(sum_len) + '\t')
        f1.write(str(len(r.ref_exons)) + '\n')

    f1.close()
    f2.close()
    print("Output written to: {0},{1}\n".format(f1.name, f2.name))
def calc_ontarget_rate(tree,
                       gene_info,
                       input_fasta,
                       is_gtf,
                       sam_or_gtf,
                       output_filename=None):

    type = 'fasta' if input_fasta.upper().endswith(
        '.FA') or input_fasta.upper().endswith('.FASTA') else 'fastq'
    query_len_dict = dict(
        (r.id, len(r.seq)) for r in SeqIO.parse(open(input_fasta), type))

    if output_filename is None:
        f = sys.stdout
    else:
        f = open(output_filename, 'w')

    FIELDS = [
        'read_id', 'read_len', 'num_probe', 'num_base_overlap', 'loci', 'genes'
    ]
    writer = DictWriter(f, FIELDS, delimiter='\t')
    writer.writeheader()

    if is_gtf:
        reader = collapseGFFReader(sam_or_gtf)
        for r in reader:
            num_probe, base_hit, genes_seen = get_probe_hit(
                tree, gene_info, r, is_gtf)
            rec = {
                'read_id': r.seqid,
                'read_len': 'NA',
                'num_probe': num_probe,
                'num_base_overlap': base_hit,
                'loci': "{0}:{1}-{2}".format(r.chr, r.start, r.end),
                'genes': ",".join(genes_seen)
            }
            writer.writerow(rec)
    else:
        reader = BioReaders.GMAPSAMReader(sam_or_gtf,
                                          True,
                                          query_len_dict=query_len_dict)
        for r in reader:
            if r.sID == '*': continue
            num_probe, base_hit, genes_seen = get_probe_hit(
                tree, gene_info, r, is_gtf)
            rec = {
                'read_id': r.qID,
                'read_len': r.qLen,
                'num_probe': num_probe,
                'num_base_overlap': base_hit,
                'loci': "{0}:{1}-{2}".format(r.sID, r.sStart, r.sEnd),
                'genes': ",".join(genes_seen)
            }
            writer.writerow(rec)

    f.close()
예제 #3
0
def read_GFF(gff_filename, logf):
    """
    Read a GFF filename and get the gene regions

    :return: dict of (PB.X) --> LocusInfo
    """
    gff_info = {}  # loci --> LocusInfo
    tmp = {}  # loci PB.X --> list of GFF records for PB.X.Y

    for r in collapseGFFReader(gff_filename):
        m = rex_pbid.match(r.seqid)
        if m is None:
            raise Exception("Expected PBID format PB.X.Y but saw {0}".format(
                r.seqid))
        locus = m.group(1)  # ex: PB.1
        if locus not in tmp:
            tmp[locus] = [r]
            gff_info[locus] = LocusInfo(chrom=r.chr,
                                        strand=r.strand,
                                        regions=None,
                                        isoforms=None)
        else:
            if gff_info[locus].chrom != r.chr:
                logf.write("WARNING: Expected {0} to be on {1} but saw {2}. Could be minimap2 multi-mapping inconsistency for repetitive genes. Check later.\n".format(\
                    r.seqid, gff_info[locus].chrom, r.chr))
            tmp[locus].append(r)

    # now figure out the exonic regions for each gene PB.X
    for locus, records in tmp.items():
        c = ClusterTree(0, 0)
        for r in records:
            for e in r.ref_exons:
                c.insert(max(0, e.start - extra_bp_around_junctions),
                         e.end + extra_bp_around_junctions, 1)

        regions = [(a, b) for (a, b, junk) in c.getregions()]
        regions[0] = (max(0, regions[0][0] - __padding_before_after__),
                      regions[0][1])
        regions[-1] = (max(0, regions[-1][0]),
                       regions[-1][1] + __padding_before_after__)
        gff_info[locus] = LocusInfo(chrom=gff_info[locus].chrom,
                                    strand=gff_info[locus].strand,
                                    regions=regions,
                                    isoforms=[r.seqid for r in records])

    return gff_info
def read_annotation_for_junction_info(gff_filename):
    """
    :param gff_filename: annotation GFF
    :return: dict of (chrom, strand, 'donor' or 'acceptor') --> sorted list of donor or acceptor site. all 0-based.
    """
    d = defaultdict(lambda: set())
    for r in collapseGFFReader(gff_filename):
        if r.strand == '+':
            for i in xrange(0, len(r.ref_exons) - 1):
                d[(r.chr, r.strand, 'donor')].add(r.ref_exons[i].end - 1)
                d[(r.chr, r.strand, 'acceptor')].add(r.ref_exons[i + 1].start)
        else:
            for i in xrange(0, len(r.ref_exons) - 1):
                d[(r.chr, r.strand, 'acceptor')].add(r.ref_exons[i].end - 1)
                d[(r.chr, r.strand, 'donor')].add(r.ref_exons[i + 1].start)
    for k in d:
        d[k] = list(d[k])
        d[k].sort()
    return d
def read_annotation_for_junction_info(gff_filename):
    """
    :param gff_filename: annotation GFF
    :return: dict of (chrom, strand, 'donor' or 'acceptor') --> sorted list of donor or acceptor site. all 0-based.
    """
    d = defaultdict(lambda: set())
    for r in collapseGFFReader(gff_filename):
        if r.strand == '+':
            for i in xrange(0, len(r.ref_exons)-1):
                d[(r.chr, r.strand, 'donor')].add(r.ref_exons[i].end-1)
                d[(r.chr, r.strand, 'acceptor')].add(r.ref_exons[i+1].start)
        else:
            for i in xrange(0, len(r.ref_exons)-1):
                d[(r.chr, r.strand, 'acceptor')].add(r.ref_exons[i].end-1)
                d[(r.chr, r.strand, 'donor')].add(r.ref_exons[i+1].start)
    for k in d:
        d[k] = list(d[k])
        d[k].sort()
    return d
예제 #6
0
def read_GFF(gff_filename, logf):
    """
    Read a GFF filename and get the gene regions

    :return: dict of (PB.X) --> LocusInfo
    """
    gff_info = {} # loci --> LocusInfo
    tmp = {} # loci PB.X --> list of GFF records for PB.X.Y

    for r in collapseGFFReader(gff_filename):
        m = rex_pbid.match(r.seqid)
        if m is None: raise Exception, "Expected PBID format PB.X.Y but saw {0}".format(r.seqid)
        locus = m.group(1) # ex: PB.1
        if locus not in tmp:
            tmp[locus] = [r]
            gff_info[locus] = LocusInfo(chrom=r.chr, strand=r.strand, regions=None, isoforms=None)
        else:
            if gff_info[locus].chrom!=r.chr:
                logf.write("WARNING: Expected {0} to be on {1} but saw {2}. Could be minimap2 multi-mapping inconsistency for repetitive genes. Check later.\n".format(\
                    r.seqid, gff_info[locus].chrom, r.chr))
            tmp[locus].append(r)


    # now figure out the exonic regions for each gene PB.X
    for locus, records in tmp.iteritems():
        c = ClusterTree(0, 0)
        for r in records:
            for e in r.ref_exons:
                c.insert(e.start-extra_bp_around_junctions, e.end+extra_bp_around_junctions, 1)

        regions = [(a,b) for (a,b,junk) in c.getregions()]
        regions[0] = (regions[0][0]-__padding_before_after__, regions[0][1])
        regions[-1] = (regions[-1][0], regions[-1][1]+__padding_before_after__)
        gff_info[locus] = LocusInfo(chrom=gff_info[locus].chrom,
                                       strand=gff_info[locus].strand,
                                       regions=regions,
                                       isoforms=[r.seqid for r in records])

    return gff_info
def collate_info(fusion_prefix,
                 class_filename,
                 genepred_filename,
                 total_fl_count=None,
                 config_filename=None,
                 genome_dict=None,
                 cds_gff_filename=None,
                 min_fl_count=2):

    global_info = {}  # holding information for general information
    if config_filename is not None:
        print("Reading config file {0}...".format(config_filename),
              file=sys.stdout)
        for line in open(config_filename):
            k, v = line.strip().split('=')
            global_info[k] = v

    gene_to_id = {}  # gene name --> ensembl ID
    for line in open(genepred_filename):
        raw = line.strip().split()
        gene_to_id[raw[11]] = raw[0]

    d = defaultdict(
        lambda: {})  # PBfusion.X --> isoform index -> sqanti3 record
    orf_dict = {}
    # read SQANTI3 classification file
    for r in DictReader(open(class_filename), delimiter='\t'):
        m = fusion_pbid.match(r['isoform'])
        if m is None:
            print(
                "ERROR: fusion pbid must follow format `PBfusion.X.Y`. Abort!",
                file=sys.stderr)
            sys.exit(-1)
        gene_index, isoform_index = m.group(1), m.group(2)
        d[gene_index][isoform_index] = r
        orf_dict[r['isoform']] = r['ORF_seq']

    # get sequences
    seq_dict = dict(
        (r.id.split('|')[0], r.seq)
        for r in SeqIO.parse(open(fusion_prefix + '.rep.fa'), 'fasta'))

    # get count information
    count_d = defaultdict(lambda: 'NA')
    count_filename = fusion_prefix + '.abundance.txt'
    if os.path.exists(count_filename):
        for r in DictReader(open(count_filename), delimiter='\t'):
            count_d[r['pbid']] = int(r['count_fl'])

    if total_fl_count is None:
        print(
            "Total FL count not given --- using the sum FL count from fusions only instead.",
            file=sys.stdout)
        total_fl_count = sum(count_d.values())

    # get breakpoint information
    gff_d = defaultdict(
        lambda: {})  # PBfusion.X --> isoform index -> sqanti3 record
    if cds_gff_filename is None:
        gff_filename = fusion_prefix + '.gff'
    else:
        gff_filename = cds_gff_filename

    for r in collapseGFFReader(gff_filename):
        m = fusion_pbid.match(r.seqid)
        if m is None:
            print(
                "ERROR: fusion pbid in {0} must follow format `PBfusion.X.Y`. Abort!"
                .format(gff_filename),
                file=sys.stderr)
            sys.exit(-1)
        gene_index, isoform_index = m.group(1), int(m.group(2))
        gff_d[gene_index][isoform_index] = r
        if r.strand not in ('+', '-'):
            print("ERROR: fusion {0} did not specify strand in {1}! Abort!".
                  format(r.seqid, gff_filename))
            sys.exit(-1)

    fields2 = list(global_info.keys()) + FIELDS
    f = open(fusion_prefix + '.annotated.txt', 'w')
    f_bad = open(fusion_prefix + '.annotated_ignored.txt', 'w')
    writer = DictWriter(f, fields2, delimiter=',')
    writer.writeheader()
    writer_bad = DictWriter(f_bad, fields2, delimiter=',')
    writer_bad.writeheader()

    for gene_index, iso_dict in d.items():
        iso_dict = list(
            iso_dict.items())  # (isoform index, classification record)
        iso_dict.sort(key=lambda x: x[0])
        has_novel = any(r['associated_gene'].startswith('novelGene')
                        or r['associated_gene'] == '' for junk, r in iso_dict)
        pbid = 'PBfusion.' + str(gene_index)

        gff_info = list(gff_d[gene_index].items())
        gff_info.sort(key=lambda x: x[0])

        rec1 = gff_info[0][1]
        rec2 = gff_info[-1][1]
        left_breakpoint, left_seq, right_breakpoint, right_seq = \
            get_breakpoint_n_seq(rec1, rec2, genome_dict)
        left_exon_count = len(rec1.ref_exons)
        right_exon_count = len(rec2.ref_exons)
        gene1 = iso_dict[0][1]['associated_gene']
        gene2 = iso_dict[-1][1]['associated_gene']

        if cds_gff_filename is not None:
            left_cds_exon_count = len(rec1.cds_exons)
            right_cds_exon_count = len(rec2.cds_exons)
        else:
            left_cds_exon_count = 'NA'
            right_cds_exon_count = 'NA'

        left_orf, right_orf = 'NA', 'NA'
        if orf_dict is not None:
            seqid1 = gff_info[0][1].seqid
            seqid2 = gff_info[-1][1].seqid
            left_orf = orf_dict[seqid1]
            right_orf = orf_dict[seqid2]

        info = {
            'UniqueID':
            pbid,
            'FusionName':
            "--".join([_r['associated_gene'] for (_index, _r) in iso_dict]),
            'LeftGeneName':
            gene1,
            'LeftGeneID':
            gene_to_id[gene1] if gene1 in gene_to_id else 'NA',
            'LeftBreakpoint':
            left_breakpoint,
            'LeftFlankingSequence':
            left_seq,
            'RightGeneName':
            gene2,
            'RightGeneID':
            gene_to_id[gene2] if gene2 in gene_to_id else 'NA',
            'RightBreakpoint':
            right_breakpoint,
            'RightFlankingSequence':
            right_seq,
            'JunctionSupport':
            'NA',
            'SpanningReads':
            count_d[pbid],
            'ReadCountScore':
            count_d[pbid] * (10**6) /
            total_fl_count if count_d[pbid] is not 'NA' else 'NA',
            'Sequence':
            seq_dict[pbid],
            'LeftORF':
            left_orf,
            'RightORF':
            right_orf,
            'LeftExonCount':
            left_exon_count,
            'RightExonCount':
            right_exon_count,
            'LeftCDSExonCount':
            left_cds_exon_count,
            'RightCDSExonCount':
            right_cds_exon_count
        }
        info.update(global_info)
        if has_novel or \
                gene1==gene2 or \
                (info['SpanningReads']!='NA' and info['SpanningReads'] < min_fl_count):
            writer_bad.writerow(info)
        else:
            writer.writerow(info)

    f.close()
예제 #8
0
def make_file_for_subsample(input_prefix,
                            output_prefix,
                            demux_file=None,
                            matchAnnot_parsed=None,
                            sqanti_class=None,
                            include_single_exons=False):
    """
    Two files must exist: .abundance.txt and .rep.fq so we can make the length
    """
    count_filename = input_prefix + '.abundance.txt'
    fq_filename = input_prefix + '.rep.fq'

    if not include_single_exons:
        from cupcake.io.GFF import collapseGFFReader
        gff_filename = input_prefix + '.gff'
        print("Reading {0} to exclude single exons...".format(gff_filename),
              file=sys.stderr)
        good_ids = []
        for r in collapseGFFReader(gff_filename):
            if len(r.ref_exons) >= 2:
                good_ids.append(r.seqid)

    if demux_file is None and not os.path.exists(count_filename):
        print("Cannot find {0}. Abort!".format(count_filename),
              file=sys.stderr)
        sys.exit(-1)

    if not os.path.exists(fq_filename):
        print("Cannot find {0}. Abort!".format(fq_filename), file=sys.stderr)
        sys.exit(-1)

    if matchAnnot_parsed is not None and not os.path.exists(matchAnnot_parsed):
        print("Cannot find {0}. Abort!".format(matchAnnot_parsed),
              file=sys.stderr)
        sys.exit(-1)

    if sqanti_class is not None and not os.path.exists(sqanti_class):
        print("Cannot find {0}. Abort!".format(sqanti_class), file=sys.stderr)
        sys.exit(-1)

    if matchAnnot_parsed is not None:
        match_dict = dict(
            (r['pbid'], r)
            for r in DictReader(open(matchAnnot_parsed), delimiter='\t'))
        for k in match_dict:
            match_dict[k]['category'] = match_dict[k]['score']
    elif sqanti_class is not None:
        print("Reading {0} to get gene/isoform assignment...".format(
            sqanti_class),
              file=sys.stderr)
        match_dict = {}
        for r in DictReader(open(sqanti_class), delimiter='\t'):
            if r['associated_transcript'] == 'novel':
                refisoform = 'novel_' + r['isoform']
            else:
                refisoform = r['associated_transcript']
            match_dict[r['isoform']] = {
                'refgene': r['associated_gene'],
                'refisoform': refisoform,
                'category': r['structural_category']
            }
    else:
        match_dict = None

    seqlen_dict = dict((r.id.split('|')[0], len(r.seq))
                       for r in SeqIO.parse(open(fq_filename), 'fastq'))

    to_write = {}
    if demux_file is None:
        to_write['all'] = {}
        f = open(count_filename)
        while True:
            cur = f.tell()
            if not f.readline().startswith('#'):
                f.seek(cur)
                break
        for r in DictReader(f, delimiter='\t'):
            if r['pbid'] in good_ids or include_single_exons:
                to_write['all'][r['pbid']] = r['count_fl']
    else:
        d, samples = read_demux_fl_count_file(demux_file)
        for s in samples:
            to_write[s] = {}
        for pbid, d2 in d.items():
            for s in samples:
                if pbid in good_ids or include_single_exons:
                    to_write[s][pbid] = d2[s]

    for sample in to_write:
        h = open(output_prefix + '.' + sample + '.txt', 'w')
        if matchAnnot_parsed is None and sqanti_class is None:
            h.write("pbid\tpbgene\tlength\tfl_count\n")
        else:
            h.write(
                "pbid\tpbgene\tlength\trefisoform\trefgene\tcategory\tfl_count\n"
            )
        for pbid in to_write[sample]:
            if matchAnnot_parsed is not None or sqanti_class is not None:
                if pbid not in match_dict:
                    print(
                        "Ignoring {0} because not on annotation (SQANTI/MatchAnnot) file."
                        .format(pbid),
                        file=sys.stdout)
                    continue
                m = match_dict[pbid]
                h.write("{0}\t{1}\t{2}\t".format(pbid,
                                                 pbid.split('.')[1],
                                                 seqlen_dict[pbid]))
                h.write("{0}\t{1}\t{2}\t".format(m['refisoform'], m['refgene'],
                                                 m['category']))
            else:
                h.write("{0}\t{1}\t{2}\t".format(pbid,
                                                 pbid.split('.')[1],
                                                 seqlen_dict[pbid]))
            h.write("{0}\n".format(to_write[sample][pbid]))
        h.close()
        print("Output written to {0}.".format(h.name), file=sys.stderr)
예제 #9
0
def sqanti_filter_lite(args):

    fafq_type = 'fasta'
    with open(args.isoforms) as h:
        if h.readline().startswith('@'): fafq_type = 'fastq'

    prefix = args.sqanti_class[:args.sqanti_class.rfind('.')]

    fcsv = open(prefix + '.filtered_lite_reasons.txt', 'w')
    fcsv.write("# classification: {0}\n".format(args.sqanti_class))
    fcsv.write("# isoform: {0}\n".format(args.isoforms))
    fcsv.write("# intrapriming cutoff: {0}\n".format(args.intrapriming))
    fcsv.write("# min_cov cutoff: {0}\n".format(args.min_cov))
    fcsv.write("filtered_isoform,reason\n")

    fout = open(prefix + '.filtered_lite.' + fafq_type, 'w')

    seqids_to_keep = set()
    total_count = 0
    for r in DictReader(open(args.sqanti_class), delimiter='\t'):
        total_count += 1
        filter_flag, filter_msg = False, ""
        percA = float(r['perc_A_downstream_TTS']) / 100
        assert 0 <= percA <= 1
        runA = 0
        while runA < len(r['seq_A_downstream_TTS']):
            if r['seq_A_downstream_TTS'][runA] != 'A':
                break
            runA += 1
        min_cov = float(r['min_cov']) if r['min_cov'] != 'NA' else None
        num_exon = int(r['exons'])
        is_RTS = r['RTS_stage'] == 'TRUE'
        is_canonical = r['all_canonical'] == 'canonical'
        is_monoexonic = (num_exon == 1)

        cat = CATEGORY_DICT[r['structural_category']]

        potential_intrapriming = (percA >= args.intrapriming or runA >= args.runAlength) and \
                                 r['polyA_motif'] == 'NA' and \
                                 (r['diff_to_gene_TSS'] == 'NA' or abs(
                                     int(r['diff_to_gene_TTS'])) > args.max_dist_to_known_end)

        if cat in ['FSM']:
            if potential_intrapriming:
                filter_flag, filter_msg = True, "IntraPriming"
            elif args.filter_mono_exonic and is_monoexonic:
                filter_flag, filter_msg = True, "Mono-Exonic"
        else:
            if potential_intrapriming:
                filter_flag, filter_msg = True, "IntraPriming"
            elif args.filter_mono_exonic and is_monoexonic:
                filter_flag, filter_msg = True, "Mono-Exonic"
            elif is_RTS:
                filter_flag, filter_msg = True, "RTSwitching"
            elif (not is_canonical) and (min_cov is None or
                                         (min_cov is not None
                                          and min_cov < args.min_cov)):
                filter_flag, filter_msg = True, "LowCoverage/Non-Canonical"

        if not filter_flag:
            seqids_to_keep.add(r['isoform'])
        else:
            fcsv.write("{0},{1}\n".format(r['isoform'], filter_msg))

    print("{0} isoforms read from {1}. {2} to be kept.".format(
        total_count, args.sqanti_class, len(seqids_to_keep)),
          file=sys.stdout)

    if not args.skipFaFq:
        for r in SeqIO.parse(open(args.isoforms), fafq_type):
            if r.id in seqids_to_keep:
                SeqIO.write(r, fout, fafq_type)
        fout.close()
        print("Output written to: {0}".format(fout.name), file=sys.stdout)

    # write out a new .classification.txt, .junctions.txt
    outputClassPath = prefix + '.filtered_lite_classification.txt'
    with open(outputClassPath, 'w') as f:
        reader = DictReader(open(args.sqanti_class), delimiter='\t')
        writer = DictWriter(f, reader.fieldnames, delimiter='\t')
        writer.writeheader()
        for r in reader:
            if r['isoform'] in seqids_to_keep:
                writer.writerow(r)
        print("Output written to: {0}".format(f.name), file=sys.stdout)

    if not args.skipJunction:
        outputJuncPath = prefix + '.filtered_lite_junctions.txt'
        with open(outputJuncPath, 'w') as f:
            reader = DictReader(open(
                args.sqanti_class.replace('_classification', '_junctions')),
                                delimiter='\t')
            writer = DictWriter(f, reader.fieldnames, delimiter='\t')
            writer.writeheader()
            for r in reader:
                if r['isoform'] in seqids_to_keep:
                    writer.writerow(r)
            print("Output written to: {0}".format(f.name), file=sys.stdout)

    if not args.skipGTF:
        outputGTF = prefix + '.filtered_lite.gtf'
        with open(outputGTF, 'w') as f:
            for r in collapseGFFReader(args.gtf_file):
                if r.seqid in seqids_to_keep:
                    write_collapseGFF_format(f, r)
            print("Output written to: {0}".format(f.name), file=sys.stdout)

    if args.sam is not None:
        outputSam = prefix + '.filtered_lite.sam'
        with open(outputSam, 'w') as f:
            reader = GMAPSAMReader(args.sam, True)
            f.write(reader.header)
            for r in reader:
                if r.qID in seqids_to_keep:
                    f.write(r.record_line + '\n')
            print("Output written to: {0}".format(f.name), file=sys.stdout)

    if args.faa is not None:
        outputFAA = prefix + '.filtered_lite.faa'
        with open(outputFAA, 'w') as f:
            for r in SeqIO.parse(open(args.faa), 'fasta'):
                if r.id in seqids_to_keep:
                    f.write(">{0}\n{1}\n".format(r.description, r.seq))
        print("Output written to: {0}".format(f.name), file=sys.stdout)

    print("**** Generating SQANTI3 report....", file=sys.stderr)
    cmd = RSCRIPTPATH + " {d}/{f} {c} {j} {p} {d}".format(d=utilitiesPath,
                                                          f=RSCRIPT_REPORT,
                                                          c=outputClassPath,
                                                          j=outputJuncPath,
                                                          p="mock")
    if subprocess.check_call(cmd, shell=True) != 0:
        print("ERROR running command: {0}".format(cmd), file=sys.stderr)
        sys.exit(-1)
예제 #10
0
def collate_info(fusion_prefix, class_filename, gtf_filename,
                 total_fl_count=None,
                 config_filename=None,
                 genome_dict=None,
                 cds_gff_filename=None,
                 min_fl_count=2,
                 min_breakpoint_dist_kb=10,
                 include_Mt_genes=False):

    global_info = {}   # holding information for general information
    if config_filename is not None:
        print("Reading config file {0}...".format(config_filename), file=sys.stdout)
        for line in open(config_filename):
            k, v = line.strip().split('=')
            global_info[k] = v

    # in order to get gene name to ensembl gene ID (ENSG), we need the original GTF that was fed to SQANTI3
    gene_to_id = defaultdict(lambda: set()) # gene name --> ensembl ID
    print(f"Reading {gtf_filename} to extract gene name to ENSG ID mapping...")
    gtf_info = GTF(gtf_filename)
    for v in gtf_info.transcript_info.values():
        gene_to_id[v['gname']].add(v['gid'])
    for k in gene_to_id:
        gene_to_id[k] = "_".join(gene_to_id[k])

    d = defaultdict(lambda: {}) # PBfusion.X --> isoform index -> sqanti3 record
    orf_dict = {}
    # read SQANTI3 classification file
    for r in DictReader(open(class_filename), delimiter='\t'):
        m = fusion_pbid.match(r['isoform'])
        if m is None:
            print("ERROR: fusion pbid must follow format `PBfusion.X.Y`. Abort!", file=sys.stderr)
            sys.exit(-1)
        gene_index, isoform_index = m.group(1), m.group(2)
        d[gene_index][isoform_index] = r
        orf_dict[r['isoform']] = r['ORF_seq']

    # get sequences
    seq_dict = dict((r.id.split('|')[0], r.seq) for r in SeqIO.parse(open(fusion_prefix + '.rep.fa'),'fasta'))

    # get count information
    count_d = defaultdict(lambda: 'NA')
    count_filename = fusion_prefix + '.abundance.txt'
    if os.path.exists(count_filename):
        for r in DictReader(open(count_filename), delimiter='\t'):
            count_d[r['pbid']] = int(r['count_fl'])

    if total_fl_count is None:
        print("Total FL count not given --- using the sum FL count from fusions only instead.", file=sys.stdout)
        total_fl_count = sum(count_d.values())

    # get breakpoint information
    gff_d = defaultdict(lambda: {}) # PBfusion.X --> isoform index -> sqanti3 record
    if cds_gff_filename is None:
        gff_filename = fusion_prefix + '.gff'
    else:
        gff_filename = cds_gff_filename

    for r in collapseGFFReader(gff_filename):
        m = fusion_pbid.match(r.seqid)
        if m is None:
            print("ERROR: fusion pbid in {0} must follow format `PBfusion.X.Y`. Abort!".format(gff_filename), file=sys.stderr)
            sys.exit(-1)
        gene_index, isoform_index = m.group(1), int(m.group(2))
        gff_d[gene_index][isoform_index] = r
        if r.strand not in ('+','-'):
            print("ERROR: fusion {0} did not specify strand in {1}! Abort!".format(r.seqid, gff_filename))
            sys.exit(-1)

    fields2 = list(global_info.keys()) + FIELDS
    f = open(fusion_prefix + '.annotated.txt', 'w')
    f_bad = open(fusion_prefix + '.annotated_ignored.txt', 'w')
    writer = DictWriter(f, fields2, delimiter=',')
    writer.writeheader()
    writer_bad = DictWriter(f_bad, fields2, delimiter=',')
    writer_bad.writeheader()

    for gene_index, iso_dict in d.items():
        iso_dict = list(iso_dict.items())  # (isoform index, classification record)
        iso_dict.sort(key=lambda x: x[0])
        has_novel = any(r['associated_gene'].startswith('novelGene') or r['associated_gene']=='' for junk,r in iso_dict)
        pbid = 'PBfusion.' + str(gene_index)

        gff_info = list(gff_d[gene_index].items())
        gff_info.sort(key=lambda x: x[0])

        rec1 = gff_info[0][1]
        rec2 = gff_info[-1][1]
        left_breakpoint, left_seq, right_breakpoint, right_seq = \
            get_breakpoint_n_seq(rec1, rec2, genome_dict)
        left_exon_count = len(rec1.ref_exons)
        right_exon_count = len(rec2.ref_exons)
        gene1 = iso_dict[0][1]['associated_gene']
        gene2 = iso_dict[-1][1]['associated_gene']

        if cds_gff_filename is not None:
            left_cds_exon_count = len(rec1.cds_exons)
            right_cds_exon_count = len(rec2.cds_exons)
        else:
            left_cds_exon_count = 'NA'
            right_cds_exon_count = 'NA'

        left_orf, right_orf = 'NA', 'NA'
        if orf_dict is not None:
            seqid1 = gff_info[0][1].seqid
            seqid2 = gff_info[-1][1].seqid
            left_orf = orf_dict[seqid1]
            right_orf = orf_dict[seqid2]

        info = {'UniqueID': pbid,
                'FusionName': "--".join([_r['associated_gene'] for (_index,_r) in iso_dict]),
                'LeftGeneName': gene1,
                'LeftGeneID': gene_to_id[gene1] if gene1 in gene_to_id else 'NA',
                'LeftBreakpoint': left_breakpoint,
                'LeftFlankingSequence': left_seq,
                'RightGeneName': gene2,
                'RightGeneID': gene_to_id[gene2] if gene2 in gene_to_id else 'NA',
                'RightBreakpoint': right_breakpoint,
                'RightFlankingSequence': right_seq,
                'JunctionSupport': 'NA',
                'SpanningReads': count_d[pbid],
                'ReadCountScore': count_d[pbid]*(10**6)/total_fl_count  if count_d[pbid] is not 'NA' else 'NA',
                'Sequence': seq_dict[pbid],
                'LeftORF': left_orf,
                'RightORF': right_orf,
                'LeftExonCount': left_exon_count,
                'RightExonCount': right_exon_count,
                'LeftCDSExonCount': left_cds_exon_count,
                'RightCDSExonCount': right_cds_exon_count,
                'Comments': 'PASS'}
        info.update(global_info)

        left_chr, left_break, left_strand = left_breakpoint.split(':')
        right_chr, right_break, right_strand = right_breakpoint.split(':')

        if has_novel:
            info['Comments'] = 'FAIL:NovelGene'
        elif gene1==gene2:
            info['Comments'] = 'FAIL:SameGene'
        elif (info['SpanningReads']!='NA' and info['SpanningReads'] < min_fl_count):
            info['Comments'] = 'FAIL:TooFewFLReads'
        elif (not include_Mt_genes and (gene1.startswith('MT-') or gene2.startswith('MT-'))):
            info['Comments'] = 'FAIL:MtGenes'
        elif (left_chr==right_chr and abs(int(left_break)-int(right_break))/1000<=min_breakpoint_dist_kb):
            info['Comments'] = 'FAIL:BreakpointTooClose'
#        elif (left_exon_count==1 and left_orf=='NA'):
#            info['Comments'] = 'PASS:LeftExonNoORF'
#        elif (right_exon_count==1 and right_orf=='NA'):
#            info['Comments'] = 'PASS:RightExonNoORF'

        if info['Comments'].startswith('FAIL:'):
            writer_bad.writerow(info)
        else:
            writer.writerow(info)

    f.close()
def make_file_for_subsample(input_prefix,
                            output_filename,
                            matchAnnot_parsed=None,
                            sqanti_class=None,
                            include_single_exons=False):
    """
    Two files must exist: .abundance.txt and .rep.fq so we can make the length
    """
    count_filename = input_prefix + '.abundance.txt'
    fq_filename = input_prefix + '.rep.fq'

    if not include_single_exons:
        from cupcake.io.GFF import collapseGFFReader
        gff_filename = input_prefix + '.gff'
        print >> sys.stderr, "Reading {0} to exclude single exons...".format(
            gff_filename)
        good_ids = []
        for r in collapseGFFReader(gff_filename):
            if len(r.ref_exons) >= 2:
                good_ids.append(r.seqid)

    if not os.path.exists(count_filename):
        print >> sys.stderr, "Cannot find {0}. Abort!".format(count_filename)
        sys.exit(-1)

    if not os.path.exists(fq_filename):
        print >> sys.stderr, "Cannot find {0}. Abort!".format(fq_filename)
        sys.exit(-1)

    if matchAnnot_parsed is not None and not os.path.exists(matchAnnot_parsed):
        print >> sys.stderr, "Cannot find {0}. Abort!".format(
            matchAnnot_parsed)
        sys.exit(-1)

    if sqanti_class is not None and not os.path.exists(sqanti_class):
        print >> sys.stderr, "Cannot find {0}. Abort!".format(sqanti_class)
        sys.exit(-1)

    if matchAnnot_parsed is not None:
        match_dict = dict(
            (r['pbid'], r)
            for r in DictReader(open(matchAnnot_parsed), delimiter='\t'))
    elif sqanti_class is not None:
        print >> sys.stderr, "Reading {0} to get gene/isoform assignment...".format(
            sqanti_class)
        match_dict = {}
        for r in DictReader(open(sqanti_class), delimiter='\t'):
            if r['associated_transcript'] == 'novel':
                refisoform = 'novel_' + r['isoform']
            else:
                refisoform = r['associated_transcript']
            match_dict[r['isoform']] = {
                'refgene': r['associated_gene'],
                'refisoform': refisoform
            }
    else:
        match_dict = None

    seqlen_dict = dict((r.id.split('|')[0], len(r.seq))
                       for r in SeqIO.parse(open(fq_filename), 'fastq'))

    h = open(output_filename, 'w')
    if matchAnnot_parsed is None and sqanti_class is None:
        h.write("pbid\tpbgene\tlength\tfl_count\n")
    else:
        h.write("pbid\tpbgene\tlength\trefisoform\trefgene\tfl_count\n")
    f = open(count_filename)
    while True:
        cur = f.tell()
        if not f.readline().startswith('#'):
            f.seek(cur)
            break
    for r in DictReader(f, delimiter='\t'):
        if not include_single_exons and r['pbid'] not in good_ids:
            print >> sys.stderr, "Exclude {0} because single exon.".format(
                r['pbid'])
            continue
        h.write("{0}\t{1}\t{2}\t".format(r['pbid'], r['pbid'].split('.')[1],
                                         seqlen_dict[r['pbid']]))
        if matchAnnot_parsed is not None or sqanti_class is not None:
            m = match_dict[r['pbid']]
            h.write("{0}\t{1}\t".format(m['refisoform'], m['refgene']))
        h.write("{0}\n".format(r['count_fl']))
    h.close()

    print >> sys.stderr, "Output written to {0}.".format(output_filename)
예제 #12
0
def main(corrected_csv,
         cluster_info,
         output_prefix,
         fasta_file=None,
         gff_file=None,
         faa_file=None):

    # read corrected CSV
    reader = DictReader(open(corrected_csv), delimiter='\t')
    for k in CORRECTED_CSV_FILELDS:
        if k not in reader.fieldnames:
            print("The following fields must exist in {0}!\n{1}".format(
                corrected_csv, "\n".join(CORRECTED_CSV_FILELDS)))
            sys.exit(-1)

    per_unique = {}  # tag -> record
    per_unique_count = Counter()  # tag -> number of duplicates
    per_pbid = defaultdict(lambda: {
        'gene': None,
        'transcript': None,
        'clusters': []
    })  # pbid --> list of clusters it is in
    for r in reader:
        tag = "{bc}-{umi}-{gene}".format(bc=r['BC_ed'],
                                         umi=r['UMI_ed'],
                                         gene=r['gene'])
        per_unique[tag] = r
        per_unique_count[tag] += 1

    # now link barcode to cell type, also PCR dup counts
    for tag in per_unique:
        c = cluster_info[per_unique[tag]['BC_ed']]
        rec = per_unique[tag]
        rec['cluster'] = c
        rec['num_dups'] = per_unique_count[tag]
        pbid = rec['pbid']
        if pbid in per_pbid: per_pbid[pbid]['clusters'].add(c)
        else:
            per_pbid[pbid] = {
                'gene': rec['gene'],
                'transcript': rec['transcript'],
                'clusters': set([c])
            }

    # write out de-dup CSV file
    with open(output_prefix + '.csv', 'w') as f:
        writer = DictWriter(f,
                            CORRECTED_CSV_FILELDS + ['cluster', 'num_dups'],
                            delimiter='\t',
                            extrasaction='ignore')
        writer.writeheader()
        keys = per_unique.keys()
        for k in sorted(keys):
            writer.writerow(per_unique[k])

    if fasta_file is not None:
        f_d = {}  # cluster --> file handle
        # writer pbid master file
        with open(output_prefix + '.fasta', 'w') as f:
            for r in SeqIO.parse(open(fasta_file), 'fasta'):
                if r.id in per_pbid:
                    newid = "{pbid}|{gene}|{transcript}|{clusters}".format(\
                            pbid=r.id,
                            gene=per_pbid[r.id]['gene'],
                            transcript=per_pbid[r.id]['transcript'],
                            clusters=";".join(per_pbid[r.id]['clusters']))
                    f.write(">{0}\n{1}\n".format(newid, r.seq))
                    for c in per_pbid[r.id]['clusters']:
                        if c not in f_d:
                            f_d[c] = open(
                                "{o}.{c}.fasta".format(o=output_prefix, c=c),
                                'w')
                        f_d[c].write(">{0}\n{1}\n".format(newid, r.seq))

    if faa_file is not None:
        f_d = {}  # cluster --> file handle
        # writer pbid master file
        with open(output_prefix + '.faa', 'w') as f:
            for r in SeqIO.parse(open(faa_file), 'fasta'):
                if r.id in per_pbid:
                    newid = "{pbid}|{gene}|{transcript}|{clusters}".format(\
                            pbid=r.id,
                            gene=per_pbid[r.id]['gene'],
                            transcript=per_pbid[r.id]['transcript'],
                            clusters=";".join(per_pbid[r.id]['clusters']))
                    f.write(">{0}\n{1}\n".format(newid, r.seq))
                    for c in per_pbid[r.id]['clusters']:
                        if c not in f_d:
                            f_d[c] = open(
                                "{o}.{c}.faa".format(o=output_prefix, c=c),
                                'w')
                        f_d[c].write(">{0}\n{1}\n".format(newid, r.seq))
        for handle in f_d.values():
            handle.close()

    if gff_file is not None:
        f_d = {}  # cluster --> file handle
        # writer pbid master file
        with open(output_prefix + '.gff', 'w') as f:
            for r in collapseGFFReader(gff_file):
                if r.seqid in per_pbid:
                    newid = "{pbid}|{gene}|{transcript}|{clusters}".format(\
                            pbid=r.seqid,
                            gene=per_pbid[r.seqid]['gene'],
                            transcript=per_pbid[r.seqid]['transcript'],
                            clusters=";".join(per_pbid[r.seqid]['clusters']))
                    write_collapseGFF_format(f, r)
                    for c in per_pbid[r.seqid]['clusters']:
                        if c not in f_d:
                            f_d[c] = open(
                                "{o}.{c}.gff".format(o=output_prefix, c=c),
                                'w')
                        write_collapseGFF_format(f_d[c], r)
        for handle in f_d.values():
            handle.close()
def make_file_for_subsample(input_prefix, output_filename, matchAnnot_parsed=None, sqanti_class=None, include_single_exons=False):
    """
    Two files must exist: .abundance.txt and .rep.fq so we can make the length
    """
    count_filename = input_prefix + '.abundance.txt'
    fq_filename = input_prefix + '.rep.fq'

    if not include_single_exons:
        from cupcake.io.GFF import collapseGFFReader
        gff_filename = input_prefix + '.gff'
        print >> sys.stderr, "Reading {0} to exclude single exons...".format(gff_filename)
        good_ids = []
        for r in collapseGFFReader(gff_filename):
            if len(r.ref_exons) >= 2:
                good_ids.append(r.seqid)

    if not os.path.exists(count_filename):
        print >> sys.stderr, "Cannot find {0}. Abort!".format(count_filename)
        sys.exit(-1)

    if not os.path.exists(fq_filename):
        print >> sys.stderr, "Cannot find {0}. Abort!".format(fq_filename)
        sys.exit(-1)

    if matchAnnot_parsed is not None and not os.path.exists(matchAnnot_parsed):
        print >> sys.stderr, "Cannot find {0}. Abort!".format(matchAnnot_parsed)
        sys.exit(-1)

    if sqanti_class is not None and not os.path.exists(sqanti_class):
        print >> sys.stderr, "Cannot find {0}. Abort!".format(sqanti_class)
        sys.exit(-1)

    if matchAnnot_parsed is not None:
        match_dict = dict((r['pbid'],r) for r in DictReader(open(matchAnnot_parsed), delimiter='\t'))
    elif sqanti_class is not None:
        print >> sys.stderr, "Reading {0} to get gene/isoform assignment...".format(sqanti_class)
        match_dict = {}
        for r in DictReader(open(sqanti_class), delimiter='\t'):
            if r['associated_transcript'] == 'novel':
                refisoform = 'novel_'+r['isoform']
            else:
                refisoform = r['associated_transcript']
            match_dict[r['isoform']] = {'refgene': r['associated_gene'],
                                     'refisoform': refisoform}
    else:
        match_dict = None

    seqlen_dict = dict((r.id.split('|')[0],len(r.seq)) for r in SeqIO.parse(open(fq_filename),'fastq'))
    
    h = open(output_filename, 'w')
    if matchAnnot_parsed is None and sqanti_class is None:
        h.write("pbid\tpbgene\tlength\tfl_count\n")
    else:
        h.write("pbid\tpbgene\tlength\trefisoform\trefgene\tfl_count\n")
    f = open(count_filename)
    while True:
        cur = f.tell()
        if not f.readline().startswith('#'):
            f.seek(cur)
            break
    for r in DictReader(f, delimiter='\t'):
        if not include_single_exons and r['pbid'] not in good_ids:
            print >> sys.stderr, "Exclude {0} because single exon.".format(r['pbid'])
            continue

        if matchAnnot_parsed is not None or sqanti_class is not None:
            if r['pbid'] not in match_dict:
                print >> sys.stdout, "Ignoring {0} because not on annotation (SQANTI/MatchAnnot) file.".format(r['pbid'])
                continue
            m = match_dict[r['pbid']]
            h.write("{0}\t{1}\t{2}\t".format(r['pbid'], r['pbid'].split('.')[1], seqlen_dict[r['pbid']]))
            h.write("{0}\t{1}\t".format(m['refisoform'], m['refgene']))
        else:
            h.write("{0}\t{1}\t{2}\t".format(r['pbid'], r['pbid'].split('.')[1], seqlen_dict[r['pbid']]))
        h.write("{0}\n".format(r['count_fl']))
    h.close()

    print >> sys.stderr, "Output written to {0}.".format(output_filename)