def calc_ontarget_rate(tree,
                       gene_info,
                       input_fasta,
                       is_gtf,
                       sam_or_gtf,
                       output_filename=None):

    feature = ("fasta" if input_fasta.upper().endswith(".FA")
               or input_fasta.upper().endswith(".FASTA") else "fastq")
    query_len_dict = {
        r.id: len(r.seq)
        for r in SeqIO.parse(open(input_fasta), feature)
    }

    if output_filename is None:
        f = sys.stdout
    else:
        f = Path(output_filename)

    FIELDS = [
        "read_id", "read_len", "num_probe", "num_base_overlap", "loci", "genes"
    ]
    writer = DictWriter(f, FIELDS, delimiter="\t")
    writer.writeheader()

    if is_gtf:
        reader = collapseGFFReader(sam_or_gtf)
        for r in reader:
            num_probe, base_hit, genes_seen = get_probe_hit(
                tree, gene_info, r, is_gtf)
            rec = {
                "read_id": r.seqid,
                "read_len": "NA",
                "num_probe": num_probe,
                "num_base_overlap": base_hit,
                "loci": f"{r.chr}:{r.start}-{r.end}",
                "genes": ",".join(genes_seen),
            }
            writer.writerow(rec)
    else:
        reader = BioReaders.GMAPSAMReader(sam_or_gtf,
                                          True,
                                          query_len_dict=query_len_dict)
        for r in reader:
            if r.sID == "*":
                continue
            num_probe, base_hit, genes_seen = get_probe_hit(
                tree, gene_info, r, is_gtf)
            rec = {
                "read_id": r.qID,
                "read_len": r.qLen,
                "num_probe": num_probe,
                "num_base_overlap": base_hit,
                "loci": f"{r.sID}:{r.sStart}-{r.sEnd}",
                "genes": ",".join(genes_seen),
            }
            writer.writerow(rec)
示例#2
0
def read_GFF(gff_filename):
    """
    Read a GFF filename and get the gene regions

    :return: dict of (PB.X) --> LocusInfo
    """
    gff_info = {}  # loci --> LocusInfo
    tmp = {}  # loci PB.X --> list of GFF records for PB.X.Y

    for r in collapseGFFReader(gff_filename):
        m = rex_pbid.match(r.seqid)
        if m is None:
            raise Exception(f"Expected PBID format PB.X.Y but saw {r.seqid}")
        locus = m.group(1)  # ex: PB.1
        if locus not in tmp:
            tmp[locus] = [r]
            gff_info[locus] = LocusInfo(chrom=r.chr,
                                        strand=r.strand,
                                        regions=None,
                                        isoforms=None)
        else:
            if gff_info[locus].chrom != r.chr:
                logger.warning(
                    f"WARNING: Expected {r.seqid} to be on {gff_info[locus].chrom} but saw {r.chr}. Could be minimap2 multi-mapping inconsistency for repetitive genes. Check later.\n"
                )
            tmp[locus].append(r)

    # now figure out the exonic regions for each gene PB.X
    for locus, records in tmp.items():
        c = ClusterTree(0, 0)
        for r in records:
            for e in r.ref_exons:
                c.insert(
                    max(0, e.start - extra_bp_around_junctions),
                    e.end + extra_bp_around_junctions,
                    1,
                )

        regions = [(a, b) for (a, b, junk) in c.getregions()]
        regions[0] = (max(0, regions[0][0] - __padding_before_after__),
                      regions[0][1])
        regions[-1] = (
            max(0, regions[-1][0]),
            regions[-1][1] + __padding_before_after__,
        )
        gff_info[locus] = LocusInfo(
            chrom=gff_info[locus].chrom,
            strand=gff_info[locus].strand,
            regions=regions,
            isoforms=[r.seqid for r in records],
        )

    return gff_info
示例#3
0
def read_annotation_for_junction_info(gff_filename: str) -> defaultdict:
    """
    :param gff_filename: annotation GFF
    :return: dict of (chrom, strand, 'donor' or 'acceptor') --> sorted list of donor or acceptor site. all 0-based.
    """
    d = defaultdict(lambda: set())
    for r in collapseGFFReader(gff_filename):
        if r.strand == "+":
            for i in range(0, len(r.ref_exons) - 1):
                d[(r.chr, r.strand, "donor")].add(r.ref_exons[i].end - 1)
                d[(r.chr, r.strand, "acceptor")].add(r.ref_exons[i + 1].start)
        else:
            for i in range(0, len(r.ref_exons) - 1):
                d[(r.chr, r.strand, "acceptor")].add(r.ref_exons[i].end - 1)
                d[(r.chr, r.strand, "donor")].add(r.ref_exons[i + 1].start)
    for k in d:
        d[k] = list(d[k])
        d[k].sort()
    return d
def make_file_for_subsample(
    input_prefix: str,
    output_prefix: str,
    demux_file=None,
    matchAnnot_parsed=None,
    sqanti_class=None,
    include_single_exons=False,
) -> None:
    """
    Two files must exist: .abundance.txt and .rep.fq so we can make the length
    """
    count_filename = f"{input_prefix}.abundance.txt"

    rep_filenames = [
        (f"{input_prefix}.rep.fq", "fastq"),
        (f"{input_prefix}.rep.fastq", "fastq"),
        (f"{input_prefix}.rep.fa", "fasta"),
        (f"{input_prefix}.rep.fasta", "fasta"),
    ]

    rep_filename = None
    rep_type = None
    for x, feature in rep_filenames:
        if Path(x).exists():
            rep_filename = x
            rep_type = feature

    if rep_filename is None:
        logger.error(
            "Expected to find input fasta or fastq files {input_prefix}.rep.fa or {input_prefix}.rep.fq. Not found. Abort!"
        )
        sys.exit(-1)

    if not include_single_exons:
        from cupcake.sequence.GFF import collapseGFFReader

        gff_filename = f"{input_prefix}.gff"
        logger.info(f"Reading {gff_filename} to exclude single exons...")
        # good_ids = []
        good_ids = [
            r.seqid for r in collapseGFFReader(gff_filename) if len(r.ref_exons) >= 2
        ]
        # for r in collapseGFFReader(gff_filename):
        #     if len(r.ref_exons) >= 2:
        #         good_ids.append(r.seqid)
    else:
        good_ids = []

    if demux_file is None and not Path(count_filename).exists():
        logger.error(f"Cannot find {count_filename}. Abort!")
        sys.exit(-1)

    if matchAnnot_parsed is not None and not Path(matchAnnot_parsed).exists():
        logger.error(f"Cannot find {matchAnnot_parsed}. Abort!")
        sys.exit(-1)

    if sqanti_class is not None and not Path(sqanti_class).exists():
        logger.error(f"Cannot find {sqanti_class}. Abort!")
        sys.exit(-1)

    if matchAnnot_parsed is not None:
        with open(matchAnnot_parsed) as ma:
            match_dict = {r["pbid"]: r for r in DictReader(ma, delimiter="\t")}
        for k in match_dict:
            match_dict[k]["category"] = match_dict[k]["score"]
    elif sqanti_class is not None:
        logger.info(f"Reading {sqanti_class} to get gene/isoform assignment...")
        match_dict = {}
        with open(sqanti_class) as sc:
            for r in DictReader(sc, delimiter="\t"):
                if r["associated_transcript"] == "novel":
                    refisoform = f"novel_{r['isoform']}"
                else:
                    refisoform = r["associated_transcript"]
                match_dict[r["isoform"]] = {
                    "refgene": r["associated_gene"],
                    "refisoform": refisoform,
                    "category": r["structural_category"],
                }
    else:
        match_dict = None
    with open(rep_filename) as rf:
        seqlen_dict = {
            r.id.split("|")[0]: len(r.seq) for r in SeqIO.parse(rf, rep_type)
        }

    to_write = {}
    if demux_file is None:
        to_write["all"] = {}
        with open(count_filename) as f:
            while True:
                cur = f.tell()
                if not f.readline().startswith("#"):
                    f.seek(cur)
                    break
            for r in DictReader(f, delimiter="\t"):
                if r["pbid"] in good_ids or include_single_exons:
                    to_write["all"][r["pbid"]] = r["count_fl"]
    else:
        d, samples = read_demux_fl_count_file(demux_file)
        for s in samples:
            to_write[s] = {}
        for pbid, d2 in d.items():
            for s in samples:
                if pbid in good_ids or include_single_exons:
                    to_write[s][pbid] = d2[s]

    for sample in to_write:
        with Path(f"{output_prefix}.{sample}.txt").open("a+") as h:
            if matchAnnot_parsed is None and sqanti_class is None:
                h.write("pbid\tpbgene\tlength\tfl_count\n")
            else:
                h.write(
                    "pbid\tpbgene\tlength\trefisoform\trefgene\tcategory\tfl_count\n"
                )
            for pbid in to_write[sample]:
                if matchAnnot_parsed is not None or sqanti_class is not None:
                    if pbid not in match_dict:
                        logger.warning(
                            f"Ignoring {pbid} because not in annotation (SQANTI/MatchAnnot) file."
                        )
                        continue
                    m = match_dict[pbid]
                    h.write(f"{pbid}\t{pbid.split('.')[1]}\t{seqlen_dict[pbid]}\t")
                    h.write(f'{m["refisoform"]}\t{m["refgene"]}\t{m["category"]}\t')
                else:
                    h.write(f'{pbid}\t{pbid.split(".")[1]}\t{seqlen_dict[pbid]}\t')
                h.write(f"{to_write[sample][pbid]}\n")
            logger.info(
                f"Output written to {Path(f'{output_prefix}.{sample}.txt').resolve()}."
            )
示例#5
0
def collate_info(
    fusion_prefix: str,
    class_filename: str,
    genepred_filename: str,
    total_fl_count: Optional[int] = None,
    config_filename: Optional[str] = None,
    genome_dict: Optional[dict] = None,
    cds_gff_filename: Optional[str] = None,
    min_fl_count: int = 2,
    min_breakpoint_dist_kb: int = 10,
    include_Mt_genes: bool = False,
) -> None:

    global_info = {}  # holding information for general information
    if config_filename is not None:
        logger.info(f"Reading config file {config_filename}...")
        for line in open(config_filename):
            k, v = line.strip().split("=")
            global_info[k] = v

    gene_to_id = {}  # gene name --> ensembl ID
    for line in open(genepred_filename):
        raw = line.strip().split()
        gene_to_id[raw[11]] = raw[0]

    d = defaultdict(
        lambda: {})  # PBfusion.X --> isoform index -> sqanti3 record
    orf_dict = {}
    # read SQANTI3 classification file
    for r in DictReader(open(class_filename), delimiter="\t"):
        m = fusion_pbid.match(r["isoform"])
        if m is None:
            logger.error(
                "ERROR: fusion pbid must follow format `PBfusion.X.Y`. Abort!")
            sys.exit(-1)
        gene_index, isoform_index = m.group(1), m.group(2)
        d[gene_index][isoform_index] = r
        orf_dict[r["isoform"]] = r["ORF_seq"]

    # get sequences
    seq_dict = {
        r.id.split("|")[0]: r.seq
        for r in SeqIO.parse(open(f"{fusion_prefix}.rep.fa"), "fasta")
    }

    # get count information
    count_d = defaultdict(lambda: "NA")
    count_filename = f"{fusion_prefix}.abundance.txt"
    if Path(count_filename).exists():
        for r in DictReader(open(count_filename), delimiter="\t"):
            count_d[r["pbid"]] = int(r["count_fl"])

    if total_fl_count is None:
        logger.info(
            "Total FL count not given --- using the sum FL count from fusions only instead."
        )
        total_fl_count = sum(count_d.values())

    # get breakpoint information
    gff_d = defaultdict(
        lambda: {})  # PBfusion.X --> isoform index -> sqanti3 record
    if cds_gff_filename is None:
        gff_filename = f"{fusion_prefix}.gff"
    else:
        gff_filename = cds_gff_filename

    for r in collapseGFFReader(gff_filename):
        m = fusion_pbid.match(r.seqid)
        if m is None:
            logger.error(
                f"ERROR: fusion pbid in {gff_filename} must follow format `PBfusion.X.Y`. Abort!"
            )
            sys.exit(-1)
        gene_index, isoform_index = m.group(1), int(m.group(2))
        gff_d[gene_index][isoform_index] = r
        if r.strand not in ("+", "-"):
            logger.error(
                f"ERROR: fusion {r.seqid} did not specify strand in {gff_filename}! Abort!"
            )
            sys.exit(-1)

    fields2 = list(global_info.keys()) + FIELDS
    with open(f"{fusion_prefix}.annotated.txt",
              "w") as f, open(f"{fusion_prefix}.annotated_ignored.txt",
                              "w") as f_bad:
        writer = DictWriter(f, fields2, delimiter=",")
        writer.writeheader()
        writer_bad = DictWriter(f_bad, fields2, delimiter=",")
        writer_bad.writeheader()

        for gene_index, iso_dict in d.items():
            iso_dict = list(
                iso_dict.items())  # (isoform index, classification record)
            iso_dict.sort(key=lambda x: x[0])
            has_novel = any(r["associated_gene"].startswith("novelGene")
                            or r["associated_gene"] == ""
                            for junk, r in iso_dict)
            pbid = f"PBfusion.{str(gene_index)}"

            gff_info = list(gff_d[gene_index].items())
            gff_info.sort(key=lambda x: x[0])

            rec1 = gff_info[0][1]
            rec2 = gff_info[-1][1]
            (
                left_breakpoint,
                left_seq,
                right_breakpoint,
                right_seq,
            ) = get_breakpoint_n_seq(rec1, rec2, genome_dict)
            left_exon_count = len(rec1.ref_exons)
            right_exon_count = len(rec2.ref_exons)
            gene1 = iso_dict[0][1]["associated_gene"]
            gene2 = iso_dict[-1][1]["associated_gene"]

            if cds_gff_filename is not None:
                left_cds_exon_count = len(rec1.cds_exons)
                right_cds_exon_count = len(rec2.cds_exons)
            else:
                left_cds_exon_count = "NA"
                right_cds_exon_count = "NA"

            left_orf, right_orf = "NA", "NA"
            if orf_dict is not None:
                seqid1 = gff_info[0][1].seqid
                seqid2 = gff_info[-1][1].seqid
                left_orf = orf_dict[seqid1]
                right_orf = orf_dict[seqid2]

            info = {
                "UniqueID":
                pbid,
                "FusionName":
                "--".join([_r["associated_gene"]
                           for (_index, _r) in iso_dict]),
                "LeftGeneName":
                gene1,
                "LeftGeneID":
                gene_to_id[gene1] if gene1 in gene_to_id else "NA",
                "LeftBreakpoint":
                left_breakpoint,
                "LeftFlankingSequence":
                left_seq,
                "RightGeneName":
                gene2,
                "RightGeneID":
                gene_to_id[gene2] if gene2 in gene_to_id else "NA",
                "RightBreakpoint":
                right_breakpoint,
                "RightFlankingSequence":
                right_seq,
                "JunctionSupport":
                "NA",
                "SpanningReads":
                count_d[pbid],
                "ReadCountScore":
                (count_d[pbid] * (10**6) /
                 total_fl_count) if count_d[pbid] != "NA" else "NA",
                "Sequence":
                seq_dict[pbid],
                "LeftORF":
                left_orf,
                "RightORF":
                right_orf,
                "LeftExonCount":
                left_exon_count,
                "RightExonCount":
                right_exon_count,
                "LeftCDSExonCount":
                left_cds_exon_count,
                "RightCDSExonCount":
                right_cds_exon_count,
                "Comments":
                "PASS",
            }
            info.update(global_info)

            left_chr, left_break, left_strand = left_breakpoint.split(":")
            right_chr, right_break, right_strand = right_breakpoint.split(":")

            if has_novel:
                info["Comments"] = "FAIL:NovelGene"
            elif gene1 == gene2:
                info["Comments"] = "FAIL:SameGene"
            elif info["SpanningReads"] != "NA" and info[
                    "SpanningReads"] < min_fl_count:
                info["Comments"] = "FAIL:TooFewFLReads"
            elif not include_Mt_genes and (gene1.startswith("MT-")
                                           or gene2.startswith("MT-")):
                info["Comments"] = "FAIL:MtGenes"
            elif (left_chr == right_chr
                  and abs(int(left_break) - int(right_break)) / 1000 <=
                  min_breakpoint_dist_kb):
                info["Comments"] = "FAIL:BreakpointTooClose"

            if info["Comments"].startswith("FAIL:"):
                writer_bad.writerow(info)
            else:
                writer.writerow(info)
示例#6
0
def dedup_FLNC_per_cluster(
    corrected_csv,
    cluster_info,
    output_prefix,
    fasta_file=None,
    gff_file=None,
    faa_file=None,
):

    # read corrected CSV
    reader = DictReader(open(corrected_csv), delimiter="\t")
    for k in CORRECTED_CSV_FILELDS:
        if k not in reader.fieldnames:
            raise RuntimeError(
                "The following fields must exist in {}!\n{}".format(
                    corrected_csv, "\n".join(CORRECTED_CSV_FILELDS)))

    per_unique = {}  # tag -> record
    per_unique_count = Counter()  # tag -> number of duplicates
    per_pbid = defaultdict(lambda: {
        "gene": None,
        "transcript": None,
        "clusters": []
    })  # pbid --> list of clusters it is in
    for r in reader:
        tag = f"{r['BC_ed']}-{r['UMI_ed']}-{r['gene']}"
        per_unique[tag] = r
        per_unique_count[tag] += 1

    # now link barcode to cell type, also PCR dup counts
    for tag in per_unique:
        c = cluster_info[per_unique[tag]["BC_ed"]]
        rec = per_unique[tag]
        rec["cluster"] = c
        rec["num_dups"] = per_unique_count[tag]
        pbid = rec["pbid"]
        if pbid in per_pbid:
            per_pbid[pbid]["clusters"].add(c)
        else:
            per_pbid[pbid] = {
                "gene": rec["gene"],
                "transcript": rec["transcript"],
                "clusters": {c},
            }

    # write out de-dup CSV file
    with open(f"{output_prefix}.csv", "w") as f:
        writer = DictWriter(
            f,
            CORRECTED_CSV_FILELDS + ["cluster", "num_dups"],
            delimiter="\t",
            extrasaction="ignore",
        )
        writer.writeheader()
        keys = per_unique.keys()
        for k in sorted(keys):
            writer.writerow(per_unique[k])

    if fasta_file is not None:
        f_d = {}  # cluster --> file handle
        # writer pbid master file
        with open(f"{output_prefix}.fasta", "w") as f:
            for r in SeqIO.parse(open(fasta_file), "fasta"):
                if r.id in per_pbid:
                    newid = f"{r.id}|{per_pbid[r.id]['gene']}|{per_pbid[r.id]['transcript']}|{';'.join(per_pbid[r.id]['clusters'])}"
                    f.write(f">{newid}\n{r.seq}\n")
                    for c in per_pbid[r.id]["clusters"]:
                        if c not in f_d:
                            f_d[c] = open(f"{output_prefix}.{c}.fasta", "w")
                        f_d[c].write(f">{newid}\n{r.seq}\n")

    if faa_file is not None:
        f_d = {}  # cluster --> file handle
        # writer pbid master file
        with open(f"{output_prefix}.faa", "w") as f:
            for r in SeqIO.parse(open(faa_file), "fasta"):
                if r.id in per_pbid:
                    newid = f'{r.id}|{per_pbid[r.id]["gene"]}|{per_pbid[r.id]["transcript"]}|{";".join(per_pbid[r.id]["clusters"])}'
                    f.write(f">{newid}\n{r.seq}\n")
                    for c in per_pbid[r.id]["clusters"]:
                        if c not in f_d:
                            f_d[c] = open(f"{output_prefix}.{c}.faa", "w")
                        f_d[c].write(f">{newid}\n{r.seq}\n")
        for handle in f_d.values():
            handle.close()

    if gff_file is not None:
        f_d = {}  # cluster --> file handle
        # writer pbid master file
        with open(f"{output_prefix}.gff", "w") as f:
            for r in collapseGFFReader(gff_file):
                if r.seqid in per_pbid:
                    newid = f'{r.seqid}|{per_pbid[r.seqid]["gene"]}|{per_pbid[r.seqid]["transcript"]}|{";".join(per_pbid[r.seqid]["clusters"])}'
                    write_collapseGFF_format(f, r)
                    for c in per_pbid[r.seqid]["clusters"]:
                        if c not in f_d:
                            f_d[c] = open(f"{output_prefix}.{c}.gff", "w")
                        write_collapseGFF_format(f_d[c], r)
        for handle in f_d.values():
            handle.close()