def calc_ontarget_rate(tree, gene_info, input_fasta, is_gtf, sam_or_gtf, output_filename=None): feature = ("fasta" if input_fasta.upper().endswith(".FA") or input_fasta.upper().endswith(".FASTA") else "fastq") query_len_dict = { r.id: len(r.seq) for r in SeqIO.parse(open(input_fasta), feature) } if output_filename is None: f = sys.stdout else: f = Path(output_filename) FIELDS = [ "read_id", "read_len", "num_probe", "num_base_overlap", "loci", "genes" ] writer = DictWriter(f, FIELDS, delimiter="\t") writer.writeheader() if is_gtf: reader = collapseGFFReader(sam_or_gtf) for r in reader: num_probe, base_hit, genes_seen = get_probe_hit( tree, gene_info, r, is_gtf) rec = { "read_id": r.seqid, "read_len": "NA", "num_probe": num_probe, "num_base_overlap": base_hit, "loci": f"{r.chr}:{r.start}-{r.end}", "genes": ",".join(genes_seen), } writer.writerow(rec) else: reader = BioReaders.GMAPSAMReader(sam_or_gtf, True, query_len_dict=query_len_dict) for r in reader: if r.sID == "*": continue num_probe, base_hit, genes_seen = get_probe_hit( tree, gene_info, r, is_gtf) rec = { "read_id": r.qID, "read_len": r.qLen, "num_probe": num_probe, "num_base_overlap": base_hit, "loci": f"{r.sID}:{r.sStart}-{r.sEnd}", "genes": ",".join(genes_seen), } writer.writerow(rec)
def read_GFF(gff_filename): """ Read a GFF filename and get the gene regions :return: dict of (PB.X) --> LocusInfo """ gff_info = {} # loci --> LocusInfo tmp = {} # loci PB.X --> list of GFF records for PB.X.Y for r in collapseGFFReader(gff_filename): m = rex_pbid.match(r.seqid) if m is None: raise Exception(f"Expected PBID format PB.X.Y but saw {r.seqid}") locus = m.group(1) # ex: PB.1 if locus not in tmp: tmp[locus] = [r] gff_info[locus] = LocusInfo(chrom=r.chr, strand=r.strand, regions=None, isoforms=None) else: if gff_info[locus].chrom != r.chr: logger.warning( f"WARNING: Expected {r.seqid} to be on {gff_info[locus].chrom} but saw {r.chr}. Could be minimap2 multi-mapping inconsistency for repetitive genes. Check later.\n" ) tmp[locus].append(r) # now figure out the exonic regions for each gene PB.X for locus, records in tmp.items(): c = ClusterTree(0, 0) for r in records: for e in r.ref_exons: c.insert( max(0, e.start - extra_bp_around_junctions), e.end + extra_bp_around_junctions, 1, ) regions = [(a, b) for (a, b, junk) in c.getregions()] regions[0] = (max(0, regions[0][0] - __padding_before_after__), regions[0][1]) regions[-1] = ( max(0, regions[-1][0]), regions[-1][1] + __padding_before_after__, ) gff_info[locus] = LocusInfo( chrom=gff_info[locus].chrom, strand=gff_info[locus].strand, regions=regions, isoforms=[r.seqid for r in records], ) return gff_info
def read_annotation_for_junction_info(gff_filename: str) -> defaultdict: """ :param gff_filename: annotation GFF :return: dict of (chrom, strand, 'donor' or 'acceptor') --> sorted list of donor or acceptor site. all 0-based. """ d = defaultdict(lambda: set()) for r in collapseGFFReader(gff_filename): if r.strand == "+": for i in range(0, len(r.ref_exons) - 1): d[(r.chr, r.strand, "donor")].add(r.ref_exons[i].end - 1) d[(r.chr, r.strand, "acceptor")].add(r.ref_exons[i + 1].start) else: for i in range(0, len(r.ref_exons) - 1): d[(r.chr, r.strand, "acceptor")].add(r.ref_exons[i].end - 1) d[(r.chr, r.strand, "donor")].add(r.ref_exons[i + 1].start) for k in d: d[k] = list(d[k]) d[k].sort() return d
def make_file_for_subsample( input_prefix: str, output_prefix: str, demux_file=None, matchAnnot_parsed=None, sqanti_class=None, include_single_exons=False, ) -> None: """ Two files must exist: .abundance.txt and .rep.fq so we can make the length """ count_filename = f"{input_prefix}.abundance.txt" rep_filenames = [ (f"{input_prefix}.rep.fq", "fastq"), (f"{input_prefix}.rep.fastq", "fastq"), (f"{input_prefix}.rep.fa", "fasta"), (f"{input_prefix}.rep.fasta", "fasta"), ] rep_filename = None rep_type = None for x, feature in rep_filenames: if Path(x).exists(): rep_filename = x rep_type = feature if rep_filename is None: logger.error( "Expected to find input fasta or fastq files {input_prefix}.rep.fa or {input_prefix}.rep.fq. Not found. Abort!" ) sys.exit(-1) if not include_single_exons: from cupcake.sequence.GFF import collapseGFFReader gff_filename = f"{input_prefix}.gff" logger.info(f"Reading {gff_filename} to exclude single exons...") # good_ids = [] good_ids = [ r.seqid for r in collapseGFFReader(gff_filename) if len(r.ref_exons) >= 2 ] # for r in collapseGFFReader(gff_filename): # if len(r.ref_exons) >= 2: # good_ids.append(r.seqid) else: good_ids = [] if demux_file is None and not Path(count_filename).exists(): logger.error(f"Cannot find {count_filename}. Abort!") sys.exit(-1) if matchAnnot_parsed is not None and not Path(matchAnnot_parsed).exists(): logger.error(f"Cannot find {matchAnnot_parsed}. Abort!") sys.exit(-1) if sqanti_class is not None and not Path(sqanti_class).exists(): logger.error(f"Cannot find {sqanti_class}. Abort!") sys.exit(-1) if matchAnnot_parsed is not None: with open(matchAnnot_parsed) as ma: match_dict = {r["pbid"]: r for r in DictReader(ma, delimiter="\t")} for k in match_dict: match_dict[k]["category"] = match_dict[k]["score"] elif sqanti_class is not None: logger.info(f"Reading {sqanti_class} to get gene/isoform assignment...") match_dict = {} with open(sqanti_class) as sc: for r in DictReader(sc, delimiter="\t"): if r["associated_transcript"] == "novel": refisoform = f"novel_{r['isoform']}" else: refisoform = r["associated_transcript"] match_dict[r["isoform"]] = { "refgene": r["associated_gene"], "refisoform": refisoform, "category": r["structural_category"], } else: match_dict = None with open(rep_filename) as rf: seqlen_dict = { r.id.split("|")[0]: len(r.seq) for r in SeqIO.parse(rf, rep_type) } to_write = {} if demux_file is None: to_write["all"] = {} with open(count_filename) as f: while True: cur = f.tell() if not f.readline().startswith("#"): f.seek(cur) break for r in DictReader(f, delimiter="\t"): if r["pbid"] in good_ids or include_single_exons: to_write["all"][r["pbid"]] = r["count_fl"] else: d, samples = read_demux_fl_count_file(demux_file) for s in samples: to_write[s] = {} for pbid, d2 in d.items(): for s in samples: if pbid in good_ids or include_single_exons: to_write[s][pbid] = d2[s] for sample in to_write: with Path(f"{output_prefix}.{sample}.txt").open("a+") as h: if matchAnnot_parsed is None and sqanti_class is None: h.write("pbid\tpbgene\tlength\tfl_count\n") else: h.write( "pbid\tpbgene\tlength\trefisoform\trefgene\tcategory\tfl_count\n" ) for pbid in to_write[sample]: if matchAnnot_parsed is not None or sqanti_class is not None: if pbid not in match_dict: logger.warning( f"Ignoring {pbid} because not in annotation (SQANTI/MatchAnnot) file." ) continue m = match_dict[pbid] h.write(f"{pbid}\t{pbid.split('.')[1]}\t{seqlen_dict[pbid]}\t") h.write(f'{m["refisoform"]}\t{m["refgene"]}\t{m["category"]}\t') else: h.write(f'{pbid}\t{pbid.split(".")[1]}\t{seqlen_dict[pbid]}\t') h.write(f"{to_write[sample][pbid]}\n") logger.info( f"Output written to {Path(f'{output_prefix}.{sample}.txt').resolve()}." )
def collate_info( fusion_prefix: str, class_filename: str, genepred_filename: str, total_fl_count: Optional[int] = None, config_filename: Optional[str] = None, genome_dict: Optional[dict] = None, cds_gff_filename: Optional[str] = None, min_fl_count: int = 2, min_breakpoint_dist_kb: int = 10, include_Mt_genes: bool = False, ) -> None: global_info = {} # holding information for general information if config_filename is not None: logger.info(f"Reading config file {config_filename}...") for line in open(config_filename): k, v = line.strip().split("=") global_info[k] = v gene_to_id = {} # gene name --> ensembl ID for line in open(genepred_filename): raw = line.strip().split() gene_to_id[raw[11]] = raw[0] d = defaultdict( lambda: {}) # PBfusion.X --> isoform index -> sqanti3 record orf_dict = {} # read SQANTI3 classification file for r in DictReader(open(class_filename), delimiter="\t"): m = fusion_pbid.match(r["isoform"]) if m is None: logger.error( "ERROR: fusion pbid must follow format `PBfusion.X.Y`. Abort!") sys.exit(-1) gene_index, isoform_index = m.group(1), m.group(2) d[gene_index][isoform_index] = r orf_dict[r["isoform"]] = r["ORF_seq"] # get sequences seq_dict = { r.id.split("|")[0]: r.seq for r in SeqIO.parse(open(f"{fusion_prefix}.rep.fa"), "fasta") } # get count information count_d = defaultdict(lambda: "NA") count_filename = f"{fusion_prefix}.abundance.txt" if Path(count_filename).exists(): for r in DictReader(open(count_filename), delimiter="\t"): count_d[r["pbid"]] = int(r["count_fl"]) if total_fl_count is None: logger.info( "Total FL count not given --- using the sum FL count from fusions only instead." ) total_fl_count = sum(count_d.values()) # get breakpoint information gff_d = defaultdict( lambda: {}) # PBfusion.X --> isoform index -> sqanti3 record if cds_gff_filename is None: gff_filename = f"{fusion_prefix}.gff" else: gff_filename = cds_gff_filename for r in collapseGFFReader(gff_filename): m = fusion_pbid.match(r.seqid) if m is None: logger.error( f"ERROR: fusion pbid in {gff_filename} must follow format `PBfusion.X.Y`. Abort!" ) sys.exit(-1) gene_index, isoform_index = m.group(1), int(m.group(2)) gff_d[gene_index][isoform_index] = r if r.strand not in ("+", "-"): logger.error( f"ERROR: fusion {r.seqid} did not specify strand in {gff_filename}! Abort!" ) sys.exit(-1) fields2 = list(global_info.keys()) + FIELDS with open(f"{fusion_prefix}.annotated.txt", "w") as f, open(f"{fusion_prefix}.annotated_ignored.txt", "w") as f_bad: writer = DictWriter(f, fields2, delimiter=",") writer.writeheader() writer_bad = DictWriter(f_bad, fields2, delimiter=",") writer_bad.writeheader() for gene_index, iso_dict in d.items(): iso_dict = list( iso_dict.items()) # (isoform index, classification record) iso_dict.sort(key=lambda x: x[0]) has_novel = any(r["associated_gene"].startswith("novelGene") or r["associated_gene"] == "" for junk, r in iso_dict) pbid = f"PBfusion.{str(gene_index)}" gff_info = list(gff_d[gene_index].items()) gff_info.sort(key=lambda x: x[0]) rec1 = gff_info[0][1] rec2 = gff_info[-1][1] ( left_breakpoint, left_seq, right_breakpoint, right_seq, ) = get_breakpoint_n_seq(rec1, rec2, genome_dict) left_exon_count = len(rec1.ref_exons) right_exon_count = len(rec2.ref_exons) gene1 = iso_dict[0][1]["associated_gene"] gene2 = iso_dict[-1][1]["associated_gene"] if cds_gff_filename is not None: left_cds_exon_count = len(rec1.cds_exons) right_cds_exon_count = len(rec2.cds_exons) else: left_cds_exon_count = "NA" right_cds_exon_count = "NA" left_orf, right_orf = "NA", "NA" if orf_dict is not None: seqid1 = gff_info[0][1].seqid seqid2 = gff_info[-1][1].seqid left_orf = orf_dict[seqid1] right_orf = orf_dict[seqid2] info = { "UniqueID": pbid, "FusionName": "--".join([_r["associated_gene"] for (_index, _r) in iso_dict]), "LeftGeneName": gene1, "LeftGeneID": gene_to_id[gene1] if gene1 in gene_to_id else "NA", "LeftBreakpoint": left_breakpoint, "LeftFlankingSequence": left_seq, "RightGeneName": gene2, "RightGeneID": gene_to_id[gene2] if gene2 in gene_to_id else "NA", "RightBreakpoint": right_breakpoint, "RightFlankingSequence": right_seq, "JunctionSupport": "NA", "SpanningReads": count_d[pbid], "ReadCountScore": (count_d[pbid] * (10**6) / total_fl_count) if count_d[pbid] != "NA" else "NA", "Sequence": seq_dict[pbid], "LeftORF": left_orf, "RightORF": right_orf, "LeftExonCount": left_exon_count, "RightExonCount": right_exon_count, "LeftCDSExonCount": left_cds_exon_count, "RightCDSExonCount": right_cds_exon_count, "Comments": "PASS", } info.update(global_info) left_chr, left_break, left_strand = left_breakpoint.split(":") right_chr, right_break, right_strand = right_breakpoint.split(":") if has_novel: info["Comments"] = "FAIL:NovelGene" elif gene1 == gene2: info["Comments"] = "FAIL:SameGene" elif info["SpanningReads"] != "NA" and info[ "SpanningReads"] < min_fl_count: info["Comments"] = "FAIL:TooFewFLReads" elif not include_Mt_genes and (gene1.startswith("MT-") or gene2.startswith("MT-")): info["Comments"] = "FAIL:MtGenes" elif (left_chr == right_chr and abs(int(left_break) - int(right_break)) / 1000 <= min_breakpoint_dist_kb): info["Comments"] = "FAIL:BreakpointTooClose" if info["Comments"].startswith("FAIL:"): writer_bad.writerow(info) else: writer.writerow(info)
def dedup_FLNC_per_cluster( corrected_csv, cluster_info, output_prefix, fasta_file=None, gff_file=None, faa_file=None, ): # read corrected CSV reader = DictReader(open(corrected_csv), delimiter="\t") for k in CORRECTED_CSV_FILELDS: if k not in reader.fieldnames: raise RuntimeError( "The following fields must exist in {}!\n{}".format( corrected_csv, "\n".join(CORRECTED_CSV_FILELDS))) per_unique = {} # tag -> record per_unique_count = Counter() # tag -> number of duplicates per_pbid = defaultdict(lambda: { "gene": None, "transcript": None, "clusters": [] }) # pbid --> list of clusters it is in for r in reader: tag = f"{r['BC_ed']}-{r['UMI_ed']}-{r['gene']}" per_unique[tag] = r per_unique_count[tag] += 1 # now link barcode to cell type, also PCR dup counts for tag in per_unique: c = cluster_info[per_unique[tag]["BC_ed"]] rec = per_unique[tag] rec["cluster"] = c rec["num_dups"] = per_unique_count[tag] pbid = rec["pbid"] if pbid in per_pbid: per_pbid[pbid]["clusters"].add(c) else: per_pbid[pbid] = { "gene": rec["gene"], "transcript": rec["transcript"], "clusters": {c}, } # write out de-dup CSV file with open(f"{output_prefix}.csv", "w") as f: writer = DictWriter( f, CORRECTED_CSV_FILELDS + ["cluster", "num_dups"], delimiter="\t", extrasaction="ignore", ) writer.writeheader() keys = per_unique.keys() for k in sorted(keys): writer.writerow(per_unique[k]) if fasta_file is not None: f_d = {} # cluster --> file handle # writer pbid master file with open(f"{output_prefix}.fasta", "w") as f: for r in SeqIO.parse(open(fasta_file), "fasta"): if r.id in per_pbid: newid = f"{r.id}|{per_pbid[r.id]['gene']}|{per_pbid[r.id]['transcript']}|{';'.join(per_pbid[r.id]['clusters'])}" f.write(f">{newid}\n{r.seq}\n") for c in per_pbid[r.id]["clusters"]: if c not in f_d: f_d[c] = open(f"{output_prefix}.{c}.fasta", "w") f_d[c].write(f">{newid}\n{r.seq}\n") if faa_file is not None: f_d = {} # cluster --> file handle # writer pbid master file with open(f"{output_prefix}.faa", "w") as f: for r in SeqIO.parse(open(faa_file), "fasta"): if r.id in per_pbid: newid = f'{r.id}|{per_pbid[r.id]["gene"]}|{per_pbid[r.id]["transcript"]}|{";".join(per_pbid[r.id]["clusters"])}' f.write(f">{newid}\n{r.seq}\n") for c in per_pbid[r.id]["clusters"]: if c not in f_d: f_d[c] = open(f"{output_prefix}.{c}.faa", "w") f_d[c].write(f">{newid}\n{r.seq}\n") for handle in f_d.values(): handle.close() if gff_file is not None: f_d = {} # cluster --> file handle # writer pbid master file with open(f"{output_prefix}.gff", "w") as f: for r in collapseGFFReader(gff_file): if r.seqid in per_pbid: newid = f'{r.seqid}|{per_pbid[r.seqid]["gene"]}|{per_pbid[r.seqid]["transcript"]}|{";".join(per_pbid[r.seqid]["clusters"])}' write_collapseGFF_format(f, r) for c in per_pbid[r.seqid]["clusters"]: if c not in f_d: f_d[c] = open(f"{output_prefix}.{c}.gff", "w") write_collapseGFF_format(f_d[c], r) for handle in f_d.values(): handle.close()