def make_fake_genome(genome_d, gff_info, locus, output_prefix, output_name): chrom = gff_info[locus].chrom regions = gff_info[locus].regions with open(f"{output_prefix}.fasta", "w") as f: f.write(">" + output_name + "\n") for s, e in regions: f.write(str(genome_d[chrom][s:e].seq)) f.write("\n") f.close() # for mapping, write <0-based index on fake genome>, <ref chrom>, <0-based index on ref genome> with open(f"{output_prefix}.mapping.txt", "w") as f: i = 0 for s, e in regions: for j in range(s, e): f.write(f"{i},{chrom},{j}\n") i += 1 with open(f"{output_prefix}.pbids.txt", "w") as f: f.write("\n".join(gff_info[locus].isoforms) + "\n") logger.info( f"Output written to {output_prefix}.fasta, {output_prefix}.mapping.txt, {output_prefix}.pbids.txt.", )
def get_seq_stats(filename, binwidth): print("file type is:", type_fa_or_fq(filename)) with open(f"{filename.name}.seqlengths.txt", "w") as f: lens = [] for r in SeqIO.parse(open(filename), type_fa_or_fq(filename)): f.write(f"{r.id} {str(len(r.seq))}\n") lens.append(len(r.seq)) logger.info(f"{len(lens)} sequences") logger.info(f"min: {min(lens)}") logger.info(f"max: {max(lens)}") logger.info(f"avg: {sum(lens) * 1.0 / len(lens)}") # print by 1 kb bins logger.info("Length Breakdown by kb range:") _max = (max(lens) // binwidth) + 1 bins = [0] * _max for x in lens: bins[x // binwidth] += 1 for i in range(0, _max): if binwidth == 1000: print(f"{i}-{i + 1} kb: {bins[i]}") else: print(f"{i * binwidth}-{(i + 1) * binwidth}: {bins[i]}") print("5-95% percentile:", np.percentile(lens, 5), np.percentile(lens, 95))
def error_correct_haplotypes(hap_obj, isoform_tally, diff_arr, hap_count_ordered): # create new hap_obj and old_to_new_map dict new_hap_obj = Haplotypes(hap_obj.hap_var_positions, hap_obj.ref_at_pos, hap_obj.count_of_vars_by_pos) old_to_new_map = {} for i, j in enumerate(diff_arr.argmin(axis=0)): # haplotype i maps to haplotype hap_count_ordered[j][0] k = hap_count_ordered[j][0] new_hap_index, msg = new_hap_obj.match_or_add_haplotype( hap_obj.haplotypes[k]) old_to_new_map[i] = new_hap_index # now create a new isoform_tally new_isoform_tally = {} for k, v in isoform_tally.items(): new_isoform_tally[k] = Counter() for old_hap_index, count in v.items(): if old_hap_index not in old_to_new_map: logger.info(f"Discarding: {hap_obj.haplotypes[old_hap_index]}") continue new_hap_index = old_to_new_map[old_hap_index] new_isoform_tally[k][new_hap_index] += count return old_to_new_map, new_hap_obj, new_isoform_tally
def main( sam_filename: str = typer.Argument(...), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ) -> None: sam_filename = Path(sam_filename) if sam_filename.suffix != ".sam": raise RuntimeError("Only accepts files ending in .sam. Abort!") prefix = sam_filename.stem output_gff = f"{prefix}.collapsed.gff" with open(output_gff, "w") as f: reader = GMAPSAMReader(sam_filename, True) for r in reader: if r.sID == "*": continue r.strand = r.flag.strand r.geneid = r.qID r.seqid = r.qID r.chr = r.sID r.ref_exons = r.segments r.start = r.sStart r.end = r.sEnd r.cds_exons = None write_collapseGFF_format(f, r) logger.info(f"Output written to {output_gff}.")
def readGTF(self, filename): """ .coords files (0) gene name (1) chr (2) number of exons (3) strand (4) list of space-separated 1-based start, 1-based end """ for line in open(filename): raw = line.strip().split() tID = raw[0] seqname = raw[1] ith = 0 if tID in self.transcript: logger.info(f"duplicate tID {tID} seen, ignore!") continue self.transcript_info[tID] = {"chr": seqname} for i in range(4, len(raw), 2): start0 = int(raw[i]) - 1 end1 = int(raw[i + 1]) self.genome[seqname].insert(start0, end1, tID) self.transcript[tID].insert(start0, end1, { "ith": ith, "chr": seqname }) self.exon[(start0, end1)].append((tID, ith, seqname)) i += 1
def parse_matchAnnot(fa_or_fq, filename, not_pbid=False, parse_FL_coverage=False): pbids = [] fl_cov = {} # only used if parse_FL_coverage is True for r in SeqIO.parse(open(fa_or_fq), type_fa_or_fq(fa_or_fq)): _id = r.id if not_pbid else r.id.split("|")[0] pbids.append(_id) if parse_FL_coverage: try: cov = int( r.description.split("full_length_coverage=")[1].split(";") [0]) fl_cov[_id] = cov except: logger.error( f"WARNING: Unable to extract `full_length_coverage=` from {r.description}. Mark as NA." ) fl_cov[_id] = "NA" match = defaultdict(lambda: (None, None, 0)) # ex: PB.1.1 -> (NOC2L, NOC2L-001, 5) for line in open(filename): i = line.find("result:") if i >= 0: raw = line[i:].strip().split() if len(raw) < 7: continue pbid = raw[1] if not_pbid else raw[1].split("|")[0] gene = raw[2] isoform = raw[3] score = int(raw[7]) if score > match[pbid][1]: match[pbid] = (gene, isoform, score) f = open(f"{filename}.parsed.txt", "w") f.write("pbid\tpbgene\trefisoform\trefgene\tscore") if parse_FL_coverage: f.write("\tcount_fl") f.write("\n") for pbid in pbids: if not_pbid: pbpre = pbid else: pbpre = pbid.split(".")[1] _cov_text = f"\t{fl_cov[pbid]}" if parse_FL_coverage else "" if pbid not in match: f.write(f"{pbid}\t{pbpre}\tNA\tNA\tNA{_cov_text}\n") else: gene, isoform, score = match[pbid] if gene is None: f.write(f"{pbid}\t{pbpre}\tNA\tNA\tNA{_cov_text}\n") else: f.write( f"{pbid}\t{pbpre}\t{isoform}\t{gene}\t{score}{_cov_text}\n" ) f.close() logger.info(f"Output written to: {f.name}")
def get_abundance_post_collapse( group_file: Path, cluster_report_csv: Path, output_prefix: str, restricted_movies: Optional[List[str]] = None, ): """ :param collapse_prefix: collapse prefix filename (must have .group.txt present) :param prefix_dict: :param output_prefix: :param restricted_movies: :return: """ if not group_file.exists(): logger.error(f"File {group_file.name} does not exist. Abort!") sys.exit(-1) if not cluster_report_csv.exists(): logger.error(f"File {cluster_report_csv.name} does not exist. Abort!") sys.exit(-1) cid_info = read_group_filename(group_file, is_cid=True) output_read_count_IsoSeq_csv(cid_info, cluster_report_csv, f"{output_prefix}.read_stat.txt") logger.info(f"Read stat file written to {output_prefix}.read_stat.txt") make_abundance_file( f"{output_prefix}.read_stat.txt", f"{output_prefix}.abundance.txt", restricted_movies=restricted_movies, ) logger.info(f"Abundance file written to {output_prefix}.abundance.txt")
def main( fasta_filename: str = typer.Argument( ..., help="Fasta file from which to simulate phasing data from."), ploidity: int = typer.Option(2, "-p"), err_sub: float = typer.Option(...), copies: str = typer.Option(...), write_fastq: bool = typer.Option(False), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ) -> None: assert 2 <= ploidity <= 6 copies = list(map(int, copies.split(","))) assert len(copies) == ploidity for r in SeqIO.parse(open(fasta_filename), "fasta"): d2 = r.id.split("|")[0] logger.info(f"making {d2}") Path(d2).mkdir(parents=True, exist_ok=True) simulate_phasing_data( seq0=r.seq.tostring(), err_sub=err_sub, ploidity=ploidity, copies=copies, write_fastq=write_fastq, working_dir=d2, )
def scrub_sample_GFFs( sample_dirs: Dict[str, str], gff_filename: Union[str, Path], count_filename: Union[str, Path], group_filename: Union[str, Path], fastq_filename: Union[str, Path], output_prefix: str, tree: IntervalTree, ) -> None: for _, d in sample_dirs.items(): with Path(d, f"{output_prefix}.gff.tmp").open("w") as outf: for r in GFF.collapseGFFReader(Path(d, gff_filename)): n = len(r.ref_exons) if n == 1: GFF.write_collapseGFF_format(outf, r) new_ref_exons = scrub_ref_exons(r, tree) if new_ref_exons is None: logger.info("No changes made due to error:", r.seqid) else: # print "before:", r.ref_exons # print "after :", new_ref_exons r.ref_exons = new_ref_exons GFF.write_collapseGFF_format(outf, r) cleanup_scrubbed_files_redundancy( outf.name, Path(d, group_filename), Path(d, count_filename), Path(d, fastq_filename) if fastq_filename is not None else None, Path(d, output_prefix), )
def trim5p3p_multithreaded(fastq_filename: Union[str, Path], output_prefix: str, chunks: int) -> None: # first figure out how many records there are and record positions num_lines = 0 for _ in open(fastq_filename, "r"): num_lines += 1 num_records = num_lines // 4 chunk_size = (num_records // chunks) + (num_records % chunks > 0) logger.info( f"{fastq_filename} has {num_records} records, {chunk_size} per chunk") pools = [] records = [] count = 0 i = 1 for r in SeqIO.parse(open(fastq_filename), "fastq"): count += 1 records.append(r) if count >= chunk_size: p = Process(target=trim5p3p, args=(records, f"{output_prefix}.{str(i)}")) p.start() print(f"Starting worker {i}...") pools.append(p) records = [] count = 0 i += 1 p = Process(target=trim5p3p, args=(records, f"{output_prefix}.{str(i)}")) p.start() logger.info(f"Starting worker {i}...") pools.append(p) for p in pools: p.join()
def link_files( src_dir: str, out_dir=Path.cwd()) -> Tuple[Path, Path, Path, Path]: """ :param src_dir: job directory Locate mapped.fastq, read-stat, classify report link to current directory """ src_dir = Path(src_dir) # location for mapped fastq in IsoSeq3 mapped_fastq = src_dir.joinpath("outputs", "collapse_isoforms.fastq") # for <SL8 mapped_fasta = src_dir.joinpath( "outputs", "collapse_isoforms.fasta") # SL8+ only fasta # mapped_gff = os.path.join( # os.path.abspath(src_dir), "outputs", "collapse_isoforms.gff" # ) read_stat = src_dir.joinpath("outputs", "collapse_isoforms.read_stat.txt") primer_csv = src_dir.joinpath("outputs", "flnc.report.csv") if mapped_fastq.exists(): logger.info("Detecting IsoSeq task directories...") return out_dir, mapped_fastq, read_stat, primer_csv elif mapped_fasta.exists(): logger.info("Detecting IsoSeq task directories...") return out_dir, mapped_fasta, read_stat, primer_csv else: raise FileNotFoundError( "Cannot find expected files (ex: collapse_isoforms.fastq) in job directory! Does not look like a Iso-Seq job!" )
def brangus(vcf_filename, out_filename, unzip_snps=None): if unzip_snps is None: unzip_snps = defaultdict(lambda: {}) for r in vcfpy.Reader(vcf_filename): unzip_snps[r.CHROM][r.POS] = r logger.info(f"Finished reading {vcf_filename}") with open(out_filename, "w") as out_f: FIELDS = [ "dir", "chrom", "pos", "strand", "ref", "alt_Short", "alt_PB", "in_Short", "in_PB", "cov_Short", "cov_PB", "genomic_HP", ] writer = DictWriter(out_f, FIELDS, delimiter="\t") writer.writeheader() dirs = glob.glob("by_loci/*size*/") for d1 in dirs: mpileup = Path(d1, "ccs.mpileup") mapfile = Path(d1, "fake.mapping.txt") vcffile = Path(d1, "phased.partial.vcf") config = Path(d1, "config") nosnp = Path(d1, "phased.partial.NO_SNPS_FOUND") if not vcffile.exists(): if not nosnp.exists(): logger.error(f"Skipping {d1} because no SNPs found.") else: logger.info(f"Evaluating {d1}.") strand = "NA" if config.exists(): # find the strand this gene family is on for line in open(config): if line.startswith("ref_strand="): strand = line.strip().split("=")[1] good_positions, cov_at_pos = get_positions_to_recover( mapfile, mpileup, unzip_snps, min_cov=30) name = d1.split("/")[1] eval_isophase( vcffile, unzip_snps, good_positions, cov_at_pos, {}, {}, writer, name, strand, ) return
def convert_sam_rec_to_gff3_rec(r, source, qid_index_dict=None): """ :param r: GMAPSAMRecord record :param qid_seen: list of qIDs processed so far -- if redundant, we have to put a unique suffix :return SeqRecord ready to be written as GFF3 """ if r.sID == "*": logger.info(f"Skipping {r.qID} because unmapped.") return None t_len = sum(e.end - e.start for e in r.segments) seq = Seq("A" * t_len) # DO NOT CARE since sequence is not written in GFF3 rec = SeqRecord(seq, r.sID) strand = 1 if r.flag.strand == "+" else -1 # indels = r.num_ins + r.num_del # mismatches = r.num_nonmatches # matches = r.num_mat_or_sub - r.num_nonmatches if qid_index_dict is not None: if r.qID in qid_index_dict: qid_index_dict[r.qID] += 1 r.qID += f"_dup{str(qid_index_dict[r.qID])}" else: qid_index_dict[r.qID] += 1 gene_qualifiers = {"source": source, "ID": r.qID, "Name": r.qID} # for gene record # mRNA_qualifiers = {"source": source, "ID": r.qID+'.mRNA', "Name": r.qID+'.mRNA', "Parent": r.qID, # "coverage": "{0:.2f}".format(r.qCoverage*10**2) if r.qCoverage is not None else "NA", # "identity": "{0:.2f}".format(r.identity*10**2), # "matches": matches, "mismatches": mismatches, "indels": indels} # gene line, one per record top_feature = SeqFeature( FeatureLocation(r.sStart, r.sEnd), type="gene", strand=strand, qualifiers=gene_qualifiers, ) # mRNA line, one per record top_feature.sub_features = ( [] ) # top_feature.sub_features = [SeqFeature(FeatureLocation(r.sStart, r.sEnd), type="mRNA", strand=strand, qualifiers=mRNA_qualifiers)] # exon lines, as many exons per record for i, e in enumerate(r.segments): _id = f"{r.qID}.exon{i+1}" exon_qual = {"source": source, "ID": _id, "Name": _id} top_feature.sub_features.append( SeqFeature( FeatureLocation(e.start, e.end), type="exon", strand=strand, qualifiers=exon_qual, ) ) rec.features = [top_feature] return rec
def shaded_bed12_post_sqanti( sqanti_class_filename: Union[str, Path], input_bed12: Union[str, Path], output_prefix: str, FL_fieldnames: List[str] = ["FL"], ok_to_ignore: bool = False, ) -> None: # read input BED12 file into dict bed_info = {} # isoform --> bed record for line in open(input_bed12): raw = line.strip().split() bed_info[raw[3]] = raw CPM_fieldnames = {} # CPM -> FL field names for k in FL_fieldnames: assert k.startswith("FL") if k == "FL": CPM_fieldnames["CPM"] = "FL" else: assert k.startswith("FL.") CPM_fieldnames["CPM." + k[3:]] = k # group SQANTI3 classification file by `associated_gene` records_by_gene = defaultdict(lambda: []) total_fl_count_dict = Counter() for r in DictReader(open(sqanti_class_filename), delimiter="\t"): records_by_gene[r["associated_gene"]].append(r) for cpm_k, fl_k in CPM_fieldnames.items(): total_fl_count_dict[cpm_k] += int(r[fl_k]) if r[fl_k] != "NA" else 0 for cpm_k in total_fl_count_dict: if total_fl_count_dict[cpm_k] == 0: raise RuntimeError( f"No counts observed in column `{CPM_fieldnames[cpm_k]}`. Ignore!" ) logger.info(f"Generating count RGB for columns: {', '.join(CPM_fieldnames.keys())}") bed_writers = {} for cpm_k in CPM_fieldnames: outfile = f"{output_prefix}.{cpm_k}.bed12" logger.info(f"Writing output to {outfile}....") with open(outfile, "w") as bed_writers[cpm_k]: bed_writers[cpm_k].write("track name=PacBioColored itemRgb=On\n") # calculate FL CPM for _, records in records_by_gene.items(): for r in records: for cpm_k, fl_k in CPM_fieldnames.items(): r[cpm_k] = ( (int(r[fl_k]) if r[fl_k] != "NA" else 0) * (10 ** 6) / total_fl_count_dict[cpm_k] ) shade_isoforms_for_gene_group(records, bed_info, bed_writers, ok_to_ignore)
def fq2fa(input_file): if not (input_file.lower().endswith(".fastq") or input_file.lower().endswith(".fq")): raise AssertionError( f"Input {input_file} does not end with .fastq or .fq! Abort") output = Path(input_file).with_suffix(".fasta") for r in SeqIO.parse(open(input_file), "fastq"): SeqIO.write(r, output, "fasta") logger.info(f"Output written to {output}")
def collect_all_vcf( dirs: str, vcf_filename: str = "phased.partial.vcf", output: str = "IsoSeq_IsoPhase.vcf", ) -> None: no_snp_found_filename = Path(f"{Path(vcf_filename).stem}.NO_SNPS_FOUND") snps_by_chrom = defaultdict(lambda: []) reader = None for d in dirs: filename = Path(d, vcf_filename) if not filename.exists(): if not no_snp_found_filename.exists(): logger.info("VCF file {filename} does not exist. Skipping.") continue with open(filename) as rf: reader = vcfpy.Reader(rf) for r in reader: c = Counter() # genotype -> count for x in r.samples: if x.data.GT.count("|") == 0: c[x.data.GT] += x.data.HQ else: for i, gt in enumerate(x.data.GT.split("|")): c[gt] += x.data.HQ[i] c_keys = c.keys() genotype = "|".join(str(k) for k in c_keys) counts = ",".join(str(c[k]) for k in c_keys) r.samples = [ vcfpy.Call( r, "SAMPLE", vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)]), ) ] snps_by_chrom[r.CHROM].append((r.POS, r)) keys = list(snps_by_chrom.keys()) keys.sort() if reader is not None: reader.samples = ["SAMPLE"] with open(output, "w") as f: f = vcfpy.Writer(f, reader) for k in keys: v = snps_by_chrom[k] v.sort(key=lambda x: x[0]) for _, rec in v: f.write_record(rec) print("Output written to:", output)
def main( sample_config: str = typer.Argument(...), summary_report: str = typer.Argument(...), output_prefix: str = typer.Argument(...), min_sample: int = typer.Option( 1, "-S", help="Minimum number of samples as evidence (default: 1)"), min_transcript: int = typer.Option( 2, "-T", help="Minimum number of transcripts as evidence (default: 2)"), # parser.add_argument("-C", "--accept_all_canonical", action="store_true", default=False, help="Accept all canonical jucntions (default: false)") scrubbed_junction_file: Optional[Union[str, Path]] = typer.Option( None, help= "Scrubbed junction bed --- if given, directly use it to scrub GFFs."), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ): ( sample_dirs, sample_names, group_filename, gff_filename, count_filename, fastq_filename, ) = sp.read_config(sample_config) report_filename = summary_report if scrubbed_junction_file is None: output_filename = f"{output_prefix}.scrubbed.junction.bed" tree = scrub_junctions(report_filename, output_filename, min_sample, min_transcript, True) logger.info(f"Scrubbed junction written to: {output_filename}") else: output_filename = scrubbed_junction_file logger.info(f"Reading scrubbed junction file: {output_filename}") tree = read_scrubbed_junction_to_tree(output_filename) scrub_sample_GFFs( sample_dirs, gff_filename, count_filename, group_filename, fastq_filename, output_prefix, tree, )
def regroup_gff( pooled_gff, demux_count_file, output_prefix, out_group_dict, in_fafq=None ): """ :param pooled_sam: SAM file :param demux_count_file: comma-delimited per-barcode count file :param output_prefix: output prefix for GFF :param out_group_dict: dict of barcode name --> group to be long in (ex: {'EM1':'EM', 'EM2':'EM'}) :param in_fafq: optional fasta/fastq that was input to SAM """ if in_fafq is not None: type_fafq = get_type_fafq(in_fafq) in_tissue = defaultdict( lambda: set() ) # pbid --> list of tissue it is in (EM, END, R) for r in DictReader(open(demux_count_file), delimiter=","): for k, v in r.items(): if k != "id" and int(v) > 0: in_tissue[r["id"]].add(k) # in_tissue = dict(in_tissue) handles = {} handles_fafq = {} for g in out_group_dict.values(): handles[g] = open(f"{output_prefix}_{g}_only.gff", "w") if in_fafq is not None: handles_fafq[g] = open(f"{output_prefix}_{g}_only.{type_fafq}", "w") if in_fafq is not None: fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq)) fafq_dict_keys = list(fafq_dict.keys()) for k in fafq_dict_keys: m = rex_pbid.match(k) if m is not None: fafq_dict[m.group(1)] = fafq_dict[k] reader = GFF.collapseGFFReader(pooled_gff) for r in reader: groups_to_write_in = set() pbid = r.seqid if pbid not in in_tissue: logger.info( f"WARNING: {pbid} does not belong to any group indicated by outgroup_dict" ) for tissue in in_tissue[pbid]: groups_to_write_in.add(out_group_dict[tissue]) for g in groups_to_write_in: GFF.write_collapseGFF_format(handles[g], r) if in_fafq is not None: SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
def link_files(src_dir, out_dir=Path.cwd()): """ :param src_dir: job directory Locate HQ isoform, (cluster) report.csv, (classify) file.csv link to current directory """ src_dir = Path(src_dir) # location for HQ fastq in IsoSeq1 hq_fastq = src_dir.joinpath( "tasks", "pbtranscript.tasks.combine_cluster_bins-0", "hq_isoforms.fastq", ) # location for HQ fastq in IsoSeq2 hq_fastq2 = src_dir.joinpath( "tasks", "pbtranscript2tools.tasks.collect_polish-0", "all_arrowed_hq.fastq", ) # location for cluster report in IsoSeq1 cluster_csv = src_dir.joinpath( "tasks", "pbtranscript.tasks.combine_cluster_bins-0", "cluster_report.csv", ) cluster_csv2 = src_dir.joinpath( "tasks", "pbtranscript2tools.tasks.collect_polish-0", "report.csv", ) # location for classify report in IsoSeq1 and 2 primer_csv = src_dir.joinpath("tasks", "pbcoretools.tasks.gather_csv-1", "file.csv") if hq_fastq.exists(): logger.info("Detecting IsoSeq1 task directories...") hq_fastq.symlink_to(out_dir.joinpath("hq_isoforms.fastq")) cluster_csv.symlink_to(out_dir.joinpath("cluster_report.csv")) primer_csv.symlink_to(out_dir.joinpath("classify_report.csv")) isoseq_version = "1" else: logger.info("Detecting IsoSeq2 task directories...") hq_fastq2.symlink_to(out_dir.joinpath("hq_isoforms.fastq")) cluster_csv2.symlink_to(out_dir.joinpath("cluster_report.csv")) primer_csv.symlink_to(out_dir.joinpath("classify_report.csv")) isoseq_version = "2" return ( out_dir, "hq_isoforms.fastq", "cluster_report.csv", "classify_report.csv", isoseq_version, )
def main(gtf): transcript_tally = {} for tID in gtf.transcript: transcript_tally[tID] = [0] * len(gtf.get_exons(tID)) for r in btabBlockReader( "sim_gencode_20x_first1000_test2.gmap.tophits.btab"): path = btab_reclist_to_interval_list_0basedStart(r) info = match_transcript(gtf, r[0]["chr"], path) if info["matchedExons"] is None: logger.info(f"Did not find a match for {r[0]['seqid']}!") continue for i, _ in info["matchedExons"]: transcript_tally[info["tID"]][i] += 1 return transcript_tally
def err_correct( genome_file: Path, sam_file: Path, output_err_corrected_fasta: Path, genome_dict: Dict[str, SeqIO.SeqRecord] = None, ) -> None: if genome_dict is None: genome_dict = {} logger.info(f"Loading {genome_file.name}") for r in tqdm(SeqIO.parse(OpenFile(genome_file, "r"), "fasta")): genome_dict[r.name] = r logger.info(f"Finished reading {genome_file}") with open(output_err_corrected_fasta, "w") as f: reader = BioReaders.GMAPSAMReader(str(sam_file), True) for r in tqdm(reader): logger.info(r) if r.sID == "*": continue seq = consistute_genome_seq_from_exons(genome_dict, r.sID, r.segments, r.flag.strand) # logger.info(f">{r.qID}") f.write(f">{r.qID}\n{seq}\n") logger.info(f"output written to {output_err_corrected_fasta}")
def get_roi_len(seqid: str): # before isoseq3: <movie>/<zmw>/<start>_<end>_CCS # for isoseq3: <movie>/<zmw>/ccs if seqid.endswith("/ccs"): logger.info( "WARNING: isoseq3 format detected. Output `length` column will be `NA`." ) return "NA" elif not seqid.endswith("_CCS"): logger.error( "Sequence ID format must be <movie>/<zmw>/<start>_<end>_CCS or <movie>/<zmw>/ccs! Abort!" ) sys.exit(-1) s, e, junk = seqid.split("/")[2].split("_") return abs(int(s) - int(e))
def main( genome_fasta: str = typer.Argument(..., help="Reference genome fasta"), flnc_filename: str = typer.Argument(..., help="FLNC fastq file"), gff_filename: str = typer.Argument( ..., help="GFF file of transcripts, IDs must be PB.X.Y"), stat_filename: str = typer.Argument( ..., help="Tab-delimited read stat file linking FLNC to PB.X.Y"), coverage: int = typer.Option( 40, "--coverage", "-c", help="Minimum FLNC coverage required (default: 40)", ), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ) -> None: if Path("by_loci").exists() and Path("by_loci").is_dir(): logger.error( "Directory by_loci/ already exists. Delete before running!") sys.exit(-1) if not Path(genome_fasta).exists(): logger.error(f"Cannot find genome FASTA {genome_fasta}. Abort!") sys.exit(-1) if not Path(flnc_filename).exists(): logger.error(f"Cannot find FLNC file {flnc_filename}. Abort!") sys.exit(-1) if not Path(gff_filename).exists(): logger.error(f"Cannot find GFF file {gff_filename}. Abort!") sys.exit(-1) if not Path(stat_filename).exists(): logger.error(f"Cannot find Stat file {stat_filename}. Abort!") sys.exit(-1) logger.info(f"Reading genome fasta {genome_fasta}...") genome_d = SeqIO.to_dict(SeqIO.parse(open(genome_fasta), "fasta")) select_loci_to_phase(genome_d, gff_filename, stat_filename, flnc_filename, coverage)
def fa2fq(input_file): if not ( input_file.lower().endswith(".fasta") or input_file.lower().endswith(".fa") ): raise AssertionError( f"Input {input_file} does not end with .fasta or .fa! Abort" ) output = Path(input_file).with_suffix(".fastq") f = open(output, "w") for r in SeqIO.parse(open(input_file), "fasta"): r.letter_annotations["phred_quality"] = [60] * len(r.seq) SeqIO.write(r, f, "fastq") f.close() logger.info(f"Output written to {f.name}") return f.name
def main_pasa(gtf): pasa_tally = {} for tID in gtf.transcript: pasa_tally[tID] = [0] * len(gtf.get_exons(tID)) pasa = GTF( "sim_gencode_20x_first1000_test2.pasa_assemblies.denovo_transcript_isoforms.gtf" ) for tID in pasa.transcript: path = pasa.get_exons(tID) seqname = pasa.exon[(path[0].start, path[0].end)][0][2] info = match_transcript(gtf, seqname, path) if info["matchedExons"] is None: logger.info(f"Did not find a match for {format(tID)}!") continue for i, j in info["matchedExons"]: pasa_tally[info["tID"]][i] += 1 return pasa_tally
def scrub_ref_exons(r: Dict[str, Any], tree: IntervalTree) -> Optional[List[Interval]]: n = len(r.ref_exons) new_ref_exons = [] cur_start = r.ref_exons[0].start for i in range(n - 1): donor = r.ref_exons[i].end - 1 # make it 0-based accep = r.ref_exons[i + 1].start # start is already 0-based match = find_best_match_junction(tree[r.chr, r.strand], donor, accep) if match is None: logger.info( f"donor-acceptor site {r.chr},{r.strand},{donor}-{accep} has no hit in tree!" ) return None new_ref_exons.append(Interval(cur_start, match.start + 1)) cur_start = match.end new_ref_exons.append(Interval(cur_start, r.ref_exons[-1].end)) return new_ref_exons
def sep_by_primer(filename, output_prefix, sample_size): filetype = type_fa_or_fq(filename) ids = [r.id for r in SeqIO.parse(open(filename), filetype)] n = len(ids) if sample_size > n: logger.warning( f"WARNING: {filename} contains only {n} sequences but subsample size at {sample_size}! Simply output whole file." ) chosen_ids = random.sample(ids, min(n, sample_size)) with open(f"{output_prefix}.random{str(sample_size)}.{filetype}", "w") as f: for r in SeqIO.parse(open(filename), filetype): if r.id in chosen_ids: SeqIO.write(r, f, filetype) logger.info(f"Randomly selected sequences written to {f.name}.")
def main( snps_filename: str = typer.Argument( ..., help="Filename containing list of .snps_files to process."), genome_filename: str = typer.Argument( ..., help="Genome fasta. Chromosome IDs must agree with .snps_files files!", ), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ): snps_filename = Path(snps_filename) genome_filename = Path(genome_filename) snps_files = [] # sanity checking of input files for filename in open(snps_filename): if not filename.suffix(".snps"): raise FileNotFoundError( f"Input files listed in {snps_filename} must end with .snps_files!" ) if not filename.exists(): raise FileNotFoundError(f"{filename} does not exist! Abort.") snps_files.append(filename) if not genome_filename.exists(): raise FileNotFoundError( f"Genome file {genome_filename} does not exist!") logger.info(f"Reading genome file {genome_filename}....") genome_d = LazyFastaReader(genome_filename) # quick checking if the genome chromosomes have the |arrow|arrow style suffix, if they do, process it keys = list(genome_d.keys()) for k in keys: k2 = k.split("|")[0] if k2 != k and k2 not in keys: genome_d.d[k2] = genome_d.d[k] logger.info( f"Detected | string in chromosome ID, stripping {k} to {k2}...." ) logger.info("Finished reading genome.") for snp_file in snps_files: assert snp_file.suffix(".snps") vcf_file = snp_file.with_suffix(".vcf") logger.info(f"Processing {snp_file} --> {vcf_file}") write_snp_to_vcf(snp_file, vcf_file, genome_filename, genome_d)
def demux_isoseq2_no_genome( job_dir: Optional[Path] = None, hq_fastq: Optional[Path] = None, cluster_csv: Optional[Path] = None, classify_csv: Optional[Path] = None, output_filename=sys.stdout, ): if job_dir is not None: ( out_dir_ignore, hq_fastq, cluster_csv, classify_csv, isoseq_version, ) = link_files(job_dir) assert isoseq_version in ("1", "2") else: for _ in ( hq_fastq, cluster_csv, classify_csv, ): if not _.exists(): raise FileNotFoundError(f"{_.name} was not found!") # info: dict of hq_isoform --> primer --> FL count logger.info(f"Reading {classify_csv}...") max_primer, classify_csv = read_classify_csv(classify_csv) logger.info(f"Reading {cluster_csv}...") info = read_cluster_csv(cluster_csv, classify_csv, isoseq_version) with open(output_filename, "w") as f: f.write( f"id,{','.join('primer' + str(i) for i in range(max_primer + 1))}\n" ) logger.info(f"Reading {hq_fastq}...") for r in SeqIO.parse(open(hq_fastq), "fastq"): if isoseq_version == "1": m = hq1_id_rex.match(r.id) else: m = hq2_id_rex.match(r.id) if m is None: raise RuntimeError( f"Unexpected HQ isoform ID format: {r.id}! Abort.") cid = m.group(1) f.write(r.id) for p in range(max_primer + 1): f.write(f",{info[cid][p]}") f.write("\n") logger.info(f"Count file written to {f.name}.")
def main( input_file: str = typer.Option(..., "--input", "-i", help="Input fasta or fastq."), sam_filename: str = typer.Option(..., "--sam_filename", "-s", help="Aligned SAM filename."), genome_filename: str = typer.Option(..., "--genome_filename", "-g", help="Genome fasta."), output_prefix: str = typer.Option(..., "--output_prefix", "-o", help="Output prefix."), gff: Optional[str] = typer.Option(None, "--gff", help="Annotation GFF."), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ): # read genome logger.info(f"Reading genome {genome_filename}...") genome_d = SeqIO.to_dict(SeqIO.parse(open(genome_filename), "fasta")) # read gff if gff is not None: logger.info(f"Reading annotation {gff}...") junction_info = read_annotation_for_junction_info(gff) else: junction_info = None evaluate_alignment_sam(input_file, sam_filename, genome_d, output_prefix, junction_info)