def write_fasta(self, fout, ids = [], seqs = [], write_all = False, fasta = None) -> None: """ Write gRNA sequences to FASTA file. Arguments: fout (str): required, path to output file ids (list): list of IDs (str) of gRNA to write seqs (list): list of sequences (str) of gRNA to write, overrides ``ids`` write_all (bool): write all gRNA seqences, overrides ``seqs`` and ``ids`` fasta (str): optional, path to FASTA file, used for renaming gRNA """ ## get relevant gRNA sequences if write_all: gRNA_seqs = self.flatten_gRNAseqs() elif seqs: gRNA_seqs = self.get_gRNAseqs_by_seq(*seqs) elif ids: gRNA_seqs = self.get_gRNAseqs_by_id(*ids) else: print("Either 'ids' OR 'seqs' OR 'write_all' is required. Writing empty file.") open(fout, "w+").write('') return ## rename sequences per fasta file (if fasta file provided) fasta_inv = {} if not fasta else {str(seq): k for k, seq in fasta_to_dict(fasta).items()} self.assign_seqid(assign_all = False) to_write = {fasta_inv.get(gRNA_seq.seq, gRNA_seq.id): gRNA_seq.seq for gRNA_seq in gRNA_seqs} if to_write: dict_to_fasta(to_write, fout) else: open(fout, "w+").write('') return
def extend_reference( feature: list, subfeature: list, fout_fasta, fout_gff, mafft="mafft", feature_type="mRNA", subfeature_type="CDS", ## xx_type are currently unused thread: int = 1, directory=None, tmp=True, logger=None): feature = feature if non_string_iter(feature) else [feature] subfeature = subfeature if non_string_iter(subfeature) else [subfeature] valid_feature = [x for x in feature if os.path.exists(x)] valid_subfeature = [x for x in subfeature if os.path.exists(x)] ## raise Error for inaccessible files invalid_feature = [x for x in feature if x not in valid_feature] invalid_subfeature = [x for x in subfeature if x not in valid_subfeature] invalid = invalid_feature + invalid_subfeature if invalid: InvalidPath(','.join(invalid)) if valid_feature and valid_subfeature: ## create directory if directory is None: directory = tempfile.mkdtemp() tmp = True group_by_gene(subfeature, feature, directory, sep='.', verbose=True, logger=logger) ## align for fasta in [ f for f in os.listdir(directory) if re.search("\.fasta$", f) ]: feature_name = re.search("^.+(?=\.fasta$)", os.path.basename(fasta)).group(0) with open(os.path.join(directory, f"{feature_name}_aln.fa"), "w+") as f: stdout, stderr = MafftCommandline(mafft, input=os.path.join( directory, fasta), quiet=True, thread=thread)() f.write(stdout) os.remove(os.path.join(directory, fasta)) ## convert alignment to gff aln_to_annotation(directory, fout=fout_gff, sep='.', outfmt="gff") ## combine genomic files seqs_feature = dict( itertools.chain(*[fasta_to_dict(fa).items() for fa in feature])) dict_to_fasta(seqs_feature, fout_fasta) ## remove temporary directories if tmp: import shutil shutil.rmtree(directory) return
def get_merged_seqs(merged_f, fasta, fout, header=[], indv_i=1): ## get domain ranges dat = [x.split('\t') for x in splitlines(merged_f)] get = make_custom_get(dat[0]) dat = dat[1:] ## parse fasta file seqs = fasta_to_dict(fasta) ## get sequences output = {} for i, entry in enumerate(dat): seq = seqs[get(entry, "molecule")][get(entry, "start"):get(entry, "end")] key = '|'.join([str(x) for x in \ ([indv_i, get(entry, "molecule"), i + 1, f"{get(entry, 'start') + 1}-{get(entry, 'end')}"])]) output[key] = seq dict_to_fasta(output, fout) return
def group_by_gene(fa_cds, fa_genomic, directory, sep='.', verbose=True, logger=None): seqs_cds = { seqid: seq for fname in fa_cds for seqid, seq in fasta_to_dict(fname).items() } seqs_genomic = { seqid: seq for fname in fa_genomic for seqid, seq in fasta_to_dict(fname).items() } genes = {seqid: [] for seqid in seqs_genomic} orphan_cds = [] ## map CDS to GENOMIC for seqid in seqs_cds.keys(): gene = re.match(f"^.+?(?={re.escape(sep)}[^{sep}]+$)", seqid).group(0) if not gene in genes: orphan_cds.append(seqid) else: genes[gene].append(seqid) ## print orphans orphan_genes = sorted(gene for gene, cds in genes.items() if not cds) def log(msg): if verbose and logger: logger.plain(msg) elif logger: logger.fplain(msg) elif verbose: print(msg) if orphan_genes: log(f"GENOMIC sequences without CDS: {','.join(orphan_genes)}") if orphan_cds: log(f"CDS sequences without GENOMIC: {','.join(orphan_cds)}") ## write for gene, cds in genes.items(): to_write = { gene: seqs_genomic[gene], **{seqid_cds: seqs_cds[seqid_cds] for seqid_cds in cds} } dict_to_fasta(to_write, f"{directory}/{gene}.fasta") return
def mask_identical(to_mask_fname, fasta_fname, fout_fname, **kwargs): seqs_to_mask = fasta_to_dict(to_mask_fname) ## separate sequences with ambiguous bases (not compatible with BLAST) and those without ## - ambiguous bases typically present in scaffold-level assemblies as runs of 'N's standard_bases = {'A', 'T', 'G', 'C', 'U', 'a', 't', 'g', 'c', 'u'} unambig_to_mask = { seqid: seq for seqid, seq in seqs_to_mask.items() if set(seq).issubset(standard_bases) } ambig_to_mask = { seqid: seq for seqid, seq in seqs_to_mask.items() if seqid not in unambig_to_mask } ## if some (but not all) sequences don't have unambiguous bases if (unambig_to_mask and ambig_to_mask): import tempfile ## replace to_mask_fname with temporary file tmp_to_mask_fname = tempfile.mkstemp(suffix=".fasta")[1] ## write unambig_to_mask to file to be used for BLAST dict_to_fasta(unambig_to_mask, tmp_to_mask_fname) else: tmp_to_mask_fname = None ## start masking masked = [] ## if at least one sequence doesn't have ambiguous bases if unambig_to_mask: masked.extend( blast_mask((tmp_to_mask_fname if tmp_to_mask_fname is not None else to_mask_fname), fasta_fname, fout_fname, **kwargs)) ## if at least one sequence has ambiguous bases if ambig_to_mask: masked.extend([ Masked(hsp) for query_result in find_identical_in_fasta( ambig_to_mask, fasta_fname) for hit in query_result for hsp in hit.hsps ]) ## remove temporary file if created if tmp_to_mask_fname is not None: os.remove(tmp_to_mask_fname) return masked
def collapse_query(self, fout_fasta=None, fout_map=None): import tempfile self.query_nr = fout_fasta if fout_fasta is not None else tempfile.mkstemp( dir=self.directory)[1] self.query_nr_map = fout_map if fout_map is not None else tempfile.mkstemp( dir=self.directory)[1] dat = fasta_to_dict(self.query) identicals = { k: set(seqid for seqid, seq in dat.items() if str(seq) == str(v)) for k, v in dat.items() } identical_sets = set( map(lambda x: tuple(sorted(x)), identicals.values())) ## write nr sequences dict_to_fasta({seqids[0]: dat[seqids[0]] for seqids in identical_sets}, self.query_nr) ## write nr mapping with open(self.query_nr_map, 'w') as f: f.write('\n'.join(['\t'.join(seqids) for seqids in identical_sets])) return
def remove_non_max_bitscore(fasta, bedtool, genes, relax=False, lvl=0, quiet=True, colnames_blast=[ "chrom", "start", "end", "candidate", "cstart", "cend" ], blast_metrics=["bitscore"], colnames_bed=[ "bed_chrom", "bed_start", "bed_end", "id", "score", "strand", "source", "feature", "phase", "attributes", "overlap" ], colnames_gff=[ "bed_chrom", "source", "feature", "bed_start", "bed_end", "score", "strand", "phase", "attributes", "overlap" ], bedtools='', attribute_mod={}) -> None: """ Remove query sequences for which the subject feature in the query-subject hit with the max bitscore is not a target gene/feature. This occurs in-place. Arguments: fasta (str): path to FASTA file of query sequences to reduce bedtool (:class:`BedTool`): BedTool object where BLAST hits have been intersected with subject GFF3 files genes (list): gene/feature IDs of targets relax (bool): retain query sequences even if max bitscore hit overlaps with non-target feature so long as it also overlaps with a target feature lvl (int): printing indentation quiet (bool): silence non-essential messages colnames_blast (list): column names of BLAST output blast_metrics (list): additional column names of metrics in BLAST output colnames_bed (list): column names if annotation intersected with is BED format colnames_gff (list): column names if annotation intersected with is GFF3 format bedtools (str): path to directory contaiing BEDTools executables if bedtool is not in command-search path attribute_mod (dict): optional, required only if non-standard attriute field names are present in GFF3 files. Dictionary describing attribute modification. """ import itertools pybedtools.helpers.set_bedtools_path(path=bedtools) printi = make_local_print(quiet=quiet, printf=make_print_preindent(initial_lvl=lvl)) genes = set(genes) cols_bed = colnames_blast + blast_metrics + colnames_bed cols_gff = colnames_blast + blast_metrics + colnames_gff ## make get function get_bed = make_custom_get(cols_bed) get_gff = make_custom_get(cols_gff) def get(data, *args, **kwargs): if not data: helper_get = get_bed else: entry_len = len(data) if not isinstance(data[0], (list, tuple)) else len( data[0]) if entry_len == len(cols_bed): helper_get = get_bed elif entry_len == len(cols_gff): helper_get = get_gff else: return get_bed([], "dummy", suppress_print=True) return helper_get(data, *args, **kwargs) data = {} for entry in (str(bedtool).split('\n') if isinstance( bedtool, BedTool) else tuple( itertools.chain( *[str(bedtool_obj).split('\n') for bedtool_obj in bedtool]))): entry = entry.split('\t') if get(entry, "feature") in ("gene", "pseudogene", '.'): data[get(entry, "candidate")] = (data.get(get(entry, "candidate"), []) + [{ "ann": Annotation(get(entry, *colnames_gff), None, attr_mod=attribute_mod), "bitscore": get(entry, "bitscore") }]) ## get largest bitscore for each candidate target max_bitscore = { candidate: max([entry["bitscore"] for entry in data[candidate]]) for candidate in data } ## identify sequences to discard and print warnings if candidate has max bitscore with target and non-target ## note that due to the algorithm, candidates that don't overlap with ANY features will also be kept throw = [] for candidate in data: # max_bitscore_genes = set(get(entry, "id") for entry in data[candidate] # if get(entry, "bitscore") == max_bitscore[candidate]) max_bitscore_genes = set( entry["ann"].get_attr("ID", fmt=str) for entry in data[candidate] if entry["bitscore"] == max_bitscore[candidate]) if max_bitscore_genes.issubset( genes): ## if max score genes are subset of target genes continue else: if max_bitscore_genes.isdisjoint( genes): ## if no target genes have max score throw.append(candidate) else: ## if overlapping but not subset if relax: printi(( f"Warning: candidate target '{candidate}' has hit(s) with bitscore" f" {max_bitscore[candidate]} that overlap(s) with target gene(s)" f" {genes & max_bitscore_genes} and non-target gene(s)" f" {max_bitscore_genes - genes}." " This sequence will be retained as 'relax' has been set to True." )) else: throw.append(candidate) printi(( f"Warning: candidate target '{candidate}' has hit(s) with bitscore" f" {max_bitscore[candidate]} that overlap(s) with target gene(s)" f" {genes & max_bitscore_genes} and non-target gene(s)" f" {max_bitscore_genes - genes}." " This sequence will be removed from the list of candidate targets" " as 'relax' has been set to False.")) ## read original candidate targets, filter, and write seqs = fasta_to_dict(fasta) dict_to_fasta( {seq_id: seq for seq_id, seq in seqs.items() if seq_id not in throw}, fasta) return