def prepare_denovo_input_bed(inputfile, params, outdir): """Prepare a BED file for de novo motif prediction. All regions to same size; split in test and validation set; converted to FASTA. Parameters ---------- inputfile : str BED file with input regions. params : dict Dictionary with parameters. outdir : str Output directory to save files. """ logger.info("preparing input (BED)") # Create BED file with regions of equal size width = int(params["width"]) bedfile = os.path.join(outdir, "input.bed") write_equalwidth_bedfile(inputfile, width, bedfile) abs_max = int(params["abs_max"]) fraction = float(params["fraction"]) pred_bedfile = os.path.join(outdir, "prediction.bed") val_bedfile = os.path.join(outdir, "validation.bed") # Split input into prediction and validation set logger.debug( "Splitting %s into prediction set (%s) and validation set (%s)", bedfile, pred_bedfile, val_bedfile) divide_file(bedfile, pred_bedfile, val_bedfile, fraction, abs_max) config = MotifConfig() genome = Genome(params["genome"]) for infile in [pred_bedfile, val_bedfile]: genome.track2fasta( infile, infile.replace(".bed", ".fa"), ) # Create file for location plots lwidth = int(params["lwidth"]) extend = (lwidth - width) // 2 genome.track2fasta( val_bedfile, os.path.join(outdir, "localization.fa"), extend_up=extend, extend_down=extend, stranded=params["use_strand"], )
def as_fasta(seqs, genome=None): ftype = get_seqs_type(seqs) if ftype == "fasta": return seqs elif ftype == "fastafile": return Fasta(seqs) else: if genome is None: raise ValueError("need genome to convert to FASTA") tmpfa = NamedTemporaryFile() if type(genome) == type(""): genome = Genome(genome) genome.track2fasta(seqs, tmpfa.name) return Fasta(tmpfa.name)
def _as_seqdict_genome_regions(regions, minsize=None): """ Accepts list of regions where the genome is encoded in the region, using the genome@chrom:start-end format. """ genomic_regions = {} for region in regions: genome, region = region.split("@") if genome not in genomic_regions: Genome(genome) genomic_regions[genome] = [] genomic_regions[genome].append(region) tmpfa = NamedTemporaryFile(mode="w", delete=False) for genome, g_regions in genomic_regions.items(): g = Genome(genome) fa = g.track2fasta(g_regions) for seq in fa: seq.name = f"{genome}@{seq.name}" print(seq.__repr__(), file=tmpfa) tmpfa.flush() # Open tempfile and restore original sequence order fa = as_seqdict(tmpfa.name) fa = {region: fa[region] for region in regions} return _check_minsize(fa, minsize)
def _genomepy_convert(to_convert, genome, minsize=None): """ Convert a variety of inputs using track2fasta(). """ if genome is None: raise ValueError("input file is not a FASTA file, need a genome!") if isinstance(genome, Genome): g = genome else: g = Genome(genome) tmpfile = NamedTemporaryFile() g.track2fasta(to_convert, tmpfile.name) fa = as_seqdict(tmpfile.name) return _check_minsize(fa, minsize)
def as_fasta(seqs, genome=None): ftype = get_seqs_type(seqs) if ftype == "fasta": return seqs elif ftype == "fastafile": return Fasta(seqs) else: if genome is None: raise ValueError("need genome to convert to FASTA") tmpfa = NamedTemporaryFile() if isinstance(genome, str): genome = Genome(genome) if isinstance(seqs, np.ndarray): seqs = list(seqs) genome.track2fasta(seqs, tmpfa.name) return Fasta(tmpfa.name)