def prepare_denovo_input_bed(inputfile, params, outdir): """Prepare a BED file for de novo motif prediction. All regions to same size; split in test and validation set; converted to FASTA. Parameters ---------- inputfile : str BED file with input regions. params : dict Dictionary with parameters. outdir : str Output directory to save files. """ logger.info("preparing input (BED)") # Create BED file with regions of equal size width = int(params["width"]) bedfile = os.path.join(outdir, "input.bed") write_equalwidth_bedfile(inputfile, width, bedfile) abs_max = int(params["abs_max"]) fraction = float(params["fraction"]) pred_bedfile = os.path.join(outdir, "prediction.bed") val_bedfile = os.path.join(outdir, "validation.bed") # Split input into prediction and validation set logger.debug( "Splitting %s into prediction set (%s) and validation set (%s)", bedfile, pred_bedfile, val_bedfile) divide_file(bedfile, pred_bedfile, val_bedfile, fraction, abs_max) config = MotifConfig() genome = Genome(params["genome"]) for infile in [pred_bedfile, val_bedfile]: genome.track2fasta( infile, infile.replace(".bed", ".fa"), ) # Create file for location plots lwidth = int(params["lwidth"]) extend = (lwidth - width) // 2 genome.track2fasta( val_bedfile, os.path.join(outdir, "localization.fa"), extend_up=extend, extend_down=extend, stranded=params["use_strand"], )
def prepare_input_bed(self, inputfile, organism="hg18", width=200, fraction=0.2, abs_max=1000, use_strand=False): """ Create all the bed- and fasta-files necessary for motif prediction and validation """ self.inputfile = inputfile width = int(width) fraction = float(fraction) abs_max = int(abs_max) use_strand = bool(use_strand) self.logger.info("preparing input (BED)") # Set all peaks to specific width self.logger.debug("Creating inputfile %s, width %s", self.input_bed, width) # if not self.weird: write_equalwidth_bedfile(inputfile, width, self.input_bed) # Split input_bed in prediction and validation set self.logger.debug( "Splitting %s into prediction set (%s) and validation set (%s)", self.input_bed, self.prediction_bed, self.validation_bed) #if not self.weird: self.prediction_num, self.validation_num = divide_file(self.input_bed, self.prediction_bed, self.validation_bed, fraction, abs_max) # Make fasta files index_dir = os.path.join(self.config.get_index_dir(), organism) self.logger.debug("Creating %s", self.prediction_fa) genome_index.track2fasta(index_dir, self.prediction_bed, self.prediction_fa, use_strand=use_strand, ignore_missing=True) self.logger.debug("Creating %s", self.validation_fa) genome_index.track2fasta(index_dir, self.validation_bed, self.validation_fa, use_strand=use_strand, ignore_missing=True)
def prepare_input_bed(self, inputfile, organism="hg18", width=200, fraction=0.2, abs_max=1000, use_strand=False): """ Create all the bed- and fasta-files necessary for motif prediction and validation """ self.inputfile = inputfile width = int(width) fraction = float(fraction) abs_max = int(abs_max) use_strand = bool(use_strand) self.logger.info("preparing input (BED)") # Set all peaks to specific width self.logger.debug("Creating inputfile %s, width %s", self.input_bed, width) # if not self.weird: write_equalwidth_bedfile(inputfile, width, self.input_bed) # Split input_bed in prediction and validation set self.logger.debug( "Splitting %s into prediction set (%s) and validation set (%s)", self.input_bed, self.prediction_bed, self.validation_bed) #if not self.weird: self.prediction_num, self.validation_num = divide_file( self.input_bed, self.prediction_bed, self.validation_bed, fraction, abs_max) # Make fasta files index_dir = os.path.join(self.config.get_index_dir(), organism) self.logger.debug("Creating %s", self.prediction_fa) genome_index.track2fasta(index_dir, self.prediction_bed, self.prediction_fa, use_strand=use_strand, ignore_missing=True) self.logger.debug("Creating %s", self.validation_fa) genome_index.track2fasta(index_dir, self.validation_bed, self.validation_fa, use_strand=use_strand, ignore_missing=True)