Exemplo n.º 1
0
def prepare_denovo_input_bed(inputfile, params, outdir):
    """Prepare a BED file for de novo motif prediction.

    All regions to same size; split in test and validation set;
    converted to FASTA.

    Parameters
    ----------
    inputfile : str
        BED file with input regions.

    params : dict
        Dictionary with parameters.

    outdir : str
        Output directory to save files.
    """
    logger.info("preparing input (BED)")

    # Create BED file with regions of equal size
    width = int(params["width"])
    bedfile = os.path.join(outdir, "input.bed")
    write_equalwidth_bedfile(inputfile, width, bedfile)

    abs_max = int(params["abs_max"])
    fraction = float(params["fraction"])
    pred_bedfile = os.path.join(outdir, "prediction.bed")
    val_bedfile = os.path.join(outdir, "validation.bed")
    # Split input into prediction and validation set
    logger.debug(
        "Splitting %s into prediction set (%s) and validation set (%s)",
        bedfile, pred_bedfile, val_bedfile)
    divide_file(bedfile, pred_bedfile, val_bedfile, fraction, abs_max)

    config = MotifConfig()

    genome = Genome(params["genome"])
    for infile in [pred_bedfile, val_bedfile]:
        genome.track2fasta(
            infile,
            infile.replace(".bed", ".fa"),
        )

    # Create file for location plots
    lwidth = int(params["lwidth"])
    extend = (lwidth - width) // 2

    genome.track2fasta(
        val_bedfile,
        os.path.join(outdir, "localization.fa"),
        extend_up=extend,
        extend_down=extend,
        stranded=params["use_strand"],
    )
Exemplo n.º 2
0
def prepare_denovo_input_bed(inputfile, params, outdir):
    """Prepare a BED file for de novo motif prediction.

    All regions to same size; split in test and validation set;
    converted to FASTA.

    Parameters
    ----------
    inputfile : str
        BED file with input regions.

    params : dict
        Dictionary with parameters.

    outdir : str
        Output directory to save files.
    """
    logger.info("preparing input (BED)")
    
    # Create BED file with regions of equal size
    width = int(params["width"])
    bedfile = os.path.join(outdir, "input.bed")
    write_equalwidth_bedfile(inputfile, width, bedfile)
    
    abs_max = int(params["abs_max"])
    fraction = float(params["fraction"])
    pred_bedfile = os.path.join(outdir, "prediction.bed")
    val_bedfile = os.path.join(outdir, "validation.bed")
    # Split input into prediction and validation set
    logger.debug(
                "Splitting %s into prediction set (%s) and validation set (%s)",
                bedfile, pred_bedfile, val_bedfile)
    divide_file(bedfile, pred_bedfile, val_bedfile, fraction, abs_max)

    config = MotifConfig()
   
    genome = Genome(params["genome"])
    for infile in [pred_bedfile, val_bedfile]:
        genome.track2fasta(
            infile, 
            infile.replace(".bed", ".fa"), 
            )

    # Create file for location plots
    lwidth = int(params["lwidth"])
    extend = (lwidth - width) // 2
    
    genome.track2fasta(
            val_bedfile, 
            os.path.join(outdir, "localization.fa"), 
            extend_up=extend, 
            extend_down=extend, 
            stranded=params["use_strand"], 
            )
Exemplo n.º 3
0
    def prepare_input_bed(self, inputfile, organism="hg18", width=200, fraction=0.2, abs_max=1000, use_strand=False):
        """ Create all the bed- and fasta-files necessary for motif prediction and validation """
        self.inputfile = inputfile

        width = int(width)
        fraction = float(fraction)
        abs_max = int(abs_max)
        use_strand = bool(use_strand)

        self.logger.info("preparing input (BED)")

        # Set all peaks to specific width
        self.logger.debug("Creating inputfile %s, width %s", self.input_bed, width)

    #    if not self.weird:
        write_equalwidth_bedfile(inputfile, width, self.input_bed)

        # Split input_bed in prediction and validation set
        self.logger.debug(
                "Splitting %s into prediction set (%s) and validation set (%s)",
                self.input_bed, self.prediction_bed, self.validation_bed)
        #if not self.weird:
        self.prediction_num, self.validation_num = divide_file(self.input_bed, self.prediction_bed, self.validation_bed, fraction, abs_max)


        # Make fasta files
        index_dir = os.path.join(self.config.get_index_dir(), organism)
        self.logger.debug("Creating %s", self.prediction_fa)

        genome_index.track2fasta(index_dir, self.prediction_bed, self.prediction_fa, use_strand=use_strand, ignore_missing=True)
        self.logger.debug("Creating %s", self.validation_fa)
        genome_index.track2fasta(index_dir, self.validation_bed, self.validation_fa, use_strand=use_strand, ignore_missing=True)
Exemplo n.º 4
0
    def prepare_input_bed(self,
                          inputfile,
                          organism="hg18",
                          width=200,
                          fraction=0.2,
                          abs_max=1000,
                          use_strand=False):
        """ Create all the bed- and fasta-files necessary for motif prediction and validation """
        self.inputfile = inputfile

        width = int(width)
        fraction = float(fraction)
        abs_max = int(abs_max)
        use_strand = bool(use_strand)

        self.logger.info("preparing input (BED)")

        # Set all peaks to specific width
        self.logger.debug("Creating inputfile %s, width %s", self.input_bed,
                          width)

        #    if not self.weird:
        write_equalwidth_bedfile(inputfile, width, self.input_bed)

        # Split input_bed in prediction and validation set
        self.logger.debug(
            "Splitting %s into prediction set (%s) and validation set (%s)",
            self.input_bed, self.prediction_bed, self.validation_bed)
        #if not self.weird:
        self.prediction_num, self.validation_num = divide_file(
            self.input_bed, self.prediction_bed, self.validation_bed, fraction,
            abs_max)

        # Make fasta files
        index_dir = os.path.join(self.config.get_index_dir(), organism)
        self.logger.debug("Creating %s", self.prediction_fa)

        genome_index.track2fasta(index_dir,
                                 self.prediction_bed,
                                 self.prediction_fa,
                                 use_strand=use_strand,
                                 ignore_missing=True)
        self.logger.debug("Creating %s", self.validation_fa)
        genome_index.track2fasta(index_dir,
                                 self.validation_bed,
                                 self.validation_fa,
                                 use_strand=use_strand,
                                 ignore_missing=True)