Exemplo n.º 1
0
    def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10):
        fg = Fasta(fafile)
        if bg_type == "random":
            if int(self.markov_model) >= 6:
                self.logger.warn("Are you sure about the Markov model? It seems too high!")
            else:
                order = {"1":"1st","2":"2nd", "3":"3rd", "4":"4th", "5":"5th"}[str(self.markov_model)]
                self.logger.debug("Creating random background (%s order Markov)" % order)

            m = MarkovFasta(fg, k=int(self.markov_model), n=nr_times * len(fg))
            m.writefasta(outfile)
            self.logger.debug("Random background: %s", outfile)
            # return the number of random sequences created
            return len(m)
        elif bg_type == "genomic":
            self.logger.debug("Creating genomic background")
            index_dir = os.path.join(self.config.get_index_dir(), organism)
            f = RandomGenomicFasta(index_dir, width, nr_times * len(fg))
            f.writefasta(outfile)
            return len(f)
        elif bg_type == "gc":
            self.logger.debug("Creating GC matched background")

            f = MatchedGcFasta(fafile, organism, nr_times * len(fg))
            f.writefasta(outfile)
            self.logger.debug("GC matched background: %s", outfile)
            return len(f)
        elif bg_type == "promoter":
            gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism)
            index_dir = os.path.join(self.config.get_index_dir(), organism)

            self.logger.info(
                    "Creating random promoter background (%s, using genes in %s)", 
                    organism, gene_file)
            f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg))
            f.writefasta(outfile)
            self.logger.debug("Random promoter background: %s", outfile)
            return len(f)
        elif bg_type == "user":
            bg_file = self.params["user_background"]
            if not os.path.exists(bg_file):
                self.logger.error(
                        "User-specified background file %s does not exist!", 
                        bg_file)
                sys.exit(1)
            else:
                self.logger.info("Copying user-specified background file %s to %s.",
                        bg_file, outfile)
                fa = Fasta(bg_file)
                l = median([len(seq) for seq in fa.seqs])
                if l < width * 0.95 or l > width * 1.05:
                    self.logger.warn("The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length.", bg_file, l, width)
                fa.writefasta(outfile)
                return len(fa)
Exemplo n.º 2
0
	def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10):
		if bg_type == "random":
			if int(self.markov_model) >= 6:
				self.logger.warn("Are you sure about the Markov model? It seems too high!")
			else:
				order = {"1":"1st","2":"2nd", "3":"3rd", "4":"4th", "5":"5th"}[str(self.markov_model)]
				self.logger.info("Creating random background (%s order Markov)" % order)
		
			f = Fasta(fafile)
			m = MarkovFasta(f, k=int(self.markov_model))
			m.writefasta(outfile)
			self.logger.debug("Random background: %s" % (outfile))
			# return the number of random sequences created
			return len(m)
		elif bg_type == "genomic_matched":	
			gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism)
			index_dir = os.path.join(self.config.get_index_dir(), organism)
			self.logger.info("Creating matched genomic background (%s, using genes in %s)" % (organism, gene_file))
		
			f = MatchedGenomicFasta(bedfile, gene_file, index_dir, width, nr_times)
			f.writefasta(outfile)
			self.logger.debug("Matched genomic background: %s" % (outfile))
			return len(f)
		elif bg_type == "promoter":
			gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism)
			index_dir = os.path.join(self.config.get_index_dir(), organism)
			
			self.logger.info("Creating random promoter background (%s, using genes in %s)" % (organism, gene_file))
			fg = Fasta(fafile)
			f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg))
			f.writefasta(outfile)
			self.logger.debug("Random promoter background: %s" % (outfile))
			return len(f)
		elif bg_type == "user":
			bg_file = self.params["user_background"]
			if not os.path.exists(bg_file):
				self.logger.error("User-specified background file %s does not exist!" % bg_file)
				sys.exit(1)
			else:
				self.logger.info("Copying user-specified background file %s to %s." % (bg_file, outfile))
				fa = Fasta(bg_file)
				l = median([len(seq) for seq in fa.seqs])
				if l < width * 0.95 or l > width * 1.05:
					self.logger.warn("The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length." % (bg_file, l, width))
				fa.writefasta(outfile)
				return len(fa)