示例#1
0
def as_fasta(seqs, index_dir=None):
    ftype = get_seqs_type(seqs)
    if ftype == "fasta":
        return seqs
    elif ftype == "fastafile":
        return Fasta(seqs)
    else:
        if index_dir is None:
            raise ValueError("need index_dir / genome to convert to FASTA")

        tmpfa = NamedTemporaryFile()
        
        if ftype == "bedfile":
            track2fasta(index_dir, seqs, tmpfa.name) 
        else:

            if ftype == "regionfile":
                seqs = [l.strip() for l in open(seqs).readlines()]
            tmpbed = NamedTemporaryFile()
            for seq in seqs:
                vals = re.split(r'[:-]', seq)
                tmpbed.write("{}\t{}\t{}\n".format(*vals))
            tmpbed.flush()
            track2fasta(index_dir, tmpbed.name, tmpfa.name) 
        return Fasta(tmpfa.name)
示例#2
0
def location(args):
    """
    Creates histrogram of motif location.

    Parameters
    ----------
    args : argparse object
        Command line arguments.
    """
    fastafile = args.fastafile
    pwmfile = args.pwmfile

    lwidth = args.width
    if not lwidth:
        f = Fasta(fastafile)
        lwidth = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = pwmfile_to_motifs(pwmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")

    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                pool.apply_async(
                    motif_localization,
                    (fastafile, motif, lwidth, outfile, args.cutoff)))

    for job in jobs:
        job.get()
示例#3
0
    def __init__(self,
                 outfile,
                 fg_file=None,
                 background=None,
                 do_counter=True,
                 job_server=None):
        self.lock = thread.allocate_lock()
        self.motifs = []
        self.finished = []
        self.stats = {}
        self.stat_jobs = []
        self.outfile = outfile
        if job_server:
            self.job_server = job_server
        else:
            self.job_server = Pool(2)
        self.counter = 0
        self.do_counter = do_counter

        open(outfile, "w").close()

        if fg_file and background:
            self.fg_fa = Fasta(fg_file)
            self.background = dict([(bg, Fasta(fname))
                                    for bg, fname in background.items()])
            self.do_stats = True
        else:
            self.do_stats = False
示例#4
0
	def calculate_enrichment(self, motif_file, fg, bg):
		""" fg: [sample_fa, sample_gff] bg: [[bg1_fa, bg1_gff, bg1_enrichment], [bg2_fa, bg2_gff, bg2_enrichment], .. etc] """
		
		self.logger.info("Scanning background sequences with motifs")
		scan_cmd = scan_fasta_file_with_motifs
		jobs = []
		if self.parallel:
			jobs.append(self.job_server().submit(scan_cmd, (fg[0], motif_file, self.SCAN_THRESHOLD, fg[1],), (),()))
		else:
			scan_cmd(fg[0], motif_file, self.SCAN_THRESHOD, fg[1])

		for fasta_file, gff_file in [x[:2] for x in bg]:
			if self.parallel:
				jobs.append(self.job_server().submit(scan_cmd, (fasta_file, motif_file, self.SCAN_THRESHOLD, gff_file,), (),()))
			else:
				scan_cmd(fasta_file, motif_file, self.SCAN_THRESHOLD, gff_file)
			
		for job in jobs:
				error = job()
				if error:
					self.logger.error("Error in thread: %s" % error)
					sys.exit(1)

		self.logger.info("Calculating enrichment")
		enrichment_cmd = gff_enrichment
		num_sample = len(Fasta(fg[0]).items())	
		for fasta_file, gff_file, out_file in bg:
			num_bg = len(Fasta(fasta_file).items())
			enrichment_cmd(fg[1], gff_file, num_sample, num_bg, out_file)
示例#5
0
def peak2fasta(peak_ids, ref_genome):
    '''
    Convert peak_id into fasta object.

    Args:
        peak_id (str or list of str): Peak_id.  e.g. "chr5_0930303_9499409"
            or it can be a list of peak_id.  e.g. ["chr5_0930303_9499409", "chr11_123445555_123445577"]

        ref_genome (str): Reference genome name.   e.g. "mm9", "mm10", "hg19" etc

    Returns:
        gimmemotifs fasta object: DNA sequence in fasta format

    '''
    genome_data = Genome(ref_genome)

    def peak2seq(peak_id):
        chromosome_name, start, end = decompose_chrstr(peak_id)
        locus = (int(start), int(end))

        tmp = genome_data[chromosome_name][locus[0]:locus[1]]
        name = f"{tmp.name}_{tmp.start}_{tmp.end}"
        seq = tmp.seq
        return (name, seq)

    if type(peak_ids) is str:
        peak_ids = [peak_ids]

    fasta = Fasta()
    for peak_id in peak_ids:
        name, seq = peak2seq(peak_id)
        fasta.add(name, seq)

    return fasta
    def __init__(self, fasta, size=None, n=None, k=1, matrix_only=False):
        self.k = k

        # Initialize super Fasta object
        Fasta.__init__(self)

        # Initialize Markov transition matrix
        self._initialize_matrices(fasta.seqs, k=k)

        if matrix_only:
            return

        c = 0
        if not n:
            n = len(fasta)

        while len(self) < n:
            seq = choice(fasta.seqs)
            name = "random_Markov%s_%s" % (k, c)
            if size:
                random_seq = self._generate_sequence(size)
            else:
                random_seq = self._generate_sequence(len(seq))
            self.add(name, random_seq)
            c += 1
示例#7
0
def location(args):
    fastafile = args.fastafile
    pwmfile = args.pwmfile

    lwidth = args.width
    if not lwidth:
        f = Fasta(fastafile)
        lwidth = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = pwmfile_to_motifs(pwmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")

    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                    pool.apply_async(
                        motif_localization, 
                        (fastafile,motif,lwidth,outfile, args.cutoff)
                        ))
    
    for job in jobs:
        job.get()
示例#8
0
    def _run_program(self, bin, fastafile, savedir, params=None):

        fastafile = os.path.abspath(fastafile)
        savedir = os.path.abspath(savedir)

        basename = "munk_in.fa"

        new_file = os.path.join(self.tmpdir, basename)
        out = open(new_file, "w")
        f = Fasta(fastafile)
        for name,seq in f.items():
            header = " ".join(["%0.1f" % x for x in range(len(seq) / 2) + range(len(seq) / 2, 0, -1)])
            out.write(">%s\n" % header)
            out.write("%s\n" % seq)
        out.close()
        
        fastafile = new_file
        outfile = fastafile + ".out"

        current_path = os.getcwd()
        os.chdir(self.dir())
        
        cmd = "%s %s %s yes 1.0 p:%s > %s" % (bin, params["width"], params["width"], fastafile, outfile)
        p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE) 
        stdout, stderr = p.communicate()

        motifs = []
        if os.path.exists(outfile):
            motifs = self.parse(open(outfile))
        
        os.chdir(current_path)
        
        return motifs, stdout, stderr
示例#9
0
def scan_it_moods(infile,
                  motifs,
                  cutoff,
                  bgfile,
                  nreport=1,
                  scan_rc=True,
                  pvalue=None,
                  count=False):
    tmpdir = mkdtemp()
    matrices = []
    pseudocount = 1e-3
    # sys.stderr.write("bgfile: {}\n".format(bgfile))
    bg = MOODS.tools.bg_from_sequence_dna("".join(Fasta(bgfile).seqs), 1)

    for motif in motifs:
        pfmname = os.path.join(tmpdir, "{}.pfm".format(motif.id))
        with open(pfmname, "w") as f:
            matrix = np.array(motif.pwm).transpose()
            for line in [" ".join([str(x) for x in row]) for row in matrix]:
                f.write("{}\n".format(line))

        matrices.append(MOODS.parsers.pfm_log_odds(pfmname, bg, pseudocount))

    thresholds = []
    if pvalue is not None:
        thresholds = [
            MOODS.tools.threshold_from_p(m, bg, float(pvalue))
            for m in matrices
        ]
        # sys.stderr.write("{}\n".format(thresholds))
    else:
        thresholds = [calc_threshold_moods(m, float(cutoff)) for m in matrices]

    scanner = MOODS.scan.Scanner(7)
    scanner.set_motifs(matrices, bg, thresholds)

    config = MotifConfig()
    ncpus = int(config.get_default_params()["ncpus"])
    fa = Fasta(infile)
    chunk = 500
    if (len(fa) / chunk) < ncpus:
        chunk = len(fa) / (ncpus + 1)

    jobs = []
    func = scan_fa_with_motif_moods
    if count:
        func = scan_fa_with_motif_moods_count

    pool = mp.Pool()
    for i in range(0, len(fa), chunk):
        jobs.append(
            pool.apply_async(
                func,
                (fa[i:i + chunk], motifs, matrices, bg, thresholds, nreport,
                 scan_rc),
            ))

    for job in jobs:
        for ret in job.get():
            yield ret
示例#10
0
    def __init__(self, fasta, length=None, number=None, k=1, matrix_only=False):
        self.k = k

        # Initialize super Fasta object
        Fasta.__init__(self)
        
        # Initialize Markov transition matrix
        self._initialize_matrices(fasta.seqs, k=k)

        if matrix_only:
            return
        
        c = 0
        if not number:
            number = len(fasta)

        while len(self) < number:
            seq = choice(fasta.seqs)
            id = "random_Markov%s_%s" % (k,c)
            if length:
                random_seq = self._generate_sequence(length)
            else:
                random_seq = self._generate_sequence(len(seq))
            self.add(id, random_seq)    
            c += 1
示例#11
0
 def setUp(self):
     self.data_dir = "test/data/pwmscan"
     
     self.motif = read_motifs(open(os.path.join(self.data_dir, "TATA.pwm")), fmt="pwm")[0]
     self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa"))
     self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff")
     self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa"))
     self.random_gff = os.path.join(self.data_dir, "random_result.gff")
     self.enrichment = os.path.join(self.data_dir, "enrichment.txt")
     self.tmp = NamedTemporaryFile().name
示例#12
0
def remove_zero_seq(fasta_object):
    """
    Remove DNA sequence with zero length
    """
    fasta = Fasta()
    for i, seq in enumerate(fasta_object.seqs):
        if seq:
            name = fasta_object.ids[i]
            fasta.add(name, seq)
    return fasta
示例#13
0
def download_genome(genomebuild, genome_dir): 
    # download genome based on URL + genomebuild
    sys.stderr.write("Downloading {} genome\n".format(genomebuild))
    for genome_url in UCSC_GENOME_URLS:

        remote = genome_url.format(genomebuild)

        genome_fa = os.path.join(
                genome_dir,
                os.path.split(remote)[-1]
                )

        sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild)))
        
        try:
            urlretrieve(
                genome_url.format(genomebuild),
                genome_fa
                )

            if not check_genome_file(genome_fa):
                os.unlink(genome_fa)
                continue

            break
        except:
            pass

    if not check_genome_file(genome_fa):
        sys.stderr.write("Failed to download genome\n")
        sys.exit(1)

    sys.stderr.write("Unpacking\n")
    genome_fa = os.path.basename(genome_fa)
    if genome_fa.endswith("tar.gz"):
        cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa)
    elif genome_fa.endswith(".zip"):
        cmd = "unzip {0}".format(genome_fa)
    else:
        cmd = "gunzip {0}".format(genome_fa)

    sp.call(cmd, shell=True, cwd=genome_dir)
    
    fa_files = glob("{}/*.fa".format(genome_dir))
    if len(fa_files) == 1:
        f = Fasta(fa_files[0])
        for n,s in f.items():
            with open("{}/{}.fa".format(genome_dir, n), "w") as f:
                f.write(">{}\n{}\n".format(n,s))

        os.unlink(fa_files[0])

    genome_fa = os.path.join(genome_dir, genome_fa)
    if os.path.exists(genome_fa):
        os.unlink(genome_fa)
示例#14
0
    def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10):
        fg = Fasta(fafile)
        if bg_type == "random":
            if int(self.markov_model) >= 6:
                self.logger.warn("Are you sure about the Markov model? It seems too high!")
            else:
                order = {"1":"1st","2":"2nd", "3":"3rd", "4":"4th", "5":"5th"}[str(self.markov_model)]
                self.logger.debug("Creating random background (%s order Markov)" % order)

            m = MarkovFasta(fg, k=int(self.markov_model), n=nr_times * len(fg))
            m.writefasta(outfile)
            self.logger.debug("Random background: %s", outfile)
            # return the number of random sequences created
            return len(m)
        elif bg_type == "genomic":
            self.logger.debug("Creating genomic background")
            index_dir = os.path.join(self.config.get_index_dir(), organism)
            f = RandomGenomicFasta(index_dir, width, nr_times * len(fg))
            f.writefasta(outfile)
            return len(f)
        elif bg_type == "gc":
            self.logger.debug("Creating GC matched background")

            f = MatchedGcFasta(fafile, organism, nr_times * len(fg))
            f.writefasta(outfile)
            self.logger.debug("GC matched background: %s", outfile)
            return len(f)
        elif bg_type == "promoter":
            gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism)
            index_dir = os.path.join(self.config.get_index_dir(), organism)

            self.logger.info(
                    "Creating random promoter background (%s, using genes in %s)", 
                    organism, gene_file)
            f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg))
            f.writefasta(outfile)
            self.logger.debug("Random promoter background: %s", outfile)
            return len(f)
        elif bg_type == "user":
            bg_file = self.params["user_background"]
            if not os.path.exists(bg_file):
                self.logger.error(
                        "User-specified background file %s does not exist!", 
                        bg_file)
                sys.exit(1)
            else:
                self.logger.info("Copying user-specified background file %s to %s.",
                        bg_file, outfile)
                fa = Fasta(bg_file)
                l = median([len(seq) for seq in fa.seqs])
                if l < width * 0.95 or l > width * 1.05:
                    self.logger.warn("The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length.", bg_file, l, width)
                fa.writefasta(outfile)
                return len(fa)
示例#15
0
def get_roc_values(motif, fg_file, bg_file):
    try:
        fg_result = motif.pwm_scan_score(Fasta(fg_file), cutoff=0.0, nreport=1)
        fg_vals = [sorted(x)[-1] for x in fg_result.values()]

        bg_result = motif.pwm_scan_score(Fasta(bg_file), cutoff=0.0, nreport=1)
        bg_vals = [sorted(x)[-1] for x in bg_result.values()]

        (x, y) = ROC_values(fg_vals, bg_vals)
        return None, x, y
    except Exception, e:
        error = e
        return error, [], []
示例#16
0
def as_fasta(seqs, genome=None):
    ftype = get_seqs_type(seqs)
    if ftype == "fasta":
        return seqs
    elif ftype == "fastafile":
        return Fasta(seqs)
    else:
        if genome is None:
            raise ValueError("need genome to convert to FASTA")

        tmpfa = NamedTemporaryFile()
        if type(genome) == type(""):
            genome = Genome(genome)
        genome.track2fasta(seqs, tmpfa.name)
        return Fasta(tmpfa.name)
示例#17
0
class TestMotifPwm(unittest.TestCase):
    """ A test class to test Motif pwmscan functionality and related things """
    def setUp(self):
        self.data_dir = "test/data/pwmscan"

        self.motif = pwmfile_to_motifs(os.path.join(self.data_dir,
                                                    "TATA.pwm"))[0]
        self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa"))
        self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff")
        self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa"))
        self.random_gff = os.path.join(self.data_dir, "random_result.gff")
        self.enrichment = os.path.join(self.data_dir, "enrichment.txt")
        self.tmp = NamedTemporaryFile().name

    def test1_pwm_scan(self):
        """ Scan a FASTA file with PWM of motif """
        result = self.motif.pwm_scan(self.prom, nreport=1)

        # Every sequence should have a TATA match
        self.assertEquals(len(result.keys()), len(self.prom.items()))

    def test2_pwm_scan_to_gff(self):
        """ Scan a FASTA file with PWM of motif, and produce GFF """

        self.motif.pwm_scan_to_gff(self.prom, self.tmp)
        self.assertEquals(open(self.prom_gff).read(), open(self.tmp).read())

    def test3_gff_enrichment(self):
        """ Test gff_enrichment """
        self.motif.pwm_scan_to_gff(self.random, self.random_gff)
        gff_enrichment(self.prom_gff, self.random_gff, 316, 3160, self.tmp)
        self.assertEquals(open(self.enrichment).read(), open(self.tmp).read())

    def tearDown(self):
        pass
示例#18
0
def check_denovo_input(inputfile, params):

    genome = params["genome"]
    background = params["background"]
    
    input_type = "BED"
    # If we can load it as fasta then it is a fasta, yeh?
    try:
        Fasta(inputfile)
        logger.debug("Inputfile is a FASTA file")
        input_type = "FASTA"
    except Exception:
        # Leave it to BED
        pass

    if input_type == "FASTA":
        valid_bg = FA_VALID_BGS    
    elif input_type == "BED":
        valid_bg = BED_VALID_BGS    
        if "genomic" in background:
            Genome(genome)
        # is it a valid bed-file etc.
        check_bed_file(inputfile)    # bed-specific
    
    for bg in background:
        if not bg in valid_bg:
            logger.info("Input type is %s, ignoring background type '%s'", 
                            input_type, bg)
        background = [bg for bg in background if bg in valid_bg]

    if len(background) == 0:
        logger.error("No valid backgrounds specified!")
        sys.exit(1)

    return input_type, background
示例#19
0
class TestMotifPwm(unittest.TestCase):
	""" A test class to test Motif pwmscan functionality and related things """

	def setUp(self):
		self.data_dir = "test/data/pwmscan"
		
		self.motif = pwmfile_to_motifs(os.path.join(self.data_dir, "TATA.pwm"))[0]
		self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa"))
		self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff")
		self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa"))
		self.random_gff = os.path.join(self.data_dir, "random_result.gff")
		self.enrichment = os.path.join(self.data_dir, "enrichment.txt")
		self.tmp = NamedTemporaryFile().name
	
	def test1_pwm_scan(self):
		""" Scan a FASTA file with PWM of motif """
		result = self.motif.pwm_scan(self.prom, nreport=1)

		# Every sequence should have a TATA match
		self.assertEquals(len(result.keys()), len(self.prom.items()))

	def test2_pwm_scan_to_gff(self):
		""" Scan a FASTA file with PWM of motif, and produce GFF """
		
		self.motif.pwm_scan_to_gff(self.prom, self.tmp)
		self.assertEquals(open(self.prom_gff).read(), open(self.tmp).read())

	def test3_gff_enrichment(self):
		""" Test gff_enrichment """
		self.motif.pwm_scan_to_gff(self.random, self.random_gff)
		gff_enrichment(self.prom_gff, self.random_gff, 316, 3160, self.tmp)
		self.assertEquals(open(self.enrichment).read(), open(self.tmp).read())

	def tearDown(self):
		pass
示例#20
0
def prepare_denovo_input_fa(inputfile, params, outdir):
    """Create all the FASTA files for de novo motif prediction and validation.

    Parameters
    ----------
    """
    fraction = float(params["fraction"])
    abs_max = int(params["abs_max"])

    logger.info("preparing input (FASTA)")

    pred_fa = os.path.join(outdir, "prediction.fa")
    val_fa = os.path.join(outdir, "validation.fa")
    loc_fa = os.path.join(outdir, "localization.fa")

    # Split inputfile in prediction and validation set
    logger.debug(
        "Splitting %s into prediction set (%s) and validation set (%s)",
        inputfile,
        pred_fa,
        val_fa,
    )

    divide_fa_file(inputfile, pred_fa, val_fa, fraction, abs_max)

    # File for location plots
    shutil.copy(val_fa, loc_fa)
    seqs = Fasta(loc_fa).seqs
    lsize = len(seqs[0])
    all_same_size = not (False in [len(seq) == lsize for seq in seqs])
    if not all_same_size:
        logger.warn(
            "PLEASE NOTE: FASTA file contains sequences of different sizes. "
            "Positional preference plots might be incorrect!")
示例#21
0
 def test_track2fasta_exons(self):
     """ track2fasta should convert bed12 to fasta"""
     from gimmemotifs.fasta import Fasta
     bedfile = os.path.join(self.fasta_dir, "genes.bed")
     fafile = os.path.join(self.fasta_dir, "genes.out")
     
     # Create index
     self.g.create_index(self.fasta_dir, self.index_dir)
     # Convert bed to fasta
     track2fasta(self.index_dir, bedfile, self.temp_file, use_strand=True)
     target = Fasta(fafile)
     test = Fasta(self.temp_file)
     for gene in test.ids:
         name = gene.split(" ")[-1]
         self.assertEqual(len(test[gene]), len(target[name]))
         self.assertEqual(test[gene].upper(), target[name].upper())
示例#22
0
    def test1_scan_sequences(self):
        """ Scanner """
        for ncpus in [1, 2, 3]:
            s = Scanner(ncpus=ncpus)
            s.set_motifs(self.motifs)

            f = Fasta(self.fa)

            s.set_threshold(threshold=0.0)
            nmatches = [len(m[0]) for m in s._scan_sequences(f.seqs, 1, False)]
            self.assertEqual([1, 1, 1], nmatches)

            s.set_threshold(threshold=0.99)
            nmatches = [len(m[0]) for m in s._scan_sequences(f.seqs, 1, False)]
            self.assertEqual([0, 1, 1], nmatches)

            s.set_threshold(threshold=0.99)
            nmatches = [
                len(m[0]) for m in s._scan_sequences(f.seqs, 10, False)
            ]
            self.assertEqual([0, 1, 2], nmatches)

            s.set_threshold(threshold=0.99)
            nmatches = [len(m[0]) for m in s._scan_sequences(f.seqs, 10, True)]
            self.assertEqual([0, 2, 4], nmatches)
示例#23
0
    def __init__(self, matchfile, genome="hg19", number=None, size=None):
        # Create temporary files
        tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
        tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name

        # Create bed-file with coordinates of random sequences
        matched_gc_bedfile(tmpbed, matchfile, genome, number, size=size)

        # Convert track to fasta
        Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta)

        # Initialize super Fasta object
        Fasta.__init__(self, tmpfasta)

        # Delete the temporary files
        os.remove(tmpbed)
        os.remove(tmpfasta)
示例#24
0
def get_roc_values(motif, fg_file, bg_file):
	error = None
	x = []
	y = []
	try:
		from gimmemotifs.fasta import Fasta
		from gimmemotifs.rocmetrics import ROC_values,ROC_AUC,MNCP,max_fmeasure
	
		fg_result = motif.pwm_scan_score(Fasta(fg_file), cutoff=0.0, nreport=1)
		fg_vals = [sorted(x)[-1] for x in fg_result.values()]
	
		bg_result = motif.pwm_scan_score(Fasta(bg_file), cutoff=0.0, nreport=1)
		bg_vals = [sorted(x)[-1] for x in bg_result.values()]
	
		(x, y) = ROC_values(fg_vals, bg_vals)
	except Exception,e:
		error = e
示例#25
0
def as_fasta(seqs, genome=None):
    ftype = get_seqs_type(seqs)
    if ftype == "fasta":
        return seqs
    elif ftype == "fastafile":
        return Fasta(seqs)
    else:
        if genome is None:
            raise ValueError("need genome to convert to FASTA")

        tmpfa = NamedTemporaryFile()
        if isinstance(genome, str):
            genome = Genome(genome)

        if isinstance(seqs, np.ndarray):
            seqs = list(seqs)
        genome.track2fasta(seqs, tmpfa.name)
        return Fasta(tmpfa.name)
示例#26
0
	def __init__(self, fasta, length=None, multiply=10):
		
		# Initialize super Fasta object
		Fasta.__init__(self)
		
		# Initialize Markov transition matrix
		self._initialize_matrices(fasta.seqs)

		c = 0
		for seq in fasta.seqs:
			for i in range(multiply):
				id = "random_1st_order_%s" % (c)
				if length:
					random_seq = self._generate_sequence(length)
				else:
					random_seq = self._generate_sequence(len(seq))
				self.add(id, random_seq)	
				c += 1
示例#27
0
    def __init__(self, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, n=None):
        length = int(length)

        # Create temporary files
        tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
        tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name
        
        # Create bed-file with coordinates of random sequences
        create_random_genomic_bedfile(tmpbed, index, length, n)
        
        # Convert track to fasta
        track2fasta(index, tmpbed, tmpfasta, use_strand=True)

        # Initialize super Fasta object
        Fasta.__init__(self, tmpfasta)

        # Delete the temporary files
        os.remove(tmpbed)
        os.remove(tmpfasta)
示例#28
0
    def __init__(
        self,
        outfile,
        genome=None,
        fg_file=None,
        background=None,
        gc=False,
        do_counter=True,
        job_server=None,
    ):
        self.lock = thread.allocate_lock()
        self.motifs = []
        self.finished = []
        self.stats = {}
        self.stat_jobs = []
        self.outfile = outfile
        self.genome = genome
        if job_server:
            self.job_server = job_server
        else:
            self.job_server = Pool(2)
        self.counter = 0
        self.do_counter = do_counter

        open(outfile, "w").close()

        if fg_file and background:
            self.fg_fa = Fasta(fg_file)
            self.background = dict(
                [(bg, Fasta(fname)) for bg, fname in background.items()]
            )
            self.do_stats = True
            self.gc = gc
            self.zscore = self.gc
            if self.gc:
                if genome is None:
                    raise ValueError(
                        "Need a genome when calculating GC% zscores for motif statistics"
                    )
                else:
                    self.genome = genome
        else:
            self.do_stats = False
示例#29
0
	def __init__(self, bedfile, genefile, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, multiply=10, match_chromosome=True):
		self.match_chromosome = match_chromosome

		# Create temporary files
		tmpbed = NamedTemporaryFile().name
		tmpfasta = NamedTemporaryFile().name
		
		# Create bed-file with coordinates of random sequences
		self._create_bedfile(tmpbed, bedfile, genefile, length, multiply)
		
		# Convert track to fasta
		track2fasta(index, tmpbed, tmpfasta)

		# Initialize super Fasta object
		Fasta.__init__(self, tmpfasta)

		# Delete the temporary files
		os.remove(tmpbed)
		os.remove(tmpfasta)
示例#30
0
	def __init__(self, genefile, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, n=None):
		length = int(length)

		# Create temporary files
		tmpbed = NamedTemporaryFile().name
		tmpfasta = NamedTemporaryFile().name
		
		# Create bed-file with coordinates of random sequences
		self._create_promoter_bedfile(tmpbed, genefile, length, n)
		
		# Convert track to fasta
		track2fasta(index, tmpbed, tmpfasta, use_strand=True)

		# Initialize super Fasta object
		Fasta.__init__(self, tmpfasta)

		# Delete the temporary files
		os.remove(tmpbed)
		os.remove(tmpfasta)
示例#31
0
    def __init__(self, genome, size=None, n=None):
        size = int(size)

        # Create temporary files
        tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
        tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name

        # Create bed-file with coordinates of random sequences
        create_random_genomic_bedfile(tmpbed, genome, size, n)

        # Convert track to fasta
        Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta, stranded=True)

        # Initialize super Fasta object
        Fasta.__init__(self, tmpfasta)

        # Delete the temporary files
        os.remove(tmpbed)
        os.remove(tmpfasta)
示例#32
0
 def setUp(self):
     self.data_dir = "test/data/pwmscan"
     
     self.motif = pwmfile_to_motifs(os.path.join(self.data_dir, "TATA.pwm"))[0]
     self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa"))
     self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff")
     self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa"))
     self.random_gff = os.path.join(self.data_dir, "random_result.gff")
     self.enrichment = os.path.join(self.data_dir, "enrichment.txt")
     self.tmp = NamedTemporaryFile().name
示例#33
0
	def __init__(self, bedfile, genefile, index="/usr/share/gimmemotifs/genome_index/hg18", length=None, multiply=10, match_chromosome=True):
		self.match_chromosome = match_chromosome
		length = int(length)

		# Create temporary files
		tmpbed = NamedTemporaryFile().name
		tmpfasta = NamedTemporaryFile().name
		
		# Create bed-file with coordinates of random sequences
		self._create_bedfile(tmpbed, bedfile, genefile, length, multiply)
		
		# Convert track to fasta
		track2fasta(index, tmpbed, tmpfasta)

		# Initialize super Fasta object
		Fasta.__init__(self, tmpfasta)

		# Delete the temporary files
		os.remove(tmpbed)
		os.remove(tmpfasta)
示例#34
0
def get_scores(motif, fg_file, bg_file):
    error = None
    auc = None
    mncp = None
    max_f = None
    y = None
    try:
        fg_result = motif.pwm_scan_score(Fasta(fg_file), cutoff=0.0, nreport=1)
        fg_vals = [sorted(x)[-1] for x in fg_result.values()]

        bg_result = motif.pwm_scan_score(Fasta(bg_file), cutoff=0.0, nreport=1)
        bg_vals = [sorted(x)[-1] for x in bg_result.values()]

        (x, y) = ROC_values(fg_vals, bg_vals)
        auc = ROC_AUC(fg_vals, bg_vals)
        mncp = MNCP(fg_vals, bg_vals)
        max_f, y = max_fmeasure(x, y)

    except Exception, e:
        error = e
示例#35
0
def scan_fasta_file_with_motifs(fastafile, motiffile, threshold, gfffile, scan_rc=True):
	error = None
	try:
		from gimmemotifs.fasta import Fasta
		from gimmemotifs.motif import pwmfile_to_motifs
		motifs = pwmfile_to_motifs(motiffile)
		fa = Fasta(fastafile)
		for motif in motifs:
			motif.pwm_scan_to_gff(fa, gfffile, nreport=1, cutoff=float(threshold), scan_rc=scan_rc, append=True)
	except Exception,e :
		error = e
示例#36
0
    def set_background(self, fname=None, genome=None, length=200, nseq=10000):
        """Set the background to use for FPR and z-score calculations.

        Background can be specified either as a genome name or as the 
        name of a FASTA file.
        
        Parameters
        ----------
        fname : str, optional
            Name of FASTA file to use as background.

        genome : str, optional
            Name of genome to use to retrieve random sequences.

        length : int, optional
            Length of genomic sequences to retrieve. The default
            is 200.

        nseq : int, optional
            Number of genomic sequences to retrieve.
        """
        length = int(length)

        if genome and fname:
            raise ValueError("Need either genome or filename for background.")

        if fname:
            if not os.path.exists(fname):
                raise IOError(
                    "Background file {} does not exist!".format(fname))

            self.background = Fasta(fname)
            self.background_hash = file_checksum(fname)
            return

        if not genome:
            if self.genome:
                genome = self.genome
                logger.info(
                    "Using default background: genome {} with length {}".
                    format(genome, length))
            else:
                raise ValueError(
                    "Need either genome or filename for background.")

        logger.info("Using background: genome {} with length {}".format(
            genome, length))
        with Cache(CACHE_DIR) as cache:
            self.background_hash = "{}\{}".format(genome, int(length))
            fa = cache.get(self.background_hash)
            if not fa:
                fa = RandomGenomicFasta(genome, length, nseq)
                cache.set(self.background_hash, fa)
        self.background = fa
示例#37
0
    def __init__(self, matchfile, genome="hg19", number=None):
        config = MotifConfig()
        index = os.path.join(config.get_index_dir(), genome)

        # Create temporary files
        tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
        tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name
        
        # Create bed-file with coordinates of random sequences
        matched_gc_bedfile(tmpbed, matchfile, genome, number)
        
        # Convert track to fasta
        track2fasta(index, tmpbed, tmpfasta)

        # Initialize super Fasta object
        Fasta.__init__(self, tmpfasta)

        # Delete the temporary files
        os.remove(tmpbed)
        os.remove(tmpfasta)
示例#38
0
def divide_fa_file(fname, sample, rest, fraction, abs_max):
    fa = Fasta(fname)
    ids = fa.ids[:]

    x = int(fraction * len(ids))
    if x > abs_max:
        x = abs_max

    sample_seqs = random.sample(ids, x)

    # Rest
    f_sample = open(sample, "w")
    f_rest = open(rest, "w")
    for name,seq in fa.items():
        if name in sample_seqs:
            f_sample.write(">%s\n%s\n" % (name, seq))
        else:
            f_rest.write(">%s\n%s\n" % (name, seq))
    f_sample.close()
    f_rest.close()
    
    return x, len(ids[x:])    
示例#39
0
def divide_fa_file(fname, sample, rest, fraction, abs_max):
    fa = Fasta(fname)
    ids = fa.ids[:]

    x = int(fraction * len(ids))
    if x > abs_max:
        x = abs_max

    sample_seqs = random.sample(ids, x)

    # Rest
    f_sample = open(sample, "w")
    f_rest = open(rest, "w")
    for name,seq in fa.items():
        if name in sample_seqs:
            f_sample.write(">%s\n%s\n" % (name, seq))
        else:
            f_rest.write(">%s\n%s\n" % (name, seq))
    f_sample.close()
    f_rest.close()
    
    return x, len(ids[x:])    
示例#40
0
def motif_localization(fastafile, motif, width, outfile, cutoff=0.9):
    NR_HIST_MATCHES = 100

    matches = motif.pwm_scan(Fasta(fastafile), cutoff=cutoff, nreport=NR_HIST_MATCHES)
    if len(matches) > 0:
        ar = []
        for a in matches.values():
            ar += a
        matches = np.array(ar)
        p = ks_pvalue(matches, width - len(motif))
        plot_histogram(matches - width / 2 + len(motif) / 2, outfile, xrange=(-width / 2, width / 2), breaks=21, title="%s (p=%0.2e)" % (motif.id, p), xlabel="Position")
        return motif.id, p
    else:
        return motif.id, 1.0
示例#41
0
def location(args):
    """
    Creates histrogram of motif location.

    Parameters
    ----------
    args : argparse object
        Command line arguments.
    """
    fastafile = args.fastafile
    pwmfile = args.pwmfile

    lwidth = args.width
    if not lwidth:
        f = Fasta(fastafile)
        lwidth = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = pwmfile_to_motifs(pwmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")
    
    n_cpus = int(MotifConfig().get_default_params()["ncpus"])
    pool = Pool(processes=n_cpus, maxtasksperchild=1000) 
    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                    pool.apply_async(
                        motif_localization, 
                        (fastafile,motif,lwidth,outfile, args.cutoff)
                        ))
    
    for job in jobs:
        job.get()
示例#42
0
	def __init__(self, fasta, length=None, multiply=10, k=1, matrix_only=False):
		
		
		self.k = k

		# Initialize super Fasta object
		Fasta.__init__(self)
		
		# Initialize Markov transition matrix
		self._initialize_matrices(fasta.seqs, k=k)

		if matrix_only:
			return
		
		c = 0
		for seq in fasta.seqs:
			for i in range(multiply):
				id = "random_Markov%s_%s" % (k,c)
				if length:
					random_seq = self._generate_sequence(length)
				else:
					random_seq = self._generate_sequence(len(seq))
				self.add(id, random_seq)	
				c += 1
示例#43
0
class TestMotifPwm(unittest.TestCase):
    """ A test class to test Motif pwmscan functionality and related things """

    def setUp(self):
        self.data_dir = "test/data/pwmscan"
        
        self.motif = pwmfile_to_motifs(os.path.join(self.data_dir, "TATA.pwm"))[0]
        self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa"))
        self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff")
        self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa"))
        self.random_gff = os.path.join(self.data_dir, "random_result.gff")
        self.enrichment = os.path.join(self.data_dir, "enrichment.txt")
        self.tmp = NamedTemporaryFile().name
    
    def test1_pwm_scan(self):
        """ Scan a FASTA file with PWM of motif """
        result = self.motif.pwm_scan(self.prom, nreport=1)

        # Every sequence should have a TATA match
        self.assertEquals(len(result.keys()), len(self.prom.items()))

    def test2_pwm_scan_to_gff(self):
        """ Scan a FASTA file with PWM of motif, and produce GFF """
        
        self.motif.pwm_scan_to_gff(self.prom, self.tmp)
        for line in open(self.tmp):
            vals = line.strip().split("\t")
            self.assertEquals(9, len(vals))
            self.assertTrue(int(vals[3]) > 0)
            self.assertTrue(int(vals[4]) > 0)
            self.assertTrue(float(vals[5]) > 5.25)
            self.assertTrue(float(vals[5]) < 9.06)
            self.assertIn(vals[6], ["+", "-"])

    def test3_gff_enrichment(self):
        """ Test gff_enrichment """
        self.motif.pwm_scan_to_gff(self.random, self.random_gff)
        gff_enrichment(self.prom_gff, self.random_gff, 316, 3160, self.tmp)
        f = open(self.tmp)
        f.readline() # Header
        vals = f.readline().strip().split("\t")
        self.assertEquals(vals[0], "TATA-box")
        self.assertLess(float(vals[2]), 1e-60)
        self.assertGreater(float(vals[5]), 1.5)

    def tearDown(self):
        pass
parser = OptionParser()
parser.add_option("-p", "--pwmfile", dest="pwmfile", help="File with pwms", metavar="FILE")
parser.add_option("-i", "--inputfile", dest="inputfile", help="FASTA file with background sequences", metavar="FILE") 
parser.add_option("-f", "--fpr", dest="fpr", help="Desired fpr", type="float", metavar="FLOAT") 

(options, args) = parser.parse_args()

if not options.pwmfile or not options.inputfile or not options.fpr:
	parser.print_help()
	exit()

if options.fpr < 0 or options.fpr > 1:
	print "Please specify a FPR between 0 and 1"
	sys.exit()

f = Fasta(options.inputfile)
motifs = pwmfile_to_motifs(options.pwmfile)

print "Motif\tScore\tCutoff"
for motif in motifs:
	pwm = motif.pwm
	scores = []
	min_score = motif.pwm_min_score()
	for name,seq in f.items():
		result = pwmscan(seq.upper(), pwm, min_score, 1, True)
		score = result[0][0]
		scores.append(score)
	opt_score = scoreatpercentile(scores, 100 - (100 * options.fpr))
	cutoff = (opt_score - min_score) / (motif.pwm_max_score() - min_score)
	print "%s\t%s\t%s" % (motif.id, opt_score , cutoff)
示例#45
0
def nmer_predict(fastafile):
	from tempfile import NamedTemporaryFile,mkdtemp
	from gimmemotifs.fasta import Fasta
	from numpy import sum,histogram
	from subprocess import Popen,PIPE
	from gimmemotifs.motif import Motif,motif_from_align
	from gimmemotifs.cluster import cluster_motifs 
	from string import maketrans

	def rc(seq):
		t = maketrans("ATCG", "TAGC")
		return seq[::-1].translate(t)
	
	f = Fasta(fastafile)
	nmer = {}
	N = {6:4, 8:3,10:2,12:1}
	tmp = NamedTemporaryFile()
	abs_cutoff = len(f.items()) / 100.0 * 2 
	for check_n,cutoff in N.items():
		for id,seq in f.items():
			for i in range(len(seq) - check_n):
				n = seq[i: i + check_n]
				nmer.setdefault(n.upper(), []).append(i)

	for n,pos in nmer.items():
		if len(pos) > abs_cutoff:
			hist = histogram(pos, bins=9, range=(0,200))[0]	
			if sum(hist[3:6]) > sum(hist[0:3] * N[len(n)]) and  sum(hist[3:6]) > sum(hist[7:]) *  N[len(n)]:
				tmp.write(">%s\n" % n)
				for char in n:
					w = []
					for x in  ["A", "C", "G", "T"]:
						if x == char:
							w.append(len(pos))
						else:
							w.append(0)

					tmp.write("\t".join([str(x) for x in w]) + "\n")
	
	
	tmp.flush()
	tmpname = tmp.name
	
	tree = cluster_motifs(tmpname, "subtotal", "ed", "mean", False, threshold=-0.1, include_bg=False)	
	clusters = tree.getResult()

	def refine_by_scanning(motifs, fastafile):
		
		tmp_gff = NamedTemporaryFile()
		file_in = NamedTemporaryFile()
		for m in motifs:
			file_in.write("%s\n" % m.to_pfm())
		file_in.flush()
		
		cmd = "pwmscan.py -i %s -p %s -c 0.8 > %s" % (fastafile, file_in.name, tmp_gff.name)
		p = Popen(cmd, shell=True)
		stdout,stderr = p.communicate()

		aligns = {}
		for line in open(tmp_gff.name):	
			vals = line.strip().split("\t")
			motif,instance = [x.split(" ")[1].replace('"', "") for x in vals[8].split(" ; ")]
		
			if vals[6] == "+":
				aligns.setdefault(motif,[]).append(instance.upper())
			else:
				aligns.setdefault(motif,[]).append(rc(instance.upper()))

		tmp_out = NamedTemporaryFile()
		
		refined_motifs = []
		for id,align in aligns.items():
			if len(align) > 10:
				motif = motif_from_align(align)
				refined_motifs.append(motif)
		
		return refined_motifs
	
	motifs = refine_by_scanning([x[0] for x in clusters], fastafile)
	tmp4 = NamedTemporaryFile()
	for m in motifs:
		tmp4.write("%s\n" % m.to_pfm())
	tmp4.flush()


	motifs = []
	tree = cluster_motifs(tmp4.name, "total", "wic", "mean", True, threshold=0.95, include_bg=True)	
	clusters = tree.getResult()
	for i, (cluster,members) in enumerate(clusters):
		cluster.id = "Nmer_%s" % (i + 1)
		motifs.append(cluster)
	
	refined_motifs = refine_by_scanning(motifs, fastafile)
	for i,m in enumerate(refined_motifs):
		m.id = "WannaMotif_%s" % (i + 1)
	
	return refined_motifs, "", ""	
        )
        return motif.id, p
    else:
        return motif.id, 1.0


if not options.fastafile and not options.pwmfile:
    parser.print_help()
    sys.exit()

fastafile = options.fastafile
pwmfile = options.pwmfile

lwidth = options.width
if not lwidth:
    f = Fasta(fastafile)
    lwidth = len(f.items()[0][1])
    f = None

job_server = pp.Server(secret="pumpkinrisotto")
jobs = []
motifs = pwmfile_to_motifs(pwmfile)
ids = [motif.id for motif in motifs]
if options.ids:
    ids = options.ids.split(",")

for motif in motifs:
    if motif.id in ids:
        outfile = os.path.join("%s_histogram" % motif.id)
        jobs.append(job_server.submit(motif_localization, (fastafile, motif, lwidth, outfile, options.cutoff), (), ()))
示例#47
0
if not (options.inputfile and (options.pwmfile or options.mdmodulefile)):
	parser.print_help()
	sys.exit(0)

inputfile = options.inputfile

if options.nreport:
	nreport = int(options.nreport)

cutoff = float(options.cutoff)

motifs = pwmfile_to_motifs(options.pwmfile)

bed = options.bed

f = Fasta(inputfile)
strandmap = {-1:"-",1:"+"}
for (id,seq) in f.items():
	for motif in motifs:
		pwm = motif.pwm
		c =  motif.pwm_min_score() + (motif.pwm_max_score() - motif.pwm_min_score()) * cutoff 
		result = pwmscan(seq.upper(), pwm, c, nreport)
		for (score, pos, strand) in result:
			if bed:
				first = id.split(" ")[0]	
				(chr,loc) = first.split(":")
				if loc:
					(start, end) = map(int, loc.split("-"))
					print "%s\t%s\t%s\t%s" % (chr, start + pos, start + pos + len(pwm) , score)
				else:
					print "%s\t%s\t%s\t%s" % (id, pos, pos +  len(pwm), score)
示例#48
0
def genome(args):
    
    config = MotifConfig()
    
    if not os.path.exists(args.indexdir):
        print "Index_dir %s does not exist!" % (args.indexdir)
        sys.exit(1)

    if not os.path.exists(args.fastadir):
        print "FASTA dir %s does not exist!" % (args.fastadir)
        sys.exit(1)
    
    pred_bin = "genePredToBed"
    pred = find_executable(pred_bin)
    if not pred:
        sys.stderr.write("{} not found in path!\n".format(pred_bin))
        sys.exit(1)
    
    fastadir = args.fastadir
    genomebuild = args.genomebuild
    genome_dir = os.path.join(fastadir, genomebuild)
    index_dir = os.path.join(args.indexdir, args.genomebuild)

    # Check for rights to write to directory

    if not os.path.exists(genome_dir):
        try:
            os.mkdir(genome_dir)
        except:
            sys.stderr.write("Could not create genome dir {}\n".format(genome_dir))
            sys.exit(1)
    
    # Download gene file based on URL + genomebuild
    gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genomebuild)
    tmp = NamedTemporaryFile(delete=False, suffix=".gz")
    
    anno = []
    f = urllib2.urlopen(UCSC_GENE_URL.format(genomebuild))
    p = re.compile(r'\w+.Gene.txt.gz')
    for line in f.readlines():
        m = p.search(line)
        if m:
            anno.append(m.group(0))

    sys.stderr.write("Retrieving gene annotation for {}\n".format(genomebuild))
    url = ""
    for a in ANNOS:
        if a in anno:
            url = UCSC_GENE_URL.format(genomebuild) + a
            break
    if url:
        urllib.urlretrieve(
                url,
                tmp.name 
                )

        sp.call("zcat {} | cut -f2-11 | {} /dev/stdin {}".format(tmp.name, pred, gene_file), shell=True)

    else: 
        sys.stderr.write("No annotation found!")
  
    # download genome based on URL + genomebuild
    sys.stderr.write("Downloading {} genome\n".format(genomebuild))
    for genome_url in [UCSC_GENOME_URL, ALT_UCSC_GENOME_URL]:
        
        remote = genome_url.format(genomebuild)

        genome_fa = os.path.join(
                genome_dir, 
                os.path.split(remote)[-1]
                )

        sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild)))
        urllib.urlretrieve(
                genome_url.format(genomebuild),
                genome_fa
                )
        
        if not check_genome_file(genome_fa):    
            continue
        
        break

    if not check_genome_file(genome_fa):
        sys.stderr.write("Failed to download genome\n")
        sys.exit(1)

    sys.stderr.write("Unpacking\n")
    if genome_fa.endswith("tar.gz"):
        cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa)
    else:
        cmd = "gunzip {0} && rm {0}".format(genome_fa)

    sp.call(cmd, shell=True, cwd=genome_dir)

    fa_files = glob("{}/*.fa".format(genome_dir))
    if len(fa_files) == 1:
        f = Fasta(fa_files[0])
        for n,s in f.items():
            with open("{}/{}.fa".format(n)) as f:
                f.write("{}\n{}\n".format(n,s))
    
        os.unlink(fa_files[0])

    sys.stderr.write("Creating index\n")
    g = GenomeIndex()
    g = g.create_index(genome_dir, index_dir)