def index(args): if not os.path.exists(args.indexdir): print "Index_dir %s does not exist!" % (args.indexdir) sys.exit(1) fasta_dir = args.fastadir index_dir = os.path.join(args.indexdir, args.indexname) g = GenomeIndex() g.create_index(fasta_dir, index_dir) # Create genome FASTA file for use with bedtools with open(os.path.join(index_dir, "genome.fa"), 'w') as out: for f in find_by_ext(fasta_dir, FASTA_EXT): for line in open(f): out.write(line) test_chr = g.get_chromosomes()[0] tmp = NamedTemporaryFile() tmp.write("{}\t1\t2\n".format(test_chr)) tmp.flush() b = pybedtools.BedTool(tmp.name) try: b.nucleotide_content(fi=os.path.join(index_dir, "genome.fa")) except pybedtools.helpers.BEDToolsError as e: if str(e).find("generating") == -1: raise
def create_bedtools_fa(index_dir, fasta_dir): g = GenomeIndex(index_dir) genome_fa = os.path.join(index_dir, "genome.fa") # Create genome FASTA file for use with bedtools with open(genome_fa, 'w') as out: for fname in find_by_ext(fasta_dir, FASTA_EXT): with open(fname) as f: for line in f: out.write(line) # Delete old bedtools index if it exists, otherwise bedtools will # give an error. if os.path.exists(genome_fa + ".fai"): os.unlink(genome_fa + ".fai") test_chr = g.get_chromosomes()[0] tmp = NamedTemporaryFile(mode="w") tmp.write("{}\t1\t2\n".format(test_chr)) tmp.flush() b = pybedtools.BedTool(tmp.name) try: # pylint: disable=unexpected-keyword-arg b.nucleotide_content(fi=genome_fa) except pybedtools.helpers.BEDToolsError as e: if str(e).find("generating") == -1: raise
def create_bedtools_fa(index_dir, fasta_dir): g = GenomeIndex(index_dir) # Create genome FASTA file for use with bedtools with open(os.path.join(index_dir, "genome.fa"), 'w') as out: for f in find_by_ext(fasta_dir, FASTA_EXT): for line in open(f): out.write(line) test_chr = g.get_chromosomes()[0] tmp = NamedTemporaryFile() tmp.write("{}\t1\t2\n".format(test_chr)) tmp.flush() b = pybedtools.BedTool(tmp.name) try: b.nucleotide_content(fi=os.path.join(index_dir, "genome.fa")) except pybedtools.helpers.BEDToolsError as e: if str(e).find("generating") == -1: raise
class GenomeIndex: """ Index fasta-formatted files for faster retrieval of sequences Typical use: # Make index g = GenomeIndex() g.create_index("/usr/share/genomes/hg18", "/usr/share/genome_index/hg18") # Retrieve sequence g = GenomeIndex("/usr/share/genome_index/hg18") seq = g.get_sequence("chr17", "7520037", "7531588") # Batch bed-file to fasta-file track2fasta("/usr/share/genome_index/hg18", "p53_targets.bed", "p53_targets.fa") """ def __init__(self, index_dir=None): """ Initialize GenomeIndex with index_dir as optional argument""" self.param_file = "index.params" self.size_file = "genome.size" self.index_dir = index_dir self.fasta_dir = None self.size = {} self.fasta_file = {} self.index_file = {} self.line_size = {} self.pack_char = "L" if self.index_dir: if os.path.exists(os.path.join(self.index_dir, self.param_file)): self._read_index_file() def _check_dir(self, dir): """ Check if dir exists, if not: give warning and die""" if not os.path.exists(dir): print "Directory %s does not exist!" % dir sys.exit(1) def _make_index(self, fasta, index): """ Index a single, one-sequence fasta-file""" out = open(index, "w") f = open(fasta) # Skip first line of fasta-file line = f.readline() offset = f.tell() line = f.readline() while line: out.write(pack(self.pack_char, offset)) offset = f.tell() line = f.readline() f.close() out.close() def create_index(self, fasta_dir=None, index_dir=None): """Index all fasta-files in fasta_dir (one sequence per file!) and store the results in index_dir""" # Use default directories if they are not supplied if not fasta_dir: fasta_dir = self.fasta_dir if not index_dir: index_dir = self.index_dir # Can't continue if we still don't have an index_dir or fasta_dir if not fasta_dir: print "fasta_dir not defined!" sys.exit(1) if not index_dir: print "index_dir not defined!" sys.exit(1) index_dir = os.path.abspath(index_dir) fasta_dir = os.path.abspath(fasta_dir) self.index_dir = index_dir # Prepare index directory if not os.path.exists(index_dir): try: os.mkdir(index_dir) except OSError, e: if e.args[0] == 13: sys.stderr.write( "No permission to create index directory. Superuser access needed?\n" ) sys.exit() else: sys.stderr.write(e) # Directories need to exist self._check_dir(fasta_dir) self._check_dir(index_dir) # Get all fasta-files fastafiles = find_by_ext(fasta_dir, FASTA_EXT) if not (fastafiles): raise IOError, \ "No fastafiles found in {} with extension in {}".format( fasta_dir, ",".join(FASTA_EXT)) # param_file will hold all the information about the location of the fasta-files, indeces and # length of the sequences param_file = os.path.join(index_dir, self.param_file) size_file = os.path.join(index_dir, self.size_file) try: out = open(param_file, "w") except IOError, e: if e.args[0] == 13: sys.stderr.write( "No permission to create files in index directory. Superuser access needed?\n" ) sys.exit() else: sys.stderr.write(e)
def create_index(self,fasta_dir=None, index_dir=None): """Index all fasta-files in fasta_dir (one sequence per file!) and store the results in index_dir""" # Use default directories if they are not supplied if not fasta_dir: fasta_dir = self.fasta_dir if not index_dir: index_dir = self.index_dir # Can't continue if we still don't have an index_dir or fasta_dir if not fasta_dir: print("fasta_dir not defined!") sys.exit(1) if not index_dir: print("index_dir not defined!") sys.exit(1) index_dir = os.path.abspath(index_dir) fasta_dir = os.path.abspath(fasta_dir) self.index_dir = index_dir # Prepare index directory if not os.path.exists(index_dir): try: os.mkdir(index_dir) except OSError as e: if e.args[0] == 13: sys.stderr.write("No permission to create index directory. Superuser access needed?\n") sys.exit() else: sys.stderr.write(e) # Directories need to exist self._check_dir(fasta_dir) self._check_dir(index_dir) # Get all fasta-files fastafiles = find_by_ext(fasta_dir, FASTA_EXT) if not(fastafiles): msg = "No fastafiles found in {} with extension in {}".format( fasta_dir, ",".join(FASTA_EXT)) raise IOError(msg) # param_file will hold all the information about the location of the fasta-files, indeces and # length of the sequences param_file = os.path.join(index_dir, self.param_file) size_file = os.path.join(index_dir, self.size_file) try: out = open(param_file, "w") except IOError as e: if e.args[0] == 13: sys.stderr.write("No permission to create files in index directory. Superuser access needed?\n") sys.exit() else: sys.stderr.write(e) s_out = open(size_file, "w") for fasta_file in fastafiles: #sys.stderr.write("Indexing %s\n" % fasta_file) f = open(fasta_file) line = f.readline() if not line.startswith(">"): sys.stderr.write("%s is not a valid FASTA file, expected > at first line\n" % fasta_file) sys.exit() seqname = line.strip().replace(">", "") line = f.readline() line_size = len(line.strip()) total_size = 0 while line: line = line.strip() if line.startswith(">"): sys.stderr.write("Sorry, can only index genomes with " "one sequence per FASTA file\n%s contains multiple " "sequences\n" % fasta_file) sys.exit() total_size += len(line) line = f.readline() index_file = os.path.join(index_dir, "%s.index" % seqname) out.write("{}\t{}\t{}\t{}\t{}\n".format( seqname, fasta_file, index_file, line_size, total_size)) s_out.write("{}\t{}\n".format(seqname, total_size)) self._make_index(fasta_file, index_file) f.close() out.close() s_out.close() # Read the index we just made so we can immediately use it self._read_index_file()