def createDatabase(db, iterator, force=False, synonyms=None, compression=None, random_access_points=None, regex_identifier=None, clean_sequence=False, ignore_duplicates=False, allow_duplicates=False, translator=None): """index files in filenames to create database. Two new files are created - db.fasta and db_name.idx If compression is enabled, provide random access points every # bytes. Dictzip is treated as an uncompressed file. regex_identifier: pattern to extract identifier from description line. If None, the part until the first white-space character is used. translator: specify a translator """ if db.endswith(".fasta"): db = db[:-len(".fasta")] if compression: if compression == "lzo": import lzo def lzo_mangler(s): return lzo.compress(s, 9) mangler = lzo_mangler db_name = db + ".lzo" write_chunks = True elif compression == "zlib": def zlib_mangler(s): return zlib.compress(s, 9) mangler = zlib_mangler db_name = db + ".zlib" write_chunks = True elif compression == "gzip": mangler = gzip_mangler db_name = db + ".gz" write_chunks = True elif compression == "dictzip": import dictzip def mangler(x): return x db_name = db + ".dz" write_chunks = False elif compression == "bzip2": import bz2 def bzip_mangler(x): return bz2.compress(x, 9) mangler = bzip_mangler db_name = db + ".bz2" write_chunks = True elif compression == "debug": def mangler(x): return x db_name = db + ".debug" write_chunks = True elif compression == "rle": import RLE mangler = RLE.compress db_name = db + ".rle" write_chunks = True else: raise ValueError("unknown compression library: %s" % compression) index_name = db + ".cdx" if write_chunks and random_access_points is None \ or random_access_points <= 0: raise ValueError("specify chunksize in --random-access-points") else: def mangler(x): return x db_name = db + ".fasta" write_chunks = False index_name = db + ".idx" if os.path.exists(db_name) and not force: raise ValueError("database %s already exists." % db_name) if os.path.exists(index_name) and not force: raise ValueError("database index %s already exists." % index_name) outfile_index = open(index_name, "w") if compression == "dictzip": if random_access_points is None or random_access_points <= 0: raise ValueError( "specify dictzip chunksize in --random-access-points") outfile_fasta = dictzip.open( db_name, "wb", buffersize=1000000, chunksize=random_access_points) compression = None else: outfile_fasta = open(db_name, "wb") identifiers = {} lsequence = 0 identifier_pos, sequence_pos = 0, 0 translation = string.maketrans("xX", "nN") fragments = [] lfragment = 0 last_identifier = None while 1: try: result = iterator.next() except StopIteration: break if not result: break is_new, identifier, fragment = result if is_new: # check for duplicate identifiers if identifier in identifiers: if ignore_duplicates: raise ValueError("ignore duplicates not implemented") elif allow_duplicates: # the current implementation will fail if the same # identifiers # are directly succeeding each other # better: add return to iterator that indicates a new # identifier out_identifier = identifier + \ "_%i" % (identifiers[identifier]) identifiers[identifier] += 1 identifiers[out_identifier] = 1 else: raise ValueError("%s occurs more than once" % (identifier,)) else: identifiers[identifier] = 1 out_identifier = identifier if last_identifier: if write_chunks: writeFragments(outfile_fasta, outfile_index, fragments, mangler, size=random_access_points, write_all=True) fragments = [] lfragment = 0 else: outfile_fasta.write("\n") outfile_index.write("\t%i\n" % lsequence) # write identifier identifier_pos = outfile_fasta.tell() outfile_fasta.write(mangler(">%s\n" % out_identifier)) sequence_pos = outfile_fasta.tell() outfile_index.write("%s\t%i" % (out_identifier, identifier_pos)) if write_chunks: outfile_index.write("\t%i" % random_access_points) else: outfile_index.write("\t%i" % sequence_pos) fragments = [] lsequence = 0 last_identifier = identifier if translator: s = translator(fragment) else: s = re.sub("\s", "", fragment.strip()) if clean_sequence: s = s.translate(translation) lsequence += len(s) if write_chunks: fragments.append(s) lfragment += len(s) if lfragment > random_access_points: rest = writeFragments(outfile_fasta, outfile_index, fragments, mangler, size=random_access_points, write_all=False) fragments = [rest] lfragment = len(rest) else: outfile_fasta.write(mangler(s)) if write_chunks: writeFragments(outfile_fasta, outfile_index, fragments, mangler, size=random_access_points, write_all=True) else: outfile_fasta.write("\n") outfile_index.write("\t%i\n" % lsequence) # add synonyms for the table if synonyms: for key, vals in synonyms.items(): for val in vals: outfile_index.write("%s\t%s\n" % (key, val))
def createDatabase(db, filenames, force=False, synonyms=None, compression=None, random_access_points=None, regex_identifier=None): """index files in filenames to create database. Two new files are created - db.fasta and db_name.idx If compression is enabled, provide random access points every # bytes. Dictzip is treated as an uncompressed file. regex_identifier: pattern to extract identifier from description line. If None, the part until the first white-space character is used. """ if compression: if compression == "lzo": import lzo def lzo_mangler(s): return lzo.compress(s, 9) mangler = lzo_mangler db_name = db + ".lzo" write_chunks = True elif compression == "zlib": def zlib_mangler(s): return zlib.compress(s, 9) mangler = zlib_mangler db_name = db + ".zlib" write_chunks = True elif compression == "gzip": mangler = gzip_mangler db_name = db + ".gz" write_chunks = True elif compression == "dictzip": import dictzip mangler = lambda x: x db_name = db + ".dz" write_chunks = False elif compression == "debug": mangler = lambda x: x db_name = db + ".debug" write_chunks = True else: raise "unknown compression library: %s" % compression else: mangler = lambda x: x db_name = db + ".fasta" write_chunks = False index_name = db + ".idx" if db in filenames: raise ValueError("database (%s) is part of input set." % db_name) if os.path.exists(db_name) and not force: raise ValueError("database %s already exists." % db_name) if os.path.exists(index_name) and not force: raise ValueError("database index %s already exists." % index_name) outfile_index = open(index_name, "w") if compression == "dictzip": import dictzip if random_access_points == None or random_access_points <= 0: raise ValueError( "specify dictzip chunksize in --random-access-points") outfile_fasta = dictzip.open(db_name, "wb", buffersize=1000000, chunksize=random_access_points) compression = None else: outfile_fasta = open(db_name, "wb") if type(filenames) == types.StringType: filenames = [filenames] identifiers = {} lsequence = 0 identifier_pos, sequence_pos = 0, 0 translation = string.maketrans("xX", "nN") for filename in filenames: if filename == "-": infile = sys.stdin elif filename[-3:] == ".gz": infile = gzip.open(filename, "r") else: infile = open(filename, "r") fragments = [] lfragment = 0 first = True for line in infile: if line[0] == "#": continue if line[0] == ">": if not first: if write_chunks: writeFragments(outfile_fasta, outfile_index, fragments, mangler, random_access_points, True) fragments = [] lfragment = 0 else: outfile_fasta.write("\n") outfile_index.write("\t%i\n" % lsequence) first = False if regex_identifier: try: identifier = re.search(regex_identifier, line[1:-1]).groups()[0] except AttributeError: raise "could not parse identifer from line %s" % line[ 1:-1] else: identifier = re.split("\s", line[1:-1])[0] ## check for duplicate identifiers if identifier in identifiers: raise ValueError, "%s occurs more than once in %s and %s: line=%s" %\ (identifier, identifiers[identifier], filename, line[1:-1]) identifiers[identifier] = filename # write identifier, the identifier includes a new-line identifier_pos = outfile_fasta.tell() outfile_fasta.write("%s" % mangler(line)) sequence_pos = outfile_fasta.tell() outfile_index.write("%s\t%i" % (identifier, identifier_pos)) if write_chunks: outfile_index.write("\t%i" % random_access_points) else: outfile_index.write("\t%i" % sequence_pos) lsequence = 0 else: s = re.sub("\s", "", line.strip()) if options.clean_sequence: s = s.translate(translation) lsequence += len(s) if write_chunks: fragments.append(s) lfragment += len(s) if lfragment > random_access_points: rest = writeFragments(outfile_fasta, outfile_index, fragments, mangler, random_access_points, False) fragments = [rest] lfragment = len(rest) else: outfile_fasta.write(mangler(s)) if write_chunks: writeFragments(outfile_fasta, outfile_index, fragments, mangler, random_access_points, True) else: outfile_fasta.write("\n") outfile_index.write("\t%i\n" % lsequence) # add synonyms for the table if synonyms: for key, vals in synonyms.items(): for val in vals: outfile_index.write("%s\t%s\n" % (key, val))
def createDatabase(db, iterator, force=False, synonyms=None, compression=None, random_access_points=None, regex_identifier=None, clean_sequence=False, ignore_duplicates=False, allow_duplicates=False, translator=None): """index files in filenames to create database. Two new files are created - db.fasta and db_name.idx If compression is enabled, provide random access points every # bytes. Dictzip is treated as an uncompressed file. regex_identifier: pattern to extract identifier from description line. If None, the part until the first white-space character is used. translator: specify a translator """ if db.endswith(".fasta"): db = db[:-len(".fasta")] if compression: if compression == "lzo": import lzo def lzo_mangler(s): return lzo.compress(s, 9) mangler = lzo_mangler db_name = db + ".lzo" write_chunks = True elif compression == "zlib": def zlib_mangler(s): return zlib.compress(s, 9) mangler = zlib_mangler db_name = db + ".zlib" write_chunks = True elif compression == "gzip": mangler = gzip_mangler db_name = db + ".gz" write_chunks = True elif compression == "dictzip": import dictzip mangler = lambda x: x db_name = db + ".dz" write_chunks = False elif compression == "bzip2": import bz2 def bzip_mangler(s): return bz2.compress(s, 9) mangler = bzip_mangler db_name = db + ".bz2" write_chunks = True elif compression == "debug": mangler = lambda x: x db_name = db + ".debug" write_chunks = True elif compression == "rle": import RLE mangler = RLE.compress db_name = db + ".rle" write_chunks = True else: raise ValueError("unknown compression library: %s" % compression) index_name = db + ".cdx" if write_chunks and random_access_points is None \ or random_access_points <= 0: raise ValueError("specify chunksize in --random-access-points") else: mangler = lambda x: x db_name = db + ".fasta" write_chunks = False index_name = db + ".idx" if os.path.exists(db_name) and not force: raise ValueError("database %s already exists." % db_name) if os.path.exists(index_name) and not force: raise ValueError("database index %s already exists." % index_name) outfile_index = open(index_name, "w") if compression == "dictzip": if random_access_points is None or random_access_points <= 0: raise ValueError( "specify dictzip chunksize in --random-access-points") outfile_fasta = dictzip.open(db_name, "wb", buffersize=1000000, chunksize=random_access_points) compression = None else: outfile_fasta = open(db_name, "wb") identifiers = {} lsequence = 0 identifier_pos, sequence_pos = 0, 0 translation = string.maketrans("xX", "nN") fragments = [] lfragment = 0 last_identifier = None while 1: try: result = iterator.next() except StopIteration: break if not result: break is_new, identifier, fragment = result if is_new: # check for duplicate identifiers if identifier in identifiers: if ignore_duplicates: raise ValueError("ignore duplicates not implemented") elif allow_duplicates: # the current implementation will fail if the same # identifiers # are directly succeeding each other # better: add return to iterator that indicates a new # identifier out_identifier = identifier + \ "_%i" % (identifiers[identifier]) identifiers[identifier] += 1 identifiers[out_identifier] = 1 else: raise ValueError("%s occurs more than once" % (identifier, )) else: identifiers[identifier] = 1 out_identifier = identifier if last_identifier: if write_chunks: writeFragments(outfile_fasta, outfile_index, fragments, mangler, size=random_access_points, write_all=True) fragments = [] lfragment = 0 else: outfile_fasta.write("\n") outfile_index.write("\t%i\n" % lsequence) # write identifier identifier_pos = outfile_fasta.tell() outfile_fasta.write(mangler(">%s\n" % out_identifier)) sequence_pos = outfile_fasta.tell() outfile_index.write("%s\t%i" % (out_identifier, identifier_pos)) if write_chunks: outfile_index.write("\t%i" % random_access_points) else: outfile_index.write("\t%i" % sequence_pos) fragments = [] lsequence = 0 last_identifier = identifier if translator: s = translator(fragment) else: s = re.sub("\s", "", fragment.strip()) if clean_sequence: s = s.translate(translation) lsequence += len(s) if write_chunks: fragments.append(s) lfragment += len(s) if lfragment > random_access_points: rest = writeFragments(outfile_fasta, outfile_index, fragments, mangler, size=random_access_points, write_all=False) fragments = [rest] lfragment = len(rest) else: outfile_fasta.write(mangler(s)) if write_chunks: writeFragments(outfile_fasta, outfile_index, fragments, mangler, size=random_access_points, write_all=True) else: outfile_fasta.write("\n") outfile_index.write("\t%i\n" % lsequence) # add synonyms for the table if synonyms: for key, vals in synonyms.items(): for val in vals: outfile_index.write("%s\t%s\n" % (key, val))
def createDatabase( db, filenames, force = False, synonyms = None, compression = None, random_access_points = None, regex_identifier = None): """index files in filenames to create database. Two new files are created - db.fasta and db_name.idx If compression is enabled, provide random access points every # bytes. Dictzip is treated as an uncompressed file. regex_identifier: pattern to extract identifier from description line. If None, the part until the first white-space character is used. """ if compression: if compression == "lzo": import lzo def lzo_mangler( s ): return lzo.compress(s, 9) mangler = lzo_mangler db_name = db + ".lzo" write_chunks = True elif compression == "zlib": def zlib_mangler( s ): return zlib.compress( s, 9) mangler = zlib_mangler db_name = db + ".zlib" write_chunks = True elif compression == "gzip": mangler = gzip_mangler db_name = db + ".gz" write_chunks = True elif compression == "dictzip": import dictzip mangler = lambda x: x db_name = db + ".dz" write_chunks = False elif compression == "debug": mangler = lambda x: x db_name = db + ".debug" write_chunks = True else: raise "unknown compression library: %s" % compression else: mangler = lambda x: x db_name = db + ".fasta" write_chunks = False index_name = db + ".idx" if db in filenames: raise ValueError( "database (%s) is part of input set." % db_name) if os.path.exists( db_name ) and not force: raise ValueError( "database %s already exists." % db_name ) if os.path.exists( index_name ) and not force: raise ValueError( "database index %s already exists." % index_name ) outfile_index = open( index_name, "w" ) if compression == "dictzip": import dictzip if random_access_points == None or random_access_points <= 0: raise ValueError("specify dictzip chunksize in --random-access-points") outfile_fasta = dictzip.open( db_name, "wb", buffersize=1000000, chunksize=random_access_points ) compression = None else: outfile_fasta = open( db_name, "wb" ) if type(filenames) == types.StringType: filenames = [filenames] identifiers = {} lsequence = 0 identifier_pos, sequence_pos = 0, 0 translation = string.maketrans("xX", "nN") for filename in filenames: if filename == "-": infile = sys.stdin elif filename[-3:] == ".gz": infile = gzip.open( filename, "r" ) else: infile = open( filename, "r") fragments = [] lfragment = 0 first = True for line in infile: if line[0] == "#": continue if line[0] == ">" : if not first: if write_chunks: writeFragments( outfile_fasta, outfile_index, fragments, mangler, random_access_points, True ) fragments = [] lfragment = 0 else: outfile_fasta.write( "\n" ) outfile_index.write("\t%i\n" % lsequence) first = False if regex_identifier: try: identifier = re.search(regex_identifier, line[1:-1]).groups()[0] except AttributeError: raise "could not parse identifer from line %s" % line[1:-1] else: identifier = re.split("\s", line[1:-1])[0] ## check for duplicate identifiers if identifier in identifiers: raise ValueError, "%s occurs more than once in %s and %s: line=%s" %\ (identifier, identifiers[identifier], filename, line[1:-1]) identifiers[identifier] = filename # write identifier, the identifier includes a new-line identifier_pos = outfile_fasta.tell() outfile_fasta.write( "%s" % mangler(line) ) sequence_pos = outfile_fasta.tell() outfile_index.write( "%s\t%i" % (identifier, identifier_pos ) ) if write_chunks: outfile_index.write( "\t%i" % random_access_points ) else: outfile_index.write( "\t%i" % sequence_pos ) lsequence = 0 else: s = re.sub( "\s", "", line.strip() ) if options.clean_sequence: s = s.translate( translation ) lsequence += len(s) if write_chunks: fragments.append(s) lfragment += len(s) if lfragment > random_access_points: rest = writeFragments( outfile_fasta, outfile_index, fragments, mangler, random_access_points, False) fragments = [rest] lfragment = len(rest) else: outfile_fasta.write( mangler(s) ) if write_chunks: writeFragments( outfile_fasta, outfile_index, fragments, mangler, random_access_points, True ) else: outfile_fasta.write( "\n" ) outfile_index.write("\t%i\n" % lsequence ) # add synonyms for the table if synonyms: for key, vals in synonyms.items(): for val in vals: outfile_index.write( "%s\t%s\n" % (key, val) )