def __init__(self, path, fail_msg="cannot open 2bit file", debug=False): self.debug = debug self.file = OpenFile(path, "rb", fail_msg) self.ReadHeader() self.ReadSequenceIndices() self.ReadSequenceHeaders()
class TwoBitFileCls: def __init__(self, path, fail_msg="cannot open 2bit file", debug=False): self.debug = debug self.file = OpenFile(path, "rb", fail_msg) self.ReadHeader() self.ReadSequenceIndices() self.ReadSequenceHeaders() # end def def __del__(self): self.file.close() # end def def ResetFile(self): self.file.seek(0) # end def # field size in bytes def ReadField(self, field_type=TWO_BIT_DEFAULT_TYPE): if (field_type not in ELEMENT_LENGTHS): raise TwoBitError("invalid number type: %s. Valid types: %s" % (field_type, ", ".join(sorted(ELEMENT_LENGTHS.keys())))) # end if # read the number of bytes needed for one element of the specified type num_bytes = ELEMENT_LENGTHS[field_type] # bytes_str is a string of chars (each char is one byte) bytes_str = self.file.read(num_bytes) # convert to a tuple of numbers bytes_tup = ConvertBytesToNumbers(bytes_str, self.byte_swap, field_type) # return the first value of the tuple return bytes_tup[0] # if the field size was 1 byte (8 bits, 1 single char) #if (1 == field_type): # # convert to a tuple of chars # bytes_tup = ConvertBytesToNumbers(bytes_str, self.byte_swap, "char") # # return the value as a single long # return bytes_tup[0] ## if the field size was 4 bytes (32 bits, 1 single long) #if (TWO_BIT_FIELD_SIZE == field_type): # # convert to a tuple of longs # bytes_tup = ConvertBytesToNumbers(bytes_str, self.byte_swap, "long") # # return the value as a single long # return bytes_tup[0] #else: # raise TwoBitError \ # ("invalid field size: %i byte(s). Valid field sizes: " % field_type + # "1 byte (8 bits), %i bytes (%i bits)" % # (TWO_BIT_FIELD_SIZE, TWO_BIT_FIELD_SIZE*8)) # end if # end def def ReadChars(self, num_chars): # just read the specified number of chars (1 byte per char) and return them return self.file.read(num_chars) # end def def ReadArray(self, array_size, element_type=TWO_BIT_DEFAULT_TYPE): if (element_type not in ELEMENT_LENGTHS): raise TwoBitError("invalid number type: %s. Valid types: %s" % (field_type, ", ".join(sorted(ELEMENT_LENGTHS.keys())))) # end if # read enough bytes for the specified number of elements # of the specified type num_bytes = array_size * ELEMENT_LENGTHS[element_type] # bytes_str is a string of chars (each char is one byte) bytes_str = self.file.read(num_bytes) # convert to a tuple of numbers bytes_tup = ConvertBytesToNumbers(bytes_str, self.byte_swap, element_type) # return the tuple return bytes_tup # end def def ReadHeader(self): # read the signature and check whether we need to byte-swap self.ReadSignature() # read the version number and check that it is valid self.ReadVersion() # read the number of sequences self.ReadSequenceCount() # skip the reserved field self.ReadField() # end def def ReadSignature(self): # assume we do not need to byte-swap self.byte_swap = False # get the signature sig = self.ReadField() if (self.debug): ErrMsg("signature: %i" % TWO_BIT_SIG) ErrMsg("sig read: %i" % sig) # end if # check the signature (to see if we need to byte-swap) if (TWO_BIT_SIG != sig): self.ResetFile() self.byte_swap = True sig = self.ReadField() if (self.debug): ErrMsg("enabling byte-swapping") ErrMsg("sig read: %i" % sig) # end if if (TWO_BIT_SIG != sig): raise TwoBitError("invalid signature: %i. Should be %i." % (sig, TWO_BIT_SIG)) # end if # end if # end def def ReadVersion(self): version = self.ReadField() if (TWO_BIT_VER != version): raise TwoBitError("invalid two-bit version: %i. Should be %i." % (version, TWO_BIT_VER)) # end if # end def def ReadSequenceCount(self): self.num_seqs = self.ReadField() if (self.debug): ErrMsg("Number of sequences in file: %i" % self.num_seqs) # end if # end def def ReadSequenceIndices(self): self.chrom_sequences = dict() #ErrMsg("ReadSequenceIndices TEMPORARILY DISABLED FOR TESTING") for i in xrange(self.num_seqs): self.ReadSequenceIndex() # end for # end def def ReadSequenceIndex(self): name_length = self.ReadField('char') # a single byte sequence_name = self.ReadChars(name_length) header_start = self.ReadField() if (self.debug): ErrMsg("Name length: %i" % name_length) ErrMsg("Name: %s" % sequence_name) ErrMsg("Offset: %i" % header_start) # end if new_sequence = \ TwoBitSequenceCls(sequence_name, header_start, debug=self.debug) self.chrom_sequences[sequence_name] = new_sequence # end def def ReadSequenceHeaders(self): #ErrMsg("ReadSequenceHeaders TEMPORARILY DISABLED FOR TESTING") for sequence in self.chrom_sequences.itervalues(): self.ReadSequenceHeader(sequence) # end for # end def def ReadSequenceHeader(self, sequence): if (self.debug): ErrMsg("Reading sequence header for %s..." % sequence.name) # end if # seek to the start of the header for the current sequence self.file.seek(sequence.header_start) # get the number of bases in the sequence sequence.num_bases = self.ReadField() # read the N-block information for the sequence self.ReadNBlockInfo(sequence) # read the mask-block information for the sequence self.ReadMaskBlockInfo(sequence) # skip the reserved field self.ReadField() # store the position in bytes from the beginning of the file of # the first base in the sequence sequence.seq_start = self.file.tell() # calculate the position in bytes from the beginning of the file of # the first base in the sequence #sequence.CalculateFirstSequenceBytePosition() # mark that the header for the sequence has been succesfully read sequence.header_read = True # end def def ReadNBlockInfo(self, sequence): # get the number of N-blocks in the sequence num_blocks = self.ReadField() if (self.debug): ErrMsg("Number of N-blocks: %i" % num_blocks) # end if # read the block starts sequence.n_block_starts = self.ReadArray(num_blocks) # read the block sizes sequence.n_block_sizes = self.ReadArray(num_blocks) # end def def ReadMaskBlockInfo(self, sequence): # get the number of mask-blocks in the sequence num_blocks = self.ReadField() if (self.debug): ErrMsg("Number of mask-blocks: %i" % num_blocks) # end if # read the block starts sequence.mask_block_starts = self.ReadArray(num_blocks) # read the block sizes sequence.mask_block_sizes = self.ReadArray(num_blocks) # end def def GetSequence(self, chromosome, start=None, end=None, apply_mask=False): # check the query region, and select the appropriate chromosome (chrom_seq, start, end) = self.CheckQueryRegion(chromosome, start, end) # get the starting byte position (convert 1-based to 0-based) (start_byte, extra_start_bases) = chrom_seq.GetStartByte(start-1) # get the ending byte position (convert 1-based to 0-based) (end_byte, extra_end_bases) = chrom_seq.GetEndByte(end-1) if (self.debug): ErrMsg("Start byte: %i" % start_byte) ErrMsg("Extra start bases: %i" % extra_start_bases) ErrMsg("End byte: %i" % end_byte) ErrMsg("Extra end bases: %i" % extra_end_bases) # end if # get the sequence fragment seq_frag = self.GetSequenceFragment(start_byte, end_byte) # trim the sequence fragment as required seq_frag = self.TrimSequence(seq_frag, extra_start_bases, extra_end_bases) seq_frag = chrom_seq.ApplyNBlocks(seq_frag, start-1, end-1) if (apply_mask): seq_frag = chrom_seq.ApplyMaskBlocks(seq_frag, start-1, end-1) # end if return seq_frag # end def def CheckQueryRegion(self, chromosome, start, end): errors = list() # check that chromosome is a valid sequence identifier chrom_valid = True if (chromosome not in self.chrom_sequences): # if it does not already start with "chr", try adding "chr" if (not chromosome.startswith("chr")): if ("chr%s" % chromosome in self.chrom_sequences): chromosome = "chr%s" % chromosome else: chrom_valid = False # end if else: chrom_valid = False # end if if (not chrom_valid): errors.append("invalid chromosome: %s." % chromosome) # end if # end if # check that start and end are valid base coordinates if (None == start): #{ start = 1 elif (1 > start): errors.append("invalid start coordinate: " "%i. Must be 1 or greater." % start) # end if if (chrom_valid): chrom_seq = self.chrom_sequences[chromosome] # check that sequence has had its header read if (not chrom_seq.header_read): self.ReadSequenceHeader(chrom_seq) # end if if (None == end): #{ end = chrom_seq.num_bases elif (chrom_seq.num_bases < end): errors.append("invalid end coordinate: %i. Must be %i or less." % (end, chrom_seq.num_bases)) # end if # end if if (end < start): errors.append("invalid coordinates: " "start (%i) must be less than or equal to end (%i)." % (start, end)) # end if if (0 < len(errors)): raise TwoBitError("Invalid sequence region query:\n" "%s" % "\n".join(errors)) # end if return (chrom_seq, start, end) # end def def GetSequenceFragment(self, start_byte, end_byte): # move to the first byte self.file.seek(start_byte) # read the correct number of bytes num_bytes = end_byte - start_byte + 1 bytes = self.ReadChars(num_bytes) # convert the bytes to a bit string bits = BytesToBitString(bytes) # convert the bit string to bases bases = BitStringToBaseString(bits) if (self.debug): ErrMsg("Full sequence fragment:\n%s" % bases) # end if # return the bases return bases # end def def TrimSequence(self, seq, start_trim, end_trim): if (0 == end_trim): trimmed_seq = seq[start_trim:] else: trimmed_seq = seq[start_trim:-end_trim] # end if if (self.debug): ErrMsg("Trimmed sequence:\n%s" % trimmed_seq) # end if return trimmed_seq