def create_cache(self, state_count): """ Create a new zipped sequence optimized for the specified number of states in the hidden Markov model and cache the new sequence in the directory managed by this instance. state_count -- The optimal number of states in the hidden Markov model. """ assert state_count > 0 # # Create the directory of zipped sequences if it does not yet exist. # if not os.path.isdir(self.__state_count_to_sequence_path): os.mkdir(self.__state_count_to_sequence_path) # # Reread the original sequence, and zip it, optimized for the specified # state count. # seq = Sequence.from_file( self.__original_sequence_path, self.__unzipped_alphabet_size) x_seq = ZipSequence.from_sequence(seq, state_count) # # Loop over symbols in the zipped sequence; if the symbol is new, then # ensure it matches an existing symbol in the substitution table--or # that it does not exist in the substitution table; put new symbols # into the substitution table of this instance. # for symbol in xrange(x_seq.alphabet_size): if symbol < self.__unzipped_alphabet_size: continue new_a, new_b = x_seq.get_substitute(symbol) if symbol in self.__substitutions: old_a, old_b = self.__substitutions[symbol] assert old_a == new_a assert old_b == new_b continue self.__substitutions[symbol] = (new_a, new_b) # # Record the number of symbols used for this state count, save the new # data structure, save the zipped sequence, and return the sequence. # self.__zipped_alphabet_sizes[state_count] = x_seq.alphabet_size self.__save_data_structure() x_seq_path = self.__resolve_zipped_sequence_path(state_count) x_seq.save(x_seq_path) return x_seq
def create_cache(self, state_count): """ Create a new zipped sequence optimized for the specified number of states in the hidden Markov model and cache the new sequence in the directory managed by this instance. state_count -- The optimal number of states in the hidden Markov model. """ assert state_count > 0 # # Create the directory of zipped sequences if it does not yet exist. # if not os.path.isdir(self.__state_count_to_sequence_path): os.mkdir(self.__state_count_to_sequence_path) # # Reread the original sequence, and zip it, optimized for the specified # state count. # seq = Sequence.from_file(self.__original_sequence_path, self.__unzipped_alphabet_size) x_seq = ZipSequence.from_sequence(seq, state_count) # # Loop over symbols in the zipped sequence; if the symbol is new, then # ensure it matches an existing symbol in the substitution table--or # that it does not exist in the substitution table; put new symbols # into the substitution table of this instance. # for symbol in xrange(x_seq.alphabet_size): if symbol < self.__unzipped_alphabet_size: continue new_a, new_b = x_seq.get_substitute(symbol) if symbol in self.__substitutions: old_a, old_b = self.__substitutions[symbol] assert old_a == new_a assert old_b == new_b continue self.__substitutions[symbol] = (new_a, new_b) # # Record the number of symbols used for this state count, save the new # data structure, save the zipped sequence, and return the sequence. # self.__zipped_alphabet_sizes[state_count] = x_seq.alphabet_size self.__save_data_structure() x_seq_path = self.__resolve_zipped_sequence_path(state_count) x_seq.save(x_seq_path) return x_seq
def __init__(self, directory): """ Initialize a new instance of the ZipDirectory class based on the specified directory. Require that the directory contains at least the text file, 'original_sequence', which contains the symbols of the original sequence in order and separated by whitespace. directory -- The directory to manage. """ assert os.path.isdir(directory) # # All ZipHMM directories must contain the 'original_sequence' file. # self.__path = directory assert os.path.isfile(self.__original_sequence_path) # # If the 'data_structure' file does not exist, this is an uninitialized # directory; read the original sequence and initially store the data # structure. # if not os.path.isfile(self.__data_structure_path): seq = Sequence.from_file(self.__original_sequence_path) self.__unzipped_alphabet_size = seq.alphabet_size self.__unzipped_sequence_length = len(seq) self.__zipped_alphabet_sizes = dict() self.__substitutions = dict() self.__save_data_structure() return # # Read the original alphabet size; only 16-bit values are allowed, and # there must be at least one symbol in the alphabet. # scanner = TokenScanner(self.__data_structure_path) scanner.require('orig_alphabet_size') self.__unzipped_alphabet_size = scanner.read_int() assert 1 <= self.__unzipped_alphabet_size < 65536 # # Read the original sequence length and assume this still matches the # data in the original sequence itself. # scanner.require('orig_seq_length') self.__unzipped_sequence_length = scanner.read_int() assert 1 <= self.__unzipped_sequence_length # # Read the number of symbols compressed for various state counts; this # will also define the maximum symbol value. # scanner.require('nStates2alphabet_size') self.__zipped_alphabet_sizes = dict() max_alphabet_size = 0 while True: token = scanner.peek() if token is None or token == 'symbol2pair': break # # Read the state count; duplicate entries are not allowed. # state_count = scanner.read_int() assert state_count > 0 assert state_count not in self.__zipped_alphabet_sizes # # Read the alphabet size. # alphabet_size = scanner.read_int() assert alphabet_size > 0 max_alphabet_size = max(alphabet_size, max_alphabet_size) # # Store the state count and alphabet size into the table used to # decompressed zipped sequences. # self.__zipped_alphabet_sizes[state_count] = alphabet_size # # Read the substitution table. # self.__substitutions = dict() scanner.require('symbol2pair') while scanner.peek() is not None: # # Read the new symbol; disallow duplicate entries for new symbols, # ensure the new symbol is not in the original alphabet, and ensure # the new symbol is not larger than the largest recorded alphabet # size from the previous section. # new_symbol = scanner.read_int() assert new_symbol not in self.__substitutions assert new_symbol >= self.__unzipped_alphabet_size assert new_symbol < max_alphabet_size # # Read the two symbols; neither may be larger than the largest # recorded alphabet size. # a = scanner.read_int() b = scanner.read_int() assert a < max_alphabet_size assert b < max_alphabet_size # # Record the entry in the substitution table, new_symbol => (a, b). # self.__substitutions[new_symbol] = (a, b) # # All (a, b) values symbols in the substitution table must either be # from the original alphabet or be present as a new symbol in the # substitution table. # subst # assert all( map( lambda (key, (a_, b_)): (a_ < self.__unzipped_alphabet_size or a_ in self. __substitutions) and (b_ < self.__unzipped_alphabet_size or b_ in self.__substitutions), self.__substitutions.iteritems()))
def __init__(self, directory): """ Initialize a new instance of the ZipDirectory class based on the specified directory. Require that the directory contains at least the text file, 'original_sequence', which contains the symbols of the original sequence in order and separated by whitespace. directory -- The directory to manage. """ assert os.path.isdir(directory) # # All ZipHMM directories must contain the 'original_sequence' file. # self.__path = directory assert os.path.isfile(self.__original_sequence_path) # # If the 'data_structure' file does not exist, this is an uninitialized # directory; read the original sequence and initially store the data # structure. # if not os.path.isfile(self.__data_structure_path): seq = Sequence.from_file(self.__original_sequence_path) self.__unzipped_alphabet_size = seq.alphabet_size self.__unzipped_sequence_length = len(seq) self.__zipped_alphabet_sizes = dict() self.__substitutions = dict() self.__save_data_structure() return # # Read the original alphabet size; only 16-bit values are allowed, and # there must be at least one symbol in the alphabet. # scanner = TokenScanner(self.__data_structure_path) scanner.require('orig_alphabet_size') self.__unzipped_alphabet_size = scanner.read_int() assert 1 <= self.__unzipped_alphabet_size < 65536 # # Read the original sequence length and assume this still matches the # data in the original sequence itself. # scanner.require('orig_seq_length') self.__unzipped_sequence_length = scanner.read_int() assert 1 <= self.__unzipped_sequence_length # # Read the number of symbols compressed for various state counts; this # will also define the maximum symbol value. # scanner.require('nStates2alphabet_size') self.__zipped_alphabet_sizes = dict() max_alphabet_size = 0 while True: token = scanner.peek() if token is None or token == 'symbol2pair': break # # Read the state count; duplicate entries are not allowed. # state_count = scanner.read_int() assert state_count > 0 assert state_count not in self.__zipped_alphabet_sizes # # Read the alphabet size. # alphabet_size = scanner.read_int() assert alphabet_size > 0 max_alphabet_size = max(alphabet_size, max_alphabet_size) # # Store the state count and alphabet size into the table used to # decompressed zipped sequences. # self.__zipped_alphabet_sizes[state_count] = alphabet_size # # Read the substitution table. # self.__substitutions = dict() scanner.require('symbol2pair') while scanner.peek() is not None: # # Read the new symbol; disallow duplicate entries for new symbols, # ensure the new symbol is not in the original alphabet, and ensure # the new symbol is not larger than the largest recorded alphabet # size from the previous section. # new_symbol = scanner.read_int() assert new_symbol not in self.__substitutions assert new_symbol >= self.__unzipped_alphabet_size assert new_symbol < max_alphabet_size # # Read the two symbols; neither may be larger than the largest # recorded alphabet size. # a = scanner.read_int() b = scanner.read_int() assert a < max_alphabet_size assert b < max_alphabet_size # # Record the entry in the substitution table, new_symbol => (a, b). # self.__substitutions[new_symbol] = (a, b) # # All (a, b) values symbols in the substitution table must either be # from the original alphabet or be present as a new symbol in the # substitution table. # subst # assert all(map(lambda (key, (a_, b_)): (a_ < self.__unzipped_alphabet_size or a_ in self.__substitutions) and (b_ < self.__unzipped_alphabet_size or b_ in self.__substitutions), self.__substitutions.iteritems()))