예제 #1
0
    def create_cache(self, state_count):
        """
        Create a new zipped sequence optimized for the specified number of
        states in the hidden Markov model and cache the new sequence in the
        directory managed by this instance.

        state_count -- The optimal number of states in the hidden Markov model.
        """

        assert state_count > 0

        #
        # Create the directory of zipped sequences if it does not yet exist.
        #
        if not os.path.isdir(self.__state_count_to_sequence_path):
            os.mkdir(self.__state_count_to_sequence_path)

        #
        # Reread the original sequence, and zip it, optimized for the specified
        # state count.
        #
        seq = Sequence.from_file(
            self.__original_sequence_path,
            self.__unzipped_alphabet_size)

        x_seq = ZipSequence.from_sequence(seq, state_count)

        #
        # Loop over symbols in the zipped sequence; if the symbol is new, then
        # ensure it matches an existing symbol in the substitution table--or
        # that it does not exist in the substitution table; put new symbols
        # into the substitution table of this instance.
        #
        for symbol in xrange(x_seq.alphabet_size):
            if symbol < self.__unzipped_alphabet_size:
                continue

            new_a, new_b = x_seq.get_substitute(symbol)
            if symbol in self.__substitutions:
                old_a, old_b = self.__substitutions[symbol]
                assert old_a == new_a
                assert old_b == new_b
                continue

            self.__substitutions[symbol] = (new_a, new_b)

        #
        # Record the number of symbols used for this state count, save the new
        # data structure, save the zipped sequence, and return the sequence.
        #
        self.__zipped_alphabet_sizes[state_count] = x_seq.alphabet_size
        self.__save_data_structure()
        x_seq_path = self.__resolve_zipped_sequence_path(state_count)
        x_seq.save(x_seq_path)
        return x_seq
예제 #2
0
    def create_cache(self, state_count):
        """
        Create a new zipped sequence optimized for the specified number of
        states in the hidden Markov model and cache the new sequence in the
        directory managed by this instance.

        state_count -- The optimal number of states in the hidden Markov model.
        """

        assert state_count > 0

        #
        # Create the directory of zipped sequences if it does not yet exist.
        #
        if not os.path.isdir(self.__state_count_to_sequence_path):
            os.mkdir(self.__state_count_to_sequence_path)

        #
        # Reread the original sequence, and zip it, optimized for the specified
        # state count.
        #
        seq = Sequence.from_file(self.__original_sequence_path,
                                 self.__unzipped_alphabet_size)

        x_seq = ZipSequence.from_sequence(seq, state_count)

        #
        # Loop over symbols in the zipped sequence; if the symbol is new, then
        # ensure it matches an existing symbol in the substitution table--or
        # that it does not exist in the substitution table; put new symbols
        # into the substitution table of this instance.
        #
        for symbol in xrange(x_seq.alphabet_size):
            if symbol < self.__unzipped_alphabet_size:
                continue

            new_a, new_b = x_seq.get_substitute(symbol)
            if symbol in self.__substitutions:
                old_a, old_b = self.__substitutions[symbol]
                assert old_a == new_a
                assert old_b == new_b
                continue

            self.__substitutions[symbol] = (new_a, new_b)

        #
        # Record the number of symbols used for this state count, save the new
        # data structure, save the zipped sequence, and return the sequence.
        #
        self.__zipped_alphabet_sizes[state_count] = x_seq.alphabet_size
        self.__save_data_structure()
        x_seq_path = self.__resolve_zipped_sequence_path(state_count)
        x_seq.save(x_seq_path)
        return x_seq
예제 #3
0
    def __init__(self, directory):
        """
        Initialize a new instance of the ZipDirectory class based on the
        specified directory.  Require that the directory contains at least the
        text file, 'original_sequence', which contains the symbols of the
        original sequence in order and separated by whitespace.

        directory -- The directory to manage.
        """

        assert os.path.isdir(directory)

        #
        # All ZipHMM directories must contain the 'original_sequence' file.
        #
        self.__path = directory
        assert os.path.isfile(self.__original_sequence_path)

        #
        # If the 'data_structure' file does not exist, this is an uninitialized
        # directory; read the original sequence and initially store the data
        # structure.
        #
        if not os.path.isfile(self.__data_structure_path):
            seq = Sequence.from_file(self.__original_sequence_path)
            self.__unzipped_alphabet_size = seq.alphabet_size
            self.__unzipped_sequence_length = len(seq)
            self.__zipped_alphabet_sizes = dict()
            self.__substitutions = dict()
            self.__save_data_structure()
            return

        #
        # Read the original alphabet size; only 16-bit values are allowed, and
        # there must be at least one symbol in the alphabet.
        #
        scanner = TokenScanner(self.__data_structure_path)
        scanner.require('orig_alphabet_size')
        self.__unzipped_alphabet_size = scanner.read_int()
        assert 1 <= self.__unzipped_alphabet_size < 65536

        #
        # Read the original sequence length and assume this still matches the
        # data in the original sequence itself.
        #
        scanner.require('orig_seq_length')
        self.__unzipped_sequence_length = scanner.read_int()
        assert 1 <= self.__unzipped_sequence_length

        #
        # Read the number of symbols compressed for various state counts; this
        # will also define the maximum symbol value.
        #
        scanner.require('nStates2alphabet_size')
        self.__zipped_alphabet_sizes = dict()
        max_alphabet_size = 0
        while True:
            token = scanner.peek()
            if token is None or token == 'symbol2pair':
                break

            #
            # Read the state count; duplicate entries are not allowed.
            #
            state_count = scanner.read_int()
            assert state_count > 0
            assert state_count not in self.__zipped_alphabet_sizes

            #
            # Read the alphabet size.
            #
            alphabet_size = scanner.read_int()
            assert alphabet_size > 0
            max_alphabet_size = max(alphabet_size, max_alphabet_size)

            #
            # Store the state count and alphabet size into the table used to
            # decompressed zipped sequences.
            #
            self.__zipped_alphabet_sizes[state_count] = alphabet_size

        #
        # Read the substitution table.
        #
        self.__substitutions = dict()
        scanner.require('symbol2pair')
        while scanner.peek() is not None:
            #
            # Read the new symbol; disallow duplicate entries for new symbols,
            # ensure the new symbol is not in the original alphabet, and ensure
            # the new symbol is not larger than the largest recorded alphabet
            # size from the previous section.
            #
            new_symbol = scanner.read_int()
            assert new_symbol not in self.__substitutions
            assert new_symbol >= self.__unzipped_alphabet_size
            assert new_symbol < max_alphabet_size

            #
            # Read the two symbols; neither may be larger than the largest
            # recorded alphabet size.
            #
            a = scanner.read_int()
            b = scanner.read_int()
            assert a < max_alphabet_size
            assert b < max_alphabet_size

            #
            # Record the entry in the substitution table, new_symbol => (a, b).
            #
            self.__substitutions[new_symbol] = (a, b)

        #
        # All (a, b) values symbols in the substitution table must either be
        # from the original alphabet or be present as a new symbol in the
        # substitution table.
        # subst
        #
        assert all(
            map(
                lambda (key, (a_, b_)):
                (a_ < self.__unzipped_alphabet_size or a_ in self.
                 __substitutions) and (b_ < self.__unzipped_alphabet_size or b_
                                       in self.__substitutions),
                self.__substitutions.iteritems()))
예제 #4
0
    def __init__(self, directory):
        """
        Initialize a new instance of the ZipDirectory class based on the
        specified directory.  Require that the directory contains at least the
        text file, 'original_sequence', which contains the symbols of the
        original sequence in order and separated by whitespace.

        directory -- The directory to manage.
        """

        assert os.path.isdir(directory)

        #
        # All ZipHMM directories must contain the 'original_sequence' file.
        #
        self.__path = directory
        assert os.path.isfile(self.__original_sequence_path)

        #
        # If the 'data_structure' file does not exist, this is an uninitialized
        # directory; read the original sequence and initially store the data
        # structure.
        #
        if not os.path.isfile(self.__data_structure_path):
            seq = Sequence.from_file(self.__original_sequence_path)
            self.__unzipped_alphabet_size = seq.alphabet_size
            self.__unzipped_sequence_length = len(seq)
            self.__zipped_alphabet_sizes = dict()
            self.__substitutions = dict()
            self.__save_data_structure()
            return

        #
        # Read the original alphabet size; only 16-bit values are allowed, and
        # there must be at least one symbol in the alphabet.
        #
        scanner = TokenScanner(self.__data_structure_path)
        scanner.require('orig_alphabet_size')
        self.__unzipped_alphabet_size = scanner.read_int()
        assert 1 <= self.__unzipped_alphabet_size < 65536

        #
        # Read the original sequence length and assume this still matches the
        # data in the original sequence itself.
        #
        scanner.require('orig_seq_length')
        self.__unzipped_sequence_length = scanner.read_int()
        assert 1 <= self.__unzipped_sequence_length

        #
        # Read the number of symbols compressed for various state counts; this
        # will also define the maximum symbol value.
        #
        scanner.require('nStates2alphabet_size')
        self.__zipped_alphabet_sizes = dict()
        max_alphabet_size = 0
        while True:
            token = scanner.peek()
            if token is None or token == 'symbol2pair':
                break

            #
            # Read the state count; duplicate entries are not allowed.
            #
            state_count = scanner.read_int()
            assert state_count > 0
            assert state_count not in self.__zipped_alphabet_sizes

            #
            # Read the alphabet size.
            #
            alphabet_size = scanner.read_int()
            assert alphabet_size > 0
            max_alphabet_size = max(alphabet_size, max_alphabet_size)

            #
            # Store the state count and alphabet size into the table used to
            # decompressed zipped sequences.
            #
            self.__zipped_alphabet_sizes[state_count] = alphabet_size

        #
        # Read the substitution table.
        #
        self.__substitutions = dict()
        scanner.require('symbol2pair')
        while scanner.peek() is not None:
            #
            # Read the new symbol; disallow duplicate entries for new symbols,
            # ensure the new symbol is not in the original alphabet, and ensure
            # the new symbol is not larger than the largest recorded alphabet
            # size from the previous section.
            #
            new_symbol = scanner.read_int()
            assert new_symbol not in self.__substitutions
            assert new_symbol >= self.__unzipped_alphabet_size
            assert new_symbol < max_alphabet_size

            #
            # Read the two symbols; neither may be larger than the largest
            # recorded alphabet size.
            #
            a = scanner.read_int()
            b = scanner.read_int()
            assert a < max_alphabet_size
            assert b < max_alphabet_size

            #
            # Record the entry in the substitution table, new_symbol => (a, b).
            #
            self.__substitutions[new_symbol] = (a, b)

        #
        # All (a, b) values symbols in the substitution table must either be
        # from the original alphabet or be present as a new symbol in the
        # substitution table.
        # subst
        #
        assert all(map(lambda (key, (a_, b_)):
                       (a_ < self.__unzipped_alphabet_size or
                        a_ in self.__substitutions) and
                       (b_ < self.__unzipped_alphabet_size or
                        b_ in self.__substitutions),
                       self.__substitutions.iteritems()))