def train(self, train_data, test_data): sequences = train_data + test_data # number of symbols excluding the stop symbol num_symbols = count_unique_symbols(sequences) longest_sequence_len = longest_sequence_length(sequences) # keeps track of occurences of symbols at different positions: [position][symbol] self.table = [[0]*(num_symbols) for x in range(longest_sequence_len)] # increment counters for all symbols at all position, using all sequences in the test data for x in range(len(sequences)): for y in range(len(sequences[x])): self.table[y][sequences[x][y]] += 1 # increment the stop symbol count #normalize for x in range(len(self.table)): sumval = sum(self.table[x]) for y in range(len(self.table[x])): if sumval != 0: self.table[x][y] = Decimal(self.table[x][y]) / Decimal(sumval) else: self.table[x][y] = Decimal(1) / Decimal(num_symbols)
def train(self, train_data, test_data): sequences = train_data + test_data self.num_symbols = count_unique_symbols(sequences) ll_bound = 10.0 self.model = self.randommodel(self.num_states, self.num_symbols) prev = -1.0 ll = -1.0 while prev == -1.0 or ll - prev > ll_bound: prev = ll self.model = self.iterateEM(self.model, sequences) probs = self.computeprobabilities(self.model, sequences) ll = self.loglikelihood(probs)
def train(self, train_data, test_data): self.train_data = train_data self.num_symbols = utilities.count_unique_symbols(train_data) self.i = 0