Пример #1
0
class MutualBootStrapper:

    def __init__(self, data, seeds, patterns=None, processing=1):
        if processing == 0:
            tokenized = self.tokenize(data)
            self.pos_tagged_data = self.pos_tag(tokenized)
            self.find_patterns = self.find_patterns_tagged
            self.find_seeds = self.find_seeds_tagged
        elif processing == 1:
            self.chunked_data = data
            self.find_patterns = self.find_patterns_chunked
            self.find_seeds = self.find_seeds_chunked
        self.permanent_lexicon = set(seeds)
        self.temporary_lexicon = defaultdict(set)
        for s in seeds:
            self.temporary_lexicon[s] = set()
        self.best_extraction_patterns = set()
        self.pattern_alphabet = Alphabet()
        if patterns is not None:
            for p in patterns:
                self.pattern_alphabet.add(p)
        self.n_counter_sets = None # import for getting candidate seeds
        self.f_counter_sets = None
        self.n_pattern_array = None
        self.f_pattern_array = None
        self.first_pattern_words = set()

    def tokenize(self, text):
        print "tokenizing...",
        all_entries = []
        for entry in text:
            tokenized_entry = self._nested_tokenize(entry)
            all_entries.append(tokenized_entry)
        print "[DONE]"
        return all_entries

    def _nested_tokenize(self, untokenized_sentences):
        tokenized_sents = nltk.sent_tokenize(untokenized_sentences)
        tokenized_words = [nltk.word_tokenize(sent) for sent in tokenized_sents]
        self._postprocess_tokenized_text(tokenized_words)
        return tokenized_words

    def _postprocess_tokenized_text(self, tokenized):
        for i,sent in enumerate(tokenized):
            for j,word in enumerate(sent):
                tokenized[i][j] = word.lower()
                if "/" in word:
                    tokenized[i][j] = re.sub(r"/", r" / ", word)
                    #mutating the list

    def pos_tag(self, tokenized_data):
        print "POS tagging... ",
        pos_tagged_data = []
        for entry in tokenized_data:
            new_entry = []
            for sentence in entry:
                tagged = [("<START>", "<START>")]
                tagged.extend(nltk.pos_tag(sentence))
                new_entry.append(tagged)
            pos_tagged_data.append(new_entry)
        print "[DONE]"
        return pos_tagged_data

    def build_patterns_tagged(self, sentence, index, size):
        window_start = index-size
        window_end = index+1
        sentence_copy = list(sentence)
        sentence_copy[index] = "<x>",
        while window_start <= index: # this isn't quite right
            try:
                candidate = zip(*sentence_copy[window_start:window_end])[0]
            except IndexError:
                candidate = []
            if len(candidate) > 1:
                self.pattern_alphabet.add(tuple(candidate))
                if candidate[0] != "<x>":
                    self.first_pattern_words.add(candidate[0])
                else:
                    self.first_pattern_words.add(candidate[1])
            window_start += 1
            window_end += 1

    def find_patterns_tagged(self):
        for entry in self.pos_tagged_data:
            for sentence in entry:
                for i,(word,tag)  in enumerate(sentence):
                    if word in self.temporary_lexicon:
                        self.build_patterns_tagged(sentence, i, 2)
                        self.build_patterns_tagged(sentence, i, 1)

    def find_patterns_chunked(self):
        for entry in self.chunked_data:
            for sentence in entry:
                for i,word in enumerate(sentence):
                    if isinstance(word, Chunk) and word.head in self.temporary_lexicon:
                        self.build_patterns_chunked(sentence, i, 2)
                        self.build_patterns_chunked(sentence, i, 1)

    def build_patterns_chunked(self, sentence, index, size):
        sentence_copy = list(sentence)
        sentence_copy[index] = "<x>",
        sentence_copy = self._flatten_chunks(sentence_copy)
        index = sentence_copy.index("<x>")
        window_start = index-size
        window_end = index+1
        while window_start <= index:
            candidate = sentence_copy[window_start:window_end]
            if len(candidate) > 1:
                self.pattern_alphabet.add(tuple(candidate))
            window_start += 1
            window_end += 1

    def _flatten_chunks(self, sentence):
        flattened_sentence = []
        for constituent in sentence:
            if isinstance(constituent, Chunk):
                flattened_sentence.extend(constituent.tokens)
            else:
                flattened_sentence.append(constituent[0])
        return flattened_sentence

    def set_counter_arrays(self):
        tmp_lst = [[]] * self.pattern_alphabet.size() # must be careful about pointers here
        self.n_counter_sets = map(set, tmp_lst)
        self.f_counter_sets = map(set, tmp_lst)

    def find_seeds_chunked(self):
        for entry in self.chunked_data:
            for sentence in entry:
                for i in range(len(sentence)):
                    if isinstance(sentence[i], Chunk):
                        self.match_pattern_chunked(sentence, i, 2)
                        self.match_pattern_chunked(sentence, i, 1)

    def match_pattern_chunked(self, sentence, index, size):
        candidate_seed = sentence[index].head
        sentence_copy = list(sentence)
        sentence_copy[index] = "<x>",
        sentence_copy = self._flatten_chunks(sentence_copy)
        index = sentence_copy.index("<x>")
        window_start = index-size
        window_end = index+1
        while window_start <= index:
            window = sentence_copy[window_start:window_end]
            pattern = tuple(window)
            if len(pattern) > 1 and \
                    self.pattern_alphabet.has_label(pattern) and \
                    len(candidate_seed) > 2:

                pattern_index = self.pattern_alphabet.get_index(pattern)

                # increment our counters
                self.n_counter_sets[pattern_index].add(candidate_seed)
                if candidate_seed not in self.temporary_lexicon:
                    self.f_counter_sets[pattern_index].add(candidate_seed)

            window_start += 1
            window_end += 1

    def find_seeds_tagged(self):
        for entry in self.pos_tagged_data:
            for sentence in entry:
                for i in range(len(sentence)):
                    if sentence[i][0] in self.first_pattern_words:
                        self.match_pattern_tagged(sentence, i, 3)
                        self.match_pattern_tagged(sentence, i, 2)

    def match_pattern_tagged(self, sentence, index, size):
        window_start = index-1
        window_end = index+size-1
        window = sentence[window_start:window_end]
        for seed_candidate_index in range(len(window)):
            window_copy = list(window)
            _,pos = window_copy[seed_candidate_index]
            window_copy[seed_candidate_index] = ("<x>", pos)
            pattern = tuple(zip(*window_copy)[0])
            if len(pattern) > 1 and \
                    self.pattern_alphabet.has_label(pattern) and \
                    window[seed_candidate_index][1].startswith("NN") and \
                    len(window[seed_candidate_index][0]) > 2:

                candidate_seed = window[seed_candidate_index][0]
                pattern_index = self.pattern_alphabet.get_index(pattern)

                # increment our counters
                self.n_counter_sets[pattern_index].add(candidate_seed)
                if candidate_seed not in self.temporary_lexicon:
                    self.f_counter_sets[pattern_index].add(candidate_seed)

    def calculate_pattern_scores(self):
        self.n_pattern_array = numpy.array(map(len, self.n_counter_sets), dtype=float) + 1.
        self.f_pattern_array = numpy.array(map(len, self.f_counter_sets), dtype=float) + 1.

        self.pattern_scores = numpy.nan_to_num((self.f_pattern_array/self.n_pattern_array)*numpy.log2(self.f_pattern_array))

    def calculate_seed_scores(self):
        self.candidate_seed_scores = {}
        for candidate_seed,matched_patterns_set in self.temporary_lexicon.iteritems():
            matched_patterns = list(matched_patterns_set)
            score = numpy.sum((self.pattern_scores[matched_patterns] * 0.01) + 1)
            #print score
            self.candidate_seed_scores[candidate_seed] = score

    def cull_candidates(self):
        self.calculate_pattern_scores()
        self.calculate_seed_scores()
        sorted_candidates = sorted([(v,k) for k,v in self.candidate_seed_scores.iteritems()], reverse=True)
        #print sorted_candidates
        try:
            return zip(*sorted_candidates)[1][:5]
        except IndexError:
            return []

    def run_mutual_bootstrapping(self):
        added_patterns = 0
        best_score = 5
        while added_patterns < 10 or best_score > 1.8:
            self.find_patterns()
            self.set_counter_arrays()
            self.find_seeds()
            self.calculate_pattern_scores()

            best_pattern_index = numpy.nanargmax(self.pattern_scores)
            while best_pattern_index in self.best_extraction_patterns:
                self.pattern_scores[best_pattern_index] = -10000000.
                best_pattern_index = numpy.nanargmax(self.pattern_scores)

            if self.pattern_scores[best_pattern_index] < 0.7:
                return

            best_score = self.pattern_scores[best_pattern_index]
            #print best_score, self.pattern_alphabet.get_label(best_pattern_index)

            self.best_extraction_patterns.add(best_pattern_index)
            for seed in self.n_counter_sets[best_pattern_index]:
                self.temporary_lexicon[seed].add(best_pattern_index)
            added_patterns += 1

    def run_meta_bootstrapping(self):
        best_five = self.cull_candidates()
        self.permanent_lexicon.update(best_five)
        self.temporary_lexicon = defaultdict(set)
        for s in self.permanent_lexicon:
            self.temporary_lexicon[s] = set()

    def run(self, num_iterations=50):
        for i in range(num_iterations):
            print "Iteration: {:d}".format(i+1)
            print "running mutual bootstrapping..."
            self.run_mutual_bootstrapping()
            print "[DONE]"
            print "running meta bootstrapping...",
            self.run_meta_bootstrapping()
            print "[DONE]"
            print "number of seed terms: {:d}".format(len(self.permanent_lexicon))
            print "number of total patterns: {:d}".format(self.pattern_alphabet.size())
            print "\n"


    def save_seeds(self, outfile):
        with open(outfile, "w") as f_out:
            f_out.write("\n".join(s.encode("utf-8") for s in self.permanent_lexicon))

    def save_patterns(self, outfile):
        with open(outfile, "w") as f_out:
            patterns = []
            for pattern_index in self.best_extraction_patterns:
                patterns.append(" ".join(self.pattern_alphabet.get_label(pattern_index)))
            f_out.write("\n".join(s.encode("utf-8") for s in patterns))
Пример #2
0
class HMM(BaseClassifier):

	def __init__(self):
		self.label_alphabet = Alphabet()
		self.feature_alphabet = Alphabet()
		self.transition_matrix = None
		self.emission_matrix = None
		self.initial_probability = None
		
	@property
	def num_states(self):
		return self.label_alphabet.size()
	
	@property
	def num_observations(self):
		return self.feature_alphabet.size()
		
	def _mutate_data(self, instance):
		try:
			_ = instance.old_data
		except:
			instance.old_data = instance.data
			instance.data = self.feature_alphabet.get_indices(instance.data)
			
	def _mutate_label(self, instance):
		try:
			_ = instance.old_label
		except:
			instance.old_label = instance.label
			instance.label = self.label_alphabet.get_indices(instance.label)
	
	def populate_alphabets(self, instance_list):
		"""Populate alphabets
		
		You guys have done this twice already. So I'm doing it for you this time.
		But a few things to note
			the labels get converted to label indices
			the feature vectors get converted to sparse vector	
			each time step contains exactly one feature (observation)
		
		Feel free to edit/modify/tear apart this function
		"""
		for instance in instance_list:
			for label in instance.label:
				self.label_alphabet.add(label)
			for observation in instance.data:
				self.feature_alphabet.add(observation)
			
			self._mutate_data(instance)
			self._mutate_label(instance)
			
		#for test cases and unsupervised training
		self.transition_matrix = numpy.zeros((self.num_states, self.num_states))
		self.emission_matrix = numpy.zeros((self.num_states, self.num_observations))
		self.initial_probability = numpy.zeros(self.num_states)
	
	def collect_counts(self, instance_list):
		"""Collect counts for fitting HMM parameters
		
		Very similar to Naive Bayes, we have to collect counts for estimating parameters:
		transition_counts[i,j] = the number of occurrences that state i comes before state j
		observation_counts[i,j] = the number of occurrences that state i is aligned with observation j 
		initial_state_counts[i] = the number of occurrences that state i is at the beginning of the sequence
		
		Add your implementation
		"""
		transition_counts = numpy.zeros((self.num_states, self.num_states))
		initial_state_counts = numpy.zeros(self.num_states)
		observation_counts = numpy.zeros((self.num_states, self.num_observations))
		for instance in instance_list:

			trans = zip(instance.label[:-1], instance.label[1:])
			transition_counts[instance.label[:-1], instance.label[1:]] += \
				map(trans.count, trans) #quirky workaround; commented code above doesn't work
			obs = zip(instance.label, instance.data)
			observation_counts[instance.label, instance.data] += \
				map(obs.count, obs)
			initial_state_counts[instance.label[0]] += 1 #increment initial state
			
		return (transition_counts, initial_state_counts, observation_counts)
	
	def train(self, instance_list):
		"""Train the HMM 
		
		Collect counts and find the best parameters for 
		transition matrix, emission matrix, and initial probability
		
		DO NOT smooth the counts
		
		Add your implementation
		"""
		self.populate_alphabets(instance_list)
		transition_counts, initial_state_counts, observation_counts = self.collect_counts(instance_list)
		
		#fill in these matrices
		#availing of columnar summation of numpy arrays 
		self.transition_matrix = transition_counts / numpy.sum(transition_counts, 1)
		self.emission_matrix = (observation_counts.T / (numpy.sum(transition_counts, 0) + \
			initial_state_counts)).T #p(Y1|X0) + p(Y1|X1) + ... = p(Y1)
		self.initial_probability = initial_state_counts / sum(initial_state_counts) #p(X|Start)
	
	def forward_algorithm(self, instance):
		"""Run forward algorithm
		
		Add your implementation
		"""
		sequence_length = len(instance.data) 
		alpha = numpy.zeros((self.num_states, sequence_length))
		
		#initialization
		alpha[:, 0] = self.initial_probability * self.emission_matrix[:,instance.data[0]]
		
		#recursion
		for t in range(1, sequence_length):
			alpha[:, t] = numpy.sum(alpha[:, t-1] * self.transition_matrix.T * \
				self.emission_matrix[:, instance.data[t]], 1)

		return alpha
	
	def backward_algorithm(self, instance):
		"""Run backward algorithm
		
		Add your implementation
		"""
		sequence_length = len(instance.data) 
		beta = numpy.zeros((self.num_states, sequence_length))
		
		#initialization
		beta[:, -1] += 1
		
		#recursion
		for t in reversed(xrange(sequence_length - 1)):
			beta[:, t] = numpy.sum(self.transition_matrix * \
				self.emission_matrix[:, instance.data[t + 1]] * \
				beta[:, t + 1], 1)
				
		return beta
	
	def compute_likelihood(self, alpha):
		"""Compute likelihood P(O1:T) given forward values
		
		This function is necessary for computing expected counts.
		It should assume that alpha (forward) values are computed correctly.
		
		This function should be just one line
		
		Add your implementation
		"""
		#return sum(alpha)[-1]
		#return sum(alpha[:, -1])
		return numpy.sum(alpha, 0)[-1]
		
	def compute_expected_counts(self, instance):
		"""E-step for EM Algorithm for learning HMM parameters
		
		This function is fully implemented for you
		"""
		alpha = self.forward_algorithm(instance)
		beta = self.backward_algorithm(instance)
		sequence_length = len(instance.data)
		likelihood = self.compute_likelihood(alpha)
		
		gamma = alpha * beta / likelihood
		expected_observation_counts = numpy.zeros((self.num_states, self.num_observations)) 
		for t in xrange(sequence_length):
			feature_index = instance.data[t]
			expected_observation_counts[:, feature_index] += gamma[:, t]
		
		expected_transition_counts = numpy.zeros((self.num_states, self.num_states))
		for t in xrange(sequence_length-1):
			feature_index = instance.data[t+1]
			obs = self.emission_matrix[:, feature_index]
			m1 = numpy.matrix(alpha[:, t])
			m2 = numpy.matrix(beta[:, t+1] * obs)
			xi = numpy.multiply(m1.transpose().dot(m2), self.transition_matrix) / likelihood
			expected_transition_counts += xi
		return (expected_transition_counts, expected_observation_counts, likelihood)
		
	def _be_prepared_for_baum_welch(self, training_set, mode = 'uniform', inf = None):
		"""Initialize transition_matrix, emission_matrix, and initial_probability for Baum-Welch.
		
		@param training_set: the training data
		@param mode: can be 'uniform', 'random', or 'sneaky'
		@inf: used in sneaky mode; this is a file containing a dictionary 
			serialization of an HMM object
		"""
	
		self.populate_alphabets(training_set)
		
		HVAL = 100 #added to high positions in sparse rows for weak training
		LVAL = 1 #added to low positions in sparse rows for weak training
		
		if mode == 'uniform': #all elements in a row are equal
			self.transition_matrix += (1.0 / numpy.size(self.transition_matrix, 1))
			self.emission_matrix += (1.0 / numpy.size(self.emission_matrix, 1))
			self.initial_probability += (1.0 / numpy.size(self.initial_probability))
			
		else: #elements will be unequal
			#choose one element per row per matrix to be 
			#much higher than its dear siblings
			
			if mode == 'random': #high element is selected randomly
				random.seed()
				trans = [random.choice(range(numpy.size(self.transition_matrix, 1))) \
					for i in range(numpy.size(self.transition_matrix, 0))]
				emits = [random.choice(range(numpy.size(self.emission_matrix, 1))) \
					for i in range(numpy.size(self.emission_matrix, 0))]
				init = random.choice(range(len(self.initial_probability)))
			
			elif mode == 'sneaky': #use some information from the data, but don't tell anyone!
				tempdict = HMM.from_dict(cPickle.load(inf))
				tcounts, icounts, ocounts = [tempdict[i] for i in 'transition_matrix', \
					'initial_probability', 'emission_matrix']

				trans = numpy.argmax(tcounts, 1)
				emits = numpy.argmax(ocounts, 1)
				init = numpy.argmax(icounts)
			
			#ensure that no element is zero and that the selected element is substantially higher
			self.transition_matrix[range(numpy.size(self.transition_matrix, 0)), trans] += HVAL
			self.transition_matrix += LVAL
			self.emission_matrix[range(numpy.size(self.emission_matrix, 0)), emits] += HVAL
			self.emission_matrix += LVAL
			self.initial_probability[init] += HVAL
			self.initial_probability += LVAL
			
			#normalize
			self.transition_matrix = (self.transition_matrix.T / numpy.sum(\
				self.transition_matrix, 1)).T
			self.emission_matrix = (self.emission_matrix.T / numpy.sum(\
				self.emission_matrix, 1)).T
			self.initial_probability /= sum(self.initial_probability)
	
	def baum_welch_train(self, instance_list):
		"""Baum-Welch unsupervised training
		
		Before calling this function, you have to call
			self.populate_alphabets(instance_list)
			and then initialize transition matrix, observation matrix, and initial probability.
		It's ok to fix initial probability to 1 / self.num_states (Uniform)
			
		This function is not so optimized, so it can't turn the crank on too large a dataset.
		"""
		num_states = self.label_alphabet.size()
		num_features = self.feature_alphabet.size()
		old_total_loglikelihood = - numpy.Infinity
		for i in xrange(30):
			expected_observation_counts = numpy.zeros((num_states, num_features)) 
			expected_transition_counts = numpy.zeros((num_states, num_states)) 
			total_log_likelihood = 0
			#E-Step
			for instance in instance_list:
				transition_counts, obs_counts, likelihood = self.compute_expected_counts(instance)
				expected_observation_counts += obs_counts
				expected_transition_counts += transition_counts
				total_log_likelihood += numpy.log(likelihood)
			#M-Step
			self.transition_matrix = (expected_transition_counts.transpose() / numpy.sum(expected_transition_counts, 1)).transpose()
			self.emission_matrix = (expected_observation_counts.transpose() / numpy.sum(expected_observation_counts, 1)).transpose()
			print 'Iteration %s : %s ' % (i, total_log_likelihood)
			if total_log_likelihood < old_total_loglikelihood:
				break
			old_total_loglikelihood = total_log_likelihood
		self.initial_probability = numpy.zeros(num_states) + 1.0/num_states

	def classify_instance(self, instance):
		"""Viterbi decoding algorithm

		Returns a list of label strings e.g. ['Hot', 'Cold', 'Cold']
		
		Add your implementation
		"""
		
		self._mutate_data(instance) #just in case
		
		#initialization
		slength = len(instance.data)
		v = numpy.zeros((self.num_states, slength))
		backtrace = numpy.zeros((self.num_states, slength))
		v[:, 0] = self.initial_probability * self.emission_matrix[:, \
			instance.data[0]]
			
		#recursion
		for t in range(1, slength):
			tempmat = v[:, t-1] * self.transition_matrix.T
			maxis = numpy.argmax(tempmat, axis = 1)
			backtrace[:, slength - t] = maxis #facilitates reversal later
			v[:, t] = v[maxis, t-1] * self.transition_matrix[maxis, \
				xrange(numpy.size(self.transition_matrix, 1))] * \
				self.emission_matrix[:, instance.data[t]]
			
		#termination
		backtrace[:, 0] = v[:, -1]
		
		return self._run_backtrace(backtrace)
		
	def _run_backtrace(self, back_mat):
		"""
		Helper function for extracting 
		
		@param back_mat: a deque 
		"""
		stack = [numpy.argmax(back_mat[:, 0])]
		for ind in xrange(1, numpy.size(back_mat, 1)):
			stack.append(back_mat[stack[-1], ind])
		res = []
		while stack:
			res.append(self.label_alphabet.get_label(stack.pop()))
		return res
		
	def print_parameters(self):
		"""Print the two parameter matrices
		
		You should take advantage of this function in debugging
		and inspecting the resulting parameters.
		
		This function is implemented for you.
		"""
		state_header = map(str, [self.label_alphabet.get_label(i) \
			for i in xrange(self.label_alphabet.size())])
		obs_header = map(str, [self.feature_alphabet.get_label(i) \
			for i in xrange(self.feature_alphabet.size())])
		print matrix_to_string(self.emission_matrix, state_header, obs_header)
		print matrix_to_string(self.transition_matrix, state_header, state_header)

	def to_dict(self):
		"""Convert HMM instance into a dictionary representation

		The implementation of this should be in sync with from_dict function.
		You should be able to use these two functions to convert the model into
		either representation (object or dictionary)
		
		We have enough of this. This is fully implemented for you.
		"""
		model_dict = {
			'label_alphabet': self.label_alphabet.to_dict(),
			'feature_alphabet': self.feature_alphabet.to_dict(),
			'transition_matrix': self.transition_matrix.to_list(),
			'emission_matrix': self.emission_matrix.to_list(),
			'initial_probability': self.initial_probability.to_list()
		}
		return model_dict

	@classmethod
	def from_dict(model_dict):
		"""Convert a dictionary into HMM instance
		
		The implementation of this should be in sync with to_dict function.
		
		This is fully implemented for you.
		"""
		hmm = HMM()
		hmm.label_alphabet = Alphabet.from_dict(model_dict['label_alphabet'])
		hmm.feature_alphabet = Alphabet.from_dict(model_dict['feature_alphabet'])
		hmm.transition_matrix = numpy.array(model_dict['transition_matrix'])
		hmm.emission_matrix = numpy.array(model_dict['emission_matrix'])
		hmm.initial_probability = numpy.array(model_dict['initial_probability'])
		return hmm
Пример #3
0
class Naive_Bayes(object):
    """"""
    def __init__(self, data, feature_function):
        """
        Takes a dictionary mapping labels to lists of strings with that label, and a function which
        produces a list of feature values from a string.
        """
        # your code here!
        self.data = data
        self.feature_codebook = Alphabet()
        # self.word_dict = Alphabet()
        self.label_codebook = Alphabet()
        self.feature_function = feature_function    
                
#     def _build_instance_list(self):
#         """"""
#         instance_list = {}
#         for label, documents in self.data.items():
#             instance_list[label] = []
#             for doc in documents:
#                 vector = self.extract_feature(self.data, doc, s)
#                 instance_list[label].append(vector)
#         self.instance_list = instance_list
#         
#    def _populate_codebook(self):
#         """"""
#         for label in self.instance_list:
#             self.label_codebook.add(label)
#         #here we use all the word set as features
#         self.feature_codebook = copy.deepcopy(self.word_dict)

    def extract_feature(self, string):
        """"""
        vector = np.zeros(self.feature_codebook.size())
        tokens = set(nltk.regexp_tokenize(string, pattern="\w+"))
        indice = 0
        
        for word in tokens:
            if self.feature_codebook.has_label(word):
                indice = self.feature_codebook.get_index(word)
                vector[indice] = 1.0

        return vector
                 
    def _collect_counts(self):
        """"""
        self.count_table = np.zeros((self.feature_codebook.size(), self.label_codebook.size()))
        self.count_y_table = np.zeros(self.label_codebook.size())
        for label, docs in self.instance_list.items():
            Y_index = self.label_codebook.get_index(label)
            for vector in docs:
                self.count_y_table[Y_index] += 1.0
                self.count_table[:, Y_index] += vector
                
                # for sparse vector we use different counting method
                # for x in vector:
                #    self.count_table[x,Y_index] += 1.0
                
    def train(self, theta):
        """"""
        self.instance_list = self.feature_function(self.data, self.label_codebook, self.feature_codebook, theta)
        # self._populate_codebook_withSelectFeature()
        # self.instance_list = self.feature_function(self.data, self.label_codebook, self.feature_codebook, select_feature)
        self._collect_counts()
        self.p_x_given_y_table = np.zeros((self.feature_codebook.size(), self.label_codebook.size()))
        self.p_y_table = np.zeros(self.label_codebook.size())

        self.p_x_given_y_table = (self.count_table + 0.2) / (self.count_y_table + self.feature_codebook.size() * 0.2)
        self.p_y_table = self.count_y_table / self.count_y_table.sum()
        
    def compute_log_unnormalized_score(self, feature_vector):
        """Compute log P(X|Y) + log P(Y) for all values of Y
        
        Returns a vector of loglikelihood.
            loglikelihood_vector[0] = log P(X|Y=0) + log P(Y=0)
        """
        loglikelihood_vector = np.zeros(self.label_codebook.size())
        for label in range(0, self.label_codebook.size()):
            logpro = math.log(self.p_y_table[label])
            for feature_index in range(0, self.feature_codebook.size()):        
                    logpro += feature_vector[feature_index] * math.log(self.p_x_given_y_table[feature_index, label]) + (1 - feature_vector[feature_index]) * math.log(1 - self.p_x_given_y_table[feature_index, label])
            loglikelihood_vector[label] = logpro 
        return loglikelihood_vector

    def classify(self, string):
        """
        Classifies a string according to the feature function and training data
        provided at initialization.

        Predict the label of the given instance
        
        return the predict label for the input document
        """
        # your code here!
        feature_vector = self.extract_feature(string)
        logvector = self.compute_log_unnormalized_score(feature_vector)
        # print vector
        pre_label_index = np.argmax(logvector)         
        return self.label_codebook.get_label(pre_label_index)
Пример #4
0
class MaxEnt(BaseClassifier):

	def __init__(self, gaussian_prior_variance = 1):
		"""Initialize the model

		label_alphabet, feature_alphabet, parameters must be
		consistent in order for the model to work.

		parameters numpy.array assumes a specific shape. Look athe assignment sheet for detail

		Add your implementation
		"""
		super(MaxEnt, self).__init__()
		self.label_alphabet = Alphabet()
		self.feature_alphabet = Alphabet()
		self.gaussian_prior_variance = gaussian_prior_variance
		self.parameters = numpy.array([])
		self.feature_counts = None

	def get_parameter_indices(self, feature_indices, label_index):
		"""Get the indices on the parameter vector

		Given a list of feature indices and the label index, 
		the function will give you a numpy array of the corresponding indices on self.parameters
		
		This function is fully implemented for you.
		"""
		indices = numpy.array(feature_indices) + 1
		intercept = numpy.array([0])
		indices = numpy.concatenate((intercept, indices), 1)
		indices = indices + (label_index * (self.feature_alphabet.size() + 1))
		return indices

	def compute_observed_counts(self, instance_list):
		"""Compute observed feature counts

		It should only be done once because it's parameter-independent.
		The observed feature counts are then stored internally.
		Note that we are fitting the model with the intercept terms
		so the count of intercept term is the count of that class.
		
		Additionally, we have to
			1) populate alphabet
			2) convert instance.data into a vector of feature indices aka sparse vectors
				(use the alphabet)

		Add your implementation
		"""
		#If it's already been counted, just return the value from the cache
		if not self.feature_counts:
			#populate alphabets here
			for instance in instance_list:
				self.label_alphabet.add(instance.label) #update label dictionary
				for datum in instance.data:
					self.feature_alphabet.add(datum) #update feature dictionary
			self.feature_counts = numpy.zeros((self.feature_alphabet.size() \
				+ 1) * self.label_alphabet.size()) #generate observed count vector

		else:
			return self.feature_counts

		#compute the feature counts here
		for instance in instance_list:
			newinds = self.feature_alphabet.get_indices(instance.data)
			sparse_vector = self.get_parameter_indices(newinds, \
				self.label_alphabet.get_index(instance.label))
			self.feature_counts[sparse_vector] += 1
			#instance.data = newinds
			if not instance.converted:
				instance.data = numpy.array(sorted(set(newinds))) #remove duplicates
				instance.converted = True #do not allow confusion
		return self.feature_counts
		
	def compute_label_unnormalized_loglikelihood_vector(self, sparse_feature_vector):
		"""Compute unnormalized log score from log-linear model

		log P(Y|X) is proportional to feature vector * parameter vector
		But we use a sparse vector representation, so we need to use
		index tricks that numpy allows us to do.
		"""
		loglikelihood_score_vector = numpy.zeros(self.label_alphabet.size())
		for index, label in self.label_alphabet:
			loglikelihood_score_vector[index] = sum(\
				self.parameters[self.get_parameter_indices(\
				sparse_feature_vector, index)])
			#dot product of parameters and feature functions
			#which yields sum of parameters at indices
			
		return loglikelihood_score_vector

	def compute_posterior_distribution(self, instance):
		"""Compute P(Y|X)

		Return a vector of the same size as the label_alphabet	
		
		Add your implementation
		"""
		posterior_distribution = numpy.zeros(self.label_alphabet.size()) #initialize
		unnorm = self.compute_label_unnormalized_loglikelihood_vector(\
				instance.data) #compute unnormalized log-likelihood
		if DEBUG_2:
			print unnorm
		posterior_distribution = numpy.exp(unnorm)/ sum(numpy.exp(unnorm)) #normalize
		return posterior_distribution
		
	def _argmax(self, func, *args):
		"""Not needed because numpy's is better"""
		res = [func(arg) for arg in args]
		m = max(res)
		for arg in args:
			if func(arg) == m:
				return arg

	def compute_expected_feature_counts(self, instance_list):
		"""Compute expected feature counts

		We take advantage of compute_posterior_distribution in this class to compute
		expected feature counts, which is only needed for training.

		Add your implementation
		"""
		expected_feature_counts = numpy.zeros((self.feature_alphabet.size() + 1) * self.label_alphabet.size())
		for instance in instance_list:
			#add posterior to expected_feature_counts at appropriate indices
			post_dist = self.compute_posterior_distribution(instance) #posterior distribution
			for jndex, label in self.label_alphabet:
				indices = self.get_parameter_indices(\
					instance.data, jndex)
				expected_feature_counts[indices] += post_dist[jndex] 
				#	increment expected counts at appropriate indices
		return expected_feature_counts

	def classify_instance(self, instance):
		"""Applying the model to a new ins
		tance

		Convert instance.data into a sparse vector and then classify the instance.
		Returns the predicted label. 

		Add your implementation
		"""
		if DEBUG_2:
			print instance.data
		if not instance.converted:
			instance.data = self.feature_alphabet.get_indices(instance.data) 
			instance.converted = True
			#	get_indices eliminates any heretofore unseen features
		if DEBUG_2:
			print instance.data
			print self.compute_posterior_distribution(instance)
		return self.label_alphabet.get_label(numpy.argmax( \
			self.compute_posterior_distribution(instance))) #return label corresponding to best index

	def objective_function(self, parameters):
		"""Compute negative (log P(Y|X,lambdas) + log P(lambdas))

		The function that we want to optimize over.
		You won't have to call this function yourself. fmin_l_bfgs_b will call it.

		Add your implementation
		"""
		total_loglikelihood = 0.0
		self.parameters = parameters
		#add normalizing term
		total_loglikelihood -= numpy.sum(parameters * parameters) / \
			self.gaussian_prior_variance
		# Compute the loglikelihood here
		for instance in self.training_data:
			#add posterior at correct label index
			total_loglikelihood += self.compute_posterior_distribution(instance) \
				[self.label_alphabet.get_index(instance.label)] 
		return - total_loglikelihood


	def gradient_function(self, parameters):
		"""Compute gradient of negative (log P(Y|X,lambdas) + log P(lambdas)) wrt lambdas

		With some algebra, we have that
		gradient wrt lambda i = observed_count of feature i - expected_count of feature i
		The first term is computed before running the optimization function and is a constant.
		The second term needs inference to get P(Y|X, lambdas) and is a bit expensive.
		The third term is from taking the derivative of log gaussian prior

		Returns:
			a vector of gradient

		Add your implementation
		"""
		gradient_vector = numpy.zeros(len(parameters))
		# compute gradient here
		gradient_vector += self.feature_counts - \
			self.compute_expected_feature_counts(self.training_data) - \
			2 * (parameters) / self.gaussian_prior_variance
		if DEBUG_1:
			print gradient_vector
		return - gradient_vector


	def train(self, instance_list):
		"""Find the optimal parameters for maximum entropy classifier

		We leave the actual number crunching and search to fmin_bfgs function.
		There are a few tunable parameters for the optimization function but
		the default is usually well-tuned and sufficient for most purposes.

		Arg:
			instance_list: each instance.data should be a string feature vector

		This function is fully implemented. But you are allowed to make changes 
		"""
		self.training_data = instance_list
		self.compute_observed_counts(instance_list)
		num_labels = self.label_alphabet.size()
		num_features = self.feature_alphabet.size()
		init_point = numpy.zeros(num_labels * (num_features + 1))
		optimal_parameters, _, _ = fmin_l_bfgs_b(self.objective_function, init_point, fprime=self.gradient_function)
		self.parameters = optimal_parameters


	def to_dict(self):
		"""Convert MaxEnt into a dictionary so that save() will work
		
		Add your implementation
		"""
		res = {}
		res['labalph'] = self.label_alphabet.to_dict()
		res['feaalph'] = self.feature_alphabet.to_dict()
		res['gpv'] = self.gaussian_prior_variance
		res['param'] = self.parameters
		return res


	@classmethod
	def from_dict(cls, model_dictionary):
		"""Return an instance of MaxEnt based on the dictionary created by to_dict
		
		Add your implementation
		"""
		res = MaxEnt()
		res.label_alphabet = Alphabet.from_dict(model_dictionary['labalph'])
		res.feature_alphabet = Alphabet.from_dict(model_dictionary['feaalph'])
		res.gaussian_prior_variance = model_dictionary['gpv']
		res.parameters = model_dictionary['param']
		return res
Пример #5
0
class TranslationModel:

    def __init__(self, aligned_sentences, max_iterations=-1, eta=None):
        """

        :param aligned_sentences: a list of tuples of aligned sentences
        :param max_iterations: the number of iterations to run EM
        :param eta: the value that the delta of the EM probabilities must fall below to be considered converged
        """
        self.aligned_sentences = aligned_sentences
        self.e_alphabet = Alphabet()
        self.f_alphabet = Alphabet()
        if eta is None:
            # very simple heuristic
            self.eta = len(aligned_sentences)/100.
        else:
            self.eta = eta
        self.max_iterations = max_iterations
        if max_iterations == -1:
            self.do_more = self.has_converged
        else:
            self.do_more = self.stop_iterations


    def convert_to_vector(self, raw_data, lang="e", training=True):
        """

        :param raw_data: a tokenized sentence
        :param lang: whether it's source or target
        :param training: whether this is during training or testing
        :return: numpy array of the integers corresponding to words
        """
        if lang == "e":
            alphabet = self.e_alphabet
        else:
            alphabet = self.f_alphabet

        if training:
            return numpy.array(map(alphabet.get_index, raw_data), dtype=int)

        else:
            vector = numpy.zeros(len(raw_data), dtype=int)
            for i,word in enumerate(raw_data):
                try:
                    vector[i] = alphabet.get_index(word)
                except KeyError:
                    continue #ignoring OOV words
            return vector

    def populate_alphabets(self):
        """
        Populates the alphabets so that the tokens can have an integer
        representation. Also converts the sentences into this format.

        """
        for e_instance,f_instance in self.aligned_sentences:

            for i,token in enumerate(e_instance.raw_data):
                self.e_alphabet.add(token)

            for i,token in enumerate(f_instance.raw_data):
                self.f_alphabet.add(token)

            e_instance.data = self.convert_to_vector(e_instance.raw_data, "e")
            f_instance.data = self.convert_to_vector(f_instance.raw_data, "f")

    def init_translation_table(self):
        """
        Sets up the class field of the translation table and the cache of
        the previous table in order to do the initial delta.

        Initializes the probability of everything at 0.25

        """
        self.t_table = numpy.zeros([self.e_alphabet.size(), self.f_alphabet.size()])
        self.previous_t_table = numpy.zeros([self.e_alphabet.size(), self.f_alphabet.size()])
        self.t_table.fill(.25)

    def expectation_maximization(self):
        """
        runs the EM algorithm for a specific number of iterations or until
        it has converged.

        """
        i = 0
        while not self.do_more(i):
            time1 = time.time()
            print "iteration {:d}".format(i+1),
            # initialize
            self.total = numpy.zeros(self.f_alphabet.size())
            self.counts = numpy.zeros([self.e_alphabet.size(), self.f_alphabet.size()])

            for e_instance,f_instance in self.aligned_sentences:
                self.s_total = numpy.zeros(self.e_alphabet.size())

                # compute normalization
                for e_word in e_instance.data:
                    self.s_total[e_word] += numpy.sum(self.t_table[e_word, f_instance.data])

                # collect counts
                for e in e_instance.data:
                    tmp = self.t_table[e,f_instance.data]/self.s_total[e]
                    self.counts[e,f_instance.data] += tmp
                    self.total[f_instance.data] += tmp

            # estimate probabilities
            self.t_table = self.counts/self.total

            i += 1

            print "\t{:.3f} seconds".format(time.time()-time1),

    def has_converged(self, i):
        """
        calculates the delta, sees if it is lower than eta

        @param i: only used so this method can have the same signature as stop_iterations
        @return: a boolean whether the EM iterations need to stop
        """
        delta = numpy.sum(numpy.abs(self.t_table - self.previous_t_table))
        self.previous_t_table = numpy.copy(self.t_table)
        if i != 0:
            print "\tdelta: {:.3f}".format(delta)
        if delta < self.eta:
            return True

        return False

    def stop_iterations(self, i):
        """

        @param i: current iteration nubmer
        @return: boolean whether EM need to stop iterating
        """
        print
        return i >= self.max_iterations

    def train(self):
        """
        does all tasks necessary to train our model

        """
        self.populate_alphabets()
        self.init_translation_table()
        self.expectation_maximization()
        self.build_language_model()

    def evaluate(self, candidate_data):
        """
        given candidate translations, this will select the best according to
        our translation table and language model. prints the source sentence
        and best candidate, which is argmax(p(t|s))

        :param candidate_data: a list with a source sentence and translation candidates
        """
        for (source, source_sent_tokenized), candidates in candidate_data:
            candidate_scores = numpy.zeros(len(candidates))
            for i,(c_sent, c) in enumerate(candidates):
                candidate_scores[i] = self.translation_log_prob(c, source_sent_tokenized)
            print u"source sentence: {:s}".format(source)
            print u"best translation: {:s}".format(candidates[numpy.argmax(candidate_scores)][0])
            print

    def t_table_log_prob(self, e_sentence, f_sentence):
        """
        gives the log(p(s|t))

        :param e_sentence: tokenized target sentence
        :param f_sentence: tokenized candidate sentence
        :return: the log probability of a sentence translating to a candidate
        """
        e_sentence = self.convert_to_vector(e_sentence, lang="e", training=False)
        f_sentence = self.convert_to_vector(f_sentence, lang="f", training=False)
        product = 1.
        for e in e_sentence:
            product *= numpy.sum(self.t_table[e,f_sentence])
        return numpy.log(product/(len(f_sentence)**len(e_sentence)))

    def build_language_model(self):
        """
        creates the language model for our target language and saves it in a class
        field

        """
        all_data = []
        for e_sentence,_ in self.aligned_sentences:
            all_data.append(e_sentence.data)
        self.language_model = BigramModel(all_data, self.e_alphabet)
        self.language_model.train()

    def translation_log_prob(self, e_sentence, f_sentence):
        """

        :param e_sentence: tokenized target sentence
        :param f_sentence: tokenized source sentence
        :return: the log(p(s|t)*p())
        """
        return self.t_table_log_prob(e_sentence, f_sentence) + self.language_model.log_prob(e_sentence)

    def save(self):
        raise NotImplementedError

    def load(self, data):
        raise NotImplementedError
Пример #6
0
class NaiveBayes(BaseClassifier):

	def __init__(self):
		"""Constructor
		
		Utility classes and class variables should be initialized here.
		
		Add your implementation.
		"""

		self.label_codebook = Alphabet()
		self.feature_codebook = Alphabet()

	def _collect_counts(self, instance_list):
		"""Collect feature and label counts from the dataset

		This function should first index all of labels and features
		and update the two codebooks. Then go through the data again 
		and count all of labels and features in 
			self.count_x_y_table
			self.count_y_table
		For example,
			self.count_x_y_table[12, 0] = Count of feature 12 co-occurring with label 0
			self.count_y_table[1] = Count of label 1
			If you want to know what feature 12 is, you should be able to look it up by
				self.feature_codebook.get_label(12)
		
		Add your implementation.
		"""
		
		for gen in set(map(lambda x: x.label, instance_list)):
			self.label_codebook.add(gen)
		for vector in map(lambda x: x.data, instance_list):
			for feat in vector:
				self.feature_codebook.add(feat)
		self.count_x_y_table = numpy.zeros(map(len, [self.feature_codebook, self.label_codebook]))
		self.count_y_table = numpy.zeros(len(self.label_codebook))
		for i, instance in enumerate(instance_list):
			print "Training on instance %d of %d." % (i, len(instance_list))
			label = self.label_codebook.get_index(instance.label)
			self.count_y_table[label] += 1
			for index, feature in self.feature_codebook:
				self.count_x_y_table[index,label] += int(feature in instance)
		if DEBUG:
			for index, label in self.label_codebook:
				print label,
			print ''
			for i, (e1, e2) in enumerate(self.count_x_y_table):
				print '%s: %d, %d' % (self.feature_codebook.get_label(i), e1, e2)
		
		
	def train(self, instance_list, smoothmode = 'laplace'):
		"""Fit model parameters based on the dataset
		
		You should assume that self.label_codebook and self.feature_codebook are now 
		properly populated.
		
		Populate p_x_given_y_table and p_y_table with their maximum likelihood estimates
		For example :
			self.p_x_given_y_table[10, 1] = P(X10 = 1|Y=1)
			self.p_y_table[1] = P(Y=1)
			
		You should also do some kind of smoothing.
		
		Add your implementation
		"""
		self._collect_counts(instance_list)
		if smoothmode == 'add-one':
			self.smooth_table(smoothmode)
		self.p_x_given_y_table = numpy.zeros((self.feature_codebook.size(), self.label_codebook.size()))
		for row, counts in enumerate(self.count_x_y_table):
			for col, count in enumerate(counts):
				self.p_x_given_y_table[row,col] = float(count) / self.count_y_table[col]
		self.p_y_table = numpy.zeros(self.label_codebook.size())
		for col, count in enumerate(self.count_y_table):
			self.p_y_table[col] = float(count) / sum(self.count_y_table)
		if smoothmode == 'laplace':
			self.smooth_table(smoothmode)
		
	def smooth_table(self, mode):
		"""
			Implements smoothing algorithms for probability tables;
			defaults to Laplace smoothing, distributing probability mass of
			least-frequent elements to zero-probabiity elements.
		"""
		if mode == 'laplace':

			#get lists of p(x|y) values for each y
			newtabs = [self.p_x_given_y_table[0:,i] \
				for i in range(self.p_x_given_y_table.size / \
				len(self.p_x_given_y_table))]
		
			#descry lowest-frequency nonzero elements in p(x|y) table
			mincounts = map(lambda x: min([i for i in x if i]), newtabs)
			
			#get indices of minimal and zero values
			inds = [[j for j, elem in enumerate(li) \
				if elem in [0, mincounts[i]]] \
				for i, li in enumerate(newtabs)]
				
			#average probability mass of minimal elements over zero elements
			newvals = [float(mincounts[i])/len(inds[i]) \
				for i in range(len(inds))]
			#reassign minimal and zero elements
			for i, li in enumerate(inds):
				for ind in li:
					self.p_x_given_y_table[ind,i] = newvals[i]
						
		elif mode == 'add-one':
			#add one to all counts in count tables
			self.count_x_y_table += 1
			self.count_y_table += 1

	def compute_log_unnormalized_score(self, instance):
		"""Compute log P(X|Y) + log P(Y) for all values of Y
		
		Returns a numpy vector of loglikelihood.
		The vector indices must be consistent with the codebook in the classifier
		For example:
			loglikelihood_vector[0] = log P(X|Y=0) + log P(Y=0)

		Add your implementation
		"""
		loglikelihood_vector = numpy.zeros(self.label_codebook.size())
		for col, loglike in enumerate(loglikelihood_vector):
			for index, feature in self.feature_codebook:
				loglikelihood_vector[col] += numpy.log(\
					self.p_x_given_y_table[index, col]) if feature in instance \
					else max(numpy.finfo(float).eps, \
					numpy.log(1 - self.p_x_given_y_table[index, col]))
			loglike += numpy.log( self.p_y_table[col] )
		return loglikelihood_vector

	def classify_instance(self, instance):
		"""Predict the label of the given instance
		
		Make a prediction given the features in the instance.
		This function should be very short.
		
		Add your implementation
		"""
		clus = self.compute_log_unnormalized_score(instance)
		if DEBUG:
			for index, label in self.label_codebook:
				print label, clus[index]
		for i, index in enumerate(clus):
			if index == max(clus):
				return self.label_codebook.get_label(i)

	def to_dict(self):
		"""Convert NaiveBayes instance into a dictionary representation

		The implementation of this should be in sync with from_dict function.
		You should be able to use these two functions to convert the model into
		either representation (object or dictionary)
		
		Add your implementation
		"""
		model_dict = {
			'label_alphabet': self.label_codebook.to_dict(),
			'feature_alphabet': self.feature_codebook.to_dict(),
			'#x&y' : self.count_x_y_table,
			'#y' : self.count_y_table,
			'p_x|y_table' : self.p_x_given_y_table,
			'p_y_table' : self.p_y_table,
		}

	@classmethod
	def from_dict(cls, model_dict):
		"""Convert a dictionary into NaiveBayes instance
		
		The implementation of this should be in sync with to_dict function.
		
		Add your implementation
		"""
		res = NaiveBayes()
		res.label_codebook = Alphabet.from_dict(model_dict['label_alphabet'])
		res.feature_codebook = Alphabet.from_dict(model_dict['feature_alphabet'])
		res.count_x_y_table = model_dict['#x&y']
		res.count_y_table = model_dict['#y']
		res.p_x_given_y_table = model_dict['_x|y_table']
		res.p_y_table = model_dict['p_y_table']
		return res
Пример #7
0
class MaxEnt(BaseClassifier):

	def __init__(self):
		"""Initialize the model

		label_codebook, feature_codebook, parameters must be
		assigned properly in order for the model to work.

		parameters and codebooks will be handled in the train function
		"""
		super(MaxEnt, self).__init__()
		self.label_codebook = Alphabet()
		self.feature_codebook = Alphabet()
		#self.gaussian_prior_variance = 1
		self.parameters = []	
		self.gaussian_prior_variance = 1.0

	def compute_observed_counts(self, instance_list):
		"""Compute observed feature counts

		It should only be done once because it's parameter-independent.
		The observed feature counts are then stored internally.
		Note that we are fitting the model with the intercept terms
		so the count of intercept term is the count of that class.
		
		fill the feature_counts table with observed counts
		"""
		#the data and label in instance both use sparse vector
		self.feature_counts = numpy.zeros((self.feature_codebook.size() + 1) * self.label_codebook.size())
		for instance in instance_list:	
			Y_index = (self.feature_codebook.size()+1)*instance.label
			self.feature_counts[Y_index] +=1
			#instance.data is numpy array
			indices = Y_index + instance.data +1 
			self.feature_counts[indices] +=1
		
		#print self.feature_counts[:self.feature_codebook.size()+1]
		#print self.feature_counts[self.feature_codebook.size()+1:]

	def compute_expected_feature_counts(self,instance_list):
		"""Compute expected feature counts

		E(feature|X) = sum over i,y E(feature(Xi,yi)|Xi)
					 = sum over i,y feature(Xi,yi) P(Y=yi|Xi)
		We take advantage of inference function in this class to compute
		expected feature counts, which is only needed for training.
		
		computing the expected feature counts by adding up all the expectation counts of all feature.
		return expected feature counts table
		"""
		expected_feature_counts = numpy.zeros(len(self.parameters))
		for instance in instance_list:
			posterior = self.compute_label_unnormalized_loglikelihood_vector(instance.data)
			posterior = numpy.exp(posterior-logsumexp(posterior))
			for label in range(0,self.label_codebook.size()):
				Y_index = label*(self.feature_codebook.size() + 1)
				expected_feature_counts[Y_index] += posterior[label]
				indices = Y_index + instance.data + 1
				expected_feature_counts[indices] += posterior[label]
		return expected_feature_counts

	def classify_instance(self, instance):
		"""Applying the model to a new instance

		Returns:
		       label with the maximum probability 
		"""
		vector = self.compute_posterior_distribution(instance)
		#print vector
		pre_label_index = numpy.argmax(vector) 		
		return pre_label_index

	def compute_posterior_distribution(self, instance):
		"""Compute P(Y|X)
		
		Return a vector of the same size as the label_codebook
		the vector contains the unnormalized likelihood vector since we only use them for finding the most probable label, so we don't have
		to normalized it.
		"""
		sparse_vector = numpy.array([self.feature_codebook.get_index(i) for i in instance.data if self.feature_codebook.has_label(i)])
		posterior_distribution = numpy.zeros(self.label_codebook.size())
		posterior_distribution = numpy.exp(self.compute_label_unnormalized_loglikelihood_vector(sparse_vector))
		return posterior_distribution

	def compute_label_unnormalized_loglikelihood_vector(self,sparse_feature_vector):
		"""Compute unnormalized log score from log-linear model

		log P(Y|X) is proportional to feature vector * parameter vector
		But we use a sparse vector representation, so we need to use
		index tricks that numpy allows us to do.
		
		for each label compute the unnormalized loglikelihood (sum of lambdas) given the sparse_feature_vector
		Returns:
		       a vector of scores according to different y(label)
		"""
		loglikelihood_score_vector = numpy.zeros(self.label_codebook.size())
	
		for label in range(0,self.label_codebook.size()):
			Y_index = label*(self.feature_codebook.size() + 1)
			indices = Y_index + sparse_feature_vector + 1
			if len(indices)!=0:
				loglikelihood_score_vector[label] = self.parameters[Y_index] + sum(self.parameters[indices])
			else:
				loglikelihood_score_vector[label] = self.parameters[Y_index]
			
		return loglikelihood_score_vector


	def objective_function(self, parameters):
		"""Compute negative (log P(Y|X,lambdas) + log P(lambdas))

		The function that we want to optimize over. Here I use Gaussian distribution(mean=0.0 sigma=1.0) prior to model P(lambda)
		Args:
		     parameters updated by the training procedure
		Returns:
		     negtive total likelihood
		"""
		total_loglikelihood = 0.0
		numerator = 0.0
		denominator = 0.0
		#prior = 0.0
		#self.gaussian_prior_variance = 1.0
		prior = sum([i**2/(2*self.gaussian_prior_variance**2) for i in parameters])
		self.parameters=numpy.array(parameters)
		# Compute the loglikelihood here
		loglikelihood_score_vector = numpy.zeros(self.label_codebook.size())
		for instance in self.training_data:
			Y_index = instance.label*(self.feature_codebook.size() + 1) 
			indices = Y_index + instance.data + 1
			numerator += (parameters[Y_index]+sum(parameters[indices]))
			score_vector = self.compute_label_unnormalized_loglikelihood_vector(instance.data)
			#print score_vector
			denominator += logsumexp(score_vector)
		#print numerator
		#print denominator
		total_loglikelihood = numerator - denominator - prior
		print  - total_loglikelihood
		return - total_loglikelihood


	def gradient_function(self, parameters):
		"""Compute gradient of negative (log P(Y|X,lambdas) + log P(lambdas)) wrt lambdas

		With some algebra, we have that
		gradient wrt lambda i = observed_count of feature i - expected_count of feature i - lambda i / gaussian_prior_variance^2
		The first term is computed before running the optimization function and is a constant.
		The second term needs inference to get P(Y|X, lambdas) and is a bit expensive.
		The third term is from taking the derivative of log gaussian prior
		
		Returns:
			a vector of gradient
		"""
		self.parameters = numpy.array(parameters)
		#print self.parameters
		#print parameters
		gradient_vector = numpy.zeros(len(parameters))
		observed_count_vector = self.feature_counts
		expected_count_vector = self.compute_expected_feature_counts(self.training_data)
		dprior = numpy.array([i/self.gaussian_prior_variance**2 for i in parameters])
		# compute gradient here
		gradient_vector = observed_count_vector - expected_count_vector - dprior 
		return - gradient_vector


	def train(self, instance_list):
		"""Find the optimal parameters for maximum entropy classifier

		We setup an instance of MaxEnt to use as an inference engine
		necessary for parameter fitting. MaxEnt instance and training set
		are stored internally in the trainer just so we can avoid putting in
		extra arguments into the optimization function.
		We leave the actual number crunching and search to fmin_bfgs function.
		There are a few tunable parameters for the optimization function but
		the default is usually well-tuned and sufficient for most purposes.

		Arg:
			instance_list: each instance.data should be a string feature vectors
				This function will create a sparse feature vector representation
				based on the alphabet.

		Returns:
			Maximum entropy classifier with the parameters (MAP estimate from the data
			and Gaussian prior)
		"""
		assert(len(instance_list) > 0)
		######################################
		# Do any further processing right here e.g populate codebook
		# making sparse vectors, etc.
		self.label_codebook.add('neg')
		self.label_codebook.add('pos')
		for index,instance in enumerate(instance_list):
			sparse_vector = numpy.zeros(0,dtype=numpy.int)
			for feature in instance.data:
				if not self.feature_codebook.has_label(feature):
					self.feature_codebook.add(feature)
					sparse_vector = numpy.append(sparse_vector,self.feature_codebook.get_index(feature))
				else:
					sparse_vector = numpy.append(sparse_vector,self.feature_codebook.get_index(feature))		
					
			instance_list[index].data = sparse_vector

		##################
		self.parameters = numpy.zeros((self.feature_codebook.size() + 1) * self.label_codebook.size())
		self.training_data = instance_list
		self.compute_observed_counts(instance_list)
		num_labels = self.label_codebook.size()
		num_features = self.feature_codebook.size()
		init_point = numpy.zeros(num_labels * (num_features + 1))
		optimal_parameters, _, _ = fmin_l_bfgs_b(self.objective_function, init_point, fprime=self.gradient_function)
		print optimal_parameters
		self.parameters = optimal_parameters

	def to_dict(self):
		model_dict = {
			'label_alphabet': self.label_codebook.to_dict(),
			'feature_alphabet': self.feature_codebook.to_dict(),
			'parameters': self.parameters.tolist(),
		}
		return model_dict

	@classmethod
	def from_dict(cls, model_dictionary):
		model_instance = MaxEnt()
		model_instance.label_codebook = Alphabet.from_dict(model_dict['label_alphabet'])
		model_instance.feature_codebook = Alphabet.from_dict(model_dict['feature_alphabet'])
		model_instance.p_x_given_y_table = numpy.array(model_dict['parameters'])

		return model_instance
Пример #8
0
Файл: hmm.py Проект: sdiao/CS134
class HMM(BaseClassifier):

	def __init__(self):
		self.label_codebook = Alphabet()
		self.feature_codebook = Alphabet()
		# these two flags are for feature selection
		self.filter_feature1 = True
		self.filter_feature2 = False
		
	def _collect_counts(self, instance_list):
		"""Collect counts necessary for fitting parameters

		This function should update self.transtion_count_table
		and self.feature_count_table based on this new given instance
		
		Add your docstring here explaining how you implement this function

		Returns None
		"""
		#0B, 1I, 2O
		#self.transition_count_table
		#self.feature_count_table
		for instance in instance_list:
			#label[],data[]
			# for transition_count_table, we read label[], m*m
			# for feature_count_table, we read both label and data, to see how an observation is emitted from a certain state, p*m
			for i in range(len(instance.label)):
				self.feature_count_table[instance.data[i][0]][instance.label[i]] += 1
				self.feature_count_table[instance.data[i][1]][instance.label[i]] += 1
				if i == 0:
					self.initial_state_count_table[instance.label[i]] += 1
				elif i== len(instance.label)-1:
					self.termination_state_count_table[instance.label[i]] += 1
				else:
					self.transition_count_table[instance.label[i]][instance.label[i-1]] += 1  # easy for matrix multiplication




	def train(self, instance_list):
		"""Fit parameters for hidden markov model

		Update codebooks from the given data to be consistent with
		the probability tables 

		Transition matrix and emission probability matrix
		will then be populated with the maximum likelihood estimate 
		of the appropriate parameters

		Add your docstring here explaining how you implement this function

		Returns None
		"""
		# m states, q features

		self.transition_matrix = numpy.zeros((1,1))
		self.emission_matrix = numpy.zeros((1,1))
		# m*m
		self.transition_count_table = numpy.zeros((self.label_codebook.size(),self.label_codebook.size()))
		# q*m
		self.feature_count_table = numpy.zeros((self.feature_codebook.size(),self.label_codebook.size()))
		#a table to store each state at the begining of a sequence.it is used for calculating the initial states
		self.initial_state_count_table = numpy.zeros(self.label_codebook.size())
		self.termination_state_count_table = numpy.zeros(self.label_codebook.size())
		self._collect_counts(instance_list)
		#TODO: estimate the parameters from the count tables

		#Convert count tables into probability tables
		#SMOOTHING here
		self.initial_state_count_table=(self.initial_state_count_table+1)/(numpy.sum(self.initial_state_count_table)+3)
		self.termination_state_count_table=(self.termination_state_count_table+1)/(numpy.sum(self.termination_state_count_table)+3)
		# sum of each column, each column identifies the precious state, from previous state to current state
		self.transition_matrix = (self.transition_count_table+1)/(numpy.sum((self.transition_count_table+1),0)+3)
		# sum of each column, each column identifies the state, emit from state to observation
		self.emission_matrix = (self.feature_count_table+1)/(numpy.sum((self.feature_count_table+1),0)+3)




	def classify_instance(self, instance):
		"""Viterbi decoding algorithm

		Wrapper for running the Viterbi algorithm
		We can then obtain the best sequence of labels from the backtrace pointers matrix

		Add your docstring here explaining how you implement this function

		Returns a list of label indices e.g. [0, 1, 0, 3, 4]
		"""
		instance_size = len(instance.label)
		trellis, backtrace_pointers = self.dynamic_programming_on_trellis(instance, False)
		best_sequence = numpy.zeros(instance_size)
		best_sequence[-1] = numpy.argmax(trellis[:,-1])
		for i in range(instance_size-2,0,-1):
			best_sequence[i]=backtrace_pointers[best_sequence[i+1]][i+1]

		return best_sequence

	def compute_observation_loglikelihood(self, instance):
		"""Compute and return log P(X|parameters) = loglikelihood of observations"""
		trellis = self.dynamic_programming_on_trellis(instance, True)
		loglikelihood = numpy.log10(numpy.sum(trellis[:,-1]))
		return loglikelihood

	def dynamic_programming_on_trellis(self, instance, run_forward_alg=True):
		"""Run Forward algorithm or Viterbi algorithm

		This function uses the trellis to implement dynamic
		programming algorithm for obtaining the best sequence
		of labels given the observations

		Add your docstring here explaining how you implement this function

		Returns trellis filled up with the forward probabilities 
		and backtrace pointers for finding the best sequence
		"""
		#TODO:Initialize trellis and backtrace pointers 
		# trellis, m*t, m states, t sequence length, trellis[j][0] is the first element in the sequence. the index is tricky
		instance_size = len(instance.label)
		label_size = self.label_codebook.size()
		trellis = numpy.zeros((label_size,instance_size))# 3*t, if this instance's sequence is 10
		backtrace_pointers = numpy.zeros((label_size,instance_size))#3*t
		
		# Traversing the trellis from left to right
		# Initialization, fill in the first column, t=1, index 0
		if self.filter_feature1:
			trellis[:,0] = self.initial_state_count_table*self.emission_matrix[instance.data[0][1]]
		elif self.filter_feature2:
			trellis[:,0] = self.initial_state_count_table*self.emission_matrix[instance.data[0][0]]
		else:		
			trellis[:,0] = self.initial_state_count_table*self.emission_matrix[instance.data[0][0]]*self.emission_matrix[instance.data[0][1]]
		
		# Recursion
		for t in range(1, instance_size):
			if run_forward_alg:
				for i in range(label_size):
					trellis[:,t] += trellis[i][t-1]*self.transition_matrix[:,i]
			else:
				for j in range(label_size):
					candidate_pre_state = trellis[:,t-1]*self.transition_matrix[j]
					trellis[j][t] = numpy.max(candidate_pre_state)
					backtrace_pointers[j][t] = numpy.argmax(candidate_pre_state)

			# times the observation, 2 features, using the emission matrix
			trellis[:,t]=trellis[:,t]*self.emission_matrix[instance.data[t][0]]*self.emission_matrix[instance.data[t][1]]
		# Termination ????
		#alpha_F = numpy.argmax(self.termination_state_count_table)
		#P_O_Lambda = trellis[i][-1]*self.transition_matrix[i][alpha_F]

		return (trellis, backtrace_pointers)

	def train_semisupervised(self, unlabeled_instance_list, labeled_instance_list=None):
		"""Baum-Welch algorithm for fitting HMM from unlabeled data (EXTRA CREDIT)

		The algorithm first initializes the model with the labeled data if given.
		The model is initialized randomly otherwise. Then it runs 
		Baum-Welch algorithm to enhance the model with more data.

		Add your docstring here explaining how you implement this function

		Returns None
		"""
		if labeled_instance_list is not None:
			self.train(labeled_instance_list)
		else:
			#TODO: initialize the model randomly
			pass
		while True:
			#E-Step
			self.expected_transition_counts = numpy.zeros((1,1))
			self.expected_feature_counts = numpy.zeros((1,1))
			for instance in instance_list:
				(alpha_table, beta_table) = self._run_forward_backward(instance)
				#TODO: update the expected count tables based on alphas and betas
				#also combine the expected count with the observed counts from the labeled data
			#M-Step
			#TODO: reestimate the parameters
			if self._has_converged(old_likelihood, likelihood):
				break

	def _has_converged(self, old_likelihood, likelihood):
		"""Determine whether the parameters have converged or not (EXTRA CREDIT)

		Returns True if the parameters have converged.	
		"""
		return True

	def _run_forward_backward(self, instance):
		"""Forward-backward algorithm for HMM using trellis (EXTRA CREDIT)
	
		Fill up the alpha and beta trellises (the same notation as 
		presented in the lecture and Martin and Jurafsky)
		You can reuse your forward algorithm here

		return a tuple of tables consisting of alpha and beta tables
		"""
		alpha_table = numpy.zeros((1,1))
		beta_table = numpy.zeros((1,1))
		#TODO: implement forward backward algorithm right here

		return (alpha_table, beta_table)

	def to_dict(self):
		"""Convert HMM instance into a dictionary representation

		The implementation of this should be in sync with from_dict function.
		You should be able to use these two functions to convert the model into
		either representation (object or dictionary)
		"""
		model_dict = {
			'label_alphabet': label_codebook.to_dict(),
			'feature_alphabet': feature_codebook.to_dict()
		}
		return model_dict


	def test_hmm(self):
		pp = PreProcessor()
		pp.test_preprocess()
		instance_list = pp.get_instance_list()
		self.label_codebook=pp.get_label_codebook()
		self.feature_codebook=pp.get_feature_codebook()
		self.train(instance_list)
		print "\ntransition_count_table--------------------"
		print self.transition_count_table
		print "\ntransition_matrix-------------------------"
		print self.transition_matrix
		print "\ninitial_state_count_table------------------"
		print self.initial_state_count_table
		print "\ntermination_state_count_table------------------"
		print self.termination_state_count_table
		print "\nemission matrix----------------------------"
		print self.emission_matrix

		for i in range(10):
			self.test_classify_instance(instance_list[i])

		


		
		

		


	def test_forward(self, instance):
		print "run forward algorithm, print trellis----------"
		print "instance used for test: "
		print instance.label
		trellis = self.dynamic_programming_on_trellis(instance,True)
		print trellis
		print "------forward done---------------------------"

	def test_viterbi(self, instance):
		print "run vertibi algorithm, print trellis----------"
		print "instance used for test: "
		print instance.label
		trellis,backtrace_pointers =self.dynamic_programming_on_trellis(instance,False)
		print trellis
		print backtrace_pointers
		print "------vertibi done---------------------------"



	def test_classify_instance(self,instance):
		print "test classify instance ----------------------"
		print "instance used for test: "
		print instance.label
		best_sequence = self.classify_instance(instance)
		print "best sequence:"
		print best_sequence
		
		counter = 0
		for i in range(len(instance.label)-1):
			if best_sequence[i]==instance.label[i]:
				counter +=1

		print "single run accurac: "+str(float(counter)/float(len(instance.label)-1))
		print "classify instance done ----------------------------"



	@classmethod
	def from_dict(model_dict):
		"""Convert a dictionary into HMM instance
		
		The implementation of this should be in sync with to_dict function.
		"""
		return HMM()
Пример #9
0
class PreProcessor(object):
    #np_chunking_wsj_15_18_train
    def __init__(self, dataset = "C:\\Users\\DIAOSHUO\\Dropbox\\SNLP\\cs134assn2\\np_chunking_wsj_15_18_train"):
        """
        Initialize the class variable here
        """
        self.dataset_path = dataset;
        self.label_codebook = Alphabet();
        self.feature_codebook = Alphabet();
        self.instance_list = []

    
    def get_label_codebook(self):
        """Return the self.label_codebook"""
        if self.label_codebook.size()==0:
            self._make_label_codebook()
        
        return self.label_codebook

    def _make_label_codebook(self):
        """
        populate the label_codebook according to the dataset.
        For PA2, we populate it manually since there are only 3 labels.
        For more dataset with more labels, we should populate label_codebook via reading all data points.
        """
        self.label_codebook.add("B")
        self.label_codebook.add("I")
        self.label_codebook.add("O")

    def _make_feature_codebook(self):
        """
        Parse the training data set and build the feature_codebook
        Dataset: a single file, with a HUGE number of lines, each line is a hidden state and an observation. The observation has two features
        """
        instance_counter = 0
        dataset = self.dataset_path
        f = open(dataset,'r')
        #reading file line by line in this way is memory efficient
        for line in f:
            if line == "\n":
                instance_counter+=1
            else:
                parser = line.split()
                #parser has 3 element: label, feature1, feature2
                self.feature_codebook.add(parser[1])
                self.feature_codebook.add(parser[2])

        f.close()
        print "total num of sentences: "+str(instance_counter)




    def get_feature_codebook(self):
        if self.feature_codebook.size()==0:
            self._make_feature_codebook()

        return self.feature_codebook

    def _make_instance_list(self, dataset):
        """
        Function:
        This method converts a raw dataset file into a list of instances, 
        which is ready to be processed by the HMM classifier.
        NOTE: We assume that the feature codebook and label codebook are built already.

        Args:
            dataset path. For now, we use a hardcoded one in self.dataset_path
        Return:
            a list of instance, wich the field of name, label[],data[[]],raw_data
        """
        #dataset = self.dataset_path
        #ensure feature_codebook and label_codebook already exist
        self.get_feature_codebook()
        self.get_label_codebook()

        f = open(dataset,'r')
        label = []
        data = []
        raw_data = ""
        instance_counter = 0
        for line in f:
            if line == "\n":
                #the end of a sentence, should add the instance to a instance list
                instance_counter+=1
                instance = Instance(instance_counter, label, data, raw_data)
                self.instance_list.append(instance)
                #clear the containers, ready for a new element
                data =[]
                label = []
                raw_data=""
            else:
                #make part of the data point, we store index, consistent with the codebook
                parser = line.split()
                label.append(self.label_codebook.get_index(parser[0]))
                observation = []
                observation.append(self.feature_codebook.get_index(parser[1]))
                observation.append(self.feature_codebook.get_index(parser[2]))
                data.append(observation)
                raw_data += line


        f.close()


    def get_instance_list(self, dataset = "C:\\Users\\DIAOSHUO\\Dropbox\\SNLP\\cs134assn2\\np_chunking_wsj_15_18_train"):
        if len(self.instance_list) ==0:
            self._make_instance_list(dataset)

        
        return self.instance_list


        

        

        

    def test_preprocess(self):
        self.make_label_codebook()
        print "label_codebook size: "+str(self.label_codebook.size())
        self.make_feature_codebook()
        print "feature_codebook size: "+str(self.feature_codebook.size())
        self.make_instance_list()
        print "instance_list size: "+str(len(self.instance_list))