def from_dict(cls, model_dictionary): model_instance = MaxEnt() model_instance.label_codebook = Alphabet.from_dict(model_dict['label_alphabet']) model_instance.feature_codebook = Alphabet.from_dict(model_dict['feature_alphabet']) model_instance.p_x_given_y_table = numpy.array(model_dict['parameters']) return model_instance
def from_dict(cls, model_dict): model_instance = MaxEnt() model_instance.label_codebook = Alphabet.from_dict(model_dict["label_alphabet"]) model_instance.feature_codebook = Alphabet.from_dict(model_dict["feature_alphabet"]) model_instance.parameters = numpy.array(model_dict["parameters"]) return model_instance
def bag_of_words(data, label_codebook, feature_codebook, theta): """""" word_dict = Alphabet() stopset = set(stopwords.words('english')) for key, value in data.items(): label_codebook.add(key) for doc in value: doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+")) for word in doc_tokens: if word not in stopset: word_dict.add(word) all_words = word_dict._label_to_index.keys() fdict = FreqDist([w for w in all_words]) word_feature = fdict.keys()[theta:] for word in all_words: if word in word_feature: feature_codebook.add(word) instance_list = {} for label, document_list in data.items(): instance_list[label] = [] for document in document_list: vector = np.zeros(feature_codebook.size()) tokens = set(nltk.regexp_tokenize(document, pattern="\w+")) indice = 0 for word in tokens: if feature_codebook.has_label(word): indice = feature_codebook.get_index(word) vector[indice] = 1. instance_list[label].append(vector) return instance_list
def __init__(self, dataset = "C:\\Users\\DIAOSHUO\\Dropbox\\SNLP\\cs134assn2\\np_chunking_wsj_15_18_train"): """ Initialize the class variable here """ self.dataset_path = dataset; self.label_codebook = Alphabet(); self.feature_codebook = Alphabet(); self.instance_list = []
def deserialize(self, fname): """Retrieve from serialization; keep defaults where possible.""" with open(fname, 'rb') as inf: d = cPickle.load(inf) self.weights = d['weights'] self.feature_alphabet = Alphabet.from_dict(d['feat_alph']) self.label_alphabet = Alphabet.from_dict(d['label_alph']) self.feature_generator_list = d['features'] self.decay = d['decay']
def __init__(self, original_data, tree_functions, features, no_tag=False): self.tree_functions = tree_functions self.feature_functions = zip(range(1, len(features) + 1), features) self.no_tag = no_tag self.original_data = original_data self.value_alphabet = Alphabet() self.value_alphabet.add( "__NULL__") #SVMlight doesn't like 0 value for features self.percent_buffer = int(len(self.original_data) * .18)
def __init__(self, feature_generator_list, decay = False): self.feature_generator_list = feature_generator_list self.feature_alphabet = Alphabet() self.label_alphabet = Alphabet() #you will need this if you use labeled arc self.weights = None self.learning_rate = 0.0001 self.num_iterations = 10 self.caches = {} self.decay = decay
def __init__(self): """Constructor Utility classes and class variables should be initialized here. Add your implementation. """ self.label_codebook = Alphabet() self.feature_codebook = Alphabet()
def __init__(self, data, feature_function): """ Takes a dictionary mapping labels to lists of strings with that label, and a function which produces a list of feature values from a string. """ # your code here! self.data = data self.feature_codebook = Alphabet() # self.word_dict = Alphabet() self.label_codebook = Alphabet() self.feature_function = feature_function
def from_dict(cls, model_dictionary): """Return an instance of MaxEnt based on the dictionary created by to_dict Add your implementation """ res = MaxEnt() res.label_alphabet = Alphabet.from_dict(model_dictionary['labalph']) res.feature_alphabet = Alphabet.from_dict(model_dictionary['feaalph']) res.gaussian_prior_variance = model_dictionary['gpv'] res.parameters = model_dictionary['param'] return res
def from_dict(cls, model_dict): """Convert a dictionary into NaiveBayes instance The implementation of this should be in sync with to_dict function. """ model_instance = NaiveBayes() model_instance.label_codebook = Alphabet.from_dict(model_dict['label_alphabet']) model_instance.feature_codebook = Alphabet.from_dict(model_dict['feature_alphabet']) model_instance.p_x_given_y_table = numpy.array(model_dict['p_x_given_y_table']) model_instance.p_y_table = numpy.array(model_dict['p_y_table']) return model_instance
def __init__(self): """Initialize the model label_codebook, feature_codebook, parameters must be assigned properly in order for the model to work. parameters and codebooks will be handled in the train function """ super(MaxEnt, self).__init__() self.label_codebook = Alphabet() self.feature_codebook = Alphabet() #self.gaussian_prior_variance = 1 self.parameters = [] self.gaussian_prior_variance = 1.0
def from_dict(model_dict): """Convert a dictionary into HMM instance The implementation of this should be in sync with to_dict function. This is fully implemented for you. """ hmm = HMM() hmm.label_alphabet = Alphabet.from_dict(model_dict['label_alphabet']) hmm.feature_alphabet = Alphabet.from_dict(model_dict['feature_alphabet']) hmm.transition_matrix = numpy.array(model_dict['transition_matrix']) hmm.emission_matrix = numpy.array(model_dict['emission_matrix']) hmm.initial_probability = numpy.array(model_dict['initial_probability']) return hmm
def from_dict(cls, model_dict): """Convert a dictionary into NaiveBayes instance The implementation of this should be in sync with to_dict function. Add your implementation """ res = NaiveBayes() res.label_codebook = Alphabet.from_dict(model_dict['label_alphabet']) res.feature_codebook = Alphabet.from_dict(model_dict['feature_alphabet']) res.count_x_y_table = model_dict['#x&y'] res.count_y_table = model_dict['#y'] res.p_x_given_y_table = model_dict['_x|y_table'] res.p_y_table = model_dict['p_y_table'] return res
def __init__(self, gaussian_prior_variance = 1): """Initialize the model label_alphabet, feature_alphabet, parameters must be consistent in order for the model to work. parameters numpy.array assumes a specific shape. Look athe assignment sheet for detail Add your implementation """ super(MaxEnt, self).__init__() self.label_alphabet = Alphabet() self.feature_alphabet = Alphabet() self.gaussian_prior_variance = gaussian_prior_variance self.parameters = numpy.array([]) self.feature_counts = None
def __init__(self, original_data, tree_functions, features, no_tag=False): self.tree_functions = tree_functions self.feature_functions = zip(range(1, len(features)+1), features) self.no_tag = no_tag self.original_data = original_data self.value_alphabet = Alphabet() self.value_alphabet.add("__NULL__") #SVMlight doesn't like 0 value for features self.percent_buffer = int(len(self.original_data)*.18)
def __init__(self, aligned_sentences, max_iterations=-1, eta=None): """ :param aligned_sentences: a list of tuples of aligned sentences :param max_iterations: the number of iterations to run EM :param eta: the value that the delta of the EM probabilities must fall below to be considered converged """ self.aligned_sentences = aligned_sentences self.e_alphabet = Alphabet() self.f_alphabet = Alphabet() if eta is None: # very simple heuristic self.eta = len(aligned_sentences)/100. else: self.eta = eta self.max_iterations = max_iterations if max_iterations == -1: self.do_more = self.has_converged else: self.do_more = self.stop_iterations
def bag_of_words_withTrigram(data, label_codebook, feature_codebook, theta): """here we use sparse vector""" word_dict = Alphabet() stopset = set(stopwords.words('english')) for key, value in data.items(): label_codebook.add(key) for doc in value: doc_tokens = set(nltk.regexp_tokenize(doc, pattern="\w+")) for word in doc_tokens: if word not in stopset: word_dict.add(word) all_words = word_dict._label_to_index.keys() # fdict = FreqDist([w for w in all_words]) # word_feature = fdict.keys()[theta:] for i, word in enumerate(all_words): feature_codebook.add(word) if i + 2 < len(all_words): feature_codebook.add(str(word + " " + all_words[i + 1] + " " + all_words[i + 2])) # print str(word+" "+all_words[i+1]+" "+all_words[i+2]) instance_list = {} for label, document_list in data.items(): instance_list[label] = [] for document in document_list: vector = np.zeros(0) tokens = set(nltk.regexp_tokenize(document, pattern="\w+")) indice = 0 tokens = list(tokens) for i, word in enumerate(tokens): if feature_codebook.has_label(word): indice = feature_codebook.get_index(word) vector.append(indice) # print str(word+" "+tokens[i+1]+" "+tokens[i+2]) if i + 2 < len(tokens) and feature_codebook.has_label(str(word + " " + tokens[i + 1] + " " + tokens[i + 2])): indice = feature_codebook.get_index(str(word + " " + all_words[i + 1] + " " + all_words[i + 2])) vector.append(indice) instance_list[label].append(vector) return instance_list
def __init__(self, data, seeds, patterns=None, processing=1): if processing == 0: tokenized = self.tokenize(data) self.pos_tagged_data = self.pos_tag(tokenized) self.find_patterns = self.find_patterns_tagged self.find_seeds = self.find_seeds_tagged elif processing == 1: self.chunked_data = data self.find_patterns = self.find_patterns_chunked self.find_seeds = self.find_seeds_chunked self.permanent_lexicon = set(seeds) self.temporary_lexicon = defaultdict(set) for s in seeds: self.temporary_lexicon[s] = set() self.best_extraction_patterns = set() self.pattern_alphabet = Alphabet() if patterns is not None: for p in patterns: self.pattern_alphabet.add(p) self.n_counter_sets = None # import for getting candidate seeds self.f_counter_sets = None self.n_pattern_array = None self.f_pattern_array = None self.first_pattern_words = set()
def __init__(self): self.label_codebook = Alphabet() self.feature_codebook = Alphabet() # these two flags are for feature selection self.filter_feature1 = True self.filter_feature2 = False
class Featurizer: RELATION_CLASSES = { "PHYS", "PER-SOC", "OTHER-AFF", "GPE-AFF", "DISC", "ART", "EMP-ORG", "no_rel" } def __init__(self, original_data, tree_functions, features, no_tag=False): self.tree_functions = tree_functions self.feature_functions = zip(range(1, len(features) + 1), features) self.no_tag = no_tag self.original_data = original_data self.value_alphabet = Alphabet() self.value_alphabet.add( "__NULL__") #SVMlight doesn't like 0 value for features self.percent_buffer = int(len(self.original_data) * .18) def build_mallet_features(self): self.new_features = [] for feats in self.original_data: new_row = [] for i, func in self.feature_functions: new_row.append(re.sub(r"\s", "", func(feats))) #just to be sure! self.new_features.append(new_row) def build_features(self): self.new_features = [] for feats in self.original_data: new_row = [] if not self.no_tag: new_row.append(feats.relation_type) for func in self.tree_functions: new_row.append("|BT|") new_row.append(func(feats)._pprint_flat('', '()', False)) new_row.append("|ET|") if self.feature_functions: for i, func in self.feature_functions: cell = func(feats) value = cell.split("=")[1] if value == "False": continue try: value_index = self.value_alphabet.get_index(value) except KeyError: self.value_alphabet.add(value) value_index = self.value_alphabet.get_index(value) finally: new_row.append("{:d}:{:d}".format(i, value_index)) new_row.append("|EV|") new_row = [s.encode("utf-8") for s in new_row] self.new_features.append(new_row) def build_relation_class_vectors(self): self.all_vectors = defaultdict(list) for relation_class in self.RELATION_CLASSES: no_rel_seen = 0 vector_append = self.all_vectors[relation_class].append for row in self.new_features: if row[0].startswith(relation_class): new_row = ["+1"] + row[1:] elif no_rel_seen <= self.percent_buffer: new_row = ["-1"] + row[1:] vector_append(new_row) no_rel_seen += 1 def write_multiple_vectors(self, basedir, file_suffix): for relation, feature_vectors in self.all_vectors.iteritems(): with open( os.path.join(basedir, "{}-{}".format(relation, file_suffix)), "w") as f_out: for row in feature_vectors: f_out.write("{}\n".format(" ".join(row))) def write_no_tag(self, basedir, file_suffix): with open(os.path.join(basedir, file_suffix), "w") as f_out: for row in self.new_features: f_out.write("{}\n".format(" ".join(row)))
class MutualBootStrapper: def __init__(self, data, seeds, patterns=None, processing=1): if processing == 0: tokenized = self.tokenize(data) self.pos_tagged_data = self.pos_tag(tokenized) self.find_patterns = self.find_patterns_tagged self.find_seeds = self.find_seeds_tagged elif processing == 1: self.chunked_data = data self.find_patterns = self.find_patterns_chunked self.find_seeds = self.find_seeds_chunked self.permanent_lexicon = set(seeds) self.temporary_lexicon = defaultdict(set) for s in seeds: self.temporary_lexicon[s] = set() self.best_extraction_patterns = set() self.pattern_alphabet = Alphabet() if patterns is not None: for p in patterns: self.pattern_alphabet.add(p) self.n_counter_sets = None # import for getting candidate seeds self.f_counter_sets = None self.n_pattern_array = None self.f_pattern_array = None self.first_pattern_words = set() def tokenize(self, text): print "tokenizing...", all_entries = [] for entry in text: tokenized_entry = self._nested_tokenize(entry) all_entries.append(tokenized_entry) print "[DONE]" return all_entries def _nested_tokenize(self, untokenized_sentences): tokenized_sents = nltk.sent_tokenize(untokenized_sentences) tokenized_words = [nltk.word_tokenize(sent) for sent in tokenized_sents] self._postprocess_tokenized_text(tokenized_words) return tokenized_words def _postprocess_tokenized_text(self, tokenized): for i,sent in enumerate(tokenized): for j,word in enumerate(sent): tokenized[i][j] = word.lower() if "/" in word: tokenized[i][j] = re.sub(r"/", r" / ", word) #mutating the list def pos_tag(self, tokenized_data): print "POS tagging... ", pos_tagged_data = [] for entry in tokenized_data: new_entry = [] for sentence in entry: tagged = [("<START>", "<START>")] tagged.extend(nltk.pos_tag(sentence)) new_entry.append(tagged) pos_tagged_data.append(new_entry) print "[DONE]" return pos_tagged_data def build_patterns_tagged(self, sentence, index, size): window_start = index-size window_end = index+1 sentence_copy = list(sentence) sentence_copy[index] = "<x>", while window_start <= index: # this isn't quite right try: candidate = zip(*sentence_copy[window_start:window_end])[0] except IndexError: candidate = [] if len(candidate) > 1: self.pattern_alphabet.add(tuple(candidate)) if candidate[0] != "<x>": self.first_pattern_words.add(candidate[0]) else: self.first_pattern_words.add(candidate[1]) window_start += 1 window_end += 1 def find_patterns_tagged(self): for entry in self.pos_tagged_data: for sentence in entry: for i,(word,tag) in enumerate(sentence): if word in self.temporary_lexicon: self.build_patterns_tagged(sentence, i, 2) self.build_patterns_tagged(sentence, i, 1) def find_patterns_chunked(self): for entry in self.chunked_data: for sentence in entry: for i,word in enumerate(sentence): if isinstance(word, Chunk) and word.head in self.temporary_lexicon: self.build_patterns_chunked(sentence, i, 2) self.build_patterns_chunked(sentence, i, 1) def build_patterns_chunked(self, sentence, index, size): sentence_copy = list(sentence) sentence_copy[index] = "<x>", sentence_copy = self._flatten_chunks(sentence_copy) index = sentence_copy.index("<x>") window_start = index-size window_end = index+1 while window_start <= index: candidate = sentence_copy[window_start:window_end] if len(candidate) > 1: self.pattern_alphabet.add(tuple(candidate)) window_start += 1 window_end += 1 def _flatten_chunks(self, sentence): flattened_sentence = [] for constituent in sentence: if isinstance(constituent, Chunk): flattened_sentence.extend(constituent.tokens) else: flattened_sentence.append(constituent[0]) return flattened_sentence def set_counter_arrays(self): tmp_lst = [[]] * self.pattern_alphabet.size() # must be careful about pointers here self.n_counter_sets = map(set, tmp_lst) self.f_counter_sets = map(set, tmp_lst) def find_seeds_chunked(self): for entry in self.chunked_data: for sentence in entry: for i in range(len(sentence)): if isinstance(sentence[i], Chunk): self.match_pattern_chunked(sentence, i, 2) self.match_pattern_chunked(sentence, i, 1) def match_pattern_chunked(self, sentence, index, size): candidate_seed = sentence[index].head sentence_copy = list(sentence) sentence_copy[index] = "<x>", sentence_copy = self._flatten_chunks(sentence_copy) index = sentence_copy.index("<x>") window_start = index-size window_end = index+1 while window_start <= index: window = sentence_copy[window_start:window_end] pattern = tuple(window) if len(pattern) > 1 and \ self.pattern_alphabet.has_label(pattern) and \ len(candidate_seed) > 2: pattern_index = self.pattern_alphabet.get_index(pattern) # increment our counters self.n_counter_sets[pattern_index].add(candidate_seed) if candidate_seed not in self.temporary_lexicon: self.f_counter_sets[pattern_index].add(candidate_seed) window_start += 1 window_end += 1 def find_seeds_tagged(self): for entry in self.pos_tagged_data: for sentence in entry: for i in range(len(sentence)): if sentence[i][0] in self.first_pattern_words: self.match_pattern_tagged(sentence, i, 3) self.match_pattern_tagged(sentence, i, 2) def match_pattern_tagged(self, sentence, index, size): window_start = index-1 window_end = index+size-1 window = sentence[window_start:window_end] for seed_candidate_index in range(len(window)): window_copy = list(window) _,pos = window_copy[seed_candidate_index] window_copy[seed_candidate_index] = ("<x>", pos) pattern = tuple(zip(*window_copy)[0]) if len(pattern) > 1 and \ self.pattern_alphabet.has_label(pattern) and \ window[seed_candidate_index][1].startswith("NN") and \ len(window[seed_candidate_index][0]) > 2: candidate_seed = window[seed_candidate_index][0] pattern_index = self.pattern_alphabet.get_index(pattern) # increment our counters self.n_counter_sets[pattern_index].add(candidate_seed) if candidate_seed not in self.temporary_lexicon: self.f_counter_sets[pattern_index].add(candidate_seed) def calculate_pattern_scores(self): self.n_pattern_array = numpy.array(map(len, self.n_counter_sets), dtype=float) + 1. self.f_pattern_array = numpy.array(map(len, self.f_counter_sets), dtype=float) + 1. self.pattern_scores = numpy.nan_to_num((self.f_pattern_array/self.n_pattern_array)*numpy.log2(self.f_pattern_array)) def calculate_seed_scores(self): self.candidate_seed_scores = {} for candidate_seed,matched_patterns_set in self.temporary_lexicon.iteritems(): matched_patterns = list(matched_patterns_set) score = numpy.sum((self.pattern_scores[matched_patterns] * 0.01) + 1) #print score self.candidate_seed_scores[candidate_seed] = score def cull_candidates(self): self.calculate_pattern_scores() self.calculate_seed_scores() sorted_candidates = sorted([(v,k) for k,v in self.candidate_seed_scores.iteritems()], reverse=True) #print sorted_candidates try: return zip(*sorted_candidates)[1][:5] except IndexError: return [] def run_mutual_bootstrapping(self): added_patterns = 0 best_score = 5 while added_patterns < 10 or best_score > 1.8: self.find_patterns() self.set_counter_arrays() self.find_seeds() self.calculate_pattern_scores() best_pattern_index = numpy.nanargmax(self.pattern_scores) while best_pattern_index in self.best_extraction_patterns: self.pattern_scores[best_pattern_index] = -10000000. best_pattern_index = numpy.nanargmax(self.pattern_scores) if self.pattern_scores[best_pattern_index] < 0.7: return best_score = self.pattern_scores[best_pattern_index] #print best_score, self.pattern_alphabet.get_label(best_pattern_index) self.best_extraction_patterns.add(best_pattern_index) for seed in self.n_counter_sets[best_pattern_index]: self.temporary_lexicon[seed].add(best_pattern_index) added_patterns += 1 def run_meta_bootstrapping(self): best_five = self.cull_candidates() self.permanent_lexicon.update(best_five) self.temporary_lexicon = defaultdict(set) for s in self.permanent_lexicon: self.temporary_lexicon[s] = set() def run(self, num_iterations=50): for i in range(num_iterations): print "Iteration: {:d}".format(i+1) print "running mutual bootstrapping..." self.run_mutual_bootstrapping() print "[DONE]" print "running meta bootstrapping...", self.run_meta_bootstrapping() print "[DONE]" print "number of seed terms: {:d}".format(len(self.permanent_lexicon)) print "number of total patterns: {:d}".format(self.pattern_alphabet.size()) print "\n" def save_seeds(self, outfile): with open(outfile, "w") as f_out: f_out.write("\n".join(s.encode("utf-8") for s in self.permanent_lexicon)) def save_patterns(self, outfile): with open(outfile, "w") as f_out: patterns = [] for pattern_index in self.best_extraction_patterns: patterns.append(" ".join(self.pattern_alphabet.get_label(pattern_index))) f_out.write("\n".join(s.encode("utf-8") for s in patterns))
def _collect_counts(self,obs): dic = Alphabet() for ob in obs: dic.add(ob) return dic
class MaxEnt(BaseClassifier): def __init__(self, gaussian_prior_variance = 1): """Initialize the model label_alphabet, feature_alphabet, parameters must be consistent in order for the model to work. parameters numpy.array assumes a specific shape. Look athe assignment sheet for detail Add your implementation """ super(MaxEnt, self).__init__() self.label_alphabet = Alphabet() self.feature_alphabet = Alphabet() self.gaussian_prior_variance = gaussian_prior_variance self.parameters = numpy.array([]) self.feature_counts = None def get_parameter_indices(self, feature_indices, label_index): """Get the indices on the parameter vector Given a list of feature indices and the label index, the function will give you a numpy array of the corresponding indices on self.parameters This function is fully implemented for you. """ indices = numpy.array(feature_indices) + 1 intercept = numpy.array([0]) indices = numpy.concatenate((intercept, indices), 1) indices = indices + (label_index * (self.feature_alphabet.size() + 1)) return indices def compute_observed_counts(self, instance_list): """Compute observed feature counts It should only be done once because it's parameter-independent. The observed feature counts are then stored internally. Note that we are fitting the model with the intercept terms so the count of intercept term is the count of that class. Additionally, we have to 1) populate alphabet 2) convert instance.data into a vector of feature indices aka sparse vectors (use the alphabet) Add your implementation """ #If it's already been counted, just return the value from the cache if not self.feature_counts: #populate alphabets here for instance in instance_list: self.label_alphabet.add(instance.label) #update label dictionary for datum in instance.data: self.feature_alphabet.add(datum) #update feature dictionary self.feature_counts = numpy.zeros((self.feature_alphabet.size() \ + 1) * self.label_alphabet.size()) #generate observed count vector else: return self.feature_counts #compute the feature counts here for instance in instance_list: newinds = self.feature_alphabet.get_indices(instance.data) sparse_vector = self.get_parameter_indices(newinds, \ self.label_alphabet.get_index(instance.label)) self.feature_counts[sparse_vector] += 1 #instance.data = newinds if not instance.converted: instance.data = numpy.array(sorted(set(newinds))) #remove duplicates instance.converted = True #do not allow confusion return self.feature_counts def compute_label_unnormalized_loglikelihood_vector(self, sparse_feature_vector): """Compute unnormalized log score from log-linear model log P(Y|X) is proportional to feature vector * parameter vector But we use a sparse vector representation, so we need to use index tricks that numpy allows us to do. """ loglikelihood_score_vector = numpy.zeros(self.label_alphabet.size()) for index, label in self.label_alphabet: loglikelihood_score_vector[index] = sum(\ self.parameters[self.get_parameter_indices(\ sparse_feature_vector, index)]) #dot product of parameters and feature functions #which yields sum of parameters at indices return loglikelihood_score_vector def compute_posterior_distribution(self, instance): """Compute P(Y|X) Return a vector of the same size as the label_alphabet Add your implementation """ posterior_distribution = numpy.zeros(self.label_alphabet.size()) #initialize unnorm = self.compute_label_unnormalized_loglikelihood_vector(\ instance.data) #compute unnormalized log-likelihood if DEBUG_2: print unnorm posterior_distribution = numpy.exp(unnorm)/ sum(numpy.exp(unnorm)) #normalize return posterior_distribution def _argmax(self, func, *args): """Not needed because numpy's is better""" res = [func(arg) for arg in args] m = max(res) for arg in args: if func(arg) == m: return arg def compute_expected_feature_counts(self, instance_list): """Compute expected feature counts We take advantage of compute_posterior_distribution in this class to compute expected feature counts, which is only needed for training. Add your implementation """ expected_feature_counts = numpy.zeros((self.feature_alphabet.size() + 1) * self.label_alphabet.size()) for instance in instance_list: #add posterior to expected_feature_counts at appropriate indices post_dist = self.compute_posterior_distribution(instance) #posterior distribution for jndex, label in self.label_alphabet: indices = self.get_parameter_indices(\ instance.data, jndex) expected_feature_counts[indices] += post_dist[jndex] # increment expected counts at appropriate indices return expected_feature_counts def classify_instance(self, instance): """Applying the model to a new ins tance Convert instance.data into a sparse vector and then classify the instance. Returns the predicted label. Add your implementation """ if DEBUG_2: print instance.data if not instance.converted: instance.data = self.feature_alphabet.get_indices(instance.data) instance.converted = True # get_indices eliminates any heretofore unseen features if DEBUG_2: print instance.data print self.compute_posterior_distribution(instance) return self.label_alphabet.get_label(numpy.argmax( \ self.compute_posterior_distribution(instance))) #return label corresponding to best index def objective_function(self, parameters): """Compute negative (log P(Y|X,lambdas) + log P(lambdas)) The function that we want to optimize over. You won't have to call this function yourself. fmin_l_bfgs_b will call it. Add your implementation """ total_loglikelihood = 0.0 self.parameters = parameters #add normalizing term total_loglikelihood -= numpy.sum(parameters * parameters) / \ self.gaussian_prior_variance # Compute the loglikelihood here for instance in self.training_data: #add posterior at correct label index total_loglikelihood += self.compute_posterior_distribution(instance) \ [self.label_alphabet.get_index(instance.label)] return - total_loglikelihood def gradient_function(self, parameters): """Compute gradient of negative (log P(Y|X,lambdas) + log P(lambdas)) wrt lambdas With some algebra, we have that gradient wrt lambda i = observed_count of feature i - expected_count of feature i The first term is computed before running the optimization function and is a constant. The second term needs inference to get P(Y|X, lambdas) and is a bit expensive. The third term is from taking the derivative of log gaussian prior Returns: a vector of gradient Add your implementation """ gradient_vector = numpy.zeros(len(parameters)) # compute gradient here gradient_vector += self.feature_counts - \ self.compute_expected_feature_counts(self.training_data) - \ 2 * (parameters) / self.gaussian_prior_variance if DEBUG_1: print gradient_vector return - gradient_vector def train(self, instance_list): """Find the optimal parameters for maximum entropy classifier We leave the actual number crunching and search to fmin_bfgs function. There are a few tunable parameters for the optimization function but the default is usually well-tuned and sufficient for most purposes. Arg: instance_list: each instance.data should be a string feature vector This function is fully implemented. But you are allowed to make changes """ self.training_data = instance_list self.compute_observed_counts(instance_list) num_labels = self.label_alphabet.size() num_features = self.feature_alphabet.size() init_point = numpy.zeros(num_labels * (num_features + 1)) optimal_parameters, _, _ = fmin_l_bfgs_b(self.objective_function, init_point, fprime=self.gradient_function) self.parameters = optimal_parameters def to_dict(self): """Convert MaxEnt into a dictionary so that save() will work Add your implementation """ res = {} res['labalph'] = self.label_alphabet.to_dict() res['feaalph'] = self.feature_alphabet.to_dict() res['gpv'] = self.gaussian_prior_variance res['param'] = self.parameters return res @classmethod def from_dict(cls, model_dictionary): """Return an instance of MaxEnt based on the dictionary created by to_dict Add your implementation """ res = MaxEnt() res.label_alphabet = Alphabet.from_dict(model_dictionary['labalph']) res.feature_alphabet = Alphabet.from_dict(model_dictionary['feaalph']) res.gaussian_prior_variance = model_dictionary['gpv'] res.parameters = model_dictionary['param'] return res
class NaiveBayes(BaseClassifier): def __init__(self): """Constructor Utility classes and class variables should be initialized here. Add your implementation. """ self.label_codebook = Alphabet() self.feature_codebook = Alphabet() def _collect_counts(self, instance_list): """Collect feature and label counts from the dataset This function should first index all of labels and features and update the two codebooks. Then go through the data again and count all of labels and features in self.count_x_y_table self.count_y_table For example, self.count_x_y_table[12, 0] = Count of feature 12 co-occurring with label 0 self.count_y_table[1] = Count of label 1 If you want to know what feature 12 is, you should be able to look it up by self.feature_codebook.get_label(12) Add your implementation. """ for gen in set(map(lambda x: x.label, instance_list)): self.label_codebook.add(gen) for vector in map(lambda x: x.data, instance_list): for feat in vector: self.feature_codebook.add(feat) self.count_x_y_table = numpy.zeros(map(len, [self.feature_codebook, self.label_codebook])) self.count_y_table = numpy.zeros(len(self.label_codebook)) for i, instance in enumerate(instance_list): print "Training on instance %d of %d." % (i, len(instance_list)) label = self.label_codebook.get_index(instance.label) self.count_y_table[label] += 1 for index, feature in self.feature_codebook: self.count_x_y_table[index,label] += int(feature in instance) if DEBUG: for index, label in self.label_codebook: print label, print '' for i, (e1, e2) in enumerate(self.count_x_y_table): print '%s: %d, %d' % (self.feature_codebook.get_label(i), e1, e2) def train(self, instance_list, smoothmode = 'laplace'): """Fit model parameters based on the dataset You should assume that self.label_codebook and self.feature_codebook are now properly populated. Populate p_x_given_y_table and p_y_table with their maximum likelihood estimates For example : self.p_x_given_y_table[10, 1] = P(X10 = 1|Y=1) self.p_y_table[1] = P(Y=1) You should also do some kind of smoothing. Add your implementation """ self._collect_counts(instance_list) if smoothmode == 'add-one': self.smooth_table(smoothmode) self.p_x_given_y_table = numpy.zeros((self.feature_codebook.size(), self.label_codebook.size())) for row, counts in enumerate(self.count_x_y_table): for col, count in enumerate(counts): self.p_x_given_y_table[row,col] = float(count) / self.count_y_table[col] self.p_y_table = numpy.zeros(self.label_codebook.size()) for col, count in enumerate(self.count_y_table): self.p_y_table[col] = float(count) / sum(self.count_y_table) if smoothmode == 'laplace': self.smooth_table(smoothmode) def smooth_table(self, mode): """ Implements smoothing algorithms for probability tables; defaults to Laplace smoothing, distributing probability mass of least-frequent elements to zero-probabiity elements. """ if mode == 'laplace': #get lists of p(x|y) values for each y newtabs = [self.p_x_given_y_table[0:,i] \ for i in range(self.p_x_given_y_table.size / \ len(self.p_x_given_y_table))] #descry lowest-frequency nonzero elements in p(x|y) table mincounts = map(lambda x: min([i for i in x if i]), newtabs) #get indices of minimal and zero values inds = [[j for j, elem in enumerate(li) \ if elem in [0, mincounts[i]]] \ for i, li in enumerate(newtabs)] #average probability mass of minimal elements over zero elements newvals = [float(mincounts[i])/len(inds[i]) \ for i in range(len(inds))] #reassign minimal and zero elements for i, li in enumerate(inds): for ind in li: self.p_x_given_y_table[ind,i] = newvals[i] elif mode == 'add-one': #add one to all counts in count tables self.count_x_y_table += 1 self.count_y_table += 1 def compute_log_unnormalized_score(self, instance): """Compute log P(X|Y) + log P(Y) for all values of Y Returns a numpy vector of loglikelihood. The vector indices must be consistent with the codebook in the classifier For example: loglikelihood_vector[0] = log P(X|Y=0) + log P(Y=0) Add your implementation """ loglikelihood_vector = numpy.zeros(self.label_codebook.size()) for col, loglike in enumerate(loglikelihood_vector): for index, feature in self.feature_codebook: loglikelihood_vector[col] += numpy.log(\ self.p_x_given_y_table[index, col]) if feature in instance \ else max(numpy.finfo(float).eps, \ numpy.log(1 - self.p_x_given_y_table[index, col])) loglike += numpy.log( self.p_y_table[col] ) return loglikelihood_vector def classify_instance(self, instance): """Predict the label of the given instance Make a prediction given the features in the instance. This function should be very short. Add your implementation """ clus = self.compute_log_unnormalized_score(instance) if DEBUG: for index, label in self.label_codebook: print label, clus[index] for i, index in enumerate(clus): if index == max(clus): return self.label_codebook.get_label(i) def to_dict(self): """Convert NaiveBayes instance into a dictionary representation The implementation of this should be in sync with from_dict function. You should be able to use these two functions to convert the model into either representation (object or dictionary) Add your implementation """ model_dict = { 'label_alphabet': self.label_codebook.to_dict(), 'feature_alphabet': self.feature_codebook.to_dict(), '#x&y' : self.count_x_y_table, '#y' : self.count_y_table, 'p_x|y_table' : self.p_x_given_y_table, 'p_y_table' : self.p_y_table, } @classmethod def from_dict(cls, model_dict): """Convert a dictionary into NaiveBayes instance The implementation of this should be in sync with to_dict function. Add your implementation """ res = NaiveBayes() res.label_codebook = Alphabet.from_dict(model_dict['label_alphabet']) res.feature_codebook = Alphabet.from_dict(model_dict['feature_alphabet']) res.count_x_y_table = model_dict['#x&y'] res.count_y_table = model_dict['#y'] res.p_x_given_y_table = model_dict['_x|y_table'] res.p_y_table = model_dict['p_y_table'] return res
class HMM(BaseClassifier): def __init__(self): self.label_codebook = Alphabet() self.feature_codebook = Alphabet() # these two flags are for feature selection self.filter_feature1 = True self.filter_feature2 = False def _collect_counts(self, instance_list): """Collect counts necessary for fitting parameters This function should update self.transtion_count_table and self.feature_count_table based on this new given instance Add your docstring here explaining how you implement this function Returns None """ #0B, 1I, 2O #self.transition_count_table #self.feature_count_table for instance in instance_list: #label[],data[] # for transition_count_table, we read label[], m*m # for feature_count_table, we read both label and data, to see how an observation is emitted from a certain state, p*m for i in range(len(instance.label)): self.feature_count_table[instance.data[i][0]][instance.label[i]] += 1 self.feature_count_table[instance.data[i][1]][instance.label[i]] += 1 if i == 0: self.initial_state_count_table[instance.label[i]] += 1 elif i== len(instance.label)-1: self.termination_state_count_table[instance.label[i]] += 1 else: self.transition_count_table[instance.label[i]][instance.label[i-1]] += 1 # easy for matrix multiplication def train(self, instance_list): """Fit parameters for hidden markov model Update codebooks from the given data to be consistent with the probability tables Transition matrix and emission probability matrix will then be populated with the maximum likelihood estimate of the appropriate parameters Add your docstring here explaining how you implement this function Returns None """ # m states, q features self.transition_matrix = numpy.zeros((1,1)) self.emission_matrix = numpy.zeros((1,1)) # m*m self.transition_count_table = numpy.zeros((self.label_codebook.size(),self.label_codebook.size())) # q*m self.feature_count_table = numpy.zeros((self.feature_codebook.size(),self.label_codebook.size())) #a table to store each state at the begining of a sequence.it is used for calculating the initial states self.initial_state_count_table = numpy.zeros(self.label_codebook.size()) self.termination_state_count_table = numpy.zeros(self.label_codebook.size()) self._collect_counts(instance_list) #TODO: estimate the parameters from the count tables #Convert count tables into probability tables #SMOOTHING here self.initial_state_count_table=(self.initial_state_count_table+1)/(numpy.sum(self.initial_state_count_table)+3) self.termination_state_count_table=(self.termination_state_count_table+1)/(numpy.sum(self.termination_state_count_table)+3) # sum of each column, each column identifies the precious state, from previous state to current state self.transition_matrix = (self.transition_count_table+1)/(numpy.sum((self.transition_count_table+1),0)+3) # sum of each column, each column identifies the state, emit from state to observation self.emission_matrix = (self.feature_count_table+1)/(numpy.sum((self.feature_count_table+1),0)+3) def classify_instance(self, instance): """Viterbi decoding algorithm Wrapper for running the Viterbi algorithm We can then obtain the best sequence of labels from the backtrace pointers matrix Add your docstring here explaining how you implement this function Returns a list of label indices e.g. [0, 1, 0, 3, 4] """ instance_size = len(instance.label) trellis, backtrace_pointers = self.dynamic_programming_on_trellis(instance, False) best_sequence = numpy.zeros(instance_size) best_sequence[-1] = numpy.argmax(trellis[:,-1]) for i in range(instance_size-2,0,-1): best_sequence[i]=backtrace_pointers[best_sequence[i+1]][i+1] return best_sequence def compute_observation_loglikelihood(self, instance): """Compute and return log P(X|parameters) = loglikelihood of observations""" trellis = self.dynamic_programming_on_trellis(instance, True) loglikelihood = numpy.log10(numpy.sum(trellis[:,-1])) return loglikelihood def dynamic_programming_on_trellis(self, instance, run_forward_alg=True): """Run Forward algorithm or Viterbi algorithm This function uses the trellis to implement dynamic programming algorithm for obtaining the best sequence of labels given the observations Add your docstring here explaining how you implement this function Returns trellis filled up with the forward probabilities and backtrace pointers for finding the best sequence """ #TODO:Initialize trellis and backtrace pointers # trellis, m*t, m states, t sequence length, trellis[j][0] is the first element in the sequence. the index is tricky instance_size = len(instance.label) label_size = self.label_codebook.size() trellis = numpy.zeros((label_size,instance_size))# 3*t, if this instance's sequence is 10 backtrace_pointers = numpy.zeros((label_size,instance_size))#3*t # Traversing the trellis from left to right # Initialization, fill in the first column, t=1, index 0 if self.filter_feature1: trellis[:,0] = self.initial_state_count_table*self.emission_matrix[instance.data[0][1]] elif self.filter_feature2: trellis[:,0] = self.initial_state_count_table*self.emission_matrix[instance.data[0][0]] else: trellis[:,0] = self.initial_state_count_table*self.emission_matrix[instance.data[0][0]]*self.emission_matrix[instance.data[0][1]] # Recursion for t in range(1, instance_size): if run_forward_alg: for i in range(label_size): trellis[:,t] += trellis[i][t-1]*self.transition_matrix[:,i] else: for j in range(label_size): candidate_pre_state = trellis[:,t-1]*self.transition_matrix[j] trellis[j][t] = numpy.max(candidate_pre_state) backtrace_pointers[j][t] = numpy.argmax(candidate_pre_state) # times the observation, 2 features, using the emission matrix trellis[:,t]=trellis[:,t]*self.emission_matrix[instance.data[t][0]]*self.emission_matrix[instance.data[t][1]] # Termination ???? #alpha_F = numpy.argmax(self.termination_state_count_table) #P_O_Lambda = trellis[i][-1]*self.transition_matrix[i][alpha_F] return (trellis, backtrace_pointers) def train_semisupervised(self, unlabeled_instance_list, labeled_instance_list=None): """Baum-Welch algorithm for fitting HMM from unlabeled data (EXTRA CREDIT) The algorithm first initializes the model with the labeled data if given. The model is initialized randomly otherwise. Then it runs Baum-Welch algorithm to enhance the model with more data. Add your docstring here explaining how you implement this function Returns None """ if labeled_instance_list is not None: self.train(labeled_instance_list) else: #TODO: initialize the model randomly pass while True: #E-Step self.expected_transition_counts = numpy.zeros((1,1)) self.expected_feature_counts = numpy.zeros((1,1)) for instance in instance_list: (alpha_table, beta_table) = self._run_forward_backward(instance) #TODO: update the expected count tables based on alphas and betas #also combine the expected count with the observed counts from the labeled data #M-Step #TODO: reestimate the parameters if self._has_converged(old_likelihood, likelihood): break def _has_converged(self, old_likelihood, likelihood): """Determine whether the parameters have converged or not (EXTRA CREDIT) Returns True if the parameters have converged. """ return True def _run_forward_backward(self, instance): """Forward-backward algorithm for HMM using trellis (EXTRA CREDIT) Fill up the alpha and beta trellises (the same notation as presented in the lecture and Martin and Jurafsky) You can reuse your forward algorithm here return a tuple of tables consisting of alpha and beta tables """ alpha_table = numpy.zeros((1,1)) beta_table = numpy.zeros((1,1)) #TODO: implement forward backward algorithm right here return (alpha_table, beta_table) def to_dict(self): """Convert HMM instance into a dictionary representation The implementation of this should be in sync with from_dict function. You should be able to use these two functions to convert the model into either representation (object or dictionary) """ model_dict = { 'label_alphabet': label_codebook.to_dict(), 'feature_alphabet': feature_codebook.to_dict() } return model_dict def test_hmm(self): pp = PreProcessor() pp.test_preprocess() instance_list = pp.get_instance_list() self.label_codebook=pp.get_label_codebook() self.feature_codebook=pp.get_feature_codebook() self.train(instance_list) print "\ntransition_count_table--------------------" print self.transition_count_table print "\ntransition_matrix-------------------------" print self.transition_matrix print "\ninitial_state_count_table------------------" print self.initial_state_count_table print "\ntermination_state_count_table------------------" print self.termination_state_count_table print "\nemission matrix----------------------------" print self.emission_matrix for i in range(10): self.test_classify_instance(instance_list[i]) def test_forward(self, instance): print "run forward algorithm, print trellis----------" print "instance used for test: " print instance.label trellis = self.dynamic_programming_on_trellis(instance,True) print trellis print "------forward done---------------------------" def test_viterbi(self, instance): print "run vertibi algorithm, print trellis----------" print "instance used for test: " print instance.label trellis,backtrace_pointers =self.dynamic_programming_on_trellis(instance,False) print trellis print backtrace_pointers print "------vertibi done---------------------------" def test_classify_instance(self,instance): print "test classify instance ----------------------" print "instance used for test: " print instance.label best_sequence = self.classify_instance(instance) print "best sequence:" print best_sequence counter = 0 for i in range(len(instance.label)-1): if best_sequence[i]==instance.label[i]: counter +=1 print "single run accurac: "+str(float(counter)/float(len(instance.label)-1)) print "classify instance done ----------------------------" @classmethod def from_dict(model_dict): """Convert a dictionary into HMM instance The implementation of this should be in sync with to_dict function. """ return HMM()
class Naive_Bayes(object): """""" def __init__(self, data, feature_function): """ Takes a dictionary mapping labels to lists of strings with that label, and a function which produces a list of feature values from a string. """ # your code here! self.data = data self.feature_codebook = Alphabet() # self.word_dict = Alphabet() self.label_codebook = Alphabet() self.feature_function = feature_function # def _build_instance_list(self): # """""" # instance_list = {} # for label, documents in self.data.items(): # instance_list[label] = [] # for doc in documents: # vector = self.extract_feature(self.data, doc, s) # instance_list[label].append(vector) # self.instance_list = instance_list # # def _populate_codebook(self): # """""" # for label in self.instance_list: # self.label_codebook.add(label) # #here we use all the word set as features # self.feature_codebook = copy.deepcopy(self.word_dict) def extract_feature(self, string): """""" vector = np.zeros(self.feature_codebook.size()) tokens = set(nltk.regexp_tokenize(string, pattern="\w+")) indice = 0 for word in tokens: if self.feature_codebook.has_label(word): indice = self.feature_codebook.get_index(word) vector[indice] = 1.0 return vector def _collect_counts(self): """""" self.count_table = np.zeros((self.feature_codebook.size(), self.label_codebook.size())) self.count_y_table = np.zeros(self.label_codebook.size()) for label, docs in self.instance_list.items(): Y_index = self.label_codebook.get_index(label) for vector in docs: self.count_y_table[Y_index] += 1.0 self.count_table[:, Y_index] += vector # for sparse vector we use different counting method # for x in vector: # self.count_table[x,Y_index] += 1.0 def train(self, theta): """""" self.instance_list = self.feature_function(self.data, self.label_codebook, self.feature_codebook, theta) # self._populate_codebook_withSelectFeature() # self.instance_list = self.feature_function(self.data, self.label_codebook, self.feature_codebook, select_feature) self._collect_counts() self.p_x_given_y_table = np.zeros((self.feature_codebook.size(), self.label_codebook.size())) self.p_y_table = np.zeros(self.label_codebook.size()) self.p_x_given_y_table = (self.count_table + 0.2) / (self.count_y_table + self.feature_codebook.size() * 0.2) self.p_y_table = self.count_y_table / self.count_y_table.sum() def compute_log_unnormalized_score(self, feature_vector): """Compute log P(X|Y) + log P(Y) for all values of Y Returns a vector of loglikelihood. loglikelihood_vector[0] = log P(X|Y=0) + log P(Y=0) """ loglikelihood_vector = np.zeros(self.label_codebook.size()) for label in range(0, self.label_codebook.size()): logpro = math.log(self.p_y_table[label]) for feature_index in range(0, self.feature_codebook.size()): logpro += feature_vector[feature_index] * math.log(self.p_x_given_y_table[feature_index, label]) + (1 - feature_vector[feature_index]) * math.log(1 - self.p_x_given_y_table[feature_index, label]) loglikelihood_vector[label] = logpro return loglikelihood_vector def classify(self, string): """ Classifies a string according to the feature function and training data provided at initialization. Predict the label of the given instance return the predict label for the input document """ # your code here! feature_vector = self.extract_feature(string) logvector = self.compute_log_unnormalized_score(feature_vector) # print vector pre_label_index = np.argmax(logvector) return self.label_codebook.get_label(pre_label_index)
class MaxEnt(BaseClassifier): def __init__(self): """Initialize the model label_codebook, feature_codebook, parameters must be assigned properly in order for the model to work. parameters and codebooks will be handled in the train function """ super(MaxEnt, self).__init__() self.label_codebook = Alphabet() self.feature_codebook = Alphabet() #self.gaussian_prior_variance = 1 self.parameters = [] self.gaussian_prior_variance = 1.0 def compute_observed_counts(self, instance_list): """Compute observed feature counts It should only be done once because it's parameter-independent. The observed feature counts are then stored internally. Note that we are fitting the model with the intercept terms so the count of intercept term is the count of that class. fill the feature_counts table with observed counts """ #the data and label in instance both use sparse vector self.feature_counts = numpy.zeros((self.feature_codebook.size() + 1) * self.label_codebook.size()) for instance in instance_list: Y_index = (self.feature_codebook.size()+1)*instance.label self.feature_counts[Y_index] +=1 #instance.data is numpy array indices = Y_index + instance.data +1 self.feature_counts[indices] +=1 #print self.feature_counts[:self.feature_codebook.size()+1] #print self.feature_counts[self.feature_codebook.size()+1:] def compute_expected_feature_counts(self,instance_list): """Compute expected feature counts E(feature|X) = sum over i,y E(feature(Xi,yi)|Xi) = sum over i,y feature(Xi,yi) P(Y=yi|Xi) We take advantage of inference function in this class to compute expected feature counts, which is only needed for training. computing the expected feature counts by adding up all the expectation counts of all feature. return expected feature counts table """ expected_feature_counts = numpy.zeros(len(self.parameters)) for instance in instance_list: posterior = self.compute_label_unnormalized_loglikelihood_vector(instance.data) posterior = numpy.exp(posterior-logsumexp(posterior)) for label in range(0,self.label_codebook.size()): Y_index = label*(self.feature_codebook.size() + 1) expected_feature_counts[Y_index] += posterior[label] indices = Y_index + instance.data + 1 expected_feature_counts[indices] += posterior[label] return expected_feature_counts def classify_instance(self, instance): """Applying the model to a new instance Returns: label with the maximum probability """ vector = self.compute_posterior_distribution(instance) #print vector pre_label_index = numpy.argmax(vector) return pre_label_index def compute_posterior_distribution(self, instance): """Compute P(Y|X) Return a vector of the same size as the label_codebook the vector contains the unnormalized likelihood vector since we only use them for finding the most probable label, so we don't have to normalized it. """ sparse_vector = numpy.array([self.feature_codebook.get_index(i) for i in instance.data if self.feature_codebook.has_label(i)]) posterior_distribution = numpy.zeros(self.label_codebook.size()) posterior_distribution = numpy.exp(self.compute_label_unnormalized_loglikelihood_vector(sparse_vector)) return posterior_distribution def compute_label_unnormalized_loglikelihood_vector(self,sparse_feature_vector): """Compute unnormalized log score from log-linear model log P(Y|X) is proportional to feature vector * parameter vector But we use a sparse vector representation, so we need to use index tricks that numpy allows us to do. for each label compute the unnormalized loglikelihood (sum of lambdas) given the sparse_feature_vector Returns: a vector of scores according to different y(label) """ loglikelihood_score_vector = numpy.zeros(self.label_codebook.size()) for label in range(0,self.label_codebook.size()): Y_index = label*(self.feature_codebook.size() + 1) indices = Y_index + sparse_feature_vector + 1 if len(indices)!=0: loglikelihood_score_vector[label] = self.parameters[Y_index] + sum(self.parameters[indices]) else: loglikelihood_score_vector[label] = self.parameters[Y_index] return loglikelihood_score_vector def objective_function(self, parameters): """Compute negative (log P(Y|X,lambdas) + log P(lambdas)) The function that we want to optimize over. Here I use Gaussian distribution(mean=0.0 sigma=1.0) prior to model P(lambda) Args: parameters updated by the training procedure Returns: negtive total likelihood """ total_loglikelihood = 0.0 numerator = 0.0 denominator = 0.0 #prior = 0.0 #self.gaussian_prior_variance = 1.0 prior = sum([i**2/(2*self.gaussian_prior_variance**2) for i in parameters]) self.parameters=numpy.array(parameters) # Compute the loglikelihood here loglikelihood_score_vector = numpy.zeros(self.label_codebook.size()) for instance in self.training_data: Y_index = instance.label*(self.feature_codebook.size() + 1) indices = Y_index + instance.data + 1 numerator += (parameters[Y_index]+sum(parameters[indices])) score_vector = self.compute_label_unnormalized_loglikelihood_vector(instance.data) #print score_vector denominator += logsumexp(score_vector) #print numerator #print denominator total_loglikelihood = numerator - denominator - prior print - total_loglikelihood return - total_loglikelihood def gradient_function(self, parameters): """Compute gradient of negative (log P(Y|X,lambdas) + log P(lambdas)) wrt lambdas With some algebra, we have that gradient wrt lambda i = observed_count of feature i - expected_count of feature i - lambda i / gaussian_prior_variance^2 The first term is computed before running the optimization function and is a constant. The second term needs inference to get P(Y|X, lambdas) and is a bit expensive. The third term is from taking the derivative of log gaussian prior Returns: a vector of gradient """ self.parameters = numpy.array(parameters) #print self.parameters #print parameters gradient_vector = numpy.zeros(len(parameters)) observed_count_vector = self.feature_counts expected_count_vector = self.compute_expected_feature_counts(self.training_data) dprior = numpy.array([i/self.gaussian_prior_variance**2 for i in parameters]) # compute gradient here gradient_vector = observed_count_vector - expected_count_vector - dprior return - gradient_vector def train(self, instance_list): """Find the optimal parameters for maximum entropy classifier We setup an instance of MaxEnt to use as an inference engine necessary for parameter fitting. MaxEnt instance and training set are stored internally in the trainer just so we can avoid putting in extra arguments into the optimization function. We leave the actual number crunching and search to fmin_bfgs function. There are a few tunable parameters for the optimization function but the default is usually well-tuned and sufficient for most purposes. Arg: instance_list: each instance.data should be a string feature vectors This function will create a sparse feature vector representation based on the alphabet. Returns: Maximum entropy classifier with the parameters (MAP estimate from the data and Gaussian prior) """ assert(len(instance_list) > 0) ###################################### # Do any further processing right here e.g populate codebook # making sparse vectors, etc. self.label_codebook.add('neg') self.label_codebook.add('pos') for index,instance in enumerate(instance_list): sparse_vector = numpy.zeros(0,dtype=numpy.int) for feature in instance.data: if not self.feature_codebook.has_label(feature): self.feature_codebook.add(feature) sparse_vector = numpy.append(sparse_vector,self.feature_codebook.get_index(feature)) else: sparse_vector = numpy.append(sparse_vector,self.feature_codebook.get_index(feature)) instance_list[index].data = sparse_vector ################## self.parameters = numpy.zeros((self.feature_codebook.size() + 1) * self.label_codebook.size()) self.training_data = instance_list self.compute_observed_counts(instance_list) num_labels = self.label_codebook.size() num_features = self.feature_codebook.size() init_point = numpy.zeros(num_labels * (num_features + 1)) optimal_parameters, _, _ = fmin_l_bfgs_b(self.objective_function, init_point, fprime=self.gradient_function) print optimal_parameters self.parameters = optimal_parameters def to_dict(self): model_dict = { 'label_alphabet': self.label_codebook.to_dict(), 'feature_alphabet': self.feature_codebook.to_dict(), 'parameters': self.parameters.tolist(), } return model_dict @classmethod def from_dict(cls, model_dictionary): model_instance = MaxEnt() model_instance.label_codebook = Alphabet.from_dict(model_dict['label_alphabet']) model_instance.feature_codebook = Alphabet.from_dict(model_dict['feature_alphabet']) model_instance.p_x_given_y_table = numpy.array(model_dict['parameters']) return model_instance
class HMM(BaseClassifier): def __init__(self): self.label_alphabet = Alphabet() self.feature_alphabet = Alphabet() self.transition_matrix = None self.emission_matrix = None self.initial_probability = None @property def num_states(self): return self.label_alphabet.size() @property def num_observations(self): return self.feature_alphabet.size() def _mutate_data(self, instance): try: _ = instance.old_data except: instance.old_data = instance.data instance.data = self.feature_alphabet.get_indices(instance.data) def _mutate_label(self, instance): try: _ = instance.old_label except: instance.old_label = instance.label instance.label = self.label_alphabet.get_indices(instance.label) def populate_alphabets(self, instance_list): """Populate alphabets You guys have done this twice already. So I'm doing it for you this time. But a few things to note the labels get converted to label indices the feature vectors get converted to sparse vector each time step contains exactly one feature (observation) Feel free to edit/modify/tear apart this function """ for instance in instance_list: for label in instance.label: self.label_alphabet.add(label) for observation in instance.data: self.feature_alphabet.add(observation) self._mutate_data(instance) self._mutate_label(instance) #for test cases and unsupervised training self.transition_matrix = numpy.zeros((self.num_states, self.num_states)) self.emission_matrix = numpy.zeros((self.num_states, self.num_observations)) self.initial_probability = numpy.zeros(self.num_states) def collect_counts(self, instance_list): """Collect counts for fitting HMM parameters Very similar to Naive Bayes, we have to collect counts for estimating parameters: transition_counts[i,j] = the number of occurrences that state i comes before state j observation_counts[i,j] = the number of occurrences that state i is aligned with observation j initial_state_counts[i] = the number of occurrences that state i is at the beginning of the sequence Add your implementation """ transition_counts = numpy.zeros((self.num_states, self.num_states)) initial_state_counts = numpy.zeros(self.num_states) observation_counts = numpy.zeros((self.num_states, self.num_observations)) for instance in instance_list: trans = zip(instance.label[:-1], instance.label[1:]) transition_counts[instance.label[:-1], instance.label[1:]] += \ map(trans.count, trans) #quirky workaround; commented code above doesn't work obs = zip(instance.label, instance.data) observation_counts[instance.label, instance.data] += \ map(obs.count, obs) initial_state_counts[instance.label[0]] += 1 #increment initial state return (transition_counts, initial_state_counts, observation_counts) def train(self, instance_list): """Train the HMM Collect counts and find the best parameters for transition matrix, emission matrix, and initial probability DO NOT smooth the counts Add your implementation """ self.populate_alphabets(instance_list) transition_counts, initial_state_counts, observation_counts = self.collect_counts(instance_list) #fill in these matrices #availing of columnar summation of numpy arrays self.transition_matrix = transition_counts / numpy.sum(transition_counts, 1) self.emission_matrix = (observation_counts.T / (numpy.sum(transition_counts, 0) + \ initial_state_counts)).T #p(Y1|X0) + p(Y1|X1) + ... = p(Y1) self.initial_probability = initial_state_counts / sum(initial_state_counts) #p(X|Start) def forward_algorithm(self, instance): """Run forward algorithm Add your implementation """ sequence_length = len(instance.data) alpha = numpy.zeros((self.num_states, sequence_length)) #initialization alpha[:, 0] = self.initial_probability * self.emission_matrix[:,instance.data[0]] #recursion for t in range(1, sequence_length): alpha[:, t] = numpy.sum(alpha[:, t-1] * self.transition_matrix.T * \ self.emission_matrix[:, instance.data[t]], 1) return alpha def backward_algorithm(self, instance): """Run backward algorithm Add your implementation """ sequence_length = len(instance.data) beta = numpy.zeros((self.num_states, sequence_length)) #initialization beta[:, -1] += 1 #recursion for t in reversed(xrange(sequence_length - 1)): beta[:, t] = numpy.sum(self.transition_matrix * \ self.emission_matrix[:, instance.data[t + 1]] * \ beta[:, t + 1], 1) return beta def compute_likelihood(self, alpha): """Compute likelihood P(O1:T) given forward values This function is necessary for computing expected counts. It should assume that alpha (forward) values are computed correctly. This function should be just one line Add your implementation """ #return sum(alpha)[-1] #return sum(alpha[:, -1]) return numpy.sum(alpha, 0)[-1] def compute_expected_counts(self, instance): """E-step for EM Algorithm for learning HMM parameters This function is fully implemented for you """ alpha = self.forward_algorithm(instance) beta = self.backward_algorithm(instance) sequence_length = len(instance.data) likelihood = self.compute_likelihood(alpha) gamma = alpha * beta / likelihood expected_observation_counts = numpy.zeros((self.num_states, self.num_observations)) for t in xrange(sequence_length): feature_index = instance.data[t] expected_observation_counts[:, feature_index] += gamma[:, t] expected_transition_counts = numpy.zeros((self.num_states, self.num_states)) for t in xrange(sequence_length-1): feature_index = instance.data[t+1] obs = self.emission_matrix[:, feature_index] m1 = numpy.matrix(alpha[:, t]) m2 = numpy.matrix(beta[:, t+1] * obs) xi = numpy.multiply(m1.transpose().dot(m2), self.transition_matrix) / likelihood expected_transition_counts += xi return (expected_transition_counts, expected_observation_counts, likelihood) def _be_prepared_for_baum_welch(self, training_set, mode = 'uniform', inf = None): """Initialize transition_matrix, emission_matrix, and initial_probability for Baum-Welch. @param training_set: the training data @param mode: can be 'uniform', 'random', or 'sneaky' @inf: used in sneaky mode; this is a file containing a dictionary serialization of an HMM object """ self.populate_alphabets(training_set) HVAL = 100 #added to high positions in sparse rows for weak training LVAL = 1 #added to low positions in sparse rows for weak training if mode == 'uniform': #all elements in a row are equal self.transition_matrix += (1.0 / numpy.size(self.transition_matrix, 1)) self.emission_matrix += (1.0 / numpy.size(self.emission_matrix, 1)) self.initial_probability += (1.0 / numpy.size(self.initial_probability)) else: #elements will be unequal #choose one element per row per matrix to be #much higher than its dear siblings if mode == 'random': #high element is selected randomly random.seed() trans = [random.choice(range(numpy.size(self.transition_matrix, 1))) \ for i in range(numpy.size(self.transition_matrix, 0))] emits = [random.choice(range(numpy.size(self.emission_matrix, 1))) \ for i in range(numpy.size(self.emission_matrix, 0))] init = random.choice(range(len(self.initial_probability))) elif mode == 'sneaky': #use some information from the data, but don't tell anyone! tempdict = HMM.from_dict(cPickle.load(inf)) tcounts, icounts, ocounts = [tempdict[i] for i in 'transition_matrix', \ 'initial_probability', 'emission_matrix'] trans = numpy.argmax(tcounts, 1) emits = numpy.argmax(ocounts, 1) init = numpy.argmax(icounts) #ensure that no element is zero and that the selected element is substantially higher self.transition_matrix[range(numpy.size(self.transition_matrix, 0)), trans] += HVAL self.transition_matrix += LVAL self.emission_matrix[range(numpy.size(self.emission_matrix, 0)), emits] += HVAL self.emission_matrix += LVAL self.initial_probability[init] += HVAL self.initial_probability += LVAL #normalize self.transition_matrix = (self.transition_matrix.T / numpy.sum(\ self.transition_matrix, 1)).T self.emission_matrix = (self.emission_matrix.T / numpy.sum(\ self.emission_matrix, 1)).T self.initial_probability /= sum(self.initial_probability) def baum_welch_train(self, instance_list): """Baum-Welch unsupervised training Before calling this function, you have to call self.populate_alphabets(instance_list) and then initialize transition matrix, observation matrix, and initial probability. It's ok to fix initial probability to 1 / self.num_states (Uniform) This function is not so optimized, so it can't turn the crank on too large a dataset. """ num_states = self.label_alphabet.size() num_features = self.feature_alphabet.size() old_total_loglikelihood = - numpy.Infinity for i in xrange(30): expected_observation_counts = numpy.zeros((num_states, num_features)) expected_transition_counts = numpy.zeros((num_states, num_states)) total_log_likelihood = 0 #E-Step for instance in instance_list: transition_counts, obs_counts, likelihood = self.compute_expected_counts(instance) expected_observation_counts += obs_counts expected_transition_counts += transition_counts total_log_likelihood += numpy.log(likelihood) #M-Step self.transition_matrix = (expected_transition_counts.transpose() / numpy.sum(expected_transition_counts, 1)).transpose() self.emission_matrix = (expected_observation_counts.transpose() / numpy.sum(expected_observation_counts, 1)).transpose() print 'Iteration %s : %s ' % (i, total_log_likelihood) if total_log_likelihood < old_total_loglikelihood: break old_total_loglikelihood = total_log_likelihood self.initial_probability = numpy.zeros(num_states) + 1.0/num_states def classify_instance(self, instance): """Viterbi decoding algorithm Returns a list of label strings e.g. ['Hot', 'Cold', 'Cold'] Add your implementation """ self._mutate_data(instance) #just in case #initialization slength = len(instance.data) v = numpy.zeros((self.num_states, slength)) backtrace = numpy.zeros((self.num_states, slength)) v[:, 0] = self.initial_probability * self.emission_matrix[:, \ instance.data[0]] #recursion for t in range(1, slength): tempmat = v[:, t-1] * self.transition_matrix.T maxis = numpy.argmax(tempmat, axis = 1) backtrace[:, slength - t] = maxis #facilitates reversal later v[:, t] = v[maxis, t-1] * self.transition_matrix[maxis, \ xrange(numpy.size(self.transition_matrix, 1))] * \ self.emission_matrix[:, instance.data[t]] #termination backtrace[:, 0] = v[:, -1] return self._run_backtrace(backtrace) def _run_backtrace(self, back_mat): """ Helper function for extracting @param back_mat: a deque """ stack = [numpy.argmax(back_mat[:, 0])] for ind in xrange(1, numpy.size(back_mat, 1)): stack.append(back_mat[stack[-1], ind]) res = [] while stack: res.append(self.label_alphabet.get_label(stack.pop())) return res def print_parameters(self): """Print the two parameter matrices You should take advantage of this function in debugging and inspecting the resulting parameters. This function is implemented for you. """ state_header = map(str, [self.label_alphabet.get_label(i) \ for i in xrange(self.label_alphabet.size())]) obs_header = map(str, [self.feature_alphabet.get_label(i) \ for i in xrange(self.feature_alphabet.size())]) print matrix_to_string(self.emission_matrix, state_header, obs_header) print matrix_to_string(self.transition_matrix, state_header, state_header) def to_dict(self): """Convert HMM instance into a dictionary representation The implementation of this should be in sync with from_dict function. You should be able to use these two functions to convert the model into either representation (object or dictionary) We have enough of this. This is fully implemented for you. """ model_dict = { 'label_alphabet': self.label_alphabet.to_dict(), 'feature_alphabet': self.feature_alphabet.to_dict(), 'transition_matrix': self.transition_matrix.to_list(), 'emission_matrix': self.emission_matrix.to_list(), 'initial_probability': self.initial_probability.to_list() } return model_dict @classmethod def from_dict(model_dict): """Convert a dictionary into HMM instance The implementation of this should be in sync with to_dict function. This is fully implemented for you. """ hmm = HMM() hmm.label_alphabet = Alphabet.from_dict(model_dict['label_alphabet']) hmm.feature_alphabet = Alphabet.from_dict(model_dict['feature_alphabet']) hmm.transition_matrix = numpy.array(model_dict['transition_matrix']) hmm.emission_matrix = numpy.array(model_dict['emission_matrix']) hmm.initial_probability = numpy.array(model_dict['initial_probability']) return hmm
class Parser: def __init__(self, feature_generator_list, decay = False): self.feature_generator_list = feature_generator_list self.feature_alphabet = Alphabet() self.label_alphabet = Alphabet() #you will need this if you use labeled arc self.weights = None self.learning_rate = 0.0001 self.num_iterations = 10 self.caches = {} self.decay = decay def featurize(self, src, dst, sentence, grow_alphabets=True): """Generate feature indices for an arc from src->dst Arg: Arc from src(index)->dst(index) sentence is a dictionary in which you can put whatever in. """ feature_list = [] for feature_generator in self.feature_generator_list: feature_list.extend(feature_generator(src, dst, sentence)) if grow_alphabets: #set to false when running this function on dev/test set for feature, bias in feature_list: self.feature_alphabet.add(feature) for src,dst,label in sentence['arcs']: self.label_alphabet.add(label) feature_vector = [(self.feature_alphabet.get_index(x), feature_value) for x, feature_value in feature_list \ if self.feature_alphabet.has_label(x)] return ([x for x,y in feature_vector], numpy.array([y for x,y in feature_vector])) def make_fully_connected_graph(self, sentence): """Make a graph to make an MST from If G is such graph, then the weight for an arc from token i to token j is G[i][j] i.e. G is a diction and G[i] is also a dictionary. If arc i->j does not exist, then j not in G[i]. You will need to use self.featurize for all possible edges Arg: sentence is a dictionary in which you can put whatever in. Add your implementation """ G = {} #get a list of indices indices = range(len(sentence['tokens'])) #make an arc for each pair for i in indices: G[i] = {} for j in indices: if i != j: G[i][j] = self.featurize(int(i), int(j), sentence, False) return G ########################### #Actual training function!# ########################### def train(self, training_sentences, dev_sentences=None, prealloc=False): """Perceptron algorithm for learning edge weights If a dev set is provided, then we can evaluate the parser at every k iterations just so we know the progress of the training process and see if we need more iterations. Arg: a list of dictionaries in which you can put whatever in. Add your implementation """ #this is where you should populate the feature alphabet and the weight vector #cache training sentences; populate alphabets print "Populating features and caching training sentences ..." self._add_to_caches(training_sentences, 'training', not prealloc) print "Done!" #cache dev sentences if dev_sentences: print "Caching dev sentences ... " self._add_to_caches(dev_sentences, 'dev', False) print "Done!" #initialize weight vector if not prealloc: #don't touch this business if it's preallocated print "Initializing weight vector ..." self.weights = numpy.zeros(len(self.feature_alphabet)) #self.weights = numpy.zeros((len(self.feature_alphabet) + 1) * len(self.label_alphabet)) random.seed() for i, weight in enumerate(self.weights): self.weights[i] += .00001 print "Done!" #okay, start training, bro for i in xrange(self.num_iterations): print "Pass %d:\n" % (i + 1) if dev_sentences is not None and i % 2 == 0: # tracking progress print "Current UAS: %f" % self.evaluate(dev_sentences, 'dev') for j, sentence in enumerate(training_sentences): if not j % 1000: print "Training on sentences %d to %d of %d ..." % \ (j, min(j+999, len(training_sentences)), len(training_sentences)) #graph = self.make_fully_connected_graph(sentence) fcg = self.caches['training']['fcgs'][j] graph = self._featurized_to_weighted(fcg) max_spanning_tree = mst(0, graph) #Add training function here gold = self.caches['training']['counts'][j] hypo = self._get_counts(self._fcg_to_featurized(\ fcg, max_spanning_tree)) self._mutate_weights(gold, hypo) if self.decay: self.learning_rate *= 0.9 def evaluate(self, sentences, key): """Compute evaluation metrics Compute Unlabeled Arc Score (UAS) and optionally other metrics Add your implementation """ good = 0 total = 0 for j, sent in enumerate(sentences): fcg = self.caches[key]['fcgs'][j] graph = self._featurized_to_weighted(fcg) try: hypo = self._arcset(mst(0, graph)) except: #debug print '.', continue gold = set([(int(i), int(j)) for i,j,lab in sent['arcs']]) good += len(hypo.intersection(gold)) total += len(gold) return float(good)/ total def serialize(self, fname): """Convert to dictionary representation and serialize.""" d = {} d['weights'] = self.weights d['feat_alph'] = self.feature_alphabet.to_dict() d['label_alph'] = self.label_alphabet.to_dict() d['features'] = self.feature_generator_list d['decay'] = self.decay with open(fname, 'wb') as outf: cPickle.dump(d, outf) def deserialize(self, fname): """Retrieve from serialization; keep defaults where possible.""" with open(fname, 'rb') as inf: d = cPickle.load(inf) self.weights = d['weights'] self.feature_alphabet = Alphabet.from_dict(d['feat_alph']) self.label_alphabet = Alphabet.from_dict(d['label_alph']) self.feature_generator_list = d['features'] self.decay = d['decay'] def try_parse(self, inp): """Determine whether provided input is a file or a string.""" import nltk #was it a text file? try: inp = open(inp, 'rb').read() #nope! except IOError: pass paragraph = nltk.sent_tokenize(inp) for sentence in paragraph: self.parse(sentence) def parse(self, sentence_string): """Extra credit : parse an arbitrary string This is actually what we want at the end. Given an arbitrary string 0) split it into sentences (if you want to accept multiple sentences.) 1) tokenize 2) POS-tag and other pre-processing technique 3) parse it! 4) draw it using nltk draw_trees like in the example it does not support labeled arc though :( """ #draw a tree from nltk.draw.tree import draw_trees from nltk.tree import Tree import nltk words = nltk.pos_tag(nltk.word_tokenize(sentence_string)) sentence = {'tokens': ['ROOT'], 'arcs': [], 'pos':['ROOT']} for word, pos in words: sentence['tokens'].append(word) sentence['pos'].append(pos) indices = range(len(sentence['tokens'])) fcg = self.make_fully_connected_graph(sentence) weighted = self._featurized_to_weighted(fcg) max_spanning_tree = mst(0, weighted) wlist = sentence['tokens'] #print the dependencies for i in max_spanning_tree.keys(): for j in max_spanning_tree[i].keys(): print "%s->%s" % (i, j) t = self._build_tree(max_spanning_tree, 0, wlist) draw_trees(Tree(t)) ################################### #A whole bunch of helper functions# ################################### def _build_tree(self, G, root, wlist): if root in G.keys(): return '(' + str(wlist[root]) + ' '.join([self._build_tree(\ G, ind, wlist) for ind in G[root]]) + ')' else: return '(%s)' % str(wlist[root]) def _featurized_to_weighted(self, graph): """Converts a fully-connected graph to one with arc weights""" wG = {} for i in graph.keys(): #for j in graph[i].keys(): for j in graph.keys(): if i != j: arclength = -(numpy.sum(self.weights[graph[i][j][0]] \ * graph[i][j][1])) if not arclength: arclength = 1 if i in wG.keys(): wG[i][j] = arclength else: wG[i] = {j: arclength} return wG def _add_to_caches(self, sentence_set, key, grow_alph): """Add to the stored caches under a given key""" self.caches[key] = {'fcgs': [], 'counts': []} for sentence in sentence_set: self.caches[key]['counts'].append(\ self._get_counts(self._sentence_to_featurized(\ sentence, grow_alph))) for sentence in sentence_set: self.caches[key]['fcgs'].append(\ self.make_fully_connected_graph(sentence)) def _get_counts(self, graph): """Convert a graph into a dictionary of arc counts""" counts = {} for i in graph.keys(): for j in graph[i].keys(): for feat, weight in zip(*graph[i][j]): if feat in counts.keys(): counts[feat] += weight else: counts[feat] = weight return counts def _fcg_to_featurized(self, fully_connected, spanning_tree): """ Given a maximum spanning tree, retrieve the appropriate features from the fcg. """ feat_tree = {} for head in spanning_tree.keys(): for dep, weight in spanning_tree[head].iteritems(): if head in feat_tree.keys(): feat_tree[head][dep] = fully_connected[head][dep] else: feat_tree[head] = {dep: fully_connected[head][dep]} return feat_tree def _sentence_to_featurized(self, sentence, grow = True): """ Create a graph dictionary with feature vectors. """ #declare a graph (it's an empty dictionary) G = {} #featurize all arcs for src, dst, label in sentence['arcs']: features = self.featurize(int(src), int(dst), sentence, grow) try: G[int(src)][int(dst)] = features except: G[int(src)] = {int(dst): features} return G def _mutate_weights(self, gold, hypo): """ Change the weights by comparing a hypothesis with the gold standard. """ counts = {} #get set of all features involved; aggregate counts for elem in set(gold.keys()).union(set(hypo.keys())): counts[elem] = gold.get(elem, 0) - hypo.get(elem, 0) #adjust weights self.weights[counts.keys()] += \ numpy.array(counts.values()) * self.learning_rate def _arcset(self, G): return set([(i, j) for i in G.keys() for j in G[i].keys()])
def __init__(self): self.label_alphabet = Alphabet() self.feature_alphabet = Alphabet() self.transition_matrix = None self.emission_matrix = None self.initial_probability = None
class TranslationModel: def __init__(self, aligned_sentences, max_iterations=-1, eta=None): """ :param aligned_sentences: a list of tuples of aligned sentences :param max_iterations: the number of iterations to run EM :param eta: the value that the delta of the EM probabilities must fall below to be considered converged """ self.aligned_sentences = aligned_sentences self.e_alphabet = Alphabet() self.f_alphabet = Alphabet() if eta is None: # very simple heuristic self.eta = len(aligned_sentences)/100. else: self.eta = eta self.max_iterations = max_iterations if max_iterations == -1: self.do_more = self.has_converged else: self.do_more = self.stop_iterations def convert_to_vector(self, raw_data, lang="e", training=True): """ :param raw_data: a tokenized sentence :param lang: whether it's source or target :param training: whether this is during training or testing :return: numpy array of the integers corresponding to words """ if lang == "e": alphabet = self.e_alphabet else: alphabet = self.f_alphabet if training: return numpy.array(map(alphabet.get_index, raw_data), dtype=int) else: vector = numpy.zeros(len(raw_data), dtype=int) for i,word in enumerate(raw_data): try: vector[i] = alphabet.get_index(word) except KeyError: continue #ignoring OOV words return vector def populate_alphabets(self): """ Populates the alphabets so that the tokens can have an integer representation. Also converts the sentences into this format. """ for e_instance,f_instance in self.aligned_sentences: for i,token in enumerate(e_instance.raw_data): self.e_alphabet.add(token) for i,token in enumerate(f_instance.raw_data): self.f_alphabet.add(token) e_instance.data = self.convert_to_vector(e_instance.raw_data, "e") f_instance.data = self.convert_to_vector(f_instance.raw_data, "f") def init_translation_table(self): """ Sets up the class field of the translation table and the cache of the previous table in order to do the initial delta. Initializes the probability of everything at 0.25 """ self.t_table = numpy.zeros([self.e_alphabet.size(), self.f_alphabet.size()]) self.previous_t_table = numpy.zeros([self.e_alphabet.size(), self.f_alphabet.size()]) self.t_table.fill(.25) def expectation_maximization(self): """ runs the EM algorithm for a specific number of iterations or until it has converged. """ i = 0 while not self.do_more(i): time1 = time.time() print "iteration {:d}".format(i+1), # initialize self.total = numpy.zeros(self.f_alphabet.size()) self.counts = numpy.zeros([self.e_alphabet.size(), self.f_alphabet.size()]) for e_instance,f_instance in self.aligned_sentences: self.s_total = numpy.zeros(self.e_alphabet.size()) # compute normalization for e_word in e_instance.data: self.s_total[e_word] += numpy.sum(self.t_table[e_word, f_instance.data]) # collect counts for e in e_instance.data: tmp = self.t_table[e,f_instance.data]/self.s_total[e] self.counts[e,f_instance.data] += tmp self.total[f_instance.data] += tmp # estimate probabilities self.t_table = self.counts/self.total i += 1 print "\t{:.3f} seconds".format(time.time()-time1), def has_converged(self, i): """ calculates the delta, sees if it is lower than eta @param i: only used so this method can have the same signature as stop_iterations @return: a boolean whether the EM iterations need to stop """ delta = numpy.sum(numpy.abs(self.t_table - self.previous_t_table)) self.previous_t_table = numpy.copy(self.t_table) if i != 0: print "\tdelta: {:.3f}".format(delta) if delta < self.eta: return True return False def stop_iterations(self, i): """ @param i: current iteration nubmer @return: boolean whether EM need to stop iterating """ print return i >= self.max_iterations def train(self): """ does all tasks necessary to train our model """ self.populate_alphabets() self.init_translation_table() self.expectation_maximization() self.build_language_model() def evaluate(self, candidate_data): """ given candidate translations, this will select the best according to our translation table and language model. prints the source sentence and best candidate, which is argmax(p(t|s)) :param candidate_data: a list with a source sentence and translation candidates """ for (source, source_sent_tokenized), candidates in candidate_data: candidate_scores = numpy.zeros(len(candidates)) for i,(c_sent, c) in enumerate(candidates): candidate_scores[i] = self.translation_log_prob(c, source_sent_tokenized) print u"source sentence: {:s}".format(source) print u"best translation: {:s}".format(candidates[numpy.argmax(candidate_scores)][0]) print def t_table_log_prob(self, e_sentence, f_sentence): """ gives the log(p(s|t)) :param e_sentence: tokenized target sentence :param f_sentence: tokenized candidate sentence :return: the log probability of a sentence translating to a candidate """ e_sentence = self.convert_to_vector(e_sentence, lang="e", training=False) f_sentence = self.convert_to_vector(f_sentence, lang="f", training=False) product = 1. for e in e_sentence: product *= numpy.sum(self.t_table[e,f_sentence]) return numpy.log(product/(len(f_sentence)**len(e_sentence))) def build_language_model(self): """ creates the language model for our target language and saves it in a class field """ all_data = [] for e_sentence,_ in self.aligned_sentences: all_data.append(e_sentence.data) self.language_model = BigramModel(all_data, self.e_alphabet) self.language_model.train() def translation_log_prob(self, e_sentence, f_sentence): """ :param e_sentence: tokenized target sentence :param f_sentence: tokenized source sentence :return: the log(p(s|t)*p()) """ return self.t_table_log_prob(e_sentence, f_sentence) + self.language_model.log_prob(e_sentence) def save(self): raise NotImplementedError def load(self, data): raise NotImplementedError