def test_add_one_word(self): """Assert that adding a single word to a newly constructed Vocabulary works as expected""" v = Vocabulary() i = v.get_int("hello") self.assertEqual(i, 0) self.assertEqual(v.size(), 1) self.assertEqual(v.words(), ["hello"]) self.assertEqual(v.get_int("hello"), i) self.assertEqual(v.get_word(0), "hello")
def add(self, e, f): words_e = nltk.word_tokenize(e) words_f = nltk.word_tokenize(f) sent_e, sent_f = list(), list() for single_e in words_e: sent_e.append(Vocabulary.get_int(self.e_vocab, single_e)) for single_f in words_f: sent_f.append(Vocabulary.get_int(self.f_vocab, single_f)) self.e.append(sent_e) self.f.append(sent_f)
def test_adding_words(self): """Assert that words are properly added to the vocabulary""" v = Vocabulary() tokens = "Four score and seven years ago".split() ints = list() for token in tokens: ints.append(v.get_int(token)) self.assertEqual(v.words(), tokens) for token in tokens: i = v.get_int(token) self.assertNotEqual(i, None) t = v.get_word(i) self.assertEqual(t, token) for i in range(0, len(tokens)): self.assertNotEqual(i, None) t = v.get_word(i) self.assertEqual(t, tokens[i])
def add(self, e, f): #tokenizing the sentences english_words = nltk.word_tokenize(e) foreign_words = nltk.word_tokenize(f) #creating lists to store variables english_list = list() foreign_list = list() #going through each word of sentence to find the corresponding int and append to list for english_word in english_words: cur_int = Vocabulary.get_int(self.e_vocab, english_word) english_list.append(cur_int) for foreign_word in foreign_words: cur_int = Vocabulary.get_int(self.f_vocab, foreign_word) foreign_list.append(cur_int) #appending list to of int to self.? self.e.append(english_list) self.f.append(foreign_list)
def create_vocab(words): v = Vocabulary() for word in words: v.get_int(word) return v
class ParallelCorpus: # Define a constructor def __init__(self): # List of English sentences. Each sentence will be represented as a list of ints. self.e = list() # List of foreign sentences Each sentence will be represented as a list of ints. self.f = list() # Initially empty vocabularies self.e_vocab = Vocabulary() self.f_vocab = Vocabulary() # Returns the number of sentence pairs that have been added to this parallel corpus def size(self): return len(self.e) # Returns the list of integers corresponding to the English sentence at the specified sentence index def get_e(self, sentence_index): return self.e[sentence_index] # Returns the list of integers corresponding to the foreign sentence at the specified sentence index def get_f(self, sentence_index): return self.f[sentence_index] # Given a string representing an English sentence # and a string representing a foreign sentence, # tokenize each string using nltk.word_tokenize, # and use the appropriate vocabulary to convert each token to an int. # # Append the list of integers (corresponding to the English sentence) to self.e # Append the list of integers (corresponding to the foreign sentence) to self.f def add(self, e, f): wordlist = [] #this is to get a list for each sentence of ints words = nltk.word_tokenize(e) for word in words: wordlist.append(self.e_vocab.get_int(word)) self.e.append(wordlist) wordlist = [] #this is to get a list for each sentence of ints words = nltk.word_tokenize(f) for word in words: wordlist.append(self.f_vocab.get_int(word)) self.f.append(wordlist) # Construct a conditional distribution with the given name. # # Use the formula given in the supplementary instructions def create_uniform_distribution(self, name): initial_prob = 1.0 / self.f_vocab.size() #init probability, i.e 25% return Conditional(name, self.e_vocab, self.f_vocab, initial_prob) # Given a sentence index, a scaling factor epsilon, and a conditional distribution, # calculate the conditional probability # of the English sentence (at that sentence index) # given the foreign sentence (at that sentence index) # # Use the formula given in the supplementary instructions def conditional_probability(self, sentence_index, epsilon, conditional): le = len(self.e[sentence_index]) lf = len(self.f[sentence_index]) tsum = 0 for j in range(le): for i in range(lf): tsum = tsum + conditional.get(j, i) lftole = math.pow(lf, le) p = epsilon / lftole * tsum return p # Given a conditional distribution and a scaling factor epsilon, # calculate the perplexity of this parallel corpus. # # Use the formula given in the supplementary instructions def perplexity(self, epsilon, conditional): s = self.size() pp = 0 for i in range(s): p = self.conditional_probability(i, epsilon, conditional) pp = pp + math.log(p, 2) pp = -pp return pp