Exemplo n.º 1
0
    def get(self, e_i, f_i):
        e_word = Vocabulary.get_word(self.e_vocab, e_i)
        f_word = Vocabulary.get_word(self.f_vocab, f_i)
        #looking for value corresponding to dict of dict val
        ret_val = self.dict_of_dict[e_word][f_word]

        return ret_val
Exemplo n.º 2
0
 def __str__(self):
     ret_str = ""
     for word_e in Vocabulary.words(self.e_vocab):
         for word_f in Vocabulary.words(self.f_vocab):
             ret_str += self.name + "[" + word_e + " | " + word_f + "] = " + str(
                 self.double_dict[word_e][word_f]) + "\n"
     return ret_str
Exemplo n.º 3
0
    def __str__(self):
        ret_string = ""

        for e in Vocabulary.words(self.e_vocab):
            for f in Vocabulary.words(self.f_vocab):
                ret_string = ret_string + self.name + "[" + e + " | " + f + "]" + " = " + str(
                    self.dict_of_dict[e][f]) + '\n'
        return ret_string
Exemplo n.º 4
0
 def add(self, e, f):
     words_e = nltk.word_tokenize(e)
     words_f = nltk.word_tokenize(f)
     sent_e, sent_f = list(), list()
     for single_e in words_e:
         sent_e.append(Vocabulary.get_int(self.e_vocab, single_e))
     for single_f in words_f:
         sent_f.append(Vocabulary.get_int(self.f_vocab, single_f))
     self.e.append(sent_e)
     self.f.append(sent_f)
Exemplo n.º 5
0
    def __init__(self, name, e_vocab, f_vocab, initial_value):
        self.name = name
        self.e_vocab = e_vocab
        self.f_vocab = f_vocab
        #cast as dict?? self.double_dict = dict(defaultdict(dict))?
        self.double_dict = defaultdict(dict)

        for word_e in Vocabulary.words(e_vocab):
            for word_f in Vocabulary.words(f_vocab):
                self.double_dict[word_e][word_f] = initial_value
Exemplo n.º 6
0
    def __init__(self, name, e_vocab, f_vocab, initial_value):
        self.name = name
        self.e_vocab = e_vocab
        self.f_vocab = f_vocab
        #self.initial_value = 0
        self.dict_of_dict = defaultdict(dict)

        #storing initial value for every pair of words in dictionary of dictionaries
        for e_word in Vocabulary.words(e_vocab):
            for f_word in Vocabulary.words(f_vocab):
                self.dict_of_dict[e_word][f_word] = initial_value
Exemplo n.º 7
0
    def __init__(self):

        # List of English sentences. Each sentence will be represented as a list of ints.
        self.e = list() 

        # List of foreign sentences  Each sentence will be represented as a list of ints.
        self.f = list() 

        # Initially empty vocabularies
        self.e_vocab = Vocabulary()
        self.f_vocab = Vocabulary()
Exemplo n.º 8
0
 def test_add_one_word(self):
     """Assert that adding a single word to a newly constructed Vocabulary works as expected"""
     v = Vocabulary()
     i = v.get_int("hello")
     self.assertEqual(i, 0)
     self.assertEqual(v.size(), 1)
     self.assertEqual(v.words(), ["hello"])
     self.assertEqual(v.get_int("hello"), i)
     self.assertEqual(v.get_word(0), "hello")
Exemplo n.º 9
0
    def add(self, e, f):
        #tokenizing the sentences
        english_words = nltk.word_tokenize(e)
        foreign_words = nltk.word_tokenize(f)

        #creating lists to store variables
        english_list = list()
        foreign_list = list()

        #going through each word of sentence to find the corresponding int and append to list
        for english_word in english_words:
            cur_int = Vocabulary.get_int(self.e_vocab, english_word)
            english_list.append(cur_int)

        for foreign_word in foreign_words:
            cur_int = Vocabulary.get_int(self.f_vocab, foreign_word)
            foreign_list.append(cur_int)

        #appending list to of int to self.? 
        self.e.append(english_list)
        self.f.append(foreign_list)
Exemplo n.º 10
0
    def test_adding_words(self):
        """Assert that words are properly added to the vocabulary"""
        v = Vocabulary()
        tokens = "Four score and seven years ago".split()
        ints = list()
        for token in tokens:
            ints.append(v.get_int(token))

        self.assertEqual(v.words(), tokens)

        for token in tokens:
            i = v.get_int(token)
            self.assertNotEqual(i, None)
            t = v.get_word(i)
            self.assertEqual(t, token)

        for i in range(0, len(tokens)):
            self.assertNotEqual(i, None)
            t = v.get_word(i)
            self.assertEqual(t, tokens[i])
Exemplo n.º 11
0
 def test_empty(self):
     """Assert that a newly constructed vocabulary has size zero"""
     v = Vocabulary()
     self.assertEqual(v.size(), 0)
Exemplo n.º 12
0
class ParallelCorpus:

    # Define a constructor
    def __init__(self):

        # List of English sentences. Each sentence will be represented as a list of ints.
        self.e = list() 

        # List of foreign sentences  Each sentence will be represented as a list of ints.
        self.f = list() 

        # Initially empty vocabularies
        self.e_vocab = Vocabulary()
        self.f_vocab = Vocabulary()


    # Returns the number of sentence pairs that have been added to this parallel corpus
    def size(self):
        return len(self.e)

    # Returns the list of integers corresponding to the English sentence at the specified sentence index
    def get_e(self, sentence_index):
        return self.e[sentence_index]

    # Returns the list of integers corresponding to the foreign sentence at the specified sentence index
    def get_f(self, sentence_index):
        return self.f[sentence_index]


    # Given a string representing an English sentence
    #   and a string representing a foreign sentence,
    #   tokenize each string using nltk.word_tokenize,
    #   and use the appropriate vocabulary to convert each token to an int.
    #   
    # Append the list of integers (corresponding to the English sentence) to self.e
    # Append the list of integers (corresponding to the foreign sentence) to self.f
    def add(self, e, f):
        #tokenizing the sentences
        english_words = nltk.word_tokenize(e)
        foreign_words = nltk.word_tokenize(f)

        #creating lists to store variables
        english_list = list()
        foreign_list = list()

        #going through each word of sentence to find the corresponding int and append to list
        for english_word in english_words:
            cur_int = Vocabulary.get_int(self.e_vocab, english_word)
            english_list.append(cur_int)

        for foreign_word in foreign_words:
            cur_int = Vocabulary.get_int(self.f_vocab, foreign_word)
            foreign_list.append(cur_int)

        #appending list to of int to self.? 
        self.e.append(english_list)
        self.f.append(foreign_list)


    # Construct a conditional distribution with the given name.
    #
    # Use the formula given in the supplementary instructions
    def create_uniform_distribution(self, name):
        #use constructor of Conditional class
        init_val = 1 / self.f_vocab.size()
#        print(init_val)
        return Conditional(name, self.e_vocab, self.f_vocab, init_val) 

    # Given a sentence index, a scaling factor epsilon, and a conditional distribution,
    #    calculate the conditional probability 
    #    of the English sentence (at that sentence index) 
    #    given the foreign sentence (at that sentence index)
    #
    # Use the formula given in the supplementary instructions
    def conditional_probability(self, sentence_index, epsilon, conditional):
        #getting int from sentences at sentence_index
        english = self.get_e(sentence_index)
        foreign = self.get_f(sentence_index)
#        print(english)
#        print(foreign)

        #using formula from PDF to initialize operands of prob
        factor = epsilon / (len(english) * len(foreign))
        sum_of_sum = 0
#        print(factor)    
 
        #gathering values of each word 
        for j in range(0, len(english)):
            for i in range(0, len(foreign)):
                new_val = conditional.get(english[j], foreign[i])
#                print(new_val)
                sum_of_sum = sum_of_sum + new_val
#                print(sum_of_sum)

        cond_prob = factor * sum_of_sum
        return cond_prob    


    # Given a conditional distribution and a scaling factor epsilon,
    #    calculate the perplexity of this parallel corpus.
    #
    # Use the formula given in the supplementary instructions
    def perplexity(self, epsilon, conditional):
        sum_perx = 0
        range_bound = self.size()
#        print(range_bound)
        #following formula from PDF
        for s in range(0, range_bound):
#            print(s)
            temp = self.conditional_probability(s, epsilon, conditional)
#            print(temp)
            new_val_2 = math.log2(temp)
            sum_perx = sum_perx + new_val_2

        PP = -1 * sum_perx
        return PP
Exemplo n.º 13
0
class ParallelCorpus:

    # Define a constructor
    def __init__(self):

        # List of English sentences. Each sentence will be represented as a list of ints.
        self.e = list()

        # List of foreign sentences  Each sentence will be represented as a list of ints.
        self.f = list()

        # Initially empty vocabularies
        self.e_vocab = Vocabulary()
        self.f_vocab = Vocabulary()

    # Returns the number of sentence pairs that have been added to this parallel corpus
    def size(self):
        return len(self.e)

    # Returns the list of integers corresponding to the English sentence at the specified sentence index
    def get_e(self, sentence_index):
        return self.e[sentence_index]

    # Returns the list of integers corresponding to the foreign sentence at the specified sentence index
    def get_f(self, sentence_index):
        return self.f[sentence_index]

    # Given a string representing an English sentence
    #   and a string representing a foreign sentence,
    #   tokenize each string using nltk.word_tokenize,
    #   and use the appropriate vocabulary to convert each token to an int.
    #
    # Append the list of integers (corresponding to the English sentence) to self.e
    # Append the list of integers (corresponding to the foreign sentence) to self.f
    def add(self, e, f):
        wordlist = []  #this is to get a list for each sentence of ints
        words = nltk.word_tokenize(e)
        for word in words:
            wordlist.append(self.e_vocab.get_int(word))
        self.e.append(wordlist)
        wordlist = []  #this is to get a list for each sentence of ints
        words = nltk.word_tokenize(f)
        for word in words:
            wordlist.append(self.f_vocab.get_int(word))
        self.f.append(wordlist)

    # Construct a conditional distribution with the given name.
    #
    # Use the formula given in the supplementary instructions
    def create_uniform_distribution(self, name):
        initial_prob = 1.0 / self.f_vocab.size()  #init probability, i.e 25%
        return Conditional(name, self.e_vocab, self.f_vocab, initial_prob)

    # Given a sentence index, a scaling factor epsilon, and a conditional distribution,
    #    calculate the conditional probability
    #    of the English sentence (at that sentence index)
    #    given the foreign sentence (at that sentence index)
    #
    # Use the formula given in the supplementary instructions
    def conditional_probability(self, sentence_index, epsilon, conditional):
        le = len(self.e[sentence_index])
        lf = len(self.f[sentence_index])
        tsum = 0
        for j in range(le):
            for i in range(lf):
                tsum = tsum + conditional.get(j, i)
        lftole = math.pow(lf, le)
        p = epsilon / lftole * tsum
        return p

    # Given a conditional distribution and a scaling factor epsilon,
    #    calculate the perplexity of this parallel corpus.
    #
    # Use the formula given in the supplementary instructions
    def perplexity(self, epsilon, conditional):
        s = self.size()
        pp = 0
        for i in range(s):
            p = self.conditional_probability(i, epsilon, conditional)
            pp = pp + math.log(p, 2)
        pp = -pp
        return pp
Exemplo n.º 14
0
 def set(self, e_i, f_i, value):
     word_e = Vocabulary.get_word(self.e_vocab, e_i)
     word_f = Vocabulary.get_word(self.f_vocab, f_i)
     self.double_dict[word_e][word_f] = value
Exemplo n.º 15
0
 def test_empty_list(self):
     """Assert that a newly constructed vocabulary contains an empty list"""
     v = Vocabulary()
     self.assertEqual(v.words(), list())
Exemplo n.º 16
0
 def test_empty_word_index(self):
     """Assert that a newly constructed vocabulary does not associate any string with index zero"""
     v = Vocabulary()
     self.assertEqual(v.get_word(0), None)
Exemplo n.º 17
0
 def test_negative_indices(self):
     """Assert that a newly constructed vocabulary returns None for negative numbers"""
     v = Vocabulary()
     for i in range(-1000, -1):
         self.assertEqual(v.get_word(i), None)
Exemplo n.º 18
0
    def set(self, e_i, f_i, value):
        e_word = Vocabulary.get_word(self.e_vocab, e_i)
        f_word = Vocabulary.get_word(self.f_vocab, f_i)

        self.dict_of_dict[e_word][f_word] = value
Exemplo n.º 19
0
 def get(self, e_i, f_i):
     word_e = Vocabulary.get_word(self.e_vocab, e_i)
     word_f = Vocabulary.get_word(self.f_vocab, f_i)
     return self.double_dict[word_e][word_f]
Exemplo n.º 20
0
def create_vocab(words):
    v = Vocabulary()
    for word in words:
        v.get_int(word)
    return v
Exemplo n.º 21
0
class ParallelCorpus:

    # Define a constructor
    def __init__(self):

        # List of English sentences. Each sentence will be represented as a list of ints.
        self.e = list()

        # List of foreign sentences  Each sentence will be represented as a list of ints.
        self.f = list()

        # Initially empty vocabularies
        self.e_vocab = Vocabulary()
        self.f_vocab = Vocabulary()

    # Returns the number of sentence pairs that have been added to this parallel corpus
    def size(self):
        return len(self.e)

    # Returns the list of integers corresponding to the English sentence at the specified sentence index
    def get_e(self, sentence_index):
        return self.e[sentence_index]

    # Returns the list of integers corresponding to the foreign sentence at the specified sentence index
    def get_f(self, sentence_index):
        return self.f[sentence_index]

    # Given a string representing an English sentence
    #   and a string representing a foreign sentence,
    #   tokenize each string using nltk.word_tokenize,
    #   and use the appropriate vocabulary to convert each token to an int.
    #
    # Append the list of integers (corresponding to the English sentence) to self.e
    # Append the list of integers (corresponding to the foreign sentence) to self.f
    def add(self, e, f):
        words_e = nltk.word_tokenize(e)
        words_f = nltk.word_tokenize(f)
        sent_e, sent_f = list(), list()
        for single_e in words_e:
            sent_e.append(Vocabulary.get_int(self.e_vocab, single_e))
        for single_f in words_f:
            sent_f.append(Vocabulary.get_int(self.f_vocab, single_f))
        self.e.append(sent_e)
        self.f.append(sent_f)

    # Construct a conditional distribution with the given name.
    #
    # Use the formula given in the supplementary instructions
    def create_uniform_distribution(self, name):
        return Conditional(name, self.e_vocab, self.f_vocab,
                           1 / self.f_vocab.size())

    # Given a sentence index, a scaling factor epsilon, and a conditional distribution,
    #    calculate the conditional probability
    #    of the English sentence (at that sentence index)
    #    given the foreign sentence (at that sentence index)
    #
    # Use the formula given in the supplementary instructions
    def conditional_probability(self, sentence_index, epsilon, conditional):
        sent_e = self.get_e(sentence_index)
        sent_f = self.get_f(sentence_index)
        frac = epsilon / (len(sent_f)**len(sent_e))
        sum_total = 0
        for i in range(0, len(sent_e)):
            for j in range(0, len(sent_f)):
                sum_total += conditional.get(sent_e[i], sent_f[j])
        return frac * sum_total

    # Given a conditional distribution and a scaling factor epsilon,
    #    calculate the perplexity of this parallel corpus.
    #
    # Use the formula given in the supplementary instructions
    def perplexity(self, epsilon, conditional):
        sum_total = 0
        for s in range(0, self.size()):
            sum_total += math.log2(
                self.conditional_probability(s, epsilon, conditional))
        return -1 * sum_total