Exemplo n.º 1
0
 def test_add_one_word(self):
     """Assert that adding a single word to a newly constructed Vocabulary works as expected"""
     v = Vocabulary()
     i = v.get_int("hello")
     self.assertEqual(i, 0)
     self.assertEqual(v.size(), 1)
     self.assertEqual(v.words(), ["hello"])
     self.assertEqual(v.get_int("hello"), i)
     self.assertEqual(v.get_word(0), "hello")
Exemplo n.º 2
0
class ParallelCorpus:

    # Define a constructor
    def __init__(self):

        # List of English sentences. Each sentence will be represented as a list of ints.
        self.e = list() 

        # List of foreign sentences  Each sentence will be represented as a list of ints.
        self.f = list() 

        # Initially empty vocabularies
        self.e_vocab = Vocabulary()
        self.f_vocab = Vocabulary()


    # Returns the number of sentence pairs that have been added to this parallel corpus
    def size(self):
        return len(self.e)

    # Returns the list of integers corresponding to the English sentence at the specified sentence index
    def get_e(self, sentence_index):
        return self.e[sentence_index]

    # Returns the list of integers corresponding to the foreign sentence at the specified sentence index
    def get_f(self, sentence_index):
        return self.f[sentence_index]


    # Given a string representing an English sentence
    #   and a string representing a foreign sentence,
    #   tokenize each string using nltk.word_tokenize,
    #   and use the appropriate vocabulary to convert each token to an int.
    #   
    # Append the list of integers (corresponding to the English sentence) to self.e
    # Append the list of integers (corresponding to the foreign sentence) to self.f
    def add(self, e, f):
        #tokenizing the sentences
        english_words = nltk.word_tokenize(e)
        foreign_words = nltk.word_tokenize(f)

        #creating lists to store variables
        english_list = list()
        foreign_list = list()

        #going through each word of sentence to find the corresponding int and append to list
        for english_word in english_words:
            cur_int = Vocabulary.get_int(self.e_vocab, english_word)
            english_list.append(cur_int)

        for foreign_word in foreign_words:
            cur_int = Vocabulary.get_int(self.f_vocab, foreign_word)
            foreign_list.append(cur_int)

        #appending list to of int to self.? 
        self.e.append(english_list)
        self.f.append(foreign_list)


    # Construct a conditional distribution with the given name.
    #
    # Use the formula given in the supplementary instructions
    def create_uniform_distribution(self, name):
        #use constructor of Conditional class
        init_val = 1 / self.f_vocab.size()
#        print(init_val)
        return Conditional(name, self.e_vocab, self.f_vocab, init_val) 

    # Given a sentence index, a scaling factor epsilon, and a conditional distribution,
    #    calculate the conditional probability 
    #    of the English sentence (at that sentence index) 
    #    given the foreign sentence (at that sentence index)
    #
    # Use the formula given in the supplementary instructions
    def conditional_probability(self, sentence_index, epsilon, conditional):
        #getting int from sentences at sentence_index
        english = self.get_e(sentence_index)
        foreign = self.get_f(sentence_index)
#        print(english)
#        print(foreign)

        #using formula from PDF to initialize operands of prob
        factor = epsilon / (len(english) * len(foreign))
        sum_of_sum = 0
#        print(factor)    
 
        #gathering values of each word 
        for j in range(0, len(english)):
            for i in range(0, len(foreign)):
                new_val = conditional.get(english[j], foreign[i])
#                print(new_val)
                sum_of_sum = sum_of_sum + new_val
#                print(sum_of_sum)

        cond_prob = factor * sum_of_sum
        return cond_prob    


    # Given a conditional distribution and a scaling factor epsilon,
    #    calculate the perplexity of this parallel corpus.
    #
    # Use the formula given in the supplementary instructions
    def perplexity(self, epsilon, conditional):
        sum_perx = 0
        range_bound = self.size()
#        print(range_bound)
        #following formula from PDF
        for s in range(0, range_bound):
#            print(s)
            temp = self.conditional_probability(s, epsilon, conditional)
#            print(temp)
            new_val_2 = math.log2(temp)
            sum_perx = sum_perx + new_val_2

        PP = -1 * sum_perx
        return PP
Exemplo n.º 3
0
 def test_empty(self):
     """Assert that a newly constructed vocabulary has size zero"""
     v = Vocabulary()
     self.assertEqual(v.size(), 0)
Exemplo n.º 4
0
class ParallelCorpus:

    # Define a constructor
    def __init__(self):

        # List of English sentences. Each sentence will be represented as a list of ints.
        self.e = list()

        # List of foreign sentences  Each sentence will be represented as a list of ints.
        self.f = list()

        # Initially empty vocabularies
        self.e_vocab = Vocabulary()
        self.f_vocab = Vocabulary()

    # Returns the number of sentence pairs that have been added to this parallel corpus
    def size(self):
        return len(self.e)

    # Returns the list of integers corresponding to the English sentence at the specified sentence index
    def get_e(self, sentence_index):
        return self.e[sentence_index]

    # Returns the list of integers corresponding to the foreign sentence at the specified sentence index
    def get_f(self, sentence_index):
        return self.f[sentence_index]

    # Given a string representing an English sentence
    #   and a string representing a foreign sentence,
    #   tokenize each string using nltk.word_tokenize,
    #   and use the appropriate vocabulary to convert each token to an int.
    #
    # Append the list of integers (corresponding to the English sentence) to self.e
    # Append the list of integers (corresponding to the foreign sentence) to self.f
    def add(self, e, f):
        wordlist = []  #this is to get a list for each sentence of ints
        words = nltk.word_tokenize(e)
        for word in words:
            wordlist.append(self.e_vocab.get_int(word))
        self.e.append(wordlist)
        wordlist = []  #this is to get a list for each sentence of ints
        words = nltk.word_tokenize(f)
        for word in words:
            wordlist.append(self.f_vocab.get_int(word))
        self.f.append(wordlist)

    # Construct a conditional distribution with the given name.
    #
    # Use the formula given in the supplementary instructions
    def create_uniform_distribution(self, name):
        initial_prob = 1.0 / self.f_vocab.size()  #init probability, i.e 25%
        return Conditional(name, self.e_vocab, self.f_vocab, initial_prob)

    # Given a sentence index, a scaling factor epsilon, and a conditional distribution,
    #    calculate the conditional probability
    #    of the English sentence (at that sentence index)
    #    given the foreign sentence (at that sentence index)
    #
    # Use the formula given in the supplementary instructions
    def conditional_probability(self, sentence_index, epsilon, conditional):
        le = len(self.e[sentence_index])
        lf = len(self.f[sentence_index])
        tsum = 0
        for j in range(le):
            for i in range(lf):
                tsum = tsum + conditional.get(j, i)
        lftole = math.pow(lf, le)
        p = epsilon / lftole * tsum
        return p

    # Given a conditional distribution and a scaling factor epsilon,
    #    calculate the perplexity of this parallel corpus.
    #
    # Use the formula given in the supplementary instructions
    def perplexity(self, epsilon, conditional):
        s = self.size()
        pp = 0
        for i in range(s):
            p = self.conditional_probability(i, epsilon, conditional)
            pp = pp + math.log(p, 2)
        pp = -pp
        return pp
Exemplo n.º 5
0
class ParallelCorpus:

    # Define a constructor
    def __init__(self):

        # List of English sentences. Each sentence will be represented as a list of ints.
        self.e = list()

        # List of foreign sentences  Each sentence will be represented as a list of ints.
        self.f = list()

        # Initially empty vocabularies
        self.e_vocab = Vocabulary()
        self.f_vocab = Vocabulary()

    # Returns the number of sentence pairs that have been added to this parallel corpus
    def size(self):
        return len(self.e)

    # Returns the list of integers corresponding to the English sentence at the specified sentence index
    def get_e(self, sentence_index):
        return self.e[sentence_index]

    # Returns the list of integers corresponding to the foreign sentence at the specified sentence index
    def get_f(self, sentence_index):
        return self.f[sentence_index]

    # Given a string representing an English sentence
    #   and a string representing a foreign sentence,
    #   tokenize each string using nltk.word_tokenize,
    #   and use the appropriate vocabulary to convert each token to an int.
    #
    # Append the list of integers (corresponding to the English sentence) to self.e
    # Append the list of integers (corresponding to the foreign sentence) to self.f
    def add(self, e, f):
        words_e = nltk.word_tokenize(e)
        words_f = nltk.word_tokenize(f)
        sent_e, sent_f = list(), list()
        for single_e in words_e:
            sent_e.append(Vocabulary.get_int(self.e_vocab, single_e))
        for single_f in words_f:
            sent_f.append(Vocabulary.get_int(self.f_vocab, single_f))
        self.e.append(sent_e)
        self.f.append(sent_f)

    # Construct a conditional distribution with the given name.
    #
    # Use the formula given in the supplementary instructions
    def create_uniform_distribution(self, name):
        return Conditional(name, self.e_vocab, self.f_vocab,
                           1 / self.f_vocab.size())

    # Given a sentence index, a scaling factor epsilon, and a conditional distribution,
    #    calculate the conditional probability
    #    of the English sentence (at that sentence index)
    #    given the foreign sentence (at that sentence index)
    #
    # Use the formula given in the supplementary instructions
    def conditional_probability(self, sentence_index, epsilon, conditional):
        sent_e = self.get_e(sentence_index)
        sent_f = self.get_f(sentence_index)
        frac = epsilon / (len(sent_f)**len(sent_e))
        sum_total = 0
        for i in range(0, len(sent_e)):
            for j in range(0, len(sent_f)):
                sum_total += conditional.get(sent_e[i], sent_f[j])
        return frac * sum_total

    # Given a conditional distribution and a scaling factor epsilon,
    #    calculate the perplexity of this parallel corpus.
    #
    # Use the formula given in the supplementary instructions
    def perplexity(self, epsilon, conditional):
        sum_total = 0
        for s in range(0, self.size()):
            sum_total += math.log2(
                self.conditional_probability(s, epsilon, conditional))
        return -1 * sum_total