def __init__(self, lambda_1=0.67):
     self.unigram = Unigram()
     self.bigram = Bigram()
     # just needed for languageModel.py to work
     self.word_dict = self.bigram.word_dict
     self.lambda_1 = lambda_1
     self.lambda_2 = 1 - lambda_1
Exemplo n.º 2
0
class BigramInterpolation(LanguageModel):
    def __init__(self):
        self.unigram = Unigram()
        self.bigram = Bigram()

    def train(self, trainingSentences):
        self.unigram.train(trainingSentences)
        self.bigram.train(trainingSentences)

    def getWordProbability(self, sentence, index):
        return 0

    def getVocabulary(self, context):
        return []

    def generateWord(self, context):
        return 'bunny'

    def generateSentence(self):
        result = []
        # limit sentence length to 20
        for i in range(20):
            word = LanguageModel.UNK
            while word == LanguageModel.UNK:
                # make sure word != UNK
                word = self.generateWord(result)
            result.append(word)
            if word == LanguageModel.STOP:
                break
        return result
Exemplo n.º 3
0
 def __init__(self):
     self.unigram_model = Unigram()
     self.bigram_model = BigramAddOneSmooth()
     self.trigram_model = TrigramAddOneSmooth()
     self.unigram_lambda = .25
     self.bigram_lambda = .25
     self.trigram_lambda = .5
Exemplo n.º 4
0
class BigramInterpolation(LanguageModel):
    def __init__(self):
        self.unigram = Unigram()
        self.bigram = Bigram()
        # just needed for languageModel.py to work
        self.word_dict = self.bigram.word_dict
        self.lambda_1 = 0.5
        self.lambda_2 = 0.5

    '''
    Trains a bigram-interpolation language model on a training set.
    '''

    def train(self, trainingSentences):
        self.unigram.train(trainingSentences)
        self.bigram.train(trainingSentences)

    '''
    Returns the probability of the word at index, according to the model, within
    the specified sentence.
    '''

    def getWordProbability(self, sentence, index):
        return (
            self.lambda_1 * self.bigram.getWordProbability(sentence, index) +
            self.lambda_2 * self.unigram.getWordProbability(sentence, index))

    '''
    Returns, for a given context, a random word, according to the probabilities
    in the model.
    '''

    def generateWord(self, context):
        return 'bunny'
Exemplo n.º 5
0
def main():
    raw_data = get_data()
    # Unigram
    uni = Unigram(raw_data)
    uni.main()
    # Bigram
    bi = Bigram(raw_data)
    bi.main()
class BigramInterpolation(LanguageModel):

    def __init__(self, lambda_1=0.67):
        self.unigram = Unigram()
        self.bigram = Bigram()
        # just needed for languageModel.py to work
        self.word_dict = self.bigram.word_dict
        self.lambda_1 = lambda_1
        self.lambda_2 = 1 - lambda_1
    
    '''
    Trains a bigram-interpolation language model on a training set.
    '''
    def train(self, trainingSentences):
        self.unigram.train(trainingSentences)
        self.bigram.train(trainingSentences)
    
    '''
    Returns the probability of the word at index, according to the model, within
    the specified sentence.
    '''
    def getWordProbability(self, sentence, index):
        return (self.lambda_1*self.bigram.getWordProbability(sentence, index)
                +self.lambda_2*self.unigram.getWordProbability(sentence, index))

    '''
    Returns, for a given context, a random word, according to the probabilities
    in the model.
    '''
    def generateWord(self, context):
        if context:
            previous_word = context[-1]
        else:
            previous_word = LanguageModel.START

        if (previous_word not in self.word_dict) and (previous_word != LanguageModel.START):
            previous_word = LanguageModel.UNK

        if previous_word == LanguageModel.START:
            previous_word_index = 0
        else:
            previous_word_index = self.word_dict[previous_word]

        probs_bigram = self.bigram.prob_counter[previous_word_index].toarray().ravel()
        probs_unigram = self.unigram.prob_counter[0].toarray().ravel()

        # Because the unigram model and bigram model have different word index for STOP, I need to make some adjustment
        stop_index = self.unigram.word_dict[LanguageModel.STOP]
        # move STOP probability to the first element of probs_unigram and leave the others unchanged
        stop_prob = probs_unigram[stop_index]
        probs_unigram = np.append(stop_prob, np.delete(probs_unigram, stop_index))
        probs = self.lambda_1*probs_bigram + self.lambda_2*probs_unigram  # Get the interpolation probability

        word_list = sorted(self.word_dict.items(), key=lambda item: item[1])
        word_list = [k[0] for k in word_list]

        return np.random.choice(word_list, p=probs)
Exemplo n.º 7
0
class Interpolation(LanguageModel):
    def __init__(self):
        self.unigram_model = Unigram()
        self.bigram_model = BigramAddOneSmooth()
        self.trigram_model = TrigramAddOneSmooth()
        self.unigram_lambda = .25
        self.bigram_lambda = .25
        self.trigram_lambda = .5

    def train(self, trainingSentences):
        self.unigram_model.train(trainingSentences)
        self.bigram_model.train(trainingSentences)
        self.trigram_model.train(trainingSentences)

    #Arbitrary lambdas.
    def getWordProbability(self, sentence, index):
        return (self.trigram_lambda * self.trigram_model.getWordProbability(sentence, index)) \
               + (self.bigram_lambda * self.bigram_model.getWordProbability(sentence, index)) \
               + (self.unigram_lambda * self.unigram_model.getWordProbability(sentence, index))

    #Doesn't matter which model we use here- vocabulary is the same
    def getVocabulary(self, context):
        return self.trigram_model.getVocabulary(context)

    #What does generating a sentence in an interpolation model look like?
    #I don't know, so what I've done is generate a word using trigram, bigram, and
    #unigram model some of the time, using the same values in getWordProbability
    def generateSentence(self):
        sentence = []
        prev_previous = LanguageModel.START
        previous = random.choice(list(self.trigram_model.word_count.keys()))
        for i in range(20):
            model_choice = random.random()
            if model_choice <= self.trigram_lambda:
                word = self.trigram_model.generateWord(prev_previous, previous)
            elif model_choice > self.trigram_lambda and model_choice <= self.trigram_lambda + self.bigram_lambda:
                word = self.bigram_model.generate_word(previous)
            else:
                word = self.unigram_model.generateWord()
            sentence.append(word)
            prev_previous = previous
            previous = word
            if word == LanguageModel.STOP:
                break
        return sentence
Exemplo n.º 8
0
from unigram import Unigram
from bigram import Bigram
from trigram import Trigram

inputs = read('input.txt')[0].strip().split(" ")
V, N, S_FACTOR, TRAINING_FILE, TEST_FILE = (int(inputs[0]), int(inputs[1]),
                                            float(inputs[2]), inputs[3],
                                            inputs[4])
OUTPUT_FILE_NAME = f"./results/trace_{V}_{N}_{S_FACTOR}.txt"

t1 = time()
if V == 3:
    print(f"BYOM: V = {V} n = 3 d = {S_FACTOR}")
    BYOM = BYOM(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    BYOM.execute()
elif N == 1:
    print(f"unigram: V = {V} d = {S_FACTOR}")
    UNIGRAM = Unigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    UNIGRAM.execute()
elif N == 2:
    print(f"bigram: V = {V} d = {S_FACTOR}")
    BIGRAM = Bigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    BIGRAM.execute()
elif N == 3:
    print(f"trigram: V = {V} d = {S_FACTOR}")
    TRIGRAM = Trigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    TRIGRAM.execute()
t2 = time()

print(f"execution time: {t2 - t1}s")
Exemplo n.º 9
0
 def __init__(self):
     self.unigram = Unigram()
     self.bigram = Bigram()
 def __init__(self):
     self.unigram = Unigram()
     self.bigram = Bigram()
     self.coef = 0.5
     print("W(bigram):W(unigram) coefficient is 1 :", self.coef)
class BigramInterpolation(LanguageModel):
    def __init__(self):
        self.unigram = Unigram()
        self.bigram = Bigram()
        self.coef = 0.5
        print("W(bigram):W(unigram) coefficient is 1 :", self.coef)

    def train(self, trainingSentences):
        self.unigram.train(trainingSentences)
        self.bigram.train(trainingSentences)

    def getWordProbability(self, sentence, index):
        coef = self.coef
        x = 1 / (1 + coef)

        if index == len(sentence):
            word = LanguageModel.STOP
            prev_word = sentence[-1]
        elif index == 0:
            word = sentence[0]
            prev_word = LanguageModel.START
        else:
            word = sentence[index]
            prev_word = sentence[index - 1]

        if prev_word not in self.bigram.probCounter:
            prev_word = LanguageModel.UNK

        if self.bigram.probCounter[prev_word][word] == 0:
            return x * coef * self.unigram.getWordProbability(sentence, index)
        else:
            return x * self.bigram.getWordProbability(
                sentence, index) + x * coef * self.unigram.getWordProbability(
                    sentence, index)

    def getVocabulary(self, context):

        next_posb_word = []
        # append all possible word except START in self.total
        for next_word in self.bigram.total:
            if next_word != LanguageModel.START:
                next_posb_word.append(next_word)
        # append STOP manually since there is no STOP in self.total
        next_posb_word.append(LanguageModel.STOP)

        return next_posb_word

    def generateWord(self, context):

        return self.bigram.generateWord(context)

    def generateSentence(self):
        result = []
        # limit sentence length to 20
        for i in range(20):
            word = LanguageModel.UNK
            while word == LanguageModel.UNK:
                # make sure word != UNK
                word = self.generateWord(result)
            result.append(word)
            if word == LanguageModel.STOP:
                break
        return result
Exemplo n.º 12
0
    # print("Starting with tfidf...")
    # tfidf_perceptron = Tfidf(train_ratio=0.8)
    # print("Tfidf accuracy", tfidf_perceptron.accuracy)

    # # Bigrams
    # print("Starting with bigrams...")
    # bigram_perceptron = Bigram(train_ratio=0.8)
    # print("Bigram accuracy", bigram_perceptron.accuracy)

    # PART C: Compare the data representations
    ratios = np.arange(0.05, 1.05, 0.05)
    unigram_accuracies = []
    tfidf_accuracies = []
    bigram_accuracies = []
    for r in ratios:
        unigram_perceptron = Unigram(train_ratio=r)
        unigram_accuracy = unigram_perceptron.accuracy
        unigram_accuracies.append(unigram_accuracy)
        print(r, "unigram_perceptron", unigram_accuracy)

        tfidf_perceptron = Tfidf(train_ratio=r)
        tfidf_accuracy = tfidf_perceptron.accuracy
        tfidf_accuracies.append(tfidf_accuracy)
        print(r, "tfidf_perceptron", tfidf_accuracy)

        bigram_perceptron = Bigram(train_ratio=r)
        bigram_accuracy = bigram_perceptron.accuracy
        bigram_accuracies.append(bigram_accuracy)
        print(r, "bigram_perceptron", bigram_accuracy)

    pickle.dump(unigram_accuracies, open("unigram_accuracies.pkl", "wb"))