def __init__(self, lambda_1=0.67): self.unigram = Unigram() self.bigram = Bigram() # just needed for languageModel.py to work self.word_dict = self.bigram.word_dict self.lambda_1 = lambda_1 self.lambda_2 = 1 - lambda_1
class BigramInterpolation(LanguageModel): def __init__(self): self.unigram = Unigram() self.bigram = Bigram() def train(self, trainingSentences): self.unigram.train(trainingSentences) self.bigram.train(trainingSentences) def getWordProbability(self, sentence, index): return 0 def getVocabulary(self, context): return [] def generateWord(self, context): return 'bunny' def generateSentence(self): result = [] # limit sentence length to 20 for i in range(20): word = LanguageModel.UNK while word == LanguageModel.UNK: # make sure word != UNK word = self.generateWord(result) result.append(word) if word == LanguageModel.STOP: break return result
def __init__(self): self.unigram_model = Unigram() self.bigram_model = BigramAddOneSmooth() self.trigram_model = TrigramAddOneSmooth() self.unigram_lambda = .25 self.bigram_lambda = .25 self.trigram_lambda = .5
class BigramInterpolation(LanguageModel): def __init__(self): self.unigram = Unigram() self.bigram = Bigram() # just needed for languageModel.py to work self.word_dict = self.bigram.word_dict self.lambda_1 = 0.5 self.lambda_2 = 0.5 ''' Trains a bigram-interpolation language model on a training set. ''' def train(self, trainingSentences): self.unigram.train(trainingSentences) self.bigram.train(trainingSentences) ''' Returns the probability of the word at index, according to the model, within the specified sentence. ''' def getWordProbability(self, sentence, index): return ( self.lambda_1 * self.bigram.getWordProbability(sentence, index) + self.lambda_2 * self.unigram.getWordProbability(sentence, index)) ''' Returns, for a given context, a random word, according to the probabilities in the model. ''' def generateWord(self, context): return 'bunny'
def main(): raw_data = get_data() # Unigram uni = Unigram(raw_data) uni.main() # Bigram bi = Bigram(raw_data) bi.main()
class BigramInterpolation(LanguageModel): def __init__(self, lambda_1=0.67): self.unigram = Unigram() self.bigram = Bigram() # just needed for languageModel.py to work self.word_dict = self.bigram.word_dict self.lambda_1 = lambda_1 self.lambda_2 = 1 - lambda_1 ''' Trains a bigram-interpolation language model on a training set. ''' def train(self, trainingSentences): self.unigram.train(trainingSentences) self.bigram.train(trainingSentences) ''' Returns the probability of the word at index, according to the model, within the specified sentence. ''' def getWordProbability(self, sentence, index): return (self.lambda_1*self.bigram.getWordProbability(sentence, index) +self.lambda_2*self.unigram.getWordProbability(sentence, index)) ''' Returns, for a given context, a random word, according to the probabilities in the model. ''' def generateWord(self, context): if context: previous_word = context[-1] else: previous_word = LanguageModel.START if (previous_word not in self.word_dict) and (previous_word != LanguageModel.START): previous_word = LanguageModel.UNK if previous_word == LanguageModel.START: previous_word_index = 0 else: previous_word_index = self.word_dict[previous_word] probs_bigram = self.bigram.prob_counter[previous_word_index].toarray().ravel() probs_unigram = self.unigram.prob_counter[0].toarray().ravel() # Because the unigram model and bigram model have different word index for STOP, I need to make some adjustment stop_index = self.unigram.word_dict[LanguageModel.STOP] # move STOP probability to the first element of probs_unigram and leave the others unchanged stop_prob = probs_unigram[stop_index] probs_unigram = np.append(stop_prob, np.delete(probs_unigram, stop_index)) probs = self.lambda_1*probs_bigram + self.lambda_2*probs_unigram # Get the interpolation probability word_list = sorted(self.word_dict.items(), key=lambda item: item[1]) word_list = [k[0] for k in word_list] return np.random.choice(word_list, p=probs)
class Interpolation(LanguageModel): def __init__(self): self.unigram_model = Unigram() self.bigram_model = BigramAddOneSmooth() self.trigram_model = TrigramAddOneSmooth() self.unigram_lambda = .25 self.bigram_lambda = .25 self.trigram_lambda = .5 def train(self, trainingSentences): self.unigram_model.train(trainingSentences) self.bigram_model.train(trainingSentences) self.trigram_model.train(trainingSentences) #Arbitrary lambdas. def getWordProbability(self, sentence, index): return (self.trigram_lambda * self.trigram_model.getWordProbability(sentence, index)) \ + (self.bigram_lambda * self.bigram_model.getWordProbability(sentence, index)) \ + (self.unigram_lambda * self.unigram_model.getWordProbability(sentence, index)) #Doesn't matter which model we use here- vocabulary is the same def getVocabulary(self, context): return self.trigram_model.getVocabulary(context) #What does generating a sentence in an interpolation model look like? #I don't know, so what I've done is generate a word using trigram, bigram, and #unigram model some of the time, using the same values in getWordProbability def generateSentence(self): sentence = [] prev_previous = LanguageModel.START previous = random.choice(list(self.trigram_model.word_count.keys())) for i in range(20): model_choice = random.random() if model_choice <= self.trigram_lambda: word = self.trigram_model.generateWord(prev_previous, previous) elif model_choice > self.trigram_lambda and model_choice <= self.trigram_lambda + self.bigram_lambda: word = self.bigram_model.generate_word(previous) else: word = self.unigram_model.generateWord() sentence.append(word) prev_previous = previous previous = word if word == LanguageModel.STOP: break return sentence
from unigram import Unigram from bigram import Bigram from trigram import Trigram inputs = read('input.txt')[0].strip().split(" ") V, N, S_FACTOR, TRAINING_FILE, TEST_FILE = (int(inputs[0]), int(inputs[1]), float(inputs[2]), inputs[3], inputs[4]) OUTPUT_FILE_NAME = f"./results/trace_{V}_{N}_{S_FACTOR}.txt" t1 = time() if V == 3: print(f"BYOM: V = {V} n = 3 d = {S_FACTOR}") BYOM = BYOM(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) BYOM.execute() elif N == 1: print(f"unigram: V = {V} d = {S_FACTOR}") UNIGRAM = Unigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) UNIGRAM.execute() elif N == 2: print(f"bigram: V = {V} d = {S_FACTOR}") BIGRAM = Bigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) BIGRAM.execute() elif N == 3: print(f"trigram: V = {V} d = {S_FACTOR}") TRIGRAM = Trigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) TRIGRAM.execute() t2 = time() print(f"execution time: {t2 - t1}s")
def __init__(self): self.unigram = Unigram() self.bigram = Bigram()
def __init__(self): self.unigram = Unigram() self.bigram = Bigram() self.coef = 0.5 print("W(bigram):W(unigram) coefficient is 1 :", self.coef)
class BigramInterpolation(LanguageModel): def __init__(self): self.unigram = Unigram() self.bigram = Bigram() self.coef = 0.5 print("W(bigram):W(unigram) coefficient is 1 :", self.coef) def train(self, trainingSentences): self.unigram.train(trainingSentences) self.bigram.train(trainingSentences) def getWordProbability(self, sentence, index): coef = self.coef x = 1 / (1 + coef) if index == len(sentence): word = LanguageModel.STOP prev_word = sentence[-1] elif index == 0: word = sentence[0] prev_word = LanguageModel.START else: word = sentence[index] prev_word = sentence[index - 1] if prev_word not in self.bigram.probCounter: prev_word = LanguageModel.UNK if self.bigram.probCounter[prev_word][word] == 0: return x * coef * self.unigram.getWordProbability(sentence, index) else: return x * self.bigram.getWordProbability( sentence, index) + x * coef * self.unigram.getWordProbability( sentence, index) def getVocabulary(self, context): next_posb_word = [] # append all possible word except START in self.total for next_word in self.bigram.total: if next_word != LanguageModel.START: next_posb_word.append(next_word) # append STOP manually since there is no STOP in self.total next_posb_word.append(LanguageModel.STOP) return next_posb_word def generateWord(self, context): return self.bigram.generateWord(context) def generateSentence(self): result = [] # limit sentence length to 20 for i in range(20): word = LanguageModel.UNK while word == LanguageModel.UNK: # make sure word != UNK word = self.generateWord(result) result.append(word) if word == LanguageModel.STOP: break return result
# print("Starting with tfidf...") # tfidf_perceptron = Tfidf(train_ratio=0.8) # print("Tfidf accuracy", tfidf_perceptron.accuracy) # # Bigrams # print("Starting with bigrams...") # bigram_perceptron = Bigram(train_ratio=0.8) # print("Bigram accuracy", bigram_perceptron.accuracy) # PART C: Compare the data representations ratios = np.arange(0.05, 1.05, 0.05) unigram_accuracies = [] tfidf_accuracies = [] bigram_accuracies = [] for r in ratios: unigram_perceptron = Unigram(train_ratio=r) unigram_accuracy = unigram_perceptron.accuracy unigram_accuracies.append(unigram_accuracy) print(r, "unigram_perceptron", unigram_accuracy) tfidf_perceptron = Tfidf(train_ratio=r) tfidf_accuracy = tfidf_perceptron.accuracy tfidf_accuracies.append(tfidf_accuracy) print(r, "tfidf_perceptron", tfidf_accuracy) bigram_perceptron = Bigram(train_ratio=r) bigram_accuracy = bigram_perceptron.accuracy bigram_accuracies.append(bigram_accuracy) print(r, "bigram_perceptron", bigram_accuracy) pickle.dump(unigram_accuracies, open("unigram_accuracies.pkl", "wb"))