def generateUnigramModels(): ''' ControllerModelAndRandomSentence for the generation of the unigram models. Iterates over the given genre folders and retrieves the unigram model to create the final unigram model dictionary ''' #Get the frequency of each word in the corpus unigram_frequencies = getUnigramFrequenciesforTrainingSet() #Get the word type and token count for the corpus unigram_features = getUnigramModelFeatures(unigram_frequencies) print("\n Unigram Features (Word Types, Work Tokens) {0} \n".format(unigram_features)) #Returns the frequency distributions with all tokens with frequency 1 replacedby <UNKNOWN> #unigram_features_unknown_words = handleUnknownWords(unigram_frequencies) unknown_word_probs = getUnknownWordSamplingProbs(unigram_frequencies) #Performing Good Turing Smoothing. Removed this as it is not needed for unigrams #smoothed_frequencies = applyGoodTuringUnigramSmoothing(unigram_features_unknown_words, n = 5) #Creating the unigram model i.e. calculating the probabilities of the unigrams unigram_model = createUnigramModel(unigram_frequencies, unigram_features, unknown_word_probs) #Storing the model on the disk in JSON format serializeModelToDisk(unigram_model, 'Unigram') return unigram_model
def generateBigramModels( random_sentence = False ): ''' Controller module for the generation of the bigram models. Calls the various methods needed to generate the model and serialise it to the disc. ''' bigrams = {} startchar_successors = {} # Get the bigrams in the corpus by the genre and the list of tokens that are # sentence starter words in the corpus for genre in genres: print("\nReading files for genre {0}".format(genre)) path = training_path + genre bigrams[genre], startchar_successors[genre] = getBigramsForGenre(path) #Creating the frequency model of the bigrams bigram_frequencies = getBigramFrequencies(bigrams) #Adding the frequency of the bigrams that include the start character if random_sentence: bigram_frequencies_with_startChar = getStartCharBigramFrequencies(bigram_frequencies, startchar_successors) bigram_model = createBigramModel(bigram_frequencies_with_startChar) serializeModelToDisk(bigram_model, 'BigramSentenceModel') else: bigram_model = createBigramModel(bigram_frequencies) serializeModelToDisk(bigram_model, 'Bigram') return bigram_model