def getBigramPerplexity(): ''' Controller method to find the perplexity of all the test books with all the genres ''' bigram_model = loadBigramModels() #Reads in the unigrams one file at a time and stores it with their bookname book_tokens = {} for genre in genres: print("\nReading test files for genre {0}".format(genre)) for path in os.listdir(test_path + genre): book_tokens[path] = getTokensForFile(test_path + genre + '/' + path) # Construct Bigrams from Unigrams book_bigrams = {} for path, tokens in book_tokens.iteritems(): book_bigrams[path] = [(tokens[i], tokens[i+1]) for i in range(0, len(tokens)-1)] # Predict Bigram Perplexity book_perplexity = defaultdict(dict) for book, bigrams in book_bigrams.iteritems(): for genre in genres: book_perplexity[book][genre] = computeBigramPerplexity(bigram_model[genre], bigrams) print("Perplexity of '{0}' book on {1} genre model: {2}".format(book, genre, book_perplexity[book][genre])) return book_perplexity
def getBigramsForGenre(dir_path, unknown_words = True): ''' Reads through the contents of a complete directory path and finds the bigrams present in a genre level corpus ''' genre_tokens = [] genre_startchar_successors = [] for path in os.listdir(dir_path): #Reading the file's contents and getting the tokens if not path.startswith('.'): tokens = getTokensForFile(dir_path + '/' + path) genre_tokens.extend(tokens) #Finding the list of words that are sentence starters in the current corpus genre_startchar_successors.extend(getStartCharSuccessorsForGenre(genre_tokens)) #Modifying the list of tokens by inserting <UNKNOWN> for tokens that occur only once mod_tokens = insertUnknownWords(genre_tokens) #Create a list of bigrams from the tokens #Will include the bigrams spanning the end of one file to the beginning of the next. # but that doesn't really matter (~5 bigrams out of 10s of thousands) genre_bigram = [(mod_tokens[i], mod_tokens[i+1]) for i in range(0, len(mod_tokens)-1)] return genre_bigram, genre_startchar_successors
def getUnigramFrequenciesforTrainingSet(): ''' Wrapper method to get the unigram frequency distribution across all genres ''' unigram_frequencies = {} for genre in genres: print("\nReading files for genre {0}".format(genre)) word_list = [] #Reads in the unigrams one file at a time for path in os.listdir(training_path + genre): if not path.startswith('.'): word_list.extend(getTokensForFile(training_path + genre + '/' + path)) #Creating a counter of the frequencies at the genre level unigram_frequencies[genre] = Counter(word_list) return unigram_frequencies
def getUnigramPerplexity(): ''' Controller method to find the perplexity of all the test books with all the genres ''' unigram_model = loadUnigramModels() #Reads in the unigrams one file at a time and stores it with their bookname book_tokens = {} for genre in genres: print("\nReading test files for genre {0}".format(genre)) for path in os.listdir(test_path + genre): book_tokens[path] = getTokensForFile(test_path + genre + '/' + path) #Computes the perplixity of every test corpus against each of the unigram models created book_perplexity = defaultdict(dict) for book, unigrams in book_tokens.iteritems(): print('') for genre, model in unigram_model.iteritems(): book_perplexity[book][genre] = computeUnigramPerplexity(model, unigrams) print("Perplexity of '{0}' book on {1} genre model: {2}".format(book,genre,book_perplexity[book][genre])) return book_perplexity