def __train_models(self): # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model. self.ngp = NGramPerplexity() self.tfidf = TFIDF() print("Training models from specific corpora") for file in os.listdir(self.input_dir): print("Training models from specific corpora: " + file) with open(self.input_dir + "/" + file, encoding="utf-8") as input: for line in input: words = WordExtractor.get_words(line) if len(words) == 0: continue self.sentences.append(words) self.ngp.train_from_text(words) self.tfidf.train_from_text(words)
from ngramperplexity import NGramPerplexity from wordextractor import WordExtractor NGramPerplexity.ngram_size = 3 ngp = NGramPerplexity() ngp.train_from_text( WordExtractor.get_words("There are so many people at the beach")) ngp.train_from_text( WordExtractor.get_words( "The beach is so crowded with all these people, I wish they would just go to another beach" )) ngp.train_from_text( WordExtractor.get_words( "It is summer and a great day to go to the beach.")) ngp.train_from_text( WordExtractor.get_words( "Let's go to the beach and enjoy the great weather we've got today.")) ngp.train_from_text( WordExtractor.get_words( "I think the first thing I will do at the beach is to buy an ice cream." )) ngp.train_from_text( WordExtractor.get_words( "There's many people at the beach today, I think they are enjoying their holidays." )) ngp.train_from_text( WordExtractor.get_words( "I think something is going on at the beach right now, there are literally people everywhere." )) #sentences with great similarity