def predict_project_terms_in_mongo(self, project_name, output): connection = MongoClient('localhost', 27017) admin = connection['admin'] admin.authenticate('rrpdream', 'rrpdream9453') db = connection['KeywordFinder'] search_results = db['project_terms'].find({'project_name': project_name}, {'keyword': True}) results_list = list(search_results) keyword_list = results_list[0]['keyword'] prediction_dict = dict() word_predictor = WordPredictor(self.config) for keyword in keyword_list: predict_list = word_predictor.find_similar_word(keyword, self.config['n_predict']) prediction_dict[keyword] = predict_list if output == 'mongo': time = str(datetime.now().date()) document = {'time': time, 'prediction': prediction_dict} result = db['prediction'].insert_one(document) print(result) elif output == 'print': print(prediction_dict) elif output == 'json': with open(self.config['path'] + 'project_terms_prediction.json', 'w') as f: json.dump(prediction_dict, f)
def __init__(self): self.wp = WordPredictor() for corpus in nltk.corpus.gutenberg.fileids(): self.wp.learn_from_text(nltk.corpus.gutenberg.raw(corpus)) with open("WordPredictionFile", 'wb') as f: f.write(pickle.dumps(self.wp)) print("Word prediction training ready")
def find_similar_word_for_cluster(self, cluster_no, n_newword=15): clusters = self.get_cluster() keyword = clusters[cluster_no] word_predictor = WordPredictor(config, term_similarity_matrix=True) result_dict = word_predictor.find_similar_word(keyword, n_target=n_newword) return result_dict
class Train_Word_Predictor: def __init__(self): self.wp = WordPredictor() for corpus in nltk.corpus.gutenberg.fileids(): self.wp.learn_from_text(nltk.corpus.gutenberg.raw(corpus)) with open("WordPredictionFile", 'wb') as f: f.write(pickle.dumps(self.wp)) print("Word prediction training ready") # def train_wordPredictor(self): # #wp = WordPredictor() def predict_words(self, text): words = self.wp.predict(text).terms()[0:30] finalWords = [] i = 0 for word in words: if i == 4: break if word[0].isalpha(): finalWords.append(word[0]) i = i + 1 print(finalWords) return finalWords def predict_fromDisk(self, text): with open('WordPredictionFile', 'rb') as wf: wordObject = pickle.loads(wf.read()) words = wordObject.predict(text).terms()[0:30] finalWords = [] i = 0 for word in words: if i == 4: break if word[0].isalpha(): finalWords.append(word[0]) i = i + 1 return finalWords
def find_new_word_for_cluster(self, cluster_no, n_newword=15): wordlist = self.get_all_prior_word( ) # "in" word and "not_in" word would be different clusters = self.get_cluster() keyword = clusters[cluster_no] word_predictor = WordPredictor(config, term_similarity_matrix=True) result_dict = word_predictor.find_similar_word(keyword, n_target=n_newword) new_dict = dict() for key, value in result_dict.items(): new_list = list() for predicted_word in value: if predicted_word not in wordlist: new_list.append(predicted_word) new_dict[key] = new_list return new_dict
def build_and_evaluate(self): matrix = TermDocumentMatrix(self.config) td_matrix = matrix.load() model = LdaModel(self.config) model.build(td_matrix, self.config['alpha'], self.config['beta'], self.config['n_topics'], save_model=True) p_zw = model.get_p_zw() term_similarity_matrix = TermSimilarityMatrix(self.config) term_similarity_matrix.create(p_zw, save=True) predictor = WordPredictor(self.config) evaluate = Evaluate(self.config) evaluate.ground_truth(predictor)
def run(self, alpha, beta): matrix = TermDocumentMatrix(self.config) td_matrix = matrix.load() model = LdaModel(self.config) model.build(td_matrix, alpha, beta, self.config['n_topics']) p_zw = model.get_p_zw() self.maxlike = model.get_maxlike() term_similarity_matrix = TermSimilarityMatrix(self.config) term_similarity_matrix.create(p_zw, save=True) predictor = WordPredictor(self.config) evaluate = Evaluate(self.config) self.score = evaluate.ground_truth(predictor)
def main(): # train a model on the first bit of Moby Dick wp = WordPredictor() print("bad1 = %s" % wp.get_best("the")) wp.train("moby_start.txt") print("training words = %d" % (wp.get_training_count())) # try and crash things on bad input print("bad2 = %s" % wp.get_best("the")) wp.train("thisfiledoesnotexist.txt") print("training words = %d\n" % (wp.get_training_count())) words = ["the", "me", "zebra", "ishmael", "savage"] for word in words: print("count, %s = %d" % (word, wp.get_word_count(word))) wp.train("moby_end.txt") print() # check the counts again after training on the end of the book for word in words: print("count, %s = %d" % (word, wp.get_word_count(word))) print() # Get the object ready to start looking things up wp.build() # do some prefix loopups test = ["a", "ab", "b", "be", "t", "th", "archang"] for prefix in test: if wp.get_best(prefix): print( "%s -> %s\t\t\t%.6f" % (prefix, wp.get_best(prefix).get_word(), wp.get_best(prefix).get_prob())) else: print("%s -> %s\t\t\t%s" % (prefix, "None", "None")) print("training words = %d\n" % (wp.get_training_count())) # add two individual words to the training data wp.train_word("beefeater") wp.train_word("BEEFEATER!") wp.train_word("BEEFEATER") wp.train_word("Pneumonoultramicroscopicsilicovolcanoconiosis") print("added additional words") print("training words = %d\n" % (wp.get_training_count())) # The change should have no effect for prefix lookup until we build() test_2 = ['b', 'pn'] for prefix in test_2: if wp.get_best(prefix): print("before, %s -> %s\t\t%.6f" % (prefix, wp.get_best(prefix).get_word(), wp.get_best(prefix).get_prob())) else: print("before, %s -> %s\t\t%s" % (prefix, "None", "None")) wp.build() for prefix in test_2: if wp.get_best(prefix): print("after, %s -> %s\t\t%.6f" % (prefix, wp.get_best(prefix).get_word(), wp.get_best(prefix).get_prob())) else: print("after, %s -> %s\t\t%s" % (prefix, "None", "None")) print("training words = %d\n" % (wp.get_training_count())) # test out training on a big file, timing the training as well start = timeit.default_timer() wp.train("mobydick.txt") wp.build() for prefix in test: if wp.get_best(prefix): print( "%s -> %s\t\t\t%.6f" % (prefix, wp.get_best(prefix).get_word(), wp.get_best(prefix).get_prob())) else: print("%s -> %s\t\t\t%s" % (prefix, "None", "None")) print("training words = %d\n" % (wp.get_training_count())) stop = timeit.default_timer() elapsed = (stop - start) print("elapsed (s): %.6f" % elapsed) # test lookup using random prefixes between 1-6 characters start = timeit.default_timer() random_load_test(wp) stop = timeit.default_timer() elapsed = (stop - start) print("elapsed (s): %.6f" % elapsed)
from word_predictor import WordPredictor import nltk wp = WordPredictor() for corpus in nltk.corpus.gutenberg.fileids(): wp.learn_from_text(nltk.corpus.gutenberg.raw(corpus)) def PredictNext(phrase): #return [('The',0.123),('Me',0.213123),('Hi', 0.1232)] return wp.predict(phrase).terms()[0:3]
from flask import Flask, request, render_template import cPickle as pickle from word_predictor import WordPredictor WP = WordPredictor() WP.fit() app = Flask(__name__) # load model # with open('data/vectorizer.pkl') as f: # vectorizer = pickle.load(f) # with open('data/model.pkl') as f: # model = pickle.load(f) # # home page @app.route('/') def index(): return render_template('index.html') @app.route('/predict', methods=['POST']) def predict(): text = str(request.form['precede_words']) prediction = WP.predict(text) # print 'prediction type:' ,type(prediction)