Пример #1
0
    def predict_project_terms_in_mongo(self, project_name, output):
        connection = MongoClient('localhost', 27017)
        admin = connection['admin']
        admin.authenticate('rrpdream', 'rrpdream9453')

        db = connection['KeywordFinder']
        search_results = db['project_terms'].find({'project_name': project_name}, {'keyword': True})
        results_list = list(search_results)
        keyword_list = results_list[0]['keyword']

        prediction_dict = dict()
        word_predictor = WordPredictor(self.config)
        for keyword in keyword_list:
            predict_list = word_predictor.find_similar_word(keyword, self.config['n_predict'])
            prediction_dict[keyword] = predict_list

        if output == 'mongo':
            time = str(datetime.now().date())
            document = {'time': time, 'prediction': prediction_dict}
            result = db['prediction'].insert_one(document)
            print(result)

        elif output == 'print':
            print(prediction_dict)

        elif output == 'json':
            with open(self.config['path'] + 'project_terms_prediction.json', 'w') as f:
                json.dump(prediction_dict, f)
Пример #2
0
    def __init__(self):
        self.wp = WordPredictor()
        for corpus in nltk.corpus.gutenberg.fileids():
            self.wp.learn_from_text(nltk.corpus.gutenberg.raw(corpus))

        with open("WordPredictionFile", 'wb') as f:
            f.write(pickle.dumps(self.wp))

        print("Word prediction training ready")
Пример #3
0
    def find_similar_word_for_cluster(self, cluster_no, n_newword=15):
        clusters = self.get_cluster()

        keyword = clusters[cluster_no]

        word_predictor = WordPredictor(config, term_similarity_matrix=True)
        result_dict = word_predictor.find_similar_word(keyword,
                                                       n_target=n_newword)

        return result_dict
Пример #4
0
class Train_Word_Predictor:
    def __init__(self):
        self.wp = WordPredictor()
        for corpus in nltk.corpus.gutenberg.fileids():
            self.wp.learn_from_text(nltk.corpus.gutenberg.raw(corpus))

        with open("WordPredictionFile", 'wb') as f:
            f.write(pickle.dumps(self.wp))

        print("Word prediction training ready")

    # def train_wordPredictor(self):
    #     #wp = WordPredictor()

    def predict_words(self, text):
        words = self.wp.predict(text).terms()[0:30]
        finalWords = []
        i = 0
        for word in words:
            if i == 4:
                break
            if word[0].isalpha():
                finalWords.append(word[0])
                i = i + 1

        print(finalWords)
        return finalWords

    def predict_fromDisk(self, text):

        with open('WordPredictionFile', 'rb') as wf:
            wordObject = pickle.loads(wf.read())

        words = wordObject.predict(text).terms()[0:30]
        finalWords = []
        i = 0
        for word in words:
            if i == 4:
                break
            if word[0].isalpha():
                finalWords.append(word[0])
                i = i + 1

        return finalWords
Пример #5
0
    def find_new_word_for_cluster(self, cluster_no, n_newword=15):

        wordlist = self.get_all_prior_word(
        )  # "in" word and "not_in" word would be different

        clusters = self.get_cluster()

        keyword = clusters[cluster_no]

        word_predictor = WordPredictor(config, term_similarity_matrix=True)
        result_dict = word_predictor.find_similar_word(keyword,
                                                       n_target=n_newword)

        new_dict = dict()
        for key, value in result_dict.items():
            new_list = list()
            for predicted_word in value:
                if predicted_word not in wordlist:
                    new_list.append(predicted_word)

            new_dict[key] = new_list

        return new_dict
Пример #6
0
    def build_and_evaluate(self):
        matrix = TermDocumentMatrix(self.config)
        td_matrix = matrix.load()

        model = LdaModel(self.config)
        model.build(td_matrix, self.config['alpha'], self.config['beta'], self.config['n_topics'], save_model=True)
        p_zw = model.get_p_zw()

        term_similarity_matrix = TermSimilarityMatrix(self.config)
        term_similarity_matrix.create(p_zw, save=True)

        predictor = WordPredictor(self.config)

        evaluate = Evaluate(self.config)
        evaluate.ground_truth(predictor)
Пример #7
0
    def run(self, alpha, beta):
        matrix = TermDocumentMatrix(self.config)
        td_matrix = matrix.load()

        model = LdaModel(self.config)
        model.build(td_matrix, alpha, beta, self.config['n_topics'])
        p_zw = model.get_p_zw()
        self.maxlike = model.get_maxlike()

        term_similarity_matrix = TermSimilarityMatrix(self.config)
        term_similarity_matrix.create(p_zw, save=True)

        predictor = WordPredictor(self.config)

        evaluate = Evaluate(self.config)
        self.score = evaluate.ground_truth(predictor)
Пример #8
0
def main():
    # train a model on the first bit of Moby Dick
    wp = WordPredictor()
    print("bad1 = %s" % wp.get_best("the"))
    wp.train("moby_start.txt")
    print("training words = %d" % (wp.get_training_count()))

    # try and crash things on bad input
    print("bad2 = %s" % wp.get_best("the"))
    wp.train("thisfiledoesnotexist.txt")
    print("training words = %d\n" % (wp.get_training_count()))

    words = ["the", "me", "zebra", "ishmael", "savage"]
    for word in words:
        print("count, %s = %d" % (word, wp.get_word_count(word)))
    wp.train("moby_end.txt")
    print()
    # check the counts again after training on the end of the book
    for word in words:
        print("count, %s = %d" % (word, wp.get_word_count(word)))
    print()

    # Get the object ready to start looking things up
    wp.build()

    # do some prefix loopups
    test = ["a", "ab", "b", "be", "t", "th", "archang"]
    for prefix in test:
        if wp.get_best(prefix):
            print(
                "%s -> %s\t\t\t%.6f" % (prefix, wp.get_best(prefix).get_word(),
                                        wp.get_best(prefix).get_prob()))
        else:
            print("%s -> %s\t\t\t%s" % (prefix, "None", "None"))
    print("training words = %d\n" % (wp.get_training_count()))

    # add two individual words to the training data
    wp.train_word("beefeater")
    wp.train_word("BEEFEATER!")
    wp.train_word("BEEFEATER")
    wp.train_word("Pneumonoultramicroscopicsilicovolcanoconiosis")
    print("added additional words")
    print("training words = %d\n" % (wp.get_training_count()))

    # The change should have no effect for prefix lookup until we build()
    test_2 = ['b', 'pn']
    for prefix in test_2:
        if wp.get_best(prefix):
            print("before, %s -> %s\t\t%.6f" %
                  (prefix, wp.get_best(prefix).get_word(),
                   wp.get_best(prefix).get_prob()))
        else:
            print("before, %s -> %s\t\t%s" % (prefix, "None", "None"))
    wp.build()
    for prefix in test_2:
        if wp.get_best(prefix):
            print("after, %s -> %s\t\t%.6f" %
                  (prefix, wp.get_best(prefix).get_word(),
                   wp.get_best(prefix).get_prob()))
        else:
            print("after, %s -> %s\t\t%s" % (prefix, "None", "None"))
    print("training words = %d\n" % (wp.get_training_count()))

    # test out training on a big file, timing the training as well
    start = timeit.default_timer()
    wp.train("mobydick.txt")
    wp.build()
    for prefix in test:
        if wp.get_best(prefix):
            print(
                "%s -> %s\t\t\t%.6f" % (prefix, wp.get_best(prefix).get_word(),
                                        wp.get_best(prefix).get_prob()))
        else:
            print("%s -> %s\t\t\t%s" % (prefix, "None", "None"))
    print("training words = %d\n" % (wp.get_training_count()))
    stop = timeit.default_timer()
    elapsed = (stop - start)
    print("elapsed (s): %.6f" % elapsed)
    # test lookup using random prefixes between 1-6 characters
    start = timeit.default_timer()
    random_load_test(wp)
    stop = timeit.default_timer()
    elapsed = (stop - start)
    print("elapsed (s): %.6f" % elapsed)
Пример #9
0
from word_predictor import WordPredictor
import nltk

wp = WordPredictor()
for corpus in nltk.corpus.gutenberg.fileids():
    wp.learn_from_text(nltk.corpus.gutenberg.raw(corpus))


def PredictNext(phrase):
    #return [('The',0.123),('Me',0.213123),('Hi', 0.1232)]
    return wp.predict(phrase).terms()[0:3]
Пример #10
0
from flask import Flask, request, render_template
import cPickle as pickle
from word_predictor import WordPredictor


WP = WordPredictor()
WP.fit()

app = Flask(__name__)
# load model
# with open('data/vectorizer.pkl') as f:
#     vectorizer = pickle.load(f)
# with open('data/model.pkl') as f:
#     model = pickle.load(f)
#

# home page
@app.route('/')
def index():


    return render_template('index.html')

@app.route('/predict', methods=['POST'])
def predict():

    text = str(request.form['precede_words'])

    prediction = WP.predict(text)

    # print 'prediction type:' ,type(prediction)