-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
113 lines (91 loc) · 2.86 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""The main application."""
import json
import os
import requests
import operator
import re
import nltk
from flask import Flask, render_template, request, jsonify, url_for
from flask_sqlalchemy import SQLAlchemy
from stop_words import stops
from collections import Counter
from bs4 import BeautifulSoup
from rq import Queue
from rq.job import Job
from worker import conn
app = Flask(__name__)
app.config.from_object(os.environ['APP_SETTINGS'])
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
db = SQLAlchemy(app)
q = Queue(connection=conn)
from models import Result
def count_and_save_words(url):
"""Count the times a word shows up and save them."""
errors = []
try:
r = requests.get(url)
except Exception:
errors.append(
'Unable to get URL. Please make sure it is valid and try again.'
)
return {"error": errors}
raw = BeautifulSoup(r.text).get_text()
nltk.data.path.append('./nltk_data/') # set the path
tokens = nltk.word_tokenize(raw)
text = nltk.Text(tokens)
# remove punctuation, count raw words
non_punct = re.compile('.*[A-Za-z].*')
raw_words = [w for w in text if non_punct.match(w)]
raw_word_count = Counter(raw_words)
# stop words
no_stop_words = [w for w in raw_words if w.lower() not in stops]
no_stop_words_count = Counter(no_stop_words)
# save the results
try:
result = Result(
url=url,
result_all=raw_word_count,
result_no_stop_words=no_stop_words_count
)
db.session.add(result)
db.session.commit()
return result.id
except Exception:
errors.append('Unable to add item to database.')
return {"error": errors}
@app.route('/', methods=['GET', 'POST'])
def index():
"""Get words from a URL."""
return render_template('index.html')
@app.route('/start', methods=['POST'])
def get_counts():
"""Get the counts."""
# this import solves a rq bug which currently exists
from app import count_and_save_words
# get url
data = json.loads(request.data.decode())
url = data["url"]
if not url[:8].startswith(('https://', 'http://')):
url = 'http://' + url
# start job
job = q.enqueue_call(
func=count_and_save_words, args=(url,), result_ttl=5000
)
# return created job id
return job.get_id()
@app.route('/results/<job_key>', methods=['GET'])
def get_results(job_key):
"""Get the results."""
job = Job.fetch(job_key, connection=conn)
if job.is_finished:
result = Result.query.filter_by(id=job.result).first()
results = sorted(
result.result_no_stop_words.items(),
key=operator.itemgetter(1),
reverse=True
)[:10]
return jsonify(results)
else:
return "Your results are not ready yet!", 202
if __name__ == "__main__":
app.run()