/
app.py
268 lines (240 loc) · 9.61 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
import sys
import pandas as pd
import requests
import os
import re
import time
from bs4 import BeautifulSoup
from gensim.summarization import summarize
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from datetime import datetime
from flask import Flask, render_template, request, send_file
from flask_mail import Mail, Message
""" --------------------Main Script-------------------------- """
#Readability
def prGreen(skk): print("\033[92m {}\033[00m" .format(skk))
def prCyan(skk): print("\033[96m {}\033[00m" .format(skk))
"""Global Variables"""
non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
final_output = []
final_header = ['Serial No',"Search Query","URL Link","Title of Article","Text Summary"]
abstract_Txt = ""
google_chrome_bin = os.environ.get('GOOGLE_CHROME_BIN')
chromedriver_path = os.environ.get('CHROMEDRIVER_PATH')
"""Set up Selenium driver"""
chrome_options = Options()
chrome_options.binary_location = google_chrome_bin
chrome_options.add_argument("--headless")
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("-incognito")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
sel_driver = webdriver.Chrome(executable_path=chromedriver_path,chrome_options=chrome_options)
# sel_driver = webdriver.Chrome(executable_path='./static/ChromeDriverWin32/chromedriver.exe',chrome_options=chrome_options) #Local host Test
def scrape(lst_query,filename):
for query in lst_query:
"""Scrape scheduled link from Selenium"""
url = "https://www.google.com/search?q=" + query
sel_driver.implicitly_wait(1) #reduce error
main_url = r"https://www.google.com.sg/"
sel_driver.get(main_url)
"""Simulates manual log in"""
username = sel_driver.find_element_by_class_name("gLFyf")
username.click()
username.send_keys(query)
username.submit()
listOfLinks = [] #resets at the start of each query
webresults = BeautifulSoup(sel_driver.page_source, "html.parser")
for info in (webresults.find_all("div", {"class", "g"})):
links = info.find("a").get('href')
if "https" not in links:
continue
if links in listOfLinks:
continue
else:
listOfLinks.append(links)
#Serialise Output according to number of links in each query
final_counter = len(listOfLinks)
#resets for each query
counter = 1
for url in listOfLinks:
print(url)
if counter <= final_counter:
# Pandas- List of Appending
sub_output = []
sub_output.append(counter)
sub_output.append(query)
sub_output.append(url)
"""attempts to get URL content"""
try:
txt_summary = get_content(url) #gets content from URL
if txt_summary[1] != "":
sub_output.append(txt_summary[0]) #Title
sub_output.append(txt_summary[1]) #Main content
final_output.append(sub_output)
counter += 1
except:
continue
prCyan("sleeping")
time.sleep(2) # sleep so that it will simulate actual human activity
prCyan("Resuming")
# Output to Excel File
df_results = pd.DataFrame(final_output, columns=final_header)
writer = pd.ExcelWriter('./static/user_pulls/Output_'+filename+'.xlsx', engine='xlsxwriter')
df_results.to_excel(writer, sheet_name='Results', header=final_header, index=False)
# modifyng output by style - wrap
workbook = writer.book
worksheet = writer.sheets['Results']
format = workbook.add_format()
format.set_align('top')
format.set_text_wrap()
format.set_border()
# Setting the format
worksheet.set_column('A:A', 10, format)
worksheet.set_column('B:B', 20, format)
worksheet.set_column('C:C', 40, format)
worksheet.set_column('D:D', 20, format)
worksheet.set_column('E:E', 70, format)
worksheet.set_column('E:E', 70, format)
# Clear df
df_results = None
writer.save()
writer.close()
sel_driver.quit() #closes all instances of sel_driver
def pullContent(soup):
print("Pulling")
results = ""
links = soup.select("p")
if (len(links) == 0):
for br_tag in soup.find_all('br'):
text = br_tag.previous_sibling
if (text != None):
links.append(text)
text = br_tag.next_sibling
if (text != None):
links.append(text)
for link in links:
try:
temp = link.text.strip()
except (UnicodeDecodeError):
temp = link.translate(non_bmp_map).strip()
except (TypeError):
continue
except:
temp = str(link).strip()
results = results + " " + temp
return results.strip()
# Main content Generator with BS4 and Selenium if BS4 fails to scrape
def get_content(url):
prCyan('BS4 Pull Request...')
headers = requests.utils.default_headers()
headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
})
page = ''
while page == '':
try:
page = requests.get(url)
break
except:
print("Connection refused by the server..")
print("Let me sleep for 5 seconds")
print("ZZzzzz...")
time.sleep(5)
print("Was a nice sleep, now let me continue...")
continue
raw_html = page.content
soup = BeautifulSoup(raw_html, 'html.parser')
results = pullContent(soup)
prGreen('BS4 Original Content:')
print(results)
headers = soup.select("h1")
header = ""
if len(headers) != 0:
header = headers[0].text
if (("Forbidden" in header) or (header == "") or ("Access Denied" in header) or ("400 Bad Request" in header) or ("Error" in header)):
header = "Title Could not be Retrieved due to Webpage Restrictions"
prCyan('Title:')
print(header)
final_text_summary = []
# Check if content can be pulled with BS4
"""word count minimum"""
validThreshold = 300
if len(results.split(" ")) < validThreshold:
# Selenium Pull
sel_driver.implicitly_wait(1) # reduce error
sel_driver.get(url)
soup = BeautifulSoup(sel_driver.page_source, "html.parser")
results = pullContent(soup)
prCyan('Selenium Original Content:')
print(results)
"""Output with summariser"""
# apply final regex clean up before summarising
results = re.sub(r"\{(.*?)\}+", '', results) #removes anything enclosing {}
results = re.sub(r"(#[A-Za-z]+)",'', results) #removes hashtags
results = re.sub(r"(^.+@[^\.].*\.[a-z]{2,}$)",'', results) #removes email
prCyan('After Regex...')
print(results)
final_results = summarize(results)
prCyan('With text summary:')
print(final_results)
final_text_summary.append(header)
final_text_summary.append(final_results)
return final_text_summary
"""-------------------------------FLASK APPLICATION------------------------------------"""
app = Flask(__name__)
email_pw = os.environ.get('EMAIL_PW') #fetch from environment credentials
"""Flask Mail Configuration"""
app.config['TESTING'] = False
app.config['MAIL_SERVER'] = 'smtp.gmail.com'
app.config['MAIL_PORT'] = 465
app.config['MAIL_USE_TLS'] = False
app.config['MAIL_USE_SSL'] = True
app.config['MAIL_DEBUG'] = False #same as Debug mode
app.config['MAIL_USERNAME'] = 'textcruncher@gmail.com'
app.config['MAIL_PASSWORD'] = email_pw
app.config['MAIL_DEFAULT_SENDER'] = None
app.config['MAIL_MAX_EMAILS'] = None
app.config['MAIL_SUPPRESS_SEND'] = False #same as TESTING
app.config['MAIL_ASCII_ATTACHMENTS'] = False #keyboard characters
app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0
"""Flask Mail Sending"""
mail= Mail(app)
@app.route('/send-mail/', methods=['POST'])
def send_mail():
receiver = []
emailadd = request.form['email_address']
receiver = emailadd.split(',')
# receiver.append(emailadd.split(','))
text = request.form['msg_txt'] # receives from html form as String
filename = request.form['fileName']
with app.open_resource('./static/user_pulls/Output_'+filename+'.xlsx') as fp:
msg = Message('Below is an Attached File of your Query Results', sender='textcruncher@gmail.com', recipients=receiver)
msg.attach('Output_'+filename+'.xlsx', 'file/xlsx', fp.read())
msg.body = text
mail.send(msg)
return render_template('downloads.html', filename=filename)
@app.route('/')
def home():
return render_template('index.html')
@app.route('/', methods=['POST'])
def scrape_now():
#OBtains data from html form and pass it through python to another html page
queries = request.form['queries'] #receives from html form as String
lst_queries = queries.split(',') #split by ','
current_timestamp = datetime.now().strftime('%m%d%Y%H%M%S')
scrape(lst_queries, current_timestamp)
return render_template('downloads.html', filename=current_timestamp)
@app.route('/return-file/<filename>')
def return_file(filename):
return send_file('./static/user_pulls/Output_'+filename+'.xlsx', attachment_filename='Output.xlsx', cache_timeout=0)
@app.route('/about')
def about():
return render_template('about.html')
#runs the application in debug mode
if __name__ == "__main__":
port = int(os.environ.get('PORT', 4000))
app.run(host='0.0.0.0', port=port)
# app.run(debug=True)