-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
453 lines (398 loc) · 17 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
import threading
import queue
import functools
import os
from io import BytesIO
from PIL import (
Image,
UnidentifiedImageError
)
from http.cookiejar import CookieJar
import string
from urllib.request import (
url2pathname
)
from dataclasses import dataclass
from global_props import Settings
from debug import Debug
import web
@dataclass
class Message:
"""
for message handling sending to and from threads
thread - thread name
type - the type of message
id - the thread index
status - the types status
data - extra data. Depends on message type
"""
type: str
thread: str
id: int = 0
status: str = ""
data: dict = None
class Threads:
"""
static class holding global scope variables
"""
# commander thread reference
commander = None
# global list for containing runnning threads
grunts = []
# commander thread messaging queue
commander_queue = queue.Queue()
# lock for the print function. Used for debugging
# will remove later
stdout_lock = threading.Lock()
# global semaphore. this is related to max_connections
# found in the settings file
semaphore = threading.Semaphore(10)
# global event to cancel current running task
cancel = threading.Event()
new_folder_lock = threading.Lock()
class Urls:
"""
thread safe container class for storing global links
this is to check there arent duplicate links
saves a lot of time and less scraping
"""
links = []
lock = threading.Lock()
@staticmethod
def clear():
Urls.lock.acquire()
Urls.links.clear()
Urls.lock.release()
@staticmethod
def add_url(url):
Urls.lock.acquire()
Urls.links.append(url)
Urls.lock.release()
@staticmethod
def url_exists(url):
Urls.lock.acquire()
try:
index = Urls.links.index(url)
except ValueError:
index = -1
finally:
Urls.lock.release()
return index >= 0
class ImageFile:
"""
thread safe. saves bytes read from requests
and saves to disk
"""
file_lock = threading.Lock()
@staticmethod
def write_to_file(path, filename, bytes_stream):
ImageFile.file_lock.acquire()
if not os.path.exists(path):
os.mkdir(path)
full_path = os.path.join(path, filename)
with open(full_path, "wb") as fp:
fp.write(bytes_stream.getbuffer())
fp.close()
notify_commander(Message(thread="grunt", type="image", status="ok", data={"pathname": full_path}))
ImageFile.file_lock.release()
def request_from_url(url, settings):
"""
request_from_url(str)
gets the request from url and returns the requests object
"""
cookies = settings["cookies"]
if cookies["firefox"]:
cj = web.browser_cookie3.firefox()
elif cookies["chrome"]:
cj = web.browser_cookie3.chrome()
elif cookies["opera"]:
cj = web.browser_cookie3.opera()
elif cookies["edge"]:
cj = web.browser_cookie3.edge()
else:
cj = CookieJar()
try:
r = web.requests.get(url,
cookies=cj,
headers={"User-Agent": web.FIREFOX_USER_AGENT},
timeout=settings["connection_timeout"])
except web.requests.ReadTimeout:
Debug.log_file("ConnectionError", "request_from_url", f"Connection times out on {url}")
r = None
return r
def log_thread_safe(message):
"""
print is synchronized.
will remove this soon
"""
Threads.stdout_lock.acquire()
print(message)
Threads.stdout_lock.release()
def create_commander(callback):
"""
create the main handler thread.
this thread will stay iterating for the
remainder of the programs life cycle
"""
Threads.commander = threading.Thread(
target=commander_thread, kwargs={"callback": callback})
return Threads.commander
def download_image(filename, response, settings):
"""
download_image(str, str, object)
path should be the file path, filename should be the name of the file
os.path.join is used to append path to filename
response is the response returned from requests.get
"""
# read from socket
# store in memory
# images shouldnt be too large
byte_stream = BytesIO()
for buff in response.iter_content(1000):
byte_stream.write(buff)
# load image from buffer io
try:
image = Image.open(byte_stream)
except UnidentifiedImageError as err:
image = None
Debug.log("IMAGE_OPEN_ERROR", err, url=response.url, error=err.__str__())
Debug.log_file("ImageOpenError", "download_image", f"Error opening image from {response.url}")
if image:
width, height = image.size
# if image requirements met then save
if width > 200 and height > 200:
# check if directory exists
Threads.new_folder_lock.acquire()
if not os.path.exists(settings["save_path"]):
os.mkdir(settings["save_path"])
if settings["unique_pathname"]["enabled"]:
path = os.path.join(settings["save_path"], settings["unique_pathname"]["name"])
if not os.path.exists(path):
os.mkdir(path)
else:
path = settings["save_path"]
Threads.new_folder_lock.release()
ImageFile.write_to_file(path, filename, byte_stream)
image.close()
byte_stream.close()
def _assign_unique_name(url, html_doc):
"""
uses the title tag in the html docunment
as a folder name
uses the url instead
"""
Threads.new_folder_lock.acquire()
title = web.get_title_from_html(html_doc)
if title:
unique_name = url2pathname(title.text)
else:
unique_name = url2pathname(url)
# remove any illegal characters
# this function was taken from stackoverflow
# assign to global unique_path_name
settings = Settings.load()
settings["unique_pathname"]["name"] = format_filename(unique_name)
Settings.save(settings)
Threads.new_folder_lock.release()
def format_filename(s):
"""Take a string and return a valid filename constructed from the string.
Uses a whitelist approach: any characters not present in valid_chars are
removed. Also spaces are replaced with underscores.
Note: this method may produce invalid filenames such as ``, `.` or `..`
When I use this method I prepend a date string like '2009_01_15_19_46_32_'
and append a file extension like '.txt', so I avoid the potential of using
an invalid filename.
"""
valid_chars = f"-_.() {string.ascii_letters}{string.digits}"
filename = ''.join(c for c in s if c in valid_chars)
filename = filename.replace(' ','_') # I don't like spaces in filenames.
return filename
class Grunt(threading.Thread):
"""
Worker thread which will search for images on the url passed into __init__
"""
def __init__(self, thread_index, url, settings, **kwargs):
"""
__init__(int, str, **kwargs)
thread_index should be a unique number
this can be used to create a unique filename
and can also identify the thread
first thread will be 0 and indexed that way
url is the universal resource locator to search and parse
"""
super().__init__(**kwargs)
self.thread_index = thread_index
self.url = url
self.settings = settings
def run(self):
# partial function to avoid repetitive typing
GruntMessage = functools.partial(Message, id=self.thread_index, thread="grunt")
Threads.semaphore.acquire()
if not Threads.cancel.is_set():
notify_commander(GruntMessage(status="ok", type="scanning"))
# request the url
r = request_from_url(self.url, self.settings)
if r:
ext = web.is_valid_content_type(self.url, r.headers.get("Content-Type"), self.settings["images_to_search"])
if ".html" == ext:
imgs = []
# parse the document and search for images only
if web.parse_html(self.url, r.text, imgs, images_only=True, thumbnails_only=False) > 0:
r.close()
for index, imgurl in enumerate(imgs):
# check if url has already in global list
if not Urls.url_exists(imgurl):
# its ok then add it to the global list
Urls.add_url(imgurl)
# download each one and save it
imgresp = request_from_url(imgurl, self.settings)
if imgresp:
# check the content-type matches and image
ext = web.is_valid_content_type(imgurl,
imgresp.headers.get("Content-Type"),
self.settings["images_to_search"])
if ext in web.IMAGE_EXTS:
# if image then create a file path and check
# the image resolution size matches
# if it does then save to file
if self.settings["generate_filenames"]["enabled"]:
filename = f'{self.settings["generate_filenames"]["name"]}{self.thread_index}{ext}'
else:
filename = f"test{self.thread_index}{ext}"
download_image(filename, imgresp, self.settings)
# close the image request handle
imgresp.close()
else:
if ext in web.IMAGE_EXTS:
if not Urls.url_exists(self.url):
Urls.add_url(self.url)
# if image then create a file path and check
# the image resolution size matches
# if it does then save to file
if self.settings["generate_filenames"]["enabled"]:
filename = f'{self.settings["generate_filenames"]["name"]}{self.thread_index}{ext}'
else:
filename = f"test{self.thread_index}{ext}"
download_image(filename, r, self.settings)
r.close()
Threads.semaphore.release()
if Threads.cancel.is_set():
notify_commander(GruntMessage(status="cancelled", type="finished"))
else:
notify_commander(GruntMessage(status="complete", type="finished"))
def commander_thread(callback):
"""
main handler thread takes in filepath or url
and then passes onto captain_thread for parsing
Level 1 parser and image finder thread
will create grunt threads if any links found on url
"""
quit = False
grunts = []
_task_running = False
callback(Message(thread="commander", type="message", data={"message": "Commander thread has loaded. Waiting to scan"}))
# stops code getting to long verbose
MessageMain = functools.partial(Message, thread="commander", type="message")
# settings dict will contain the settings at start of scraping
settings = {}
scanned_urls = []
while not quit:
try:
# Get the json object from the global queue
r = Threads.commander_queue.get(0.5)
if r.thread == "main":
if r.type == "quit":
Threads.cancel.set()
callback(Message(thread="commander", type="quit"))
quit = True
elif r.type == "start":
if not _task_running:
grunts = []
_task_running = True
# load the settings from file
# create a new instance of it in memory
# we dont want these values to change
# whilst downloading and saving to file
settings = dict(Settings.load())
# Set the max connections
max_connections = round(int(settings["max_connections"]))
Threads.semaphore = threading.Semaphore(max_connections)
Debug.log_file("SETTINGS", "commander.run", f"Max Connections set to {max_connections}")
callback(MessageMain(data={"message": "Starting Threads..."}))
for thread_index, url in enumerate(scanned_urls):
grunts.append(Grunt(thread_index, url, settings))
for _grunt in grunts:
_grunt.start()
elif r.type == "fetch":
if not _task_running:
# Load settings
callback(Message(thread="commander", type="fetch", status="started"))
settings = Settings.load()
callback(MessageMain(data={"message": "Initializing the global search filter..."}))
# compile our filter matches only add those from the filter list
web.compile_regex_global_filter()
# get the document from the URL
callback(MessageMain(data={"message": f"Connecting to {r.data['url']}"}))
webreq = request_from_url(r.data["url"], settings)
if webreq:
# make sure is a text document to parse
ext = web.is_valid_content_type(r.data["url"], webreq.headers["Content-type"], settings["images_to_search"])
if ext == ".html":
html_doc = webreq.text
# get the url title
_assign_unique_name(r.data["url"], html_doc)
callback(MessageMain(data={"message": "Parsing HTML Document..."}))
# scrape links and images from document
scanned_urls = []
if web.parse_html(url=r.data["url"],
html=html_doc,
urls=scanned_urls,
images_only=False,
thumbnails_only=True) > 0:
# send the scanned urls to the main thread for processing
callback(MessageMain(data={"message": f"Parsing succesful. Found {len(scanned_urls)} links"}))
data = {"urls": scanned_urls}
reqmsg = Message(thread="commander", type="fetch", status="finished", data=data)
callback(reqmsg)
else:
# Nothing found notify main thread
callback(MessageMain(data={"message": "No links found :("}))
webreq.close()
else:
callback(MessageMain(data={"message": "Still scanning for images please press cancel to start a new scan"}))
elif r.type == "cancel":
Threads.cancel.set()
elif r.thread == "grunt":
callback(r)
elif r.thread == "settings":
callback(MessageMain(data=r.data))
except queue.Empty as err:
print(f"Queue error: {err.__str__()}")
finally:
if _task_running:
# check if all grunts are finished if so cleanup
# and notify main thread
if len(grunts_alive(grunts)) == 0:
Threads.cancel.clear()
grunts = []
_task_running = False
Urls.clear()
callback(Message(thread="commander", type="complete"))
def grunts_alive(grunts):
"""
returns a list of grunt threads that are still alive
"""
return list(filter(lambda grunt : grunt.is_alive(), grunts))
def _simulate_grunts(grunts):
# Debugging thread messaging system and synchronization
for x in range(50):
grunt = Grunt(x)
grunts.append(grunt)
grunt.start()
def notify_commander(r):
"""
send_message(object)
FIFO queue puts a no wait message on the queue
r - Request namedtuple
"""
Threads.commander_queue.put_nowait(r)