def _search_html(self, html_doc: str, url: str, cancel_flag: mp.Event): soup = parsing.parse_html(html_doc) html_title = getattr(soup.find("title"), "text", "") options.assign_unique_name("", html_title) # Setup Search filters and find matches within forms, links and images self.filters = parsing.compile_filter_list( self.settings["filter-search"]) # Notify the Main Process were starting the sort self.fetch_start_message(html_title, url) for url_data in parsing.sort_soup( url=url, soup=soup, include_forms=False, images_only=False, thumbnails_only=self.settings.get("thumbnails_only", True), filters=self.filters, img_exts=self.settings["images_to_search"]): if url_data: scanned_index = self.scanned_urls.__len__() self.scanned_urls[scanned_index] = url_data self.fetch_update_message(scanned_index, url_data) if cancel_flag.is_set(): break self.fetch_complete_message(self.scanned_urls.__len__(), html_title) # call the commander process to set the fetch_running to False self.queue.put(Message(const.THREAD_MAIN, const.EVENT_FETCH_COMPLETE)) if self.scanned_urls: if self.settings["auto-download"]: # Message ourselves and start the tasks self.queue.put_nowait( Message(thread=const.THREAD_MAIN, event=const.EVENT_START)) else: self.message_main("No Links Found :(")
def stream_to_file(path: str, bytes_stream: BytesIO) -> Message: """ Writes the buffer to file Args: path: the file path to save to bytes_stream: the stream saving from Returns: Message object containing either STATUS_OK or STATUS_ERROR if could not save to file """ with open(path, "wb") as fp: try: fp.write(bytes_stream.getbuffer()) return Message(thread=const.THREAD_TASK, event=const.EVENT_DOWNLOAD_IMAGE, status=const.STATUS_OK, id=0, data={ "message": "image saved", "path": path }) except Exception as err: return Message(thread=const.THREAD_TASK, event=const.EVENT_DOWNLOAD_IMAGE, status=const.STATUS_ERROR, id=0, data={ "message": err.__str__(), "path": path })
def _on_debug(self, evt): self.app.commander.queue.put_nowait( Message(thread=const.THREAD_MAIN, event=const.EVENT_FETCH, id=0, status=const.STATUS_OK, data={"url": "http://localhost:5000/setup_test"}))
def run(self): if not self.cancel.is_set(): cookie_jar = load_cookies(self.settings) self.comm_queue.put_nowait( Message(thread=const.THREAD_TASK, id=self.task_index, status=const.STATUS_OK, event=const.EVENT_SCANNING, data={"url": self.url_data.url})) # Three Levels of looping each level parses # finds new links to images. Saves images to file if self.add_url(self.url_data): # Level 1 level_one_urls = self._follow_url(self.url_data, cookie_jar) for level_one_url_data in level_one_urls.values(): if self.add_url(level_one_url_data): # Level 2 level_two_urls = self._follow_url( level_one_url_data, cookie_jar) for level_two_url_data in level_two_urls.values(): if self.add_url(level_two_url_data): # Level 3 self._follow_url(level_two_url_data, cookie_jar) if self.cancel.is_set(): self.notify_finished(const.STATUS_ERROR, "Task has cancelled") else: self.notify_finished(const.STATUS_OK, "Task has completed")
def fetch_link(self): if self.addressbar.txt_address.GetValue(): data = {"url": self.addressbar.txt_address.GetValue()} self.app.commander.queue.put_nowait( Message(thread=const.THREAD_MAIN, event=const.EVENT_FETCH, data=data))
def fetch_update_message(self, index: int, url_data: UrlData): self.main_queue.put_nowait( Message(thread=const.THREAD_COMMANDER, event=const.EVENT_FETCH, data={ "url_data": url_data, "index": index }))
def fetch_start_message(self, title: str, url: str): self.main_queue.put_nowait( Message(thread=const.THREAD_COMMANDER, event=const.EVENT_FETCH_START, data={ "title": title, "url": url }))
def fetch_complete_message(self, length: int, title: str): self.main_queue.put_nowait( Message(thread=const.THREAD_COMMANDER, event=const.EVENT_FETCH_COMPLETE, status=const.STATUS_OK, data={ "length": length, "title": title }))
def fetch_error_message(self, err: str, url: str): self.main_queue.put_nowait( Message(thread=const.THREAD_COMMANDER, event=const.EVENT_FETCH_COMPLETE, status=const.STATUS_ERROR, data={ "message": err, "url": url }))
def ignored_error_message(self, url: str, message: str): self.main_queue.put_nowait( Message(thread=const.THREAD_COMMANDER, event=const.EVENT_FETCH_COMPLETE, status=const.STATUS_IGNORED, data={ "message": message, "url": url }))
def on_close_window(self, evt): timer_quit.set() self.detached_frame.Destroy() self.app.commander.queue.put( Message(thread=const.THREAD_MAIN, event=const.EVENT_QUIT, id=0, data={}, status=const.STATUS_OK)) self.app.commander.join() evt.Skip()
def notify_finished(self, status: int, message: str): if self.pause.is_set(): self.wait() self.comm_queue.put_nowait( Message(thread=const.THREAD_TASK, status=status, event=const.EVENT_FINISHED, id=self.task_index, data={ "message": message, "url": self.url_data.url }))
def _check_blacklist(self, url_data: UrlData, task: Task): """ This method gets called when a Task requests if a Url has already been searched from another Task. Args: url_data: the url to check """ black_list_added = repr(url_data.__dict__) in self.blacklist if not black_list_added: self.blacklist[repr(url_data.__dict__)] = 1 # flip the black_list_added boolean task.msg_box.put( Message(thread=const.THREAD_COMMANDER, event=const.EVENT_BLACKLIST, data={"added": not black_list_added}))
def do_POST(self): if self.path == "/set-html": self._set_headers() b = self.rfile.read(int(self.headers["Content-Length"])) json_string = _decode_base64(b) todo = json.loads(json_string) html = _create_document(todo) url = f"http://{_ServerHandler.host}:{_ServerHandler.port}/set-html" _ServerHandler.queue.put_nowait(Message( thread=const.THREAD_SERVER, event=const.EVENT_SERVER_READY, status=const.STATUS_OK, data={"html": html, "url": url})) self.send_response(200) self.end_headers() else: self.send_response(404) self.end_headers()
def add_url(self, url_data: UrlData) -> bool: """ Query the Parent Process if this url dict exists if not then Parent Process will add it to its blacklist returns True if no entry found """ self.comm_queue.put( Message(thread=const.THREAD_TASK, event=const.EVENT_BLACKLIST, data={ "index": self.task_index, "urldata": url_data, "url": url_data.url }, id=self.task_index, status=const.STATUS_OK)) reply = self.msg_box.get() return reply.data["added"]
def _follow_url(self, url_data: UrlData, cookie_jar: CookieJar) -> dict: """ Args: url_data: UrlData object cookie_jar: CookieJar object Returns: """ # Hang the thread if Pause flag is set if self.pause.is_set(): self.wait() urls = {} try: self._network_io.set() response = request_from_url(url_data, cookie_jar, self.settings) self._network_io.clear() if self.cancel.is_set(): response.close() raise Exception("Task has been Cancelled") urls = self.search_response( response, self.settings["form_search"]["enabled"]) response.close() except Exception as err: self.comm_queue.put_nowait( Message(thread=const.THREAD_TASK, event=const.EVENT_DOWNLOAD_IMAGE, data={ "url": url_data.url, "message": err.__str__() }, id=self.task_index, status=const.STATUS_ERROR)) finally: return urls
def message_start(self): self.main_queue.put_nowait( Message(thread=const.THREAD_COMMANDER, event=const.EVENT_START))
def message_quit(self): self.main_queue.put( Message(thread=const.THREAD_COMMANDER, event=const.EVENT_QUIT))
def message_main(self, message: str): self.main_queue.put_nowait( Message(thread=const.THREAD_COMMANDER, event=const.EVENT_MESSAGE, status=const.STATUS_OK, data={"message": message}))
def stop_tasks(self): self.app.commander.queue.put_nowait( Message(thread=const.THREAD_MAIN, event=const.EVENT_CANCEL)) self.app.commander.queue.put_nowait( Message(thread=const.THREAD_MAIN, event=const.EVENT_FETCH_CANCEL))
def download_image(filename: str, response: Response, settings: dict) -> Message: """ Args: filename: the name of the file to be saved to. This may change depending on same filename response: Response object returned from requests settings: Settings json object used to find unique save path and what to do if same filename found Returns: message (object): Message object containing information about what happened. This should be passed onto the Commander thread and handled """ message = Message(thread=const.THREAD_TASK, id=0, status=const.STATUS_IGNORED, event=const.EVENT_DOWNLOAD_IMAGE, data={ "message": "unknown", "url": response.url }) byte_stream = _response_to_stream(response) # check the image size is within our bounds minsize = settings["minimum_image_resolution"] image = Image.open(byte_stream) width, height = image.size if width > minsize["width"] and height > minsize["height"]: # create a new save path path = create_save_path(settings) full_path = os.path.join(path, filename) # check byte duplicate _duplicate = options.image_exists(path, response.content) # check filename duplicate if not _duplicate: if os.path.exists(full_path): # if file name exists then check user settings on what to do if settings["file_exists"] == "rename": full_path = options.rename_file(full_path) message.data["message"] = "Renamed path" message.data["path"] = full_path elif settings["file_exists"] == "skip": # close the stream and don't write to disk message.data["message"] = "Skipped file" message.data["path"] = full_path byte_stream.close() # everything ok. write image to disk message = stream_to_file(full_path, byte_stream) else: message.data[ "message"] = f"Bytes duplicate found locally with url {response.url} and {_duplicate}" else: # add the URl to the cache if not cache.query_ignore(response.url): cache.add_ignore(response.url, "small-image", width, height) message.data["message"] = f"Image too small ({width}x{height})" # close the file handle byte_stream.close() return message
def notify_cancel(self): self.main_queue.put_nowait( Message(thread=const.THREAD_COMMANDER, event=const.EVENT_CANCEL))
def message_pause(self, pause: bool): self.main_queue.put_nowait( Message(thread=const.THREAD_COMMANDER, event=const.EVENT_PAUSE, data={"pause": pause}))
def message_complete(self): self.main_queue.put_nowait( Message(thread=const.THREAD_COMMANDER, event=const.EVENT_COMPLETE))
def start_tasks(self): self.app.commander.queue.put_nowait( Message(thread=const.THREAD_MAIN, event=const.EVENT_START))
def pause_tasks(self): self.app.commander.queue.put_nowait( Message(thread=const.THREAD_MAIN, event=const.EVENT_PAUSE))
def search_response(self, response: Response, include_forms: bool) -> dict: """ if html parse look for image sources if image then save """ urls = {} self.comm_queue.put_nowait( Message(thread=const.THREAD_TASK, event=const.EVENT_SEARCHING, id=self.task_index, status=const.STATUS_OK, data={})) # check the file extension ext = mime.is_valid_content_type( response.url, response.headers.get("Content-Type", ""), self.settings["images_to_search"]) if mime.EXT_HTML == ext: # if html document then parse the text soup = parsing.parse_html(response.text) # search for links in soup for url_index, url in enumerate( parsing.sort_soup( url=response.url, soup=soup, include_forms=include_forms, images_only=True, thumbnails_only=False, filters=self.filters, img_exts=self.settings["images_to_search"])): if url: urls[url_index] = url elif ext in mime.IMAGE_EXTS: if self.settings["generate_filenames"]["enabled"]: # if so then append thread index and file_index to make a unique identifier file_index = f"{self.task_index}_{self.file_index}{ext}" # increment the file_index for the next image found self.file_index += 1 # append the saved unique name to our file path filename = f'{self.settings["generate_filenames"]["name"]}{file_index}' else: # generate filename from url filename = options.url_to_filename(response.url, ext) # check the validity of the image and save try: msg = download_image(filename, response, self.settings) msg.data["url"] = response.url msg.id = self.task_index self.comm_queue.put_nowait(msg) except UnidentifiedImageError as err: # Couldn't load the Image from Stream self.comm_queue.put_nowait( Message(thread=const.THREAD_TASK, id=self.task_index, data={ "url": response.url, "message": err.__str__() }, event=const.EVENT_DOWNLOAD_IMAGE, status=const.STATUS_ERROR)) return {} else: if not cache.query_ignore(response.url): cache.add_ignore(response.url, "unknown-file-type", 0, 0) self.comm_queue.put_nowait( Message(thread=const.THREAD_TASK, id=self.task_index, data={ "url": response.url, "message": "Unknown File Type" }, event=const.EVENT_DOWNLOAD_IMAGE, status=const.STATUS_IGNORED)) return urls