Пример #1
0
 def _search_html(self, html_doc: str, url: str, cancel_flag: mp.Event):
     soup = parsing.parse_html(html_doc)
     html_title = getattr(soup.find("title"), "text", "")
     options.assign_unique_name("", html_title)
     # Setup Search filters and find matches within forms, links and images
     self.filters = parsing.compile_filter_list(
         self.settings["filter-search"])
     # Notify the Main Process were starting the sort
     self.fetch_start_message(html_title, url)
     for url_data in parsing.sort_soup(
             url=url,
             soup=soup,
             include_forms=False,
             images_only=False,
             thumbnails_only=self.settings.get("thumbnails_only", True),
             filters=self.filters,
             img_exts=self.settings["images_to_search"]):
         if url_data:
             scanned_index = self.scanned_urls.__len__()
             self.scanned_urls[scanned_index] = url_data
             self.fetch_update_message(scanned_index, url_data)
         if cancel_flag.is_set():
             break
     self.fetch_complete_message(self.scanned_urls.__len__(), html_title)
     # call the commander process to set the fetch_running to False
     self.queue.put(Message(const.THREAD_MAIN, const.EVENT_FETCH_COMPLETE))
     if self.scanned_urls:
         if self.settings["auto-download"]:
             # Message ourselves and start the tasks
             self.queue.put_nowait(
                 Message(thread=const.THREAD_MAIN, event=const.EVENT_START))
     else:
         self.message_main("No Links Found :(")
Пример #2
0
def stream_to_file(path: str, bytes_stream: BytesIO) -> Message:
    """
    Writes the buffer to file
    Args:
        path: the file path to save to
        bytes_stream: the stream saving from

    Returns:
        Message object containing either STATUS_OK or STATUS_ERROR if could not save to file
    """
    with open(path, "wb") as fp:
        try:
            fp.write(bytes_stream.getbuffer())
            return Message(thread=const.THREAD_TASK,
                           event=const.EVENT_DOWNLOAD_IMAGE,
                           status=const.STATUS_OK,
                           id=0,
                           data={
                               "message": "image saved",
                               "path": path
                           })
        except Exception as err:
            return Message(thread=const.THREAD_TASK,
                           event=const.EVENT_DOWNLOAD_IMAGE,
                           status=const.STATUS_ERROR,
                           id=0,
                           data={
                               "message": err.__str__(),
                               "path": path
                           })
Пример #3
0
 def _on_debug(self, evt):
     self.app.commander.queue.put_nowait(
         Message(thread=const.THREAD_MAIN,
                 event=const.EVENT_FETCH,
                 id=0,
                 status=const.STATUS_OK,
                 data={"url": "http://localhost:5000/setup_test"}))
Пример #4
0
    def run(self):
        if not self.cancel.is_set():
            cookie_jar = load_cookies(self.settings)
            self.comm_queue.put_nowait(
                Message(thread=const.THREAD_TASK,
                        id=self.task_index,
                        status=const.STATUS_OK,
                        event=const.EVENT_SCANNING,
                        data={"url": self.url_data.url}))
            # Three Levels of looping each level parses
            # finds new links to images. Saves images to file
            if self.add_url(self.url_data):
                # Level 1
                level_one_urls = self._follow_url(self.url_data, cookie_jar)
                for level_one_url_data in level_one_urls.values():
                    if self.add_url(level_one_url_data):
                        # Level 2
                        level_two_urls = self._follow_url(
                            level_one_url_data, cookie_jar)
                        for level_two_url_data in level_two_urls.values():
                            if self.add_url(level_two_url_data):
                                # Level 3
                                self._follow_url(level_two_url_data,
                                                 cookie_jar)

        if self.cancel.is_set():
            self.notify_finished(const.STATUS_ERROR, "Task has cancelled")
        else:
            self.notify_finished(const.STATUS_OK, "Task has completed")
Пример #5
0
 def fetch_link(self):
     if self.addressbar.txt_address.GetValue():
         data = {"url": self.addressbar.txt_address.GetValue()}
         self.app.commander.queue.put_nowait(
             Message(thread=const.THREAD_MAIN,
                     event=const.EVENT_FETCH,
                     data=data))
Пример #6
0
 def fetch_update_message(self, index: int, url_data: UrlData):
     self.main_queue.put_nowait(
         Message(thread=const.THREAD_COMMANDER,
                 event=const.EVENT_FETCH,
                 data={
                     "url_data": url_data,
                     "index": index
                 }))
Пример #7
0
 def fetch_start_message(self, title: str, url: str):
     self.main_queue.put_nowait(
         Message(thread=const.THREAD_COMMANDER,
                 event=const.EVENT_FETCH_START,
                 data={
                     "title": title,
                     "url": url
                 }))
Пример #8
0
 def fetch_complete_message(self, length: int, title: str):
     self.main_queue.put_nowait(
         Message(thread=const.THREAD_COMMANDER,
                 event=const.EVENT_FETCH_COMPLETE,
                 status=const.STATUS_OK,
                 data={
                     "length": length,
                     "title": title
                 }))
Пример #9
0
 def fetch_error_message(self, err: str, url: str):
     self.main_queue.put_nowait(
         Message(thread=const.THREAD_COMMANDER,
                 event=const.EVENT_FETCH_COMPLETE,
                 status=const.STATUS_ERROR,
                 data={
                     "message": err,
                     "url": url
                 }))
Пример #10
0
 def ignored_error_message(self, url: str, message: str):
     self.main_queue.put_nowait(
         Message(thread=const.THREAD_COMMANDER,
                 event=const.EVENT_FETCH_COMPLETE,
                 status=const.STATUS_IGNORED,
                 data={
                     "message": message,
                     "url": url
                 }))
Пример #11
0
 def on_close_window(self, evt):
     timer_quit.set()
     self.detached_frame.Destroy()
     self.app.commander.queue.put(
         Message(thread=const.THREAD_MAIN,
                 event=const.EVENT_QUIT,
                 id=0,
                 data={},
                 status=const.STATUS_OK))
     self.app.commander.join()
     evt.Skip()
Пример #12
0
 def notify_finished(self, status: int, message: str):
     if self.pause.is_set():
         self.wait()
     self.comm_queue.put_nowait(
         Message(thread=const.THREAD_TASK,
                 status=status,
                 event=const.EVENT_FINISHED,
                 id=self.task_index,
                 data={
                     "message": message,
                     "url": self.url_data.url
                 }))
Пример #13
0
 def _check_blacklist(self, url_data: UrlData, task: Task):
     """
     This method gets called when a Task requests if a Url has already been searched from
     another Task.
     Args:
         url_data: the url to check
     """
     black_list_added = repr(url_data.__dict__) in self.blacklist
     if not black_list_added:
         self.blacklist[repr(url_data.__dict__)] = 1
     # flip the black_list_added boolean
     task.msg_box.put(
         Message(thread=const.THREAD_COMMANDER,
                 event=const.EVENT_BLACKLIST,
                 data={"added": not black_list_added}))
Пример #14
0
 def do_POST(self):
     if self.path == "/set-html":
         self._set_headers()
         b = self.rfile.read(int(self.headers["Content-Length"]))
         json_string = _decode_base64(b)
         todo = json.loads(json_string)
         html = _create_document(todo)
         url = f"http://{_ServerHandler.host}:{_ServerHandler.port}/set-html"
         _ServerHandler.queue.put_nowait(Message(
             thread=const.THREAD_SERVER, event=const.EVENT_SERVER_READY,
             status=const.STATUS_OK, data={"html": html,
                                           "url": url}))
         self.send_response(200)
         self.end_headers()
     else:
         self.send_response(404)
         self.end_headers()
Пример #15
0
 def add_url(self, url_data: UrlData) -> bool:
     """
     Query the Parent Process if this url dict exists
     if not then Parent Process will add it to its blacklist
     returns True if no entry found
     """
     self.comm_queue.put(
         Message(thread=const.THREAD_TASK,
                 event=const.EVENT_BLACKLIST,
                 data={
                     "index": self.task_index,
                     "urldata": url_data,
                     "url": url_data.url
                 },
                 id=self.task_index,
                 status=const.STATUS_OK))
     reply = self.msg_box.get()
     return reply.data["added"]
Пример #16
0
    def _follow_url(self, url_data: UrlData, cookie_jar: CookieJar) -> dict:
        """

        Args:
            url_data: UrlData object
            cookie_jar: CookieJar object

        Returns:

        """
        # Hang the thread if Pause flag is set
        if self.pause.is_set():
            self.wait()
        urls = {}
        try:
            self._network_io.set()
            response = request_from_url(url_data, cookie_jar, self.settings)
            self._network_io.clear()
            if self.cancel.is_set():
                response.close()
                raise Exception("Task has been Cancelled")
            urls = self.search_response(
                response, self.settings["form_search"]["enabled"])
            response.close()
        except Exception as err:
            self.comm_queue.put_nowait(
                Message(thread=const.THREAD_TASK,
                        event=const.EVENT_DOWNLOAD_IMAGE,
                        data={
                            "url": url_data.url,
                            "message": err.__str__()
                        },
                        id=self.task_index,
                        status=const.STATUS_ERROR))
        finally:
            return urls
Пример #17
0
 def message_start(self):
     self.main_queue.put_nowait(
         Message(thread=const.THREAD_COMMANDER, event=const.EVENT_START))
Пример #18
0
 def message_quit(self):
     self.main_queue.put(
         Message(thread=const.THREAD_COMMANDER, event=const.EVENT_QUIT))
Пример #19
0
 def message_main(self, message: str):
     self.main_queue.put_nowait(
         Message(thread=const.THREAD_COMMANDER,
                 event=const.EVENT_MESSAGE,
                 status=const.STATUS_OK,
                 data={"message": message}))
Пример #20
0
 def stop_tasks(self):
     self.app.commander.queue.put_nowait(
         Message(thread=const.THREAD_MAIN, event=const.EVENT_CANCEL))
     self.app.commander.queue.put_nowait(
         Message(thread=const.THREAD_MAIN, event=const.EVENT_FETCH_CANCEL))
Пример #21
0
def download_image(filename: str, response: Response,
                   settings: dict) -> Message:
    """

    Args:
        filename: the name of the file to be saved to. This may change depending on same filename
        response: Response object returned from requests
        settings: Settings json object used to find unique save path and what to do if same filename found

    Returns:
        message (object): Message object containing information about what happened. This should be passed
                          onto the Commander thread and handled
    """
    message = Message(thread=const.THREAD_TASK,
                      id=0,
                      status=const.STATUS_IGNORED,
                      event=const.EVENT_DOWNLOAD_IMAGE,
                      data={
                          "message": "unknown",
                          "url": response.url
                      })

    byte_stream = _response_to_stream(response)
    # check the image size is within our bounds
    minsize = settings["minimum_image_resolution"]
    image = Image.open(byte_stream)
    width, height = image.size
    if width > minsize["width"] and height > minsize["height"]:
        # create a new save path
        path = create_save_path(settings)
        full_path = os.path.join(path, filename)
        # check byte duplicate
        _duplicate = options.image_exists(path, response.content)
        # check filename duplicate
        if not _duplicate:
            if os.path.exists(full_path):
                # if file name exists then check user settings on what to do
                if settings["file_exists"] == "rename":
                    full_path = options.rename_file(full_path)
                    message.data["message"] = "Renamed path"
                    message.data["path"] = full_path
                elif settings["file_exists"] == "skip":
                    # close the stream and don't write to disk
                    message.data["message"] = "Skipped file"
                    message.data["path"] = full_path
                    byte_stream.close()
            # everything ok. write image to disk
            message = stream_to_file(full_path, byte_stream)
        else:
            message.data[
                "message"] = f"Bytes duplicate found locally with url {response.url} and {_duplicate}"
    else:
        # add the URl to the cache
        if not cache.query_ignore(response.url):
            cache.add_ignore(response.url, "small-image", width, height)
        message.data["message"] = f"Image too small ({width}x{height})"

    # close the file handle
    byte_stream.close()

    return message
Пример #22
0
 def notify_cancel(self):
     self.main_queue.put_nowait(
         Message(thread=const.THREAD_COMMANDER, event=const.EVENT_CANCEL))
Пример #23
0
 def message_pause(self, pause: bool):
     self.main_queue.put_nowait(
         Message(thread=const.THREAD_COMMANDER,
                 event=const.EVENT_PAUSE,
                 data={"pause": pause}))
Пример #24
0
 def message_complete(self):
     self.main_queue.put_nowait(
         Message(thread=const.THREAD_COMMANDER, event=const.EVENT_COMPLETE))
Пример #25
0
 def start_tasks(self):
     self.app.commander.queue.put_nowait(
         Message(thread=const.THREAD_MAIN, event=const.EVENT_START))
Пример #26
0
 def pause_tasks(self):
     self.app.commander.queue.put_nowait(
         Message(thread=const.THREAD_MAIN, event=const.EVENT_PAUSE))
Пример #27
0
 def search_response(self, response: Response, include_forms: bool) -> dict:
     """
     if html parse look for image sources
     if image then save
     """
     urls = {}
     self.comm_queue.put_nowait(
         Message(thread=const.THREAD_TASK,
                 event=const.EVENT_SEARCHING,
                 id=self.task_index,
                 status=const.STATUS_OK,
                 data={}))
     # check the file extension
     ext = mime.is_valid_content_type(
         response.url, response.headers.get("Content-Type", ""),
         self.settings["images_to_search"])
     if mime.EXT_HTML == ext:
         # if html document then parse the text
         soup = parsing.parse_html(response.text)
         # search for links in soup
         for url_index, url in enumerate(
                 parsing.sort_soup(
                     url=response.url,
                     soup=soup,
                     include_forms=include_forms,
                     images_only=True,
                     thumbnails_only=False,
                     filters=self.filters,
                     img_exts=self.settings["images_to_search"])):
             if url:
                 urls[url_index] = url
     elif ext in mime.IMAGE_EXTS:
         if self.settings["generate_filenames"]["enabled"]:
             # if so then append thread index and file_index to make a unique identifier
             file_index = f"{self.task_index}_{self.file_index}{ext}"
             # increment the file_index for the next image found
             self.file_index += 1
             # append the saved unique name to our file path
             filename = f'{self.settings["generate_filenames"]["name"]}{file_index}'
         else:
             # generate filename from url
             filename = options.url_to_filename(response.url, ext)
         # check the validity of the image and save
         try:
             msg = download_image(filename, response, self.settings)
             msg.data["url"] = response.url
             msg.id = self.task_index
             self.comm_queue.put_nowait(msg)
         except UnidentifiedImageError as err:
             # Couldn't load the Image from Stream
             self.comm_queue.put_nowait(
                 Message(thread=const.THREAD_TASK,
                         id=self.task_index,
                         data={
                             "url": response.url,
                             "message": err.__str__()
                         },
                         event=const.EVENT_DOWNLOAD_IMAGE,
                         status=const.STATUS_ERROR))
         return {}
     else:
         if not cache.query_ignore(response.url):
             cache.add_ignore(response.url, "unknown-file-type", 0, 0)
             self.comm_queue.put_nowait(
                 Message(thread=const.THREAD_TASK,
                         id=self.task_index,
                         data={
                             "url": response.url,
                             "message": "Unknown File Type"
                         },
                         event=const.EVENT_DOWNLOAD_IMAGE,
                         status=const.STATUS_IGNORED))
     return urls