Exemplo n.º 1
0
    def check_fetch_settings(self) -> bool:
        self.get_and_correct_spin_thread_num()

        if validate_url(self.manga_url.get()) is not True:
            self.log_and_show_error(tr('The URL in the Manga URL field is invalid.'))
            return False

        if not Path(self.cookie_path.get()).exists():
            self.log_and_show_error(tr('The cookies.txt does not exist at the given path.'))
            return False

        if self.proxy_state.get() != 'disabled':
            if self.proxy_host.get().strip() == '':
                self.log_and_show_error(tr('Invalid proxy host name.'))
                return False

            try:
                proxy_port = int(self.proxy_port.get())
                if proxy_port > 65535 or proxy_port < 1:
                    raise RuntimeError()
            except:
                self.log_and_show_error(tr('Invalid proxy port.'))
                return False

        return True
Exemplo n.º 2
0
def valid_download_html(url, savepath='/Volumes/Mac/GoGuardianHTMLS', ext='txt', validate=False):
    """Validates a url prior to attempting to download
    Parameters:
        url (str): the url name.
    Return:
        tupple: Url (url), downloaded (bool)"""

    # determine if the url is valid.
    old_url = url  # place hoder for the url tags

    # If validate true, validate the url and add a protocol if needed.
    if validate:
        valid = validate_url(url)
        # if not valid download add an http.
        if not valid:
            url = ''.join(['http://', url])

    # try to validate it.
    try:
        response = requests.get(url, timeout=(10, 10)).content  # allow to time out for only 10 seconds.
    except (Timeout, ReadTimeout, TooManyRedirects, ConnectionError):  # fails return the tupple
        return old_url, False

    # Build the file name

    filename = rename_url(old_url, suffix='')
    filename = '.'.join([filename, ext])
    filename = join(savepath, filename)

    # save the file to desk
    with open(filename, 'w') as f:
        f.write(response)
    return old_url, True
Exemplo n.º 3
0
 def is_same_site(self, url:str):
     '''Checks whether the domain of the given site matches the domain of this site.
     Returns True or False.
     '''
     if not validate_url(url):
         return False
     if self.domain != urlparse(url).netloc:
         return False
     return True
Exemplo n.º 4
0
def make_url_valid(url):
    """Validates a url, and add an http protocol if not valid.
    Parameters:
        url (str): a url string.
    Return:
        new url(str)
    """
    valid = validate_url(url)
    # if not valid download add an http.
    if not valid:
        url = ''.join(['http://', url])
    return url
Exemplo n.º 5
0
def download_multiple_html_with_pycurl(urls, savepath='/Volumes/Mac/Insight/GoGuardianHTMLS-9-17-2015-b', ext='txt'):
    """Takes a list of urls and downloads and writes a save path.
    Parameters:
        urls (list): list of url strings
        savepath (str): path to save location
        ext (str) extension to the string
    Returns None
    """

    # set up a curl object and set up some options
    curl = pycurl.Curl()
    curl.setopt(pycurl.FOLLOWLOCATION, 1)
    curl.setopt(pycurl.MAXREDIRS, 5)

    # use a for loop to get the urls and download
    for url in urls:
        # place holder for the old url
        old_url = url

        # validate the url and append and http: to it if needed
        valid = validate_url(url)

        # if not add an http.
        if not valid:
            url = ''.join(['http://', url])

        # set the url
        curl.setopt(pycurl.URL, url)

        # try to download it and save it.
        try:
            # create a new string io object and set it.
            b = StringIO.StringIO()
            curl.setopt(pycurl.WRITEFUNCTION, b.write)

            # perform the getting and streaming
            curl.perform()
            response = b.getvalue()
            b.close()

            # handle download
            # Build the file name
            filename = rename_url(old_url, suffix='')
            filename = '.'.join([filename, ext])
            filename = join(savepath, filename)

            # save the file
            with open(filename, 'w') as f:
                f.write(response)
        except:
            pass

    curl.close()
Exemplo n.º 6
0
def _save_url(url, path_name=None):
    if not validate_url(url):
        return
    ws = None
    for _, site in resource_handler.get_resources(section=_WEBSITE_SECTION):
        if site.is_same_site(url=url):
            ws = site
            break
    if not ws:
        ws = Website()
    ws.store_location(url=url, path_name=path_name)
    resources = {ws.domain: ws}
    resource_handler.store_resources(section=_WEBSITE_SECTION, resources=resources)
Exemplo n.º 7
0
async def parse(msg):
    url = pyperclip.paste()
    print("url = ", url)
    if not validate_url(url):
        return False
    parsed_url = urlparse(url)
    print(parsed_url)
    print()
    ws = Website(url = url, path_name="test")
    print("Stored domain", '\n', ws.domain)
    print("Stored URL", '\n', ws.get_url(path_name="test"))
    search_url = ws.get_query_url()
    if search_url:
        print("Stored search URL", '\n', search_url)

    return True
Exemplo n.º 8
0
def save_site(msg):
    '''If clipboard contains a url, it will be saved according to the name given in msg.data.
    Name in msg.data is parsed as anything given after the word "as".
    If no name given, will parse the url and save it according to domain name.
    '''
    url = pyperclip.paste()
    print("url = ", url)
    if not validate_url(url):
        return False
    path_name = None
    data = distill_msg(msg=msg, sediment="save").data.strip()
    if data:
        try:
            path_name = data.split("as")[1].strip()
        except:
            '''no 'as' found, so path_name cannot be extracted from given data'''
    _save_url(url=url, path_name=path_name)
    return True
Exemplo n.º 9
0
def valid_download_urls(urls, savepath='/Volumes/Mac/GoGuardianHTMLS', ext='txt', validate=False):
    """Validates a url prior to attempting to download
    Parameters:
        urls (list): the url names.
    Return:
        urls (list): the url names."""

    #need old url to match the data row
    old_url = []
    count = 0
    for url in urls:

        old_url.append(url)

        if validate:
            valid = validate_url(url)
        # if not valid download add an http.
        if not valid:
            urls[count] = ''.join(['http://', url])

        count += 1

    responses = (grequests.get(url, timeout=2) for url in urls)

    #reset array index
    count = 0
    for response in grequests.map(responses):
        if (response is None):
            pass
        else:
            filename = rename_url(old_url[count], suffix='')
            filename = '.'.join([filename, ext])
            filename = join(savepath, filename)

            # save the file to desk
            with open(filename, 'w') as f:
                f.write(response.content)

        count += 1

    return None
Exemplo n.º 10
0
 def store_location(self, url:str, path_name:str=None):
     '''Stores the given URL as the given name.
     If no name is given only stores the domain, and also the query pattern if it is in the url.
     '''
     if not validate_url(url):
         feedback("Url fails validation")
         raise ValueError
     parsed_url = urlparse(url)
     # store scheme
     if not self._base:
         self._base = '://'.join((parsed_url.scheme, parsed_url.netloc))
     # store domain
     if not self.domain:
         self._domain = parsed_url.netloc
     if not self.domain in parsed_url.netloc:
         feedback("Non-matching domain")
         raise ValueError
     # store query pattern
     self._store_query_pattern(parsed_url=parsed_url)
     # store path
     if not path_name:
         return
     self._paths[path_name] = parsed_url.path
Exemplo n.º 11
0
 def validates_link(self, key, url):
     is_valid = validate_url(url)
     if is_valid is True:
         return url
     raise ValueError('Image URL needs to be a valid URL.')