def check_fetch_settings(self) -> bool: self.get_and_correct_spin_thread_num() if validate_url(self.manga_url.get()) is not True: self.log_and_show_error(tr('The URL in the Manga URL field is invalid.')) return False if not Path(self.cookie_path.get()).exists(): self.log_and_show_error(tr('The cookies.txt does not exist at the given path.')) return False if self.proxy_state.get() != 'disabled': if self.proxy_host.get().strip() == '': self.log_and_show_error(tr('Invalid proxy host name.')) return False try: proxy_port = int(self.proxy_port.get()) if proxy_port > 65535 or proxy_port < 1: raise RuntimeError() except: self.log_and_show_error(tr('Invalid proxy port.')) return False return True
def valid_download_html(url, savepath='/Volumes/Mac/GoGuardianHTMLS', ext='txt', validate=False): """Validates a url prior to attempting to download Parameters: url (str): the url name. Return: tupple: Url (url), downloaded (bool)""" # determine if the url is valid. old_url = url # place hoder for the url tags # If validate true, validate the url and add a protocol if needed. if validate: valid = validate_url(url) # if not valid download add an http. if not valid: url = ''.join(['http://', url]) # try to validate it. try: response = requests.get(url, timeout=(10, 10)).content # allow to time out for only 10 seconds. except (Timeout, ReadTimeout, TooManyRedirects, ConnectionError): # fails return the tupple return old_url, False # Build the file name filename = rename_url(old_url, suffix='') filename = '.'.join([filename, ext]) filename = join(savepath, filename) # save the file to desk with open(filename, 'w') as f: f.write(response) return old_url, True
def is_same_site(self, url:str): '''Checks whether the domain of the given site matches the domain of this site. Returns True or False. ''' if not validate_url(url): return False if self.domain != urlparse(url).netloc: return False return True
def make_url_valid(url): """Validates a url, and add an http protocol if not valid. Parameters: url (str): a url string. Return: new url(str) """ valid = validate_url(url) # if not valid download add an http. if not valid: url = ''.join(['http://', url]) return url
def download_multiple_html_with_pycurl(urls, savepath='/Volumes/Mac/Insight/GoGuardianHTMLS-9-17-2015-b', ext='txt'): """Takes a list of urls and downloads and writes a save path. Parameters: urls (list): list of url strings savepath (str): path to save location ext (str) extension to the string Returns None """ # set up a curl object and set up some options curl = pycurl.Curl() curl.setopt(pycurl.FOLLOWLOCATION, 1) curl.setopt(pycurl.MAXREDIRS, 5) # use a for loop to get the urls and download for url in urls: # place holder for the old url old_url = url # validate the url and append and http: to it if needed valid = validate_url(url) # if not add an http. if not valid: url = ''.join(['http://', url]) # set the url curl.setopt(pycurl.URL, url) # try to download it and save it. try: # create a new string io object and set it. b = StringIO.StringIO() curl.setopt(pycurl.WRITEFUNCTION, b.write) # perform the getting and streaming curl.perform() response = b.getvalue() b.close() # handle download # Build the file name filename = rename_url(old_url, suffix='') filename = '.'.join([filename, ext]) filename = join(savepath, filename) # save the file with open(filename, 'w') as f: f.write(response) except: pass curl.close()
def _save_url(url, path_name=None): if not validate_url(url): return ws = None for _, site in resource_handler.get_resources(section=_WEBSITE_SECTION): if site.is_same_site(url=url): ws = site break if not ws: ws = Website() ws.store_location(url=url, path_name=path_name) resources = {ws.domain: ws} resource_handler.store_resources(section=_WEBSITE_SECTION, resources=resources)
async def parse(msg): url = pyperclip.paste() print("url = ", url) if not validate_url(url): return False parsed_url = urlparse(url) print(parsed_url) print() ws = Website(url = url, path_name="test") print("Stored domain", '\n', ws.domain) print("Stored URL", '\n', ws.get_url(path_name="test")) search_url = ws.get_query_url() if search_url: print("Stored search URL", '\n', search_url) return True
def save_site(msg): '''If clipboard contains a url, it will be saved according to the name given in msg.data. Name in msg.data is parsed as anything given after the word "as". If no name given, will parse the url and save it according to domain name. ''' url = pyperclip.paste() print("url = ", url) if not validate_url(url): return False path_name = None data = distill_msg(msg=msg, sediment="save").data.strip() if data: try: path_name = data.split("as")[1].strip() except: '''no 'as' found, so path_name cannot be extracted from given data''' _save_url(url=url, path_name=path_name) return True
def valid_download_urls(urls, savepath='/Volumes/Mac/GoGuardianHTMLS', ext='txt', validate=False): """Validates a url prior to attempting to download Parameters: urls (list): the url names. Return: urls (list): the url names.""" #need old url to match the data row old_url = [] count = 0 for url in urls: old_url.append(url) if validate: valid = validate_url(url) # if not valid download add an http. if not valid: urls[count] = ''.join(['http://', url]) count += 1 responses = (grequests.get(url, timeout=2) for url in urls) #reset array index count = 0 for response in grequests.map(responses): if (response is None): pass else: filename = rename_url(old_url[count], suffix='') filename = '.'.join([filename, ext]) filename = join(savepath, filename) # save the file to desk with open(filename, 'w') as f: f.write(response.content) count += 1 return None
def store_location(self, url:str, path_name:str=None): '''Stores the given URL as the given name. If no name is given only stores the domain, and also the query pattern if it is in the url. ''' if not validate_url(url): feedback("Url fails validation") raise ValueError parsed_url = urlparse(url) # store scheme if not self._base: self._base = '://'.join((parsed_url.scheme, parsed_url.netloc)) # store domain if not self.domain: self._domain = parsed_url.netloc if not self.domain in parsed_url.netloc: feedback("Non-matching domain") raise ValueError # store query pattern self._store_query_pattern(parsed_url=parsed_url) # store path if not path_name: return self._paths[path_name] = parsed_url.path
def validates_link(self, key, url): is_valid = validate_url(url) if is_valid is True: return url raise ValueError('Image URL needs to be a valid URL.')