def config(fname): print "Trying to load configuration from %s" % fname cp = ConfigParser.SafeConfigParser() try: with open(fname, "r") as fp: cp.readfp(fp) except IOError as ie: if ie.errno == errno.ENOENT: return raise for section in cp.sections(): if not section.lower().startswith("issuer "): continue if 'issuer' not in cp.options(section): print "Ignoring section %s as it has no `issuer` option set." % section if 'base_path' not in cp.options(section): print "Ignoring section %s as it has no `base_path` option set." % section issuer = cp.get(section, 'issuer') base_path = cp.get(section, 'base_path') base_path = urltools.normalize(base_path) issuer_info = g_authorized_issuers.setdefault(issuer, {}) issuer_info['base_path'] = base_path if 'map_subject' in cp.options(section): issuer_info['map_subject'] = cp.getboolean(section, 'map_subject') print "Configured token access for %s (issuer %s): %s" % ( section, issuer, str(issuer_info))
def parseLink(url): #be aware example.com is malformed arr = [] baseUrl = base(url) page = getPage(url) if (page is not None): soup = BeautifulSoup(page, 'html.parser') for x in soup.find_all('a'): link = x.get('href') if (link is not None and link[0:4] == "http"): arr.append(link) elif (link is not None and len(link) >= 1 and link[0] == "/"): arr.append(baseUrl + link) elif (link is not None and link[0:4] == "www."): arr.append("http://" + link) arr2 = [urltools.normalize(x) for x in arr] arr3 = [transform(x) for x in arr2 if checkLinkStr(x)] terms = clearHtml(page) if (terms == None): return None return { 'url': url, 'html': page, 'links': arr3, 'terms': terms, 'title': soup.title.text if (soup.title is not None) else "" } return None
def normalize(url, parent): try: if '#' in url: url = url.split('#')[0] if url.startswith('http') and '//' not in url: url = url.lstrip('http:') if url: url = urlparse.urljoin(parent, url) url = urltools.normalize(url) if url.startswith('https'): url = url.replace('https','http') if re.match(r'http:/', url): url = url.replace('https', 'http') if url.endswith('jpg') or url.endswith('png') or url.endswith('jpeg'): url = None url = url.rstrip('/') except: url = None return url
def urlchecker(self, url): if url is None: return False normalized_url = urltools.normalize(url) robotparser = urllib.robotparser.RobotFileParser() try: url_comp = urlparse(normalized_url) base_url = url_comp.scheme + "://" + url_comp.netloc + "/" except: self.logger.error("Cannot parse: " + url) try: robotparser.set_url(base_url + "robots.txt") robotparser.read() if not robotparser.can_fetch("*", normalized_url): self.logger.error(url + " is excluded due to protocol") return False except: self.logger.error("Cannot determine robots exclusion protocol: " + url) if normalized_url in self.visited_urls: self.logger.debug(url + " Has been visited before! ") return False elif base_url in self.sites_times and self.sites_times[base_url] > int( self.limit): # self.logger.debug( url + " Times visiting this site have reach the limit ") return False elif 'cgi' in normalized_url: return False else: return True
def build_url(link): """ Create valid URL from link. :param link: `str` relative url. :returns: `str` absolute url. """ return urltools.normalize(''.join((constants.HOST, link)))
def save(self, *args, **kwargs): """ Normalises the URL and generates the correct unique id :type args: [] :type kwargs: {} """ self.url = urltools.normalize(self.url) super(ShortUrl, self).save(*args, **kwargs)
def canonicalize_url(url): # Canonicalize URL url = urltools.normalize(url) # Remove fragment url = urllib.parse.urldefrag(url).url return url
def validate_path(self, values): if isinstance(values, str) or isinstance(values, unicode): values = [values] for value in values: if not value.startswith("/"): return False self.paths.add(urltools.normalize(value)) return True
def scrape_url(root_url, url): url = urllib.request.urlopen(url).read() soup = BeautifulSoup(url, 'html.parser') link_tags = soup.find_all('a') result = deque() for link in link_tags: current_link = link['href'] if current_link.startswith(root_url): result.append(normalize(current_link)) return result
def _standardize_url(url): """Takes in a url and returns a clean, consistent format. For example: example.com, http://example.com, example.com/ all are http://example.com/ Returns None if the url is somehow invalid.""" parts = parse.urlparse(url, "http") #default scheme is http if omitted standard = parts.geturl() standard = urltools.normalize(standard) if not url_regex.match(standard): return None return standard
def urlchecker (url, limit): normalized_url = urltools.normalize(url) url_comp = urlparse(url) if visited_urls.has_key(url): return False elif times_visiting_site[url_comp.net_loc] > limit : visited_urls[url] = True return False else: return True
def urlnorm(url): u = urllib.parse.urlparse(urltools.normalize(url)) path = u.path if len(path) > 0: if path[-1] == '/': path = path[:-1] v = (u.scheme, u.netloc, path, u.params, '', '') return urllib.parse.urlunparse(v)
def extract_urls(self, r): urls = set() tree = lxml.html.fromstring(r.text) for element, attribute, link, pos in tree.iterlinks(): url = urltools.normalize(urljoin(r.url, link)) urls.add(url) # self.stats['urls'] += len(urls) self.stats['processed'] += 1 return urls
def __init__(self, website, site_ctx=None, debug=False): self.uri = urltools.normalize(website) self.parsed = urltools.parse(website) self.domain = ".".join(self.parsed[4:6]).lstrip("www.") self.robots = None self.sitemap = None # list of documents self.error = {} self.debug = debug self.__session = None self.load_domain_state()
def pick(): global timeout pair = sortedList.pop() if doLiveCheck: while (time.time() < 3 + timeout): #This should be moved to fetch pass timeout = time.time() links = parseLink(wikiPrefix + pair[1]) for link in links: if pair[2] in urltools.normalize(link): return pick() return pair
def _normalize_url(url): if url.startswith("//"): url = "http:" + url parsed_url = urlparse.urlparse(url) url_path = parsed_url.path url_path = urllib.quote(url_path, safe="%/:=&?~#+!$,;'@()*[]") url = urlparse.urlunparse( (parsed_url.scheme, parsed_url.netloc, url_path, "", "", "")) url = urltools.normalize(url) return url
def fetch(self, method, endpoint, params): api_endpoint = normalize(self.api_base + endpoint) if method.lower() in ["get", "delete"]: content = self.oauth.request( method, api_endpoint, params=params, headers={"User-Agent": "Semantics3 Python Lib/0.2"} ) else: content = self.oauth.request( method, api_endpoint, data=json.dumps(params), headers={"User-Agent": "Semantics3 Python Lib/0.2", "Content-Type": "application/json"}, ) return content
def crawl(self): try: harvest_rate_accum = 0 while self.webpages_crawled < int(self.webpages_limit): print(self.webpages_crawled) try: url = self.priority_queue.pop() except e: print("cannot pop") print(url) if self.urlchecker(url): try: content = self.downloader.download(url).decode('utf-8') if content is not None: self.webpages_crawled += 1 rel = self.relevance.relevance(content, self.query) harvest_rate_accum += rel self.crawled_log(" Harvest rate: " + str(harvest_rate_accum / self.webpages_crawled)) except: print("Failed in downloading") normalized_url = urltools.normalize(url) try: url_comp = urlparse(normalized_url) base_url = url_comp.scheme + "://" + url_comp.netloc + "/" except: self.logger.error("Cannot parse: " + url) if base_url in self.sites_times: self.sites_times[base_url] += 1 else: self.sites_times[base_url] = 1 self.visited_urls.add(normalized_url) if rel < 0.2: continue for link in self.parser.extract_all_links(content): full_link = self.parser.parse_links(url, link) if full_link is not None: link_promise = self.calculator.link_promise( full_link) + rel try: self.priority_queue.additem( full_link, link_promise) except: pass except KeyError: print("Queue is empty now")
def register_url(): url_param = request.args.get('url') if not url_param: return make_response("url param is missing", 400) # bad request # TODO: advanced input validation # https://validators.readthedocs.io/en/latest/#module-validators.url # https://github.com/django/django/blob/master/django/core/validators.py#L74 clean_url = urltools.normalize(url_param) # create page page = store.create_page(clean_url) if page: return make_response(str(page.id), 201) # Created return jsonify({'status': 'Url already exist'})
def fetch(self, method, endpoint, params): api_endpoint = normalize(self.api_base + endpoint) if method.lower() in ['get', 'delete']: content = self.oauth.request( method, api_endpoint, params=params, headers={'User-Agent': 'Semantics3 Python Lib/0.2'}) else: content = self.oauth.request(method, api_endpoint, data=json.dumps(params), headers={ 'User-Agent': 'Semantics3 Python Lib/0.2', 'Content-Type': 'application/json' }) return content
def parseLink(url): #be aware example.com is malformed arr = [] baseUrl = base(url) page = getPage(url) if (page is not None): soup = BeautifulSoup(page, 'html.parser') for x in soup.find_all('a'): link = x.get('href') if (link is not None and link[0:4] == "http"): arr.append(link) elif (link is not None and len(link) >= 1 and link[0] == "/"): arr.append(baseUrl + link) elif (link is not None and link[0:4] == "www."): arr.append("http://" + link) arr2 = [urltools.normalize(x) for x in arr] arr3 = [transform(x) for x in arr2] return arr3 return None
def canonicalize_url(url, keep_params=False, keep_fragments=False): """Canonicalize the given url by applying the following procedures: # a sort query arguments, first by key, then by value # b percent encode paths and query arguments. non-ASCII characters are # c percent-encoded using UTF-8 (RFC-3986) # d normalize all spaces (in query arguments) '+' (plus symbol) # e normalize percent encodings case (%2f -> %2F) # f remove query arguments with blank values (unless site in NONCANONIC_SITES) # g remove fragments (unless #!) # h remove username/password at front of domain # i remove port if 80, keep if not # k remove query arguments (unless site in USEFUL_QUERY_KEYS) The url passed can be a str or unicode, while the url returned is always a str. """ if keep_params: # Preserve all query params parsed = extract(norm(url)) else: # Remove unwanted params parsed = extract(url_query_cleaner(normalize(url), parameterlist=config.USEFUL_QUERY_KEYS)) # Sort params, remove blank if not wanted query = urllib.urlencode(sorted(urlparse.parse_qsl(parsed.query, keep_blank_values=keep_params))) fragment = getFragment(url, keep_fragments) # The following is to remove orphaned '=' from query string params with no values query = re.sub(r"=$", "", query.replace("=&", "&")) # Reconstruct URL, escaping apart from safe chars # See http://stackoverflow.com/questions/2849756/list-of-valid-characters-for-the-fragment-identifier-in-an-url # http://stackoverflow.com/questions/4669692/valid-characters-for-directory-part-of-a-url-for-short-links safe = "/.-_~!$&'()*+,;=:@" newurl = construct(URL(parsed.scheme, '', '', parsed.subdomain, parsed.domain, parsed.tld, parsed.port, quote(parsed.path, safe=safe), query, quote(fragment, safe=safe), '')) return newurl.rstrip('/')
def _main(args): synsets = {} lines_read = 0 urldict = {} dup_count = 0 #urllist_file = codecs.open('../fall11_urls.txt', # errors='ignore', # encoding='utf-8') urllist_file = open(args.url_file, 'r', encoding="latin-1") for line in urllist_file: #line = repr(line) lines_read += 1 wnid, url = re.split('\s+', line, maxsplit=1) url = url.strip() url = url.strip('\n') url_norm = ut.normalize(url) if args.normalized and (url != url_norm): print('NORMALIZED URL:') print(' original: ', url) print(' normalized:', url_norm) if url_norm not in urldict: urldict[url_norm] = line else: dup_count += 1 print('DUPLICATE URL:') print(' ', urldict[url_norm]) print(' ', line) print(dup_count, 'duplicate URLs found') exit()
def test_normalize__malformed(): assert normalize('http://example.com/?foo') == 'http://example.com/' assert normalize('http://example.com?foo') == 'http://example.com/' assert normalize('http://example.com/foo//bar') == 'http://example.com/foo/bar' assert normalize('http://example.com?') == 'http://example.com/' assert normalize('http://example.com/?') == 'http://example.com/' assert normalize('http://example.com//?') == 'http://example.com/' assert normalize('http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z') == 'http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z' assert normalize('http://example.com/#foo?bar') == 'http://example.com/#foo?bar' assert normalize('http://example.com/#foo/bar/blub.html?x=1') == 'http://example.com/#foo/bar/blub.html?x=1' assert normalize('http://example.com/foo#?=bar') == 'http://example.com/foo#?=bar' assert normalize('http://example.com/foo/bar/http://example.com') == 'http://example.com/foo/bar/http:/example.com'
def test_normalize__ip6(): assert normalize('[::1]') == '[::1]' assert normalize('http://[::1]') == 'http://[::1]/' assert normalize('[::1]:8080') == '[::1]:8080' assert normalize('http://[::1]:8080') == 'http://[::1]:8080/'
def test_normalize__malformed(): assert normalize('http://example.com/?foo') == 'http://example.com/' assert normalize('http://example.com?foo') == 'http://example.com/' assert normalize( 'http://example.com/foo//bar') == 'http://example.com/foo/bar' assert normalize('http://example.com?') == 'http://example.com/' assert normalize('http://example.com/?') == 'http://example.com/' assert normalize('http://example.com//?') == 'http://example.com/' assert normalize( 'http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z' ) == 'http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z' assert normalize( 'http://example.com/#foo?bar') == 'http://example.com/#foo?bar' assert normalize('http://example.com/#foo/bar/blub.html?x=1' ) == 'http://example.com/#foo/bar/blub.html?x=1' assert normalize( 'http://example.com/foo#?=bar') == 'http://example.com/foo#?=bar' assert normalize('http://example.com/foo/bar/http://example.com' ) == 'http://example.com/foo/bar/http:/example.com'
def test_normalize(): assert normalize("http://example.com") == "http://example.com/" assert normalize("http://example.com/") == "http://example.com/" assert normalize("https://example.com/") == "https://example.com/" assert normalize("hTTp://example.com/") == "http://example.com/" assert normalize("http://ExAMPLe.COM/") == "http://example.com/" assert normalize("http://example.com./") == "http://example.com/" assert normalize("http://example.com:80/") == "http://example.com/" assert normalize("http://example.com:/") == "http://example.com/" assert normalize("http://example.com/#") == "http://example.com/" assert normalize("http://example.com:8080/") == "http://example.com:8080/" assert normalize("http://www.example.com/") == "http://www.example.com/" assert normalize("http://www.example.com") == "http://www.example.com/" assert normalize("http://foo.bar.example.com/") == "http://foo.bar.example.com/" assert normalize("http://example.com/a") == "http://example.com/a" assert normalize("http://example.com/a/b/c") == "http://example.com/a/b/c" assert normalize("http://example.com/?x=1") == "http://example.com/?x=1" assert normalize("http://example.com/a?x=1") == "http://example.com/a?x=1" assert normalize("http://example.com/a?x=1&y=2") == "http://example.com/a?x=1&y=2" assert normalize("http://example.com/#abc") == "http://example.com/#abc" assert normalize("http://example.com/a/b/c#abc") == "http://example.com/a/b/c#abc" assert normalize("http://example.com/a/b/c?x=1#abc") == "http://example.com/a/b/c?x=1#abc" assert normalize("http://example.com/a/./b/././c") == "http://example.com/a/b/c" assert normalize("http://example.com/a/../b") == "http://example.com/b" assert normalize("eXAmplE.com") == "example.com" assert normalize("example.com/a/../b") == "example.com/b" assert normalize("http://www.example.com") == "http://www.example.com/" assert normalize("www.example.com") == "www.example.com"
if len(sys.argv) <= 3: print "arguments illegal: " + str(sys.argv) sys.exit(1) html_file = sys.argv[1] url=sys.argv[2] suffix = sys.argv[3] html_path = url.rsplit('/', 1)[0] + '/' html_text = open(html_file, 'r').read() soup = BeautifulSoup(html_text, "html.parser") contents = set() for link in soup.findAll('a'): content = link.get('href') if content is None: continue if suffix not in content: continue if not is_absolute(content): content = html_path + content content = urltools.normalize(content) if content in contents: continue print content contents.add(content)
def test_normalize__ip4(): assert normalize('http://192.168.1.1/') == 'http://192.168.1.1/' assert normalize( 'http://192.168.1.1:8088/foo?x=1') == 'http://192.168.1.1:8088/foo?x=1' assert normalize('192.168.1.1') == '192.168.1.1' assert normalize('192.168.1.1:8080/foo/bar') == '192.168.1.1:8080/foo/bar'
def url(self, page): url_pv = pv.PV(self.prefix + ":URL:" + page) url = url_pv.get() url_pv.disconnect() return urltools.normalize(str(url))
def test_normalize__no_scheme(): assert normalize('eXAmplE.com') == 'example.com' assert normalize('example.com/a/../b') == 'example.com/b' assert normalize('www.example.com') == 'www.example.com'
def test_normalize(): assert normalize('') == '' assert normalize('http://example.com') == 'http://example.com/' assert normalize('http://example.com/') == 'http://example.com/' assert normalize(' http://example.com/ ') == 'http://example.com/' assert normalize('https://example.com/') == 'https://example.com/' assert normalize('hTTp://example.com/') == 'http://example.com/' assert normalize('http://ExAMPLe.COM/') == 'http://example.com/' assert normalize('http://example.com./') == 'http://example.com/' assert normalize('http://example.com:/') == 'http://example.com/' assert normalize('http://example.com/#') == 'http://example.com/' # subdomain assert normalize('http://www.example.com/') == 'http://www.example.com/' assert normalize('http://www.example.com') == 'http://www.example.com/' assert normalize('http://foo.bar.example.com/') == 'http://foo.bar.example.com/' # port assert normalize('http://example.com:80/') == 'http://example.com/' assert normalize('https://example.com:443/') == 'https://example.com/' assert normalize('ws://example.com:80/') == 'ws://example.com/' assert normalize('http://example.com:8080/') == 'http://example.com:8080/' # path assert normalize('http://example.com/a') == 'http://example.com/a' assert normalize('http://example.com/a/b/c') == 'http://example.com/a/b/c' assert normalize('http://example.com/foo/') == 'http://example.com/foo/' assert normalize('http://example.com/a/./b/././c') == 'http://example.com/a/b/c' assert normalize('http://example.com/a/../b') == 'http://example.com/b' assert normalize('http://example.com/./b') == 'http://example.com/b' assert normalize('http://example.com/../b') == 'http://example.com/b' assert normalize('http://example.com/////////foo') == 'http://example.com/foo' assert normalize('http://example.com/foo/.../bar') == 'http://example.com/foo/.../bar' assert normalize('http://example.com/foo+bar') == 'http://example.com/foo+bar' assert normalize('http://example.com/.') == 'http://example.com/' assert normalize('http://example.com/..') == 'http://example.com/' assert normalize('http://example.com/./') == 'http://example.com/' assert normalize('http://example.com/../') == 'http://example.com/' assert normalize('http://example.com/a/..') == 'http://example.com/' assert normalize('http://example.com/a/../') == 'http://example.com/' # encoded path assert normalize('http://example.com/%25%32%35') == 'http://example.com/%25' assert normalize('http://example.com/foo%25%32%35bar') == 'http://example.com/foo%25bar' assert normalize('http://example.com/foo/%25%32%35/bar') == 'http://example.com/foo/%25/bar' assert normalize('http://example.com/%7Efoo') == 'http://example.com/~foo' assert normalize('http://example.com/foo%23bar') == 'http://example.com/foo%23bar' # %23 = # # query assert normalize('http://example.com/?x=1') == 'http://example.com/?x=1' assert normalize('http://example.com?x=1') == 'http://example.com/?x=1' assert normalize('http://example.com/a?x=1') == 'http://example.com/a?x=1' assert normalize('http://example.com/a/?x=1') == 'http://example.com/a/?x=1' assert normalize('http://example.com/a?x=1&y=2') == 'http://example.com/a?x=1&y=2' assert normalize('http://example.com/a?y=2&x=1') == 'http://example.com/a?x=1&y=2' assert normalize('http://example.com/a?x=&y=2') == 'http://example.com/a?y=2' # fragment assert normalize('http://example.com/#abc') == 'http://example.com/#abc' assert normalize('http://example.com/a/b/c#abc') == 'http://example.com/a/b/c#abc' assert normalize('http://example.com/a/b/c?x=1#abc') == 'http://example.com/a/b/c?x=1#abc' # username/password assert normalize('http://*****:*****@example.com') == 'http://*****:*****@example.com/' assert normalize('http://*****:*****@exaMPLE.COM/') == 'http://*****:*****@example.com/' # scheme without // assert normalize('mailto:[email protected]') == 'mailto:[email protected]' assert normalize('mailto:[email protected]') == 'mailto:[email protected]'
def test_normalize__idn(): assert normalize('http://xn--e1afmkfd.xn--p1ai/') == u'http://пример.рф/'
def current_url(self): return urltools.normalize(self.browser.current_url)
def _normalize_url(self, url): return urltools.normalize(url)
def test_normalize__ip4(): assert normalize('http://192.168.1.1/') == 'http://192.168.1.1/' assert normalize('http://192.168.1.1:8088/foo?x=1') == 'http://192.168.1.1:8088/foo?x=1' assert normalize('192.168.1.1') == '192.168.1.1' assert normalize('192.168.1.1:8080/foo/bar') == '192.168.1.1:8080/foo/bar'
def crawl(original_url): tic = time.time() parent_list = [original_url] url_to_check = get_base_url(original_url) print(url_to_check) url_to_check = str(url_to_check) layer_stop = 1 layer = 0 #initializing all the requiered lists--------------------------------------------- visited_all = [original_url] visited_current_layer = [] child_list = [] child_list_filtered = [] #columns = ['Link','Parent Link', 'Layer'] df = pd.DataFrame() # Main execution of scrapper---------------------------------------------------- #looping through layers while layer < layer_stop: #looping through URLs in parent-list for url in parent_list: #scraping the children from the parent url---------------------------- if href_scrapper(url) != 0: child_list = href_scrapper(url) for child in child_list: if child != None: #if child link is of the form "index.php/blahblah" and parent ends with '/' #---> "parentlink/index.php/blahblah" if child.startswith('/'): child = str(url) + str(child) if url.endswith('/') and url_to_check not in child: child = str(url) + str(child) #normalize the child links------------------------------------- child = urltools.normalize(child) #filtering out based on 1) External 2) Repeating 3) Invalid links--------------------------- if url_to_check in child and child not in visited_all and does_page_exist( child) == 1: child_list_filtered.append(child) #adding everthing to visited all-------------------- if child not in visited_all: child_slash = child + '/' visited_all.append(child) visited_all.append(child_slash) #adding the visited and filtered children into the "current visited layer" ------ for child_filtered in child_list_filtered: visited_current_layer.append(child_filtered) #creating a Pandas dataframe to store everything for download---------- layer_number = [layer + 1] * len(child_list_filtered) parent_of_child = [url] * len(child_list_filtered) df_child = pd.DataFrame(child_list_filtered) df_parent = pd.DataFrame(parent_of_child) df_layer = pd.DataFrame(layer_number) df_to_be_added = pd.concat([df_child, df_parent, df_layer], axis=1) df = pd.concat([df, df_to_be_added], ignore_index=True, axis=0) #---------------------------------------------------------------------- #emptying the child lists child_list = [] child_list_filtered = [] #condition to stop filtering----------------------------------------------- if not visited_current_layer: layer_stop = layer_stop else: layer_stop += 1 #child layer is now parent layer-------------------------------------------- parent_list = [] #we only dont add .png, .jpg , .pdf to the new parent layer for visited_current in visited_current_layer: print(visited_current) if (not visited_current.endswith('.png') and not visited_current.endswith('.jpg') and not visited_current.endswith('.pdf')): parent_list.append(visited_current) #displaying the links in different layers---------------------------------- #print("Links in LAYER:" + str(layer+1)) print("No of links = " + str(len(visited_current_layer))) #print(visited_current_layer) print("\n") visited_current_layer = [] #updating the layer number layer += 1 return df
def _sanitize(url): ret = url ret = _urlsplit(ret) ret = urlunsplit(ret) ret = urltools.normalize(ret) return ret
def get_urls(self): return { normalize(get_absolute_url(self.url, link.get("href"))) for link in self._page.find_all(name="a") }
def _main(args): ## print(args.image_dir, args.url_file, args.shopping_file, args.dryrun) # First step is to read the "shopping list". This is the list of synsets we # want to downlown images for. By convetion if this list is empty we will # download all synsets. # # This file contins one sysset per line but the synsets can be in either # of two formats: # 1) A "synset name" such as "benthos.n.02" or, # 2) A "wordnet ID" or "offset" such as "n00004475" # These two formats are interchangable. For every synset name there is an offset # and vice versa. # # The software below figures out which form is used inthe files (forms can be mixed # withion a file) # # Dictionary of acceptable image file extensions and what we will use as # the extension when we save the file locally file_ext_whitelist = { 'jpg': 'jpg', 'png': 'png', 'jpeg': 'jpg', 'JPG': 'jpg', 'PNG': 'png', 'JPEG': 'jpg' } file_ext_gif = {'gif': 'gif', 'GIF': 'gif'} synsetdict = {} lines = 0 shoppinglist_file = open(args.shopping_file, 'r', encoding="utf-8") for line in shoppinglist_file: lines += 1 line = line.strip() line = line.strip('\n') if line[0] == 'n' and line[1:2].isnumeric(): # We have a wordnet ID wnid = line pos = line[0] offset = int(line[1:]) ss = wn.synset_from_pos_and_offset(pos, offset) synsetdict[offset] = ss elif line[0:2].isalpha: # We have a synset name ss = wn.synset(line) offset = int(ss.offset()) synsetdict[offset] = ss else: # We can't figute out what is in the file print('ERROR shoppinglist.txt, line', lines, 'unrecognised format', line) exit() if args.verbose: print('INFO: Processing URLs from the following shopping list', synsetdict) # Make sure we have a directory for every synset, these may alreadys exist or not for offset in synsetdict: ssstr = str(synsetdict[offset])[8:-2] path = args.image_dir + ssstr if not os.path.exists(path): os.makedirs(path) # if we are going to allow GIF files, append to the whitelist if args.gif_ok: file_ext_whitelist.update(file_ext_gif) if args.verbose: print('INFO: allowing gif files') # read the URL list file end to end and process only those lines that # match synsets in our shopping list lines_read = 0 files_downloaded = 0 files_existing = 0 dup_count = 0 urldict = {} urllist_file = open(args.url_file, 'r', encoding="latin-1") for line in urllist_file: lines_read += 1 wnid, url = re.split('\s+', line, maxsplit=1) # Normalixe the URL url = url.strip() url = url.strip('\n') url = ut.normalize(url) pos_offset, serial = wnid.split('_') pos = pos_offset[0] offset = int(pos_offset[1:]) ss = wn.synset_from_pos_and_offset(pos, offset) ssstr = str(ss)[8:-2] # If synset is not on our shopping list we don't want it if offset not in synsetdict: continue # Attempt to find the file extension. If we can't find it skip the URL # if we do find it, normalise the extension to lower case and three characters urlparts = urlparse(url) urlpath = urlparts.path try: _f, urlextension = urlpath.rsplit(sep='.', maxsplit=1) except (ValueError): print('WARNING No file extension, URL skiped:', line) continue if urlextension not in file_ext_whitelist: # did not find filename extension in path, perhaps it is a parameter for ext in file_ext_whitelist: dotext = '.' + ext if (dotext in urlparts.params) or (dotext in urlparts.query): file_extension = file_ext_whitelist[ext] break else: file_extension = '' print('WARNING No file extension found, URL skiped:', line) break if '' == file_extension: continue else: file_extension = file_ext_whitelist[urlextension] # Have we already downloaded this URL? Don't waste time doing it again. if url not in urldict: urldict[url] = line else: dup_count += 1 print( 'WARNING DUPLICATE URL this jpg file will NOT be downloaded again:' ) print(' ', urldict[url]) print(' ', line) continue # create the file name image_filename = args.image_dir + ssstr + '/' + ssstr + '-' + serial + '.' + file_extension # If we already have this file, we don't need to get it if Path(image_filename).is_file(): files_existing += 1 if args.verbose: print('INFO: File exists, not downloaing again', image_filename) continue try: response = urllib.request.urlopen(url) imagedata = response.read() except urllib.error.URLError as e: print(e.reason, wnid, ssstr, ' at line', lines_read, url) continue except: print('WARNING unknown error while downloading data at line', lines_read, url) continue ext_by_magic = check_magic(imagedata) if ext_by_magic not in file_ext_whitelist: print('WARNING Downloaded file signature is wrong, not saved', line) continue if ext_by_magic != file_extension: print("WARNING Downloaded file signature", ext_by_magic, "does not match URL", line) continue newfile = open(image_filename, 'wb') newfile.write(imagedata) newfile.close() files_downloaded += 1 # Crude progress bar print('.', end='') # after loop end, print a summary of what was done then exit print('downloaded', files_downloaded, 'skipped', files_existing, 'existing files', 'did not download', dup_count, 'duplicate URLs') exit()
def normalize(url): return urltools.normalize(url).rstrip('/')
def test_normalize(): assert normalize("") == "" assert normalize("http://example.com") == "http://example.com/" assert normalize("http://example.com/") == "http://example.com/" assert normalize(" http://example.com/ ") == "http://example.com/" assert normalize("https://example.com/") == "https://example.com/" assert normalize("hTTp://example.com/") == "http://example.com/" assert normalize("http://ExAMPLe.COM/") == "http://example.com/" assert normalize("http://example.com./") == "http://example.com/" assert normalize("http://example.com:/") == "http://example.com/" assert normalize("http://example.com/#") == "http://example.com/" # port assert normalize("http://example.com:80/") == "http://example.com/" assert normalize("https://example.com:443/") == "https://example.com/" assert normalize("ws://example.com:80/") == "ws://example.com/" assert normalize("http://example.com:8080/") == "http://example.com:8080/" # subdomain assert normalize("http://www.example.com/") == "http://www.example.com/" assert normalize("http://www.example.com") == "http://www.example.com/" assert normalize("http://foo.bar.example.com/") == "http://foo.bar.example.com/" # ip assert normalize("http://192.168.1.1/") == "http://192.168.1.1/" assert normalize("http://192.168.1.1:8088/foo?x=1") == "http://192.168.1.1:8088/foo?x=1" assert normalize("192.168.1.1") == "192.168.1.1" assert normalize("192.168.1.1:8080/foo/bar") == "192.168.1.1:8080/foo/bar" # ip6 assert normalize("[::1]") == "[::1]" assert normalize("http://[::1]") == "http://[::1]/" assert normalize("[::1]:8080") == "[::1]:8080" assert normalize("http://[::1]:8080") == "http://[::1]:8080/" # path assert normalize("http://example.com/a") == "http://example.com/a" assert normalize("http://example.com/a/b/c") == "http://example.com/a/b/c" assert normalize("http://example.com/foo/") == "http://example.com/foo/" assert normalize("http://example.com/a/./b/././c") == "http://example.com/a/b/c" assert normalize("http://example.com/a/../b") == "http://example.com/b" assert normalize("http://example.com/./b") == "http://example.com/b" assert normalize("http://example.com/../b") == "http://example.com/b" assert normalize("http://example.com/////////foo") == "http://example.com/foo" assert normalize("http://example.com/foo/.../bar") == "http://example.com/foo/.../bar" assert normalize("http://example.com/foo+bar") == "http://example.com/foo+bar" assert normalize("http://example.com/.") == "http://example.com/" assert normalize("http://example.com/..") == "http://example.com/" assert normalize("http://example.com/./") == "http://example.com/" assert normalize("http://example.com/../") == "http://example.com/" assert normalize("http://example.com/a/..") == "http://example.com/" assert normalize("http://example.com/a/../") == "http://example.com/" # encoded path assert normalize("http://example.com/%25%32%35") == "http://example.com/%25" assert normalize("http://example.com/foo%25%32%35bar") == "http://example.com/foo%25bar" assert normalize("http://example.com/foo/%25%32%35/bar") == "http://example.com/foo/%25/bar" assert normalize("http://example.com/%7Efoo") == "http://example.com/~foo" assert normalize("http://example.com/foo%23bar") == "http://example.com/foo%23bar" # %23 = # # query assert normalize("http://example.com/?x=1") == "http://example.com/?x=1" assert normalize("http://example.com?x=1") == "http://example.com/?x=1" assert normalize("http://example.com/a?x=1") == "http://example.com/a?x=1" assert normalize("http://example.com/a/?x=1") == "http://example.com/a/?x=1" assert normalize("http://example.com/a?x=1&y=2") == "http://example.com/a?x=1&y=2" assert normalize("http://example.com/a?y=2&x=1") == "http://example.com/a?x=1&y=2" assert normalize("http://example.com/a?x=&y=2") == "http://example.com/a?y=2" # fragment assert normalize("http://example.com/#abc") == "http://example.com/#abc" assert normalize("http://example.com/a/b/c#abc") == "http://example.com/a/b/c#abc" assert normalize("http://example.com/a/b/c?x=1#abc") == "http://example.com/a/b/c?x=1#abc" # no scheme assert normalize("eXAmplE.com") == "example.com" assert normalize("example.com/a/../b") == "example.com/b" assert normalize("www.example.com") == "www.example.com" # username/password assert normalize("http://*****:*****@example.com") == "http://*****:*****@example.com/" assert normalize("http://*****:*****@exaMPLE.COM/") == "http://*****:*****@example.com/" # scheme without // assert normalize("mailto:[email protected]") == "mailto:[email protected]" assert normalize("mailto:[email protected]") == "mailto:[email protected]" # IDN assert normalize("http://xn--e1afmkfd.xn--p1ai/") == "http://пример.рф/" # malformed urls assert normalize("http://example.com/?foo") == "http://example.com/" assert normalize("http://example.com?foo") == "http://example.com/" assert normalize("http://example.com/foo//bar") == "http://example.com/foo/bar" assert normalize("http://example.com?") == "http://example.com/" assert normalize("http://example.com/?") == "http://example.com/" assert normalize("http://example.com//?") == "http://example.com/" assert normalize("http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z") == "http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z" assert normalize("http://example.com/#foo?bar") == "http://example.com/#foo?bar" assert normalize("http://example.com/#foo/bar/blub.html?x=1") == "http://example.com/#foo/bar/blub.html?x=1" assert normalize("http://example.com/foo#?=bar") == "http://example.com/foo#?=bar" assert normalize("http://example.com/foo/bar/http://example.com") == "http://example.com/foo/bar/http:/example.com"
def normalize(url, strip=False): "RFC3986 normalize URL & Optionally removing url-query/fragment string" if strip: p = _urltools.parse(url) url = p.scheme + '://' + p.subdomain + p.domain + p.path return _urltools.normalize(url)
def urls_equal(url1, url2): return urltools.normalize(url1) == urltools.normalize(url2)
def test_normalize(): assert normalize("") == "" assert normalize("http://example.com") == "http://example.com/" assert normalize("http://example.com/") == "http://example.com/" assert normalize(" http://example.com/ ") == "http://example.com/" assert normalize("https://example.com/") == "https://example.com/" assert normalize("hTTp://example.com/") == "http://example.com/" assert normalize("http://ExAMPLe.COM/") == "http://example.com/" assert normalize("http://example.com./") == "http://example.com/" assert normalize("http://example.com:/") == "http://example.com/" assert normalize("http://example.com/#") == "http://example.com/" # port assert normalize("http://example.com:80/") == "http://example.com/" assert normalize("https://example.com:443/") == "https://example.com/" assert normalize("ws://example.com:80/") == "ws://example.com/" assert normalize("http://example.com:8080/") == "http://example.com:8080/" # subdomain assert normalize("http://www.example.com/") == "http://www.example.com/" assert normalize("http://www.example.com") == "http://www.example.com/" assert normalize( "http://foo.bar.example.com/") == "http://foo.bar.example.com/" # ip assert normalize("http://192.168.1.1/") == "http://192.168.1.1/" assert normalize( "http://192.168.1.1:8088/foo?x=1") == "http://192.168.1.1:8088/foo?x=1" assert normalize("192.168.1.1") == "192.168.1.1" assert normalize("192.168.1.1:8080/foo/bar") == "192.168.1.1:8080/foo/bar" # ip6 assert normalize("[::1]") == "[::1]" assert normalize("http://[::1]") == "http://[::1]/" assert normalize("[::1]:8080") == "[::1]:8080" assert normalize("http://[::1]:8080") == "http://[::1]:8080/" # path assert normalize("http://example.com/a") == "http://example.com/a" assert normalize("http://example.com/a/b/c") == "http://example.com/a/b/c" assert normalize("http://example.com/foo/") == "http://example.com/foo/" assert normalize( "http://example.com/a/./b/././c") == "http://example.com/a/b/c" assert normalize("http://example.com/a/../b") == "http://example.com/b" assert normalize("http://example.com/./b") == "http://example.com/b" assert normalize("http://example.com/../b") == "http://example.com/b" assert normalize( "http://example.com/////////foo") == "http://example.com/foo" assert normalize( "http://example.com/foo/.../bar") == "http://example.com/foo/.../bar" assert normalize( "http://example.com/foo+bar") == "http://example.com/foo+bar" assert normalize("http://example.com/.") == "http://example.com/" assert normalize("http://example.com/..") == "http://example.com/" assert normalize("http://example.com/./") == "http://example.com/" assert normalize("http://example.com/../") == "http://example.com/" assert normalize("http://example.com/a/..") == "http://example.com/" assert normalize("http://example.com/a/../") == "http://example.com/" # encoded path assert normalize( "http://example.com/%25%32%35") == "http://example.com/%25" assert normalize( "http://example.com/foo%25%32%35bar") == "http://example.com/foo%25bar" assert normalize("http://example.com/foo/%25%32%35/bar" ) == "http://example.com/foo/%25/bar" assert normalize("http://example.com/%7Efoo") == "http://example.com/~foo" assert normalize("http://example.com/foo%23bar" ) == "http://example.com/foo%23bar" # %23 = # # query assert normalize("http://example.com/?x=1") == "http://example.com/?x=1" assert normalize("http://example.com?x=1") == "http://example.com/?x=1" assert normalize("http://example.com/a?x=1") == "http://example.com/a?x=1" assert normalize( "http://example.com/a/?x=1") == "http://example.com/a/?x=1" assert normalize( "http://example.com/a?x=1&y=2") == "http://example.com/a?x=1&y=2" assert normalize( "http://example.com/a?y=2&x=1") == "http://example.com/a?x=1&y=2" assert normalize( "http://example.com/a?x=&y=2") == "http://example.com/a?y=2" # fragment assert normalize("http://example.com/#abc") == "http://example.com/#abc" assert normalize( "http://example.com/a/b/c#abc") == "http://example.com/a/b/c#abc" assert normalize("http://example.com/a/b/c?x=1#abc" ) == "http://example.com/a/b/c?x=1#abc" # no scheme assert normalize("eXAmplE.com") == "example.com" assert normalize("example.com/a/../b") == "example.com/b" assert normalize("www.example.com") == "www.example.com" # username/password assert normalize( "http://*****:*****@example.com") == "http://*****:*****@example.com/" assert normalize( "http://*****:*****@exaMPLE.COM/") == "http://*****:*****@example.com/" # scheme without // assert normalize("mailto:[email protected]") == "mailto:[email protected]" assert normalize("mailto:[email protected]") == "mailto:[email protected]" # IDN assert normalize("http://xn--e1afmkfd.xn--p1ai/") == "http://пример.рф/" # malformed urls assert normalize("http://example.com/?foo") == "http://example.com/" assert normalize("http://example.com?foo") == "http://example.com/" assert normalize( "http://example.com/foo//bar") == "http://example.com/foo/bar" assert normalize("http://example.com?") == "http://example.com/" assert normalize("http://example.com/?") == "http://example.com/" assert normalize("http://example.com//?") == "http://example.com/" assert normalize( "http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z" ) == "http://example.com/foo/?http://example.com/bar/?x=http://examle.com/y/z" assert normalize( "http://example.com/#foo?bar") == "http://example.com/#foo?bar" assert normalize("http://example.com/#foo/bar/blub.html?x=1" ) == "http://example.com/#foo/bar/blub.html?x=1" assert normalize( "http://example.com/foo#?=bar") == "http://example.com/foo#?=bar" assert normalize("http://example.com/foo/bar/http://example.com" ) == "http://example.com/foo/bar/http:/example.com"
def crawl(original_url, num_Id, output_file): # Accepting input of URL and depth---------------------------------------------- tic = time.time() parent_list = [original_url] url_to_check = get_base_url(original_url) print(url_to_check) url_to_check = str(url_to_check) layer_stop = 1 layer = 0 #initializing all the required lists--------------------------------------------- visited_all = [original_url] visited_current_layer = [] child_list =[] child_list_filtered = [original_url] #columns = ['Link','Parent Link', 'Layer'] df = pd.DataFrame() Di={} # Main execution of scrapper---------------------------------------------------- #looping through layers while layer < layer_stop: #looping through URLs in parent-list for url in parent_list: #scraping the children from the parent url---------------------------- href_url = href_scrapper(url) if href_url != 0: child_list = href_url for child in child_list: if child != None: ch=child #if child link is of the form "index.php/blahblah" and parent ends with '/' #---> "parentlink/index.php/blahblah" #if child.startswith('/'): #child= str(url) + str(child) #if url.endswith('/') and url_to_check not in child: #child = str(url) + str(child) child = urljoin(url,child) #normalize the child links------------------------------------- child=urltools.normalize(child) social_media = ['facebook.com','google.com','reddit.com','linkedin.com','github.com','twitter.com','digg.com','.png', '.jpg','.jpeg', '.pdf', '.css'] #filtering out based on 1) External 2) Repeating 3) Invalid 4) social media + pdf + css --------------------------- #if url_to_check in child and child not in visited_all and does_page_exist(child)==1 and ch not in Di: if url_to_check in child and child not in visited_all and ch not in Di and all(social not in child for social in social_media): child_list_filtered.append(child) Di[ch]=1 #adding everything to visited all-------------------- if child not in visited_all: child_slash = child + '/' visited_all.append(child) visited_all.append(child_slash) #sleep------------------------------------------------------- time.sleep(0.250) #adding the visited and filtered children into the "current visited layer" ------ for child_filtered in child_list_filtered: visited_current_layer.append(child_filtered) #creating a Pandas dataframe to store everything for download---------- layer_number = [layer+1]*len(child_list_filtered) parent_of_child = [url]*len(child_list_filtered) df_child = pd.DataFrame(child_list_filtered) df_parent = pd.DataFrame(parent_of_child) df_layer = pd.DataFrame(layer_number) df_to_be_added = pd.concat([df_child,df_parent,df_layer], axis=1) df = pd.concat([df,df_to_be_added],ignore_index=True, axis = 0) #---------------------------------------------------------------------- #emptying the child lists child_list = [] child_list_filtered = [] #condition to stop filtering----------------------------------------------- if not visited_current_layer : layer_stop = layer_stop else: layer_stop += 1 #child layer is now parent layer-------------------------------------------- parent_list = [] for visited_current in visited_current_layer: print(visited_current) #if(not visited_current.endswith(unwanted_extensions)): parent_list.append(visited_current) #displaying the links in different layers---------------------------------- #print("Links in LAYER:" + str(layer+1)) print("No of links = " + str(len(visited_current_layer))) #print(visited_current_layer) print("\n") visited_current_layer = [] #updating the layer number layer +=1 df.to_csv(output_file + '/' + str(num_Id) + '_' + str(url_to_check) + '.csv', sep=',', encoding='utf-8') return df, num_Id
def test_normalize(): assert normalize('') == '' assert normalize('http://example.com') == 'http://example.com/' assert normalize('http://example.com/') == 'http://example.com/' assert normalize(' http://example.com/ ') == 'http://example.com/' assert normalize('https://example.com/') == 'https://example.com/' assert normalize('hTTp://example.com/') == 'http://example.com/' assert normalize('http://ExAMPLe.COM/') == 'http://example.com/' assert normalize('http://example.com./') == 'http://example.com/' assert normalize('http://example.com:/') == 'http://example.com/' assert normalize('http://example.com/#') == 'http://example.com/' # subdomain assert normalize('http://www.example.com/') == 'http://www.example.com/' assert normalize('http://www.example.com') == 'http://www.example.com/' assert normalize( 'http://foo.bar.example.com/') == 'http://foo.bar.example.com/' # port assert normalize('http://example.com:80/') == 'http://example.com/' assert normalize('https://example.com:443/') == 'https://example.com/' assert normalize('ws://example.com:80/') == 'ws://example.com/' assert normalize('http://example.com:8080/') == 'http://example.com:8080/' # path assert normalize('http://example.com/a') == 'http://example.com/a' assert normalize('http://example.com/a/b/c') == 'http://example.com/a/b/c' assert normalize('http://example.com/foo/') == 'http://example.com/foo/' assert normalize( 'http://example.com/a/./b/././c') == 'http://example.com/a/b/c' assert normalize('http://example.com/a/../b') == 'http://example.com/b' assert normalize('http://example.com/./b') == 'http://example.com/b' assert normalize('http://example.com/../b') == 'http://example.com/b' assert normalize( 'http://example.com/////////foo') == 'http://example.com/foo' assert normalize( 'http://example.com/foo/.../bar') == 'http://example.com/foo/.../bar' assert normalize( 'http://example.com/foo+bar') == 'http://example.com/foo+bar' assert normalize('http://example.com/.') == 'http://example.com/' assert normalize('http://example.com/..') == 'http://example.com/' assert normalize('http://example.com/./') == 'http://example.com/' assert normalize('http://example.com/../') == 'http://example.com/' assert normalize('http://example.com/a/..') == 'http://example.com/' assert normalize('http://example.com/a/../') == 'http://example.com/' # encoded path assert normalize( 'http://example.com/%25%32%35') == 'http://example.com/%25' assert normalize( 'http://example.com/foo%25%32%35bar') == 'http://example.com/foo%25bar' assert normalize('http://example.com/foo/%25%32%35/bar' ) == 'http://example.com/foo/%25/bar' assert normalize('http://example.com/%7Efoo') == 'http://example.com/~foo' assert normalize('http://example.com/foo%23bar' ) == 'http://example.com/foo%23bar' # %23 = # # query assert normalize('http://example.com/?x=1') == 'http://example.com/?x=1' assert normalize('http://example.com?x=1') == 'http://example.com/?x=1' assert normalize('http://example.com/a?x=1') == 'http://example.com/a?x=1' assert normalize( 'http://example.com/a/?x=1') == 'http://example.com/a/?x=1' assert normalize( 'http://example.com/a?x=1&y=2') == 'http://example.com/a?x=1&y=2' assert normalize( 'http://example.com/a?y=2&x=1') == 'http://example.com/a?x=1&y=2' assert normalize( 'http://example.com/a?x=&y=2') == 'http://example.com/a?y=2' # fragment assert normalize('http://example.com/#abc') == 'http://example.com/#abc' assert normalize( 'http://example.com/a/b/c#abc') == 'http://example.com/a/b/c#abc' assert normalize('http://example.com/a/b/c?x=1#abc' ) == 'http://example.com/a/b/c?x=1#abc' # username/password assert normalize( 'http://*****:*****@example.com') == 'http://*****:*****@example.com/' assert normalize( 'http://*****:*****@exaMPLE.COM/') == 'http://*****:*****@example.com/' # scheme without // assert normalize('mailto:[email protected]') == 'mailto:[email protected]' assert normalize('mailto:[email protected]') == 'mailto:[email protected]'