def get_page(): # request the page r = requests.get(request.args['url']) # parse the dom into python objects html = lxml.html.document_fromstring(r.content) # prase the requested url so we can form the base href url = urlparse(request.args['url']) # create the base url dom fragment base_url = lxml.html.fromstring("<base href='%s://%s'>" % (url.scheme, url.hostname)).find('.//base') # find the head element head = html.find(".//head") # insert the base href in the last place of the head elements head.insert(-1, base_url) # rewrite urls to have absolute url html.resolve_base_href() # rewrite links to load through this proxy for element, attribute, link, pos in html.iterlinks(): if element.tag == "a" and attribute == "href": link = "http://localhost:8888/translate_url?url=%s" % (link) element.set("href", link) element.set("target", "_parent") # translate through DOM Traversal # html = translate_dom_string(html, lxml.html.tostring(html)) # translate through HTML regex string replacement html = translate_html(html, lxml.html.tostring(html)) # dump the html string for debugging # with open('html_dump', 'w') as f: # f.write(lxml.html.tostring(html)) # a little regex to remove any script tags return re.subn(r'<(script).*?</\1>(?s)', '', lxml.html.tostring(html))[0]
def fetch_links_from_web_page(self, page): log.debug('') try: # [ NOTE ]: Pull out all links after resolving them using any # <base> tags found in the document. links = [ link for element, attribute, link, pos in iterlinks( resolve_base_href(page.content)) ] except etree.ParseError: # [ NOTE ]: If the document is not HTML content this will return # an empty list. links = [] return list(set(links))
def crawl(url, thread_id=0): global WORDS, OVERRIDE_SIZE, HEADER, SAVE_PAGES, SAVE_WORDS if not OVERRIDE_SIZE: try: # Attempt to get the size in bytes of the document length = int( requests.head(url, headers=HEADER).headers['Content-Length']) except KeyError: # Sometimes no Content-Length header is returned... length = 1 if length > 524288000: # If the page is larger than 500 MB raise SizeError # If the SizeError is raised it will be caught in the except block in the run section, # and the following code will not be run. page = requests.get(url, headers=HEADER) # Get page word_list = [] doctype = get_mime_type(page) if doctype.find('image') < 0 and doctype.find('video') < 0: if SAVE_WORDS: word_list = make_words(page) for word in word_list: WORDS.put(word) try: # Pull out all links after resolving them using any <base> tags found in the document. links = [ link for element, attribute, link, pos in iterlinks( resolve_base_href(make_links_absolute(page.content, url))) ] except etree.ParseError: # If the document is not HTML content this will return an empty list. links = [] links = list(set(links)) else: links = [] if SAVE_PAGES: save_page(url, page) if SAVE_WORDS: # Announce which link was crawled write_log('CRAWL', 'Found {0} links and {1} words on {2}'.format( len(links), len(word_list), url), worker=thread_id) else: # Announce which link was crawled write_log('CRAWL', 'Found {0} links on {1}'.format(len(links), url), worker=thread_id) return links
def crawl(url): global TODO if not OVERRIDE_SIZE: try: # Attempt to get the size in bytes of the document length = int( requests.head(url, headers=HEADER).headers['Content-Length']) except KeyError: # Sometimes no Content-Length header is returned... length = 1 if length > 524288000: # If the page is larger than 500 MB raise SizeError # If the SizeError is raised it will be caught in the except block in the run section, # and the following code will not be run. page = requests.get(url, headers=HEADER) # Get page word_list = [] if SAVE_WORDS: word_list = make_words(page) WORDS.update(word_list) try: # Pull out all links after resolving them using any <base> tags found in the document. links = [ link for element, attribute, link, pos in iterlinks( resolve_base_href(page.content)) ] except etree.ParseError: # If the document is not HTML content this will return an empty list. links = [] links = list(set(links)) TODO += links DONE.append(url) if SAVE_PAGES: save_page(url, page) if SAVE_WORDS: # Announce which link was crawled write_log('[CRAWL]: Found {0} links and {1} words on {2}'.format( len(word_list), len(links), url)) else: # Announce which link was crawled write_log('[CRAWL]: Found {0} links on {1}'.format(len(links), url))
######## VI = j['GroupItem'][1] # VI print(VI['GroupItemName']) ######## DIAG = j['GroupItem'][2] # диагностика print(DIAG['GroupItemName']) ######## DRVR = j['GroupItem'][3] # звук print(DRVR['GroupItemName']) ######## UTIL = j['GroupItem'][10] # Съемные накопители print(UTIL['GroupItemName']) # for x in iter_: # print(x.get for x in range(11): print(j['GroupItem'][x]['GroupItemName']) links = html.resolve_base_href(r.text) # http://downloads.dell.com/comm/R85670.EXE 'R80894.EXE' "input#tagDrivers") for x in iter_: try: print(x[0].attrib['href']) # , "----", x[1], "---", x[2] if 'http://downloads' in x[0].attrib['href']: # 'http://downloads.dell.com/comm/R85670.EXE': print() except: pass print() ###################################################################