def acquire_chapter(self, url, story=None): """Download and scrape a single chapter from a story. @param url: The URL of the chapter to download. @param story: The Story object provided by a previous run. @type url: str @type story: L{Story} @return: A tuple consisting of the newly-created L{Chapter} object and the L{Story} object which is either newly created or was provided as an argument. @rtype: (L{Chapter}, L{Story}) """ # Verify the URL's syntactical validity before wasting bandwidth. if not self.story_url_re.match(url): prnt("Not a %s story URL: %s" % (self.site_name, url)) return None # Retrieve the raw chapter (don't keep the un-parsed HTML wasting memory) # .parse(handle) for proper encoding detection. # .urlopen for customizing the User-Agent header. dom = self.http.get_dom(url) html.make_links_absolute(dom, copy=False) chapter_select = dom.find(self.chapter_select_xpath) chapter_content = dom.find(self.chapter_content_xpath) if not story: author = '' for elem in dom.iterfind('.//a[@href]'): if self.author_url_fragment in elem.get('href'): author = elem.text break story = Story(self.get_story_title(dom), author) story.site_name = self.site_name story.category = self.get_story_category(dom) if chapter_select is not None: options = chapter_select.findall(".//option") if options[0].text.strip().lower() in self.not_chapters: options = options[1:] story.chapter_urls = [self.resolve_chapter_url(x.get('value'), url, dom) for x in options] else: story.chapter_urls = [url] cleaned = self.custom_content_cleaning(chapter_content) if cleaned is not None: chapter_content = cleaned # Extract metadata from the chapter selector (or recognize its absence) if chapter_select is not None: chapter_title_str = chapter_select.find(".//option[@selected]").text chapter_title_obj = self.chapter_title_re.match(chapter_title_str) chapter_title = chapter_title_obj.group('name') chapter_number = int(chapter_title_obj.group('num')) chapter = Chapter(chapter_number, chapter_title, chapter_content) else: chapter = Chapter(1, '', chapter_content) return chapter, story
def grab_urls(content, url): urls = {} domain = urlparse(url).netloc html = document_fromstring(content) html.make_links_absolute(url, resolve_base_href=True) for element, attribute, link, pos in html.iterlinks(): if attribute != "href": continue # skip if not on our domain if urlparse(link).netloc != domain and urlparse( link).netloc != "www." + domain: continue # skip if self referential if (url.split("//")[1] + "#") in link: continue text = element.text_content() if len( element) == 0 else element[0].text_content() text = text.lstrip() if text is not None else "" # compute relevancy here relevance[link] = relevancy(link, text, url) urls[link] = 1 if text != "": print text print link print return urls.keys()
def get_realtime_title(pages=5): """ Get ALL Category Realtime news from libertytimes realtime url may change or invaild when it is not **realtime** get_realtime_title(pages=5, encoding="UTF-8") *pages*: get page 1 to pages, default is 5 pages return: dict{time, title, url} """ result_list = [] for page in xrange(1, pages + 1): response, content = h.request("%s&ipage=%d" % (news_list_url, page)) html = lxml.html.fromstring(content.decode("utf-8", "ignore")) html.make_links_absolute(base_url) # Get news-list section div = html.findall("*div")[0] # Get all title-info to list tr = list(div.iterdescendants("tr"))[1:-1] for title_info in tr: news_url = list(title_info.iterlinks())[1][2] info_list = map(lambda x: x.text_content(), list(title_info)) try: info_dict = {"title": info_list[0].strip("\r\n "), "time": info_list[1], "url": news_url} except IndexError, error_infomation: pass result_list.append(info_dict)
def scrape(html: str, base_url: str) -> List[dict]: """ extracting a book information with the standard expression from the HTML which is given by the parameter HTML the parameter base_url specifies the URK which is to be the standard when converted to the absolute URL returns: the lost of book (dict) """ books = [] html = lxml.html.fromstring(html) html.make_links_absolute(base_url) #convered all hrefs in the a element #by cssselect() method, getting all a elemetns applied to the selctet and prcoessing each a element #the meaning of selector: the a element with "itemprop=url" passed down from the li element which is also passed forn from "id-listBook" for a in html.cssselect('#listBook > li > a[itemprop="url"]'): # getting the URL from the href of a element url = a.get('href') # getting the title of the book from the itemprop="title" in the p element p = a.cssselect('p[itemprop="name"]')[0] title = p.text_content( ) #since it contains a wbr element, use text_content() not text_content books.append({'url': url, 'title': title}) return books
def make_html_obj_from_link(link, mode="desktop"): import urlparse import requests from lxml.html import make_links_absolute if mode == "mobile": try: response = requests.get( link, headers={"User-Agent": "blahblah iPhone", "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3"}, ) except requests.exceptions.ConnectionError: print "link " + link + " unreachable" return domain = "{uri.scheme}://{uri.netloc}/".format(uri=urlparse.urlparse(response.url))[:-1] body = make_links_absolute(response.content, domain) else: try: response = requests.get(link, headers={"Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3"}) except requests.exceptions.ConnectionError: print "link " + link + " unreachable" return domain = "{uri.scheme}://{uri.netloc}/".format(uri=urlparse.urlparse(response.url))[:-1] body = make_links_absolute(response.content.decode("cp1251"), domain) from lxml import html parsed_body = html.fromstring(body) return parsed_body
def html_malware_scan(url, topic): """ Crawls a page depth 1, returns the total count for each of the keywords in topic for each of the pages in the crawl. """ response = requests.get(url, timeout=10.0) html = lxml.html.fromstring( response.content, base_url=response.url ) html.make_links_absolute(resolve_base_href=True) childs_topic_cnt = 0 main_page_topic_cnt = response.text.lower().count(topic) for url in Bar().iter({link for element, attribute, link, pos in html.iterlinks()}): #for url in {link for element, attribute, link, pos in html.iterlinks()}: childs_topic_cnt += check_content(url, topic) if (float(main_page_topic_cnt)/(float(childs_topic_cnt)+1.0) >= 1.0 or float(main_page_topic_cnt)/(float(childs_topic_cnt)+1.0) == 0): return True else: return False
def scrape_restaurant_data(self, example): # get this from yelp html = obtain_html(example["url"]) html.make_links_absolute(example["url"]) title = html.cssselect("h1.biz-page-title")[0].text.strip() review_highlights = html.cssselect("ul.review-highlights-list") if len(review_highlights) > 0: description = tree_to_str(clean_up_highlights(review_highlights[0])) else: description = create_description_highlights(html) images = html.cssselect("img.photo-box-img") image_url = None if len(images) > 0: image_url = images[0].attrib["src"] return { "title": title, "description": description, "categories": example["categories"], "image_url" : image_url, "rating": rating_to_string(example["rating"]), "price": example["price"] }
def _read_url(url): log.info('Reading url %s' % url) u = urllib.urlopen(url) data = u.read() html = lxml.html.fromstring(data) html.make_links_absolute(url) return data, html
def html_malware_scan(url, topic): """ Crawls a page depth 1, returns the total count for each of the keywords in topic for each of the pages in the crawl. """ response = requests.get(url, timeout=10.0) html = lxml.html.fromstring(response.content, base_url=response.url) html.make_links_absolute(resolve_base_href=True) childs_topic_cnt = 0 main_page_topic_cnt = response.text.lower().count(topic) for url in Bar().iter( {link for element, attribute, link, pos in html.iterlinks()}): #for url in {link for element, attribute, link, pos in html.iterlinks()}: childs_topic_cnt += check_content(url, topic) if (float(main_page_topic_cnt) / (float(childs_topic_cnt) + 1.0) >= 1.0 or float(main_page_topic_cnt) / (float(childs_topic_cnt) + 1.0) == 0): return True else: return False
def scrape_sub(self, chamber, term, district, sub_url): "Scrape basic info for a legislator's substitute." with self.urlopen(sub_url) as page: html = lxml.html.fromstring(page) html.make_links_absolute(sub_url) # substitute info div#MAINS35 div = html.xpath('//div[contains(@id, "MAINS")]')[0] leg = {} leg['img_url'] = div[0][0].get('src') subfor = div[1][0].text.replace(u'\xa0', ' ').replace(': ', '') full_name = div[1][2].text.replace(u'\xa0', ' ') party = _PARTY[div[1][2].tail.strip()] leg['contact_form'] = div[1][3].xpath('string(a/@href)') leg = Legislator(term, chamber, district.strip(), full_name, party, **leg) leg['roles'][0] = { 'chamber': chamber, 'state': self.state, 'term': term, 'role': 'substitute', 'legislator': subfor[subfor.rindex('for'):], 'district': district.replace('District', '').strip(), 'party': party, 'start_date': None, 'end_date': None } leg.add_source(sub_url) self.save_legislator(leg)
def get_realtime_title(): """Get ALL Category and Source Realtime news from chinatimes realtime url may change or invaild when it is not *realtime* return: dict{category, source, time, title, url} """ response, content = h.request(news_list_url) html = lxml.html.fromstring(content.decode('big5', 'ignore')) html.make_links_absolute(base_url) # Get news-list section div = html.findall("*div")[1] # Get all title-info to list tr = list(div.iterdescendants("tr"))[1:] result_list = [] for title_info in tr: news_url = list(title_info.iterlinks())[0][2] info_list = map(lambda x: x.text_content(), list(title_info)) info_dict = {"title": info_list[0].strip("\r\n "), "time": info_list[1], "category": info_list[2], "source": info_list[3], "url": news_url} result_list.append(info_dict) return result_list
def scrape_list_page(response: requests.Response) -> Iterator[str]: html = lxml.html.fromstring(response.text) html.make_links_absolute(response.url) for a in html.cssselect('#listBook > li > a[itemprop="url"]'): url = a.get('href') yield url
def scrape_list_page(response: requests.Response) -> Iterator[str]: html = lxml.html.fromstring(response.text) html.make_links_absolute(response.url) for a in html.cssselect('table.cassetteitem_other > tbody > tr > td:nth-child(9) > a'): url = a.get('href') yield url
def get_oracle(url): source = util.get_source(url) html = lxml.html.document_fromstring(source) html.make_links_absolute(url, resolve_base_href=True) util.save_file(lxml.html.tostring(html), 'oracle.html') util.screenshot('oracle.html', 'oracle.png') return html
def scrape(self, chamber, term): """ Scrapes legislators for the current term only """ self.validate_term(term, latest_only=True) url = _BASE_URL % _CHAMBERS[chamber].lower() with self.urlopen(url) as index: html = lxml.html.fromstring(index) html.make_links_absolute(url) base_table = html.xpath('body/table/tr/td[2]/table[2]') district = None # keep track of district for substitutes for row in base_table[0].xpath('tr'): img_url = row.xpath('string(.//img/@src)') contact_form, additional_info_url = row.xpath('.//a/@href') if "Substitute" in row.text_content(): # it seems like the sub always follows the person who he/she # is filling in for. # most sub info is provided at the additional info url self.scrape_sub(chamber, term, district, additional_info_url) continue else: full_name = " ".join(row[1][0].text_content().replace(u'\xa0', ' ').split()) party = _PARTY[row[1][0].tail.strip()] pieces = [ x.strip() for x in row.itertext() if x ][6:] # the first index will either be a role or the district role = None if 'District' in pieces[0]: district = pieces.pop(0) else: role = pieces.pop(0) district = pieces.pop(0) leg = Legislator(term, chamber, district.replace('District', '').strip(), full_name, party=party) leg.add_source(url) leg['photo_url'] = img_url leg['contact_form'] = contact_form leg['url'] = additional_info_url leg['address'] = pieces.pop(0) # at this point 'pieces' still contains phone numbers and profession # and committee membership # skip committee membership, pick 'em up in IDCommitteeScraper end = -1 if 'Committees:' in pieces: end = pieces.index('Committees:') for prop in pieces[:end]: # phone numbers if prop.lower()[0:3] in _PHONE_NUMBERS: leg[ _PHONE_NUMBERS[ prop.lower()[0:3] ] ] = prop # profession else: leg['profession'] = prop self.save_legislator(leg)
def resp2html(resp, encoding='utf-8'): """将request返回的response转换为经lxml解析后的document""" text = get_resp_text(resp, encoding=encoding) html = lxml.html.fromstring(text) html.make_links_absolute(resp.url, resolve_base_href=True) # html = cleaner.clean_html(html) # lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug return html
def scraperwiki(): for page_number in itertools.count(1): url = 'https://classic.scraperwiki.com/profiles/tlevine/?page=%d' % page_number response = get(url) html = lxml.html.fromstring(response.text) html.make_links_absolute(url) for href in html.xpath('//li[@class="code_object_line"]/descendant::h3/a[position()=2]/@href'): yield re.sub(r'index.html$', '', str(href))
def scrape_joint_committees(self): page = self.get(_JOINT_URL).text html = lxml.html.fromstring(page) html.make_links_absolute(_JOINT_URL) joint_li = html.xpath('//div[contains(h2, "Joint")]/ul/li') for li in joint_li: name, url = li[0].text, li[0].get('href') self.get_joint_committees_data(name, url)
def get_oracle(url): source = util.get_source(url) parser = lxml.etree.HTMLParser() etree = lxml.etree.parse(StringIO(source), parser) html = lxml.html.document_fromstring(source) html.make_links_absolute(url, resolve_base_href=True) html.doctype = etree.docinfo.doctype return html
def _load_response(response): ''' In: python-requests Response Out: lxml HTML tree ''' html = lxml.html.fromstring(response.text) html.make_links_absolute(response.url) return html
def europeana(): url = request.args['url'] hl = request.args.get('hl', '') words = re.findall(r'\w+', hl.lower()) html = lxml.html.parse(url).getroot() html.make_links_absolute() highlight_tree(html, words) return lxml.html.tostring(html)
def get_page(url): global fetched_count fetched_count += 1 print 'Fetching... %s %s' % (fetched_count, url) doc = lxml.html.parse(urllib.urlopen(url)) html = doc.getroot() html.make_links_absolute(url) return html
def get_links(html, base_url, tags = []): links = [] tags = tags html = lxml.html.document_fromstring(html) html.make_links_absolute(base_url) links_html = html.iterlinks() links = [ x[2] for x in links_html if x[0].tag in tags ] return links
def get_links(html, base_url, tags=[]): links = [] tags = tags html = lxml.html.document_fromstring(html) html.make_links_absolute(base_url) links_html = html.iterlinks() links = [x[2] for x in links_html if x[0].tag in tags] return links
def make_tree(url, data, encoding=None): """Build lxml tree.""" s = StringIO(data) parser = lxml.html.HTMLParser(encoding=encoding) doc = lxml.html.parse(s, parser=parser) html = doc.getroot() html.make_links_absolute(url) return html
def scrape_chamber(self, chamber): """ Scrapes legislators for the current term only """ # self.validate_term(term, latest_only=True) url = BASE_URL % CHAMBERS[chamber].lower() index = self.get(url).text html = lxml.html.fromstring(index) html.make_links_absolute(url) rows = html.xpath('//div[contains(@class, "row-equal-height")]') for row in rows: img_url = row.xpath('.//img/@src')[0] inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1] inner_text = inner.text_content() if 'Resigned' in inner_text or 'Substitute' in inner_text: continue name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip() name = re.sub(r'\s+', ' ', name) party = PARTY[inner.xpath('p/strong')[0].tail.strip()] email = inner.xpath('p/strong/a')[0].text district = inner.xpath('p/a')[0].text.replace('District ', '') person_url = inner.xpath('p/a/@href')[0] # skip roles for now role = '' # for com in inner.xpath('p/a[contains(@href, "committees")]'): # role = com.tail.strip() person = Person(name=name, district=district, party=party, primary_org=chamber, image=img_url, role=role) phones = get_phones(inner) phone = phones.get('home') or phones.get('business') office_phone = phones.get('office') address = get_address(inner) fax = get_fax(inner) if address: person.add_contact_detail(type='address', value=address, note='District Office') if phone: person.add_contact_detail(type='voice', value=phone, note='District Office') if fax: person.add_contact_detail(type='fax', value=fax, note='District Office') if email: person.add_contact_detail(type='email', value=email, note='District Office') if office_phone: person.add_contact_detail(type='voice', value=office_phone, note='Capitol Office') person.add_source(url) person.add_link(person_url) yield person
def scrape_chamber(self, chamber): """ Scrapes legislators for the current term only """ # self.validate_term(term, latest_only=True) url = BASE_URL % CHAMBERS[chamber].lower() index = self.get(url, verify=False).text html = lxml.html.fromstring(index) html.make_links_absolute(url) rows = html.xpath('//div[contains(@class, "row-equal-height")]') for row in rows: img_url = row.xpath('.//img/@src')[0] inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1] inner_text = inner.text_content() if 'Resigned' in inner_text or 'Substitute' in inner_text: continue name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip() name = re.sub('\s+', ' ', name) party = PARTY[inner.xpath('p/strong')[0].tail.strip()] email = inner.xpath('p/strong/a')[0].text district = inner.xpath('p/a')[0].text.replace('District ', '') person_url = inner.xpath('p/a/@href')[0] # skip roles for now role = '' # for com in inner.xpath('p/a[contains(@href, "committees")]'): # role = com.tail.strip() person = Person(name=name, district=district, party=party, primary_org=chamber, image=img_url, role=role) phones = get_phones(inner) phone = phones.get('home') or phones.get('business') office_phone = phones.get('office') address = get_address(inner) fax = get_fax(inner) if address: person.add_contact_detail(type='address', value=address, note='District Office') if phone: person.add_contact_detail(type='voice', value=phone, note='District Office') if fax: person.add_contact_detail(type='fax', value=fax, note='District Office') if email: person.add_contact_detail(type='email', value=email, note='District Office') if office_phone: person.add_contact_detail(type='voice', value=office_phone, note='Capitol Office') person.add_source(url) person.add_link(person_url) yield person
def create_html_parser(row): content = get_content(row.value) parser = lxml.html.HTMLParser(encoding='utf-8') html = lxml.html.document_fromstring(content, parser=parser) if 'url' in row.value: html.make_links_absolute(row.value['url']) elif row.key: html.make_links_absolute(row.key) return html
def parse_html(self, url=None, response=None, *a, **ka): if lxml is None: raise ImportError('No module named \'lxml\'') if url: response = self.get(url, *a, **ka) elif response is None: raise ValueError('either url or response must be provided') html = lxml.html.fromstring(response.text) html.make_links_absolute(response.url) return html
def thomaslevine(): url = 'http://thomaslevine.com/!/' try: response = get(url) except: return [] else: html = lxml.html.fromstring(response.text) html.make_links_absolute(url) return ((unicode(a.attrib['href']), a.text_content()) for a in html.xpath('//a') if a.attrib['href'].startswith(url))
def get_pages(): response_text = requests.get(BASE_URL).content html = lxml.html.fromstring(response_text) html.make_links_absolute(BASE_URL) links = [] i = 0 for link in html.iterlinks(): links.append(link[2]) return links
def get_html(url): """使用get方法访问指定网页并返回经lxml解析后的document""" resp = request_get(url) text = get_resp_text(resp, encoding='utf-8') html = lxml.html.fromstring(text) html.make_links_absolute(url, resolve_base_href=True) # 清理功能仅应在需要的时候用来调试网页(如prestige),否则可能反过来影响调试(如JavBus) # html = cleaner.clean_html(html) # lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug return html
def scrape_list_page(response: requests.Response): """ 一覧ページのResponseから詳細ページのURLを抜き出すジェネレーター関数 """ html = lxml.html.fromstring(response.text) html.make_links_absolute(response.url) for a in html.cssselect('#listBook > li > a[itemprop="url"]'): url = a.get('href') yield url
def scrape(html: str, base_url: str) -> List[dict]: books = [] html = lxml.html.fromstring(html) html.make_links_absolute(base_url) for a in html.cssselect('#listBook > li > a[itemprop="url"]'): url = a.get('href') p = a.cssselect('p[itemprop="name"]')[0] title = p.text_content() books.append({'url': url, 'title': title}) return books
def addImagePaths(self, body, baseurl): """Turn relative paths into absolute paths in section body""" # This is a convenience method for use in referencemanual_macros # section_collation macro. # TODO: when we not longer need 2.1 compatibility, this belongs in # a view. html = lxml.html.fromstring(safe_unicode(body)) html.make_links_absolute(baseurl, resolve_base_href=False) return lxml.html.tostring(html, encoding='utf-8')
def gethtml(url): raw = get(url).content.decode('utf-8') if raw == '': return None try: html = lxml.html.fromstring(raw) except Exception as e: print(url) raise e html.make_links_absolute(url) return html
def useWebsiteSearch (self, url, target): try: page = urllib2.urlopen (url) except (urllib2.URLError, ValueError): return None s = page.read() html = lxml.html.fromstring (s) html.make_links_absolute (baseURL) html.forms[0].inputs['query'].value = target new_page = lxml.html.submit_form(html.forms[0]) new_s = new_page.read() return new_s
def useWebsiteSearch(self, url, target): try: page = urllib2.urlopen(url) except (urllib2.URLError, ValueError): return None s = page.read() html = lxml.html.fromstring(s) html.make_links_absolute(baseURL) html.forms[0].inputs['query'].value = target new_page = lxml.html.submit_form(html.forms[0]) new_s = new_page.read() return new_s
def parse_foi(response): html = lxml.html.fromstring(response.text) html.make_links_absolute(response.url) messages = ['\n\n'.join(div.xpath('strong/text()') + div.xpath('p/text()')) \ for div in html.xpath('id("tabs-request")/div')] downloads = list(map(str, html.xpath('id("tabs")//a[text()="Download"]/@href'))) return OrderedDict([ ('messages', messages), ('downloads', downloads), ])
def scrape_list_page(): """ 一覧ページのResponseから詳細ページのURLを抜き出すジェネレーター関数 """ html = lxml.html.fromstring(response.text) # 絶対URLに変換する html.make_links_absolute(response.url) for a in html.cssselect('#listBook > li > a[itemprop="url"]'): url = a.get('href') # yield文でジェネレーターイテレーターの要素を返す yield url
def _extract_links(self, response_text, response_url): html = lxml.html.fromstring(response_text) html.make_links_absolute(response_url) for e, a, l, p in html.iterlinks(): if self.scan_tag(e.tag): if self.scan_attr(a): link = Link(self.process_attr(l), text=e.text) self.links.append(link) links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links return links
def download_councillors(): with open(WEBPAGESTXT, 'r') as txtfile: urls = txtfile.readlines() urls = [url.strip() for url in urls] session = http.client.HTTPSConnection('www.berlin.de', timeout=10) councillors = {} for url in urls: if councillors: time.sleep(2) bezirk = bezirk_from_url(url) headers = {'Accept-Encoding': 'gzip', 'Connection': 'keep-alive'} session.request('GET', url, headers=headers) response = session.getresponse() response = response.read() response = zlib.decompress(response, 47) try: response = response.decode('latin-1', 'strict') except UnicodeDecodeError: response = response.decode('windows-1252', 'strict') html = lxml.html.fromstring(response) html.make_links_absolute(url) tablerows = html.cssselect('.zl12') tablerows += html.cssselect('.zl11') number = html.cssselect('table.tk1:nth-child(8)')[0] number = number.text_content() _, number = number.split(':') number = number.strip() if number.isdigit(): number = int(number) if not number == len(tablerows): print('%s:' % bezirk, '%s councillors were found.' % len(tablerows), 'Should be %s councillors.' % number) for row in tablerows: councillor = extract_councillor(row) councillor['BEZIRK'] = bezirk identifier = normalized_name(councillor['ANZEIGENAME']) try: councillors[bezirk][identifier] = councillor except KeyError: councillors[bezirk] = {identifier: councillor} session.close() return councillors
def npm(): url = 'https://www.npmjs.org/~tlevine' try: response = get(url) except: sys.stderr.write('Error loading NPM packages\n') else: html = lxml.html.fromstring(response.text) html.make_links_absolute(url) for li in html.xpath('id("profile")/ul/li'): href = li.xpath('a/@href')[0] text = ''.join(li.xpath('text()')).strip() yield unicode(href), unicode(text)
def getCandidats (self, s): html = lxml.html.fromstring (s) html.make_links_absolute (baseURL) #lxml.html.open_in_browser(html) names = list() links = list() for name in html.xpath('//@alt'): names.append(name) for a in html.xpath('//a'): if a.attrib.get('title') in names: links.append (a.attrib.get('href')) return (names, links)
def extract_links_from_html(base, body): try: html = lxml.html.fromstring(body) html.make_links_absolute(base) for element, attribute, link, pos in html.iterlinks(): if isinstance(link, str): link = link.encode('utf-8', 'ignore') yield link except Exception: logging.warning("(lxml) html parse error") import traceback; traceback.print_exc()
def bootstrap_data(): data = urllib.urlopen(DATA_URL).read() #open("localcache", 'wb').write(data) print '='*20 #data = open("localcache").read() html = lxml.html.fromstring(data) html.make_links_absolute(DATA_URL) if len(html.xpath('//td[@class="maintext"]/ul/li/a')) != 15: logging.error('Something changed/new? %s elements on first page.' % len(html.xpath('//td[@class="maintext"]/ul/li/a'))) for a in html.xpath('//td[@class="maintext"]/ul/li/a'): if handlers.get(a.text): handlers.get(a.text)(a.attrib['href']) else: print [a.attrib['href'], a.text]
def _extract_links(self, response_text, response_url): html = lxml.html.fromstring(response_text) html.make_links_absolute(response_url) sel = pyquery.PyQuery(html) evt_links = sel('.news > li:not(.more) > a') ann_links = sel('.announcement > li:not(.more) > a') all_links = [ Link(elem.attrib['href'], text=elem.text) for elem in itertools.chain(evt_links, ann_links) ] return unique(all_links, key=lambda link: link.url)
def scraperwiki(url = 'https://classic.scraperwiki.com/profiles/tlevine/index.html'): try: response = get(url) except: pass else: html = lxml.html.fromstring(response.text) html.make_links_absolute(url) for href in html.xpath('//li[@class="code_object_line"]/descendant::h3/a[position()=2]/@href'): yield re.sub(r'index.html$', '', unicode(href)), 'A web scraper' nexts = html.xpath(u'//a[text()="Next »"]/@href') if nexts != []: for scraper in scraperwiki(nexts[0]): yield scraper
def get_uri(self, url, html): if url is not None and html is not None: print(url) parsed_uri = urlparse(url) domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) html.make_links_absolute(url) for l in html.iterlinks(): parsed_uri = urlparse(l[2]) curr_domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) if curr_domain == domain: if l[2] not in self.urls: self.pool.put(l[2]) self.urls.add(l[2])