Пример #1
0
    def acquire_chapter(self, url, story=None):
        """Download and scrape a single chapter from a story.
        @param url: The URL of the chapter to download.
        @param story: The Story object provided by a previous run.
        @type url: str
        @type story: L{Story}

        @return: A tuple consisting of the newly-created L{Chapter} object and
            the L{Story} object which is either newly created or was provided
            as an argument.
        @rtype: (L{Chapter}, L{Story})
        """
        # Verify the URL's syntactical validity before wasting bandwidth.
        if not self.story_url_re.match(url):
            prnt("Not a %s story URL: %s" % (self.site_name, url))
            return None

        # Retrieve the raw chapter (don't keep the un-parsed HTML wasting memory)
        # .parse(handle) for proper encoding detection.
        # .urlopen for customizing the User-Agent header.
        dom = self.http.get_dom(url)
        html.make_links_absolute(dom, copy=False)

        chapter_select  = dom.find(self.chapter_select_xpath)
        chapter_content = dom.find(self.chapter_content_xpath)

        if not story:
            author = ''
            for elem in dom.iterfind('.//a[@href]'):
                if self.author_url_fragment in elem.get('href'):
                    author = elem.text
                    break
            story = Story(self.get_story_title(dom), author)
            story.site_name = self.site_name
            story.category  = self.get_story_category(dom)
            if chapter_select is not None:
                options = chapter_select.findall(".//option")
                if options[0].text.strip().lower() in self.not_chapters:
                    options = options[1:]
                story.chapter_urls = [self.resolve_chapter_url(x.get('value'), url, dom) for x in options]
            else:
                story.chapter_urls = [url]

        cleaned = self.custom_content_cleaning(chapter_content)
        if cleaned is not None:
            chapter_content = cleaned

        # Extract metadata from the chapter selector (or recognize its absence)
        if chapter_select is not None:
            chapter_title_str = chapter_select.find(".//option[@selected]").text
            chapter_title_obj = self.chapter_title_re.match(chapter_title_str)

            chapter_title  = chapter_title_obj.group('name')
            chapter_number = int(chapter_title_obj.group('num'))

            chapter = Chapter(chapter_number, chapter_title, chapter_content)
        else:
            chapter = Chapter(1, '', chapter_content)

        return chapter, story
def grab_urls(content, url):
    urls = {}
    domain = urlparse(url).netloc
    html = document_fromstring(content)
    html.make_links_absolute(url, resolve_base_href=True)

    for element, attribute, link, pos in html.iterlinks():
        if attribute != "href":
            continue

        # skip if not on our domain
        if urlparse(link).netloc != domain and urlparse(
                link).netloc != "www." + domain:
            continue

        # skip if self referential
        if (url.split("//")[1] + "#") in link:
            continue

        text = element.text_content() if len(
            element) == 0 else element[0].text_content()
        text = text.lstrip() if text is not None else ""
        # compute relevancy here

        relevance[link] = relevancy(link, text, url)
        urls[link] = 1

        if text != "":
            print text
        print link
        print

    return urls.keys()
Пример #3
0
def get_realtime_title(pages=5):
    """
    Get ALL Category Realtime news from libertytimes
    realtime url may change or invaild when it is not **realtime**
    
    get_realtime_title(pages=5, encoding="UTF-8")
    
    *pages*: get page 1 to pages, default is 5 pages
        
    return: dict{time, title, url}
    """

    result_list = []

    for page in xrange(1, pages + 1):
        response, content = h.request("%s&ipage=%d" % (news_list_url, page))
        html = lxml.html.fromstring(content.decode("utf-8", "ignore"))
        html.make_links_absolute(base_url)

        # Get news-list section
        div = html.findall("*div")[0]

        # Get all title-info to list
        tr = list(div.iterdescendants("tr"))[1:-1]

        for title_info in tr:
            news_url = list(title_info.iterlinks())[1][2]
            info_list = map(lambda x: x.text_content(), list(title_info))

            try:
                info_dict = {"title": info_list[0].strip("\r\n "), "time": info_list[1], "url": news_url}
            except IndexError, error_infomation:
                pass

            result_list.append(info_dict)
Пример #4
0
def scrape(html: str, base_url: str) -> List[dict]:
    """
    extracting a book information with the standard expression from the HTML which is given by the parameter HTML
    the parameter base_url specifies the URK which is to be the standard when converted to the absolute URL
    returns: the lost of book (dict)
    """
    books = []
    html = lxml.html.fromstring(html)
    html.make_links_absolute(base_url)  #convered all hrefs in the a element

    #by cssselect() method, getting all a elemetns applied to the selctet and prcoessing each a element
    #the meaning of selector: the a element with "itemprop=url" passed down from the li element which is also passed forn from "id-listBook"

    for a in html.cssselect('#listBook > li > a[itemprop="url"]'):
        # getting the URL from the href of a element
        url = a.get('href')

        # getting the title of the book from the itemprop="title" in the p element
        p = a.cssselect('p[itemprop="name"]')[0]
        title = p.text_content(
        )  #since it contains a wbr element, use text_content() not text_content

        books.append({'url': url, 'title': title})

    return books
Пример #5
0
def make_html_obj_from_link(link, mode="desktop"):
    import urlparse
    import requests
    from lxml.html import make_links_absolute

    if mode == "mobile":
        try:
            response = requests.get(
                link,
                headers={"User-Agent": "blahblah iPhone", "Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3"},
            )
        except requests.exceptions.ConnectionError:
            print "link " + link + " unreachable"
            return
        domain = "{uri.scheme}://{uri.netloc}/".format(uri=urlparse.urlparse(response.url))[:-1]
        body = make_links_absolute(response.content, domain)
    else:
        try:
            response = requests.get(link, headers={"Accept-Language": "ru-RU,ru;q=0.8,en-US;q=0.5,en;q=0.3"})
        except requests.exceptions.ConnectionError:
            print "link " + link + " unreachable"
            return
        domain = "{uri.scheme}://{uri.netloc}/".format(uri=urlparse.urlparse(response.url))[:-1]
        body = make_links_absolute(response.content.decode("cp1251"), domain)

    from lxml import html

    parsed_body = html.fromstring(body)
    return parsed_body
Пример #6
0
def html_malware_scan(url, topic):
    """
    Crawls a page depth 1, returns the total count for each of 
    the keywords in topic for each of the pages in the crawl.
    """

    response = requests.get(url, timeout=10.0)
    html = lxml.html.fromstring(
        response.content,
        base_url=response.url
    )
    html.make_links_absolute(resolve_base_href=True)

    childs_topic_cnt = 0
    main_page_topic_cnt = response.text.lower().count(topic)


    for url in Bar().iter({link for element, attribute, link, pos in html.iterlinks()}):
    #for url in {link for element, attribute, link, pos in html.iterlinks()}:
            childs_topic_cnt += check_content(url, topic)

    if (float(main_page_topic_cnt)/(float(childs_topic_cnt)+1.0) >= 1.0 or
        float(main_page_topic_cnt)/(float(childs_topic_cnt)+1.0) == 0):
        return True
    else:
        return False
Пример #7
0
 def scrape_restaurant_data(self, example):
     # get this from yelp
     
     html = obtain_html(example["url"])
     
     html.make_links_absolute(example["url"])
     
     title = html.cssselect("h1.biz-page-title")[0].text.strip()
     
     review_highlights = html.cssselect("ul.review-highlights-list")
     if len(review_highlights) > 0:
         description = tree_to_str(clean_up_highlights(review_highlights[0]))
     else:
         description = create_description_highlights(html)
     
     images = html.cssselect("img.photo-box-img")
     image_url = None
     if len(images) > 0:
         image_url   = images[0].attrib["src"]
     
     return {
     "title": title,
     "description": description,
     "categories": example["categories"],
     "image_url" : image_url,
     "rating": rating_to_string(example["rating"]),
     "price": example["price"]
     }
Пример #8
0
def _read_url(url):
	log.info('Reading url %s' % url)
	u = urllib.urlopen(url)
	data = u.read()
	html = lxml.html.fromstring(data)
	html.make_links_absolute(url)
	return data, html
Пример #9
0
def html_malware_scan(url, topic):
    """
    Crawls a page depth 1, returns the total count for each of 
    the keywords in topic for each of the pages in the crawl.
    """

    response = requests.get(url, timeout=10.0)
    html = lxml.html.fromstring(response.content, base_url=response.url)
    html.make_links_absolute(resolve_base_href=True)

    childs_topic_cnt = 0
    main_page_topic_cnt = response.text.lower().count(topic)

    for url in Bar().iter(
        {link
         for element, attribute, link, pos in html.iterlinks()}):
        #for url in {link for element, attribute, link, pos in html.iterlinks()}:
        childs_topic_cnt += check_content(url, topic)

    if (float(main_page_topic_cnt) / (float(childs_topic_cnt) + 1.0) >= 1.0
            or float(main_page_topic_cnt) /
        (float(childs_topic_cnt) + 1.0) == 0):
        return True
    else:
        return False
Пример #10
0
 def scrape_sub(self, chamber, term, district, sub_url):
     "Scrape basic info for a legislator's substitute."
     with self.urlopen(sub_url) as page:
         html = lxml.html.fromstring(page)
         html.make_links_absolute(sub_url)
         # substitute info div#MAINS35
         div = html.xpath('//div[contains(@id, "MAINS")]')[0]
         leg = {}
         leg['img_url'] = div[0][0].get('src')
         subfor = div[1][0].text.replace(u'\xa0', ' ').replace(': ', '')
         full_name = div[1][2].text.replace(u'\xa0', ' ')
         party = _PARTY[div[1][2].tail.strip()]
         leg['contact_form'] = div[1][3].xpath('string(a/@href)')
         leg = Legislator(term, chamber, district.strip(), full_name, party,
                          **leg)
         leg['roles'][0] = {
             'chamber': chamber,
             'state': self.state,
             'term': term,
             'role': 'substitute',
             'legislator': subfor[subfor.rindex('for'):],
             'district': district.replace('District', '').strip(),
             'party': party,
             'start_date': None,
             'end_date': None
         }
         leg.add_source(sub_url)
         self.save_legislator(leg)
Пример #11
0
def get_realtime_title():
    """Get ALL Category and Source Realtime news from chinatimes
    realtime url may change or invaild when it is not *realtime*
            
    return: dict{category, source, time, title, url}
    """
    
    response, content = h.request(news_list_url)

    html = lxml.html.fromstring(content.decode('big5', 'ignore'))
    html.make_links_absolute(base_url)

    # Get news-list section
    div = html.findall("*div")[1]

    # Get all title-info to list
    tr = list(div.iterdescendants("tr"))[1:]

    result_list = []
    for title_info in tr:
        news_url = list(title_info.iterlinks())[0][2]
        info_list = map(lambda x: x.text_content(), list(title_info))

        info_dict = {"title": info_list[0].strip("\r\n "), "time": info_list[1],
                     "category": info_list[2], "source": info_list[3],
                     "url": news_url}
    
        result_list.append(info_dict)
        
    return result_list
Пример #12
0
def scrape_list_page(response: requests.Response) -> Iterator[str]:

    html = lxml.html.fromstring(response.text)
    html.make_links_absolute(response.url)
    for a in html.cssselect('#listBook > li > a[itemprop="url"]'):
        url = a.get('href')
        yield url
Пример #13
0
def scrape_list_page(response: requests.Response) -> Iterator[str]:
    html = lxml.html.fromstring(response.text)
    html.make_links_absolute(response.url)

    for a in html.cssselect('table.cassetteitem_other > tbody > tr > td:nth-child(9) > a'):
        url = a.get('href')
        yield url
Пример #14
0
def get_oracle(url):
  source = util.get_source(url)
  html = lxml.html.document_fromstring(source)
  html.make_links_absolute(url, resolve_base_href=True)
  util.save_file(lxml.html.tostring(html), 'oracle.html')
  util.screenshot('oracle.html', 'oracle.png')
  return html
Пример #15
0
    def scrape(self, chamber, term):
        """
        Scrapes legislators for the current term only
        """
        self.validate_term(term, latest_only=True)
        url = _BASE_URL % _CHAMBERS[chamber].lower()
        with self.urlopen(url) as index:
            html = lxml.html.fromstring(index)
            html.make_links_absolute(url)
            base_table = html.xpath('body/table/tr/td[2]/table[2]')
            district = None # keep track of district for substitutes
            for row in base_table[0].xpath('tr'):
                img_url = row.xpath('string(.//img/@src)')
                contact_form, additional_info_url = row.xpath('.//a/@href')
                if "Substitute" in row.text_content():
                    # it seems like the sub always follows the person who he/she
                    # is filling in for.
                    # most sub info is provided at the additional info url
                    self.scrape_sub(chamber, term, district, additional_info_url)
                    continue
                else:
                    full_name = " ".join(row[1][0].text_content().replace(u'\xa0', ' ').split())
                    party = _PARTY[row[1][0].tail.strip()]

                pieces = [ x.strip() for x in row.itertext() if x ][6:]

                # the first index will either be a role or the district
                role = None
                if 'District' in pieces[0]:
                    district = pieces.pop(0)
                else:
                    role = pieces.pop(0)
                    district = pieces.pop(0)

                leg = Legislator(term, chamber,
                                 district.replace('District', '').strip(),
                                 full_name,
                                 party=party)
                leg.add_source(url)
                leg['photo_url'] = img_url
                leg['contact_form'] = contact_form
                leg['url'] = additional_info_url
                leg['address'] = pieces.pop(0)

                # at this point 'pieces' still contains phone numbers and profession
                # and committee membership
                # skip committee membership, pick 'em up in IDCommitteeScraper
                end = -1
                if 'Committees:' in pieces:
                    end = pieces.index('Committees:')
                for prop in pieces[:end]:
                    # phone numbers
                    if prop.lower()[0:3] in _PHONE_NUMBERS:
                        leg[ _PHONE_NUMBERS[ prop.lower()[0:3] ] ] = prop
                    # profession
                    else:
                        leg['profession'] = prop

                self.save_legislator(leg)
Пример #16
0
def resp2html(resp, encoding='utf-8'):
    """将request返回的response转换为经lxml解析后的document"""
    text = get_resp_text(resp, encoding=encoding)
    html = lxml.html.fromstring(text)
    html.make_links_absolute(resp.url, resolve_base_href=True)
    # html = cleaner.clean_html(html)
    # lxml.html.open_in_browser(html, encoding=encoding)  # for develop and debug
    return html
Пример #17
0
def scraperwiki():
    for page_number in itertools.count(1):
        url = 'https://classic.scraperwiki.com/profiles/tlevine/?page=%d' % page_number
        response = get(url)
        html = lxml.html.fromstring(response.text)
        html.make_links_absolute(url)
        for href in html.xpath('//li[@class="code_object_line"]/descendant::h3/a[position()=2]/@href'):
            yield re.sub(r'index.html$', '', str(href))
Пример #18
0
 def scrape_joint_committees(self):
     page = self.get(_JOINT_URL).text
     html = lxml.html.fromstring(page)
     html.make_links_absolute(_JOINT_URL)
     joint_li = html.xpath('//div[contains(h2, "Joint")]/ul/li')
     for li in joint_li:
         name, url = li[0].text, li[0].get('href')
         self.get_joint_committees_data(name, url)
Пример #19
0
def get_oracle(url):
  source = util.get_source(url)
  parser = lxml.etree.HTMLParser()
  etree = lxml.etree.parse(StringIO(source), parser)
  html = lxml.html.document_fromstring(source)
  html.make_links_absolute(url, resolve_base_href=True)
  html.doctype = etree.docinfo.doctype
  return html
Пример #20
0
def get_oracle(url):
    source = util.get_source(url)
    parser = lxml.etree.HTMLParser()
    etree = lxml.etree.parse(StringIO(source), parser)
    html = lxml.html.document_fromstring(source)
    html.make_links_absolute(url, resolve_base_href=True)
    html.doctype = etree.docinfo.doctype
    return html
Пример #21
0
def _load_response(response):
    '''
    In:  python-requests Response
    Out: lxml HTML tree
    '''
    html = lxml.html.fromstring(response.text)
    html.make_links_absolute(response.url)
    return html
Пример #22
0
def europeana():
    url = request.args['url']
    hl = request.args.get('hl', '')
    words = re.findall(r'\w+', hl.lower())
    html = lxml.html.parse(url).getroot()
    html.make_links_absolute()
    highlight_tree(html, words)
    return lxml.html.tostring(html)
Пример #23
0
def get_page(url):
    global fetched_count
    fetched_count += 1
    print 'Fetching... %s %s' % (fetched_count, url)
    doc = lxml.html.parse(urllib.urlopen(url))
    html = doc.getroot()
    html.make_links_absolute(url)
    return html
Пример #24
0
	def get_links(html, base_url, tags = []):
		links = []
		tags = tags
		html = lxml.html.document_fromstring(html)
		html.make_links_absolute(base_url)
		links_html = html.iterlinks()
		links = [ x[2] for x in links_html if x[0].tag in tags ]
		return links
Пример #25
0
 def scrape_joint_committees(self):
     page = self.get(_JOINT_URL).text
     html = lxml.html.fromstring(page)
     html.make_links_absolute(_JOINT_URL)
     joint_li = html.xpath('//div[contains(h2, "Joint")]/ul/li')
     for li in joint_li:
         name, url = li[0].text, li[0].get('href')
         self.get_joint_committees_data(name, url)
Пример #26
0
 def get_links(html, base_url, tags=[]):
     links = []
     tags = tags
     html = lxml.html.document_fromstring(html)
     html.make_links_absolute(base_url)
     links_html = html.iterlinks()
     links = [x[2] for x in links_html if x[0].tag in tags]
     return links
Пример #27
0
def make_tree(url, data, encoding=None):
    """Build lxml tree."""
    s = StringIO(data)
    parser = lxml.html.HTMLParser(encoding=encoding)
    doc = lxml.html.parse(s, parser=parser)
    html = doc.getroot()
    html.make_links_absolute(url)
    return html
Пример #28
0
    def scrape_chamber(self, chamber):
        """
        Scrapes legislators for the current term only
        """
        # self.validate_term(term, latest_only=True)
        url = BASE_URL % CHAMBERS[chamber].lower()
        index = self.get(url).text
        html = lxml.html.fromstring(index)
        html.make_links_absolute(url)

        rows = html.xpath('//div[contains(@class, "row-equal-height")]')

        for row in rows:
            img_url = row.xpath('.//img/@src')[0]

            inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1]
            inner_text = inner.text_content()
            if 'Resigned' in inner_text or 'Substitute' in inner_text:
                continue

            name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip()
            name = re.sub(r'\s+', ' ', name)
            party = PARTY[inner.xpath('p/strong')[0].tail.strip()]
            email = inner.xpath('p/strong/a')[0].text
            district = inner.xpath('p/a')[0].text.replace('District ', '')

            person_url = inner.xpath('p/a/@href')[0]
            # skip roles for now
            role = ''
            # for com in inner.xpath('p/a[contains(@href, "committees")]'):
            #     role = com.tail.strip()

            person = Person(name=name, district=district,
                            party=party, primary_org=chamber,
                            image=img_url, role=role)
            phones = get_phones(inner)
            phone = phones.get('home') or phones.get('business')
            office_phone = phones.get('office')
            address = get_address(inner)
            fax = get_fax(inner)
            if address:
                person.add_contact_detail(type='address', value=address,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='voice', value=phone,
                                          note='District Office')
            if fax:
                person.add_contact_detail(type='fax', value=fax,
                                          note='District Office')
            if email:
                person.add_contact_detail(type='email', value=email,
                                          note='District Office')
            if office_phone:
                person.add_contact_detail(type='voice', value=office_phone,
                                          note='Capitol Office')
            person.add_source(url)
            person.add_link(person_url)
            yield person
Пример #29
0
    def scrape_chamber(self, chamber):
        """
        Scrapes legislators for the current term only
        """
        # self.validate_term(term, latest_only=True)
        url = BASE_URL % CHAMBERS[chamber].lower()
        index = self.get(url, verify=False).text
        html = lxml.html.fromstring(index)
        html.make_links_absolute(url)

        rows = html.xpath('//div[contains(@class, "row-equal-height")]')

        for row in rows:
            img_url = row.xpath('.//img/@src')[0]

            inner = row.xpath('.//div[@class="vc-column-innner-wrapper"]')[1]
            inner_text = inner.text_content()
            if 'Resigned' in inner_text or 'Substitute' in inner_text:
                continue

            name = inner.xpath('p/strong')[0].text.replace(u'\xa0', ' ').strip()
            name = re.sub('\s+', ' ', name)
            party = PARTY[inner.xpath('p/strong')[0].tail.strip()]
            email = inner.xpath('p/strong/a')[0].text
            district = inner.xpath('p/a')[0].text.replace('District ', '')

            person_url = inner.xpath('p/a/@href')[0]
            # skip roles for now
            role = ''
            # for com in inner.xpath('p/a[contains(@href, "committees")]'):
            #     role = com.tail.strip()

            person = Person(name=name, district=district,
                            party=party, primary_org=chamber,
                            image=img_url, role=role)
            phones = get_phones(inner)
            phone = phones.get('home') or phones.get('business')
            office_phone = phones.get('office')
            address = get_address(inner)
            fax = get_fax(inner)
            if address:
                person.add_contact_detail(type='address', value=address,
                                          note='District Office')
            if phone:
                person.add_contact_detail(type='voice', value=phone,
                                          note='District Office')
            if fax:
                person.add_contact_detail(type='fax', value=fax,
                                          note='District Office')
            if email:
                person.add_contact_detail(type='email', value=email,
                                          note='District Office')
            if office_phone:
                person.add_contact_detail(type='voice', value=office_phone,
                                          note='Capitol Office')
            person.add_source(url)
            person.add_link(person_url)
            yield person
Пример #30
0
def create_html_parser(row):
    content = get_content(row.value)
    parser = lxml.html.HTMLParser(encoding='utf-8')
    html = lxml.html.document_fromstring(content, parser=parser)
    if 'url' in row.value:
        html.make_links_absolute(row.value['url'])
    elif row.key:
        html.make_links_absolute(row.key)
    return html
Пример #31
0
 def parse_html(self, url=None, response=None, *a, **ka):
     if lxml is None:
         raise ImportError('No module named \'lxml\'')
     if url:
         response = self.get(url, *a, **ka)
     elif response is None:
         raise ValueError('either url or response must be provided')
     html = lxml.html.fromstring(response.text)
     html.make_links_absolute(response.url)
     return html
Пример #32
0
def thomaslevine():
    url = 'http://thomaslevine.com/!/'
    try:
        response = get(url)
    except:
        return []
    else:
        html = lxml.html.fromstring(response.text)
        html.make_links_absolute(url)
        return ((unicode(a.attrib['href']), a.text_content()) for a in html.xpath('//a') if a.attrib['href'].startswith(url))
Пример #33
0
def get_pages():
    response_text = requests.get(BASE_URL).content
    html = lxml.html.fromstring(response_text)
    html.make_links_absolute(BASE_URL)
    links = []
    i = 0
    for link in html.iterlinks():
        links.append(link[2])

    return links
Пример #34
0
def get_html(url):
    """使用get方法访问指定网页并返回经lxml解析后的document"""
    resp = request_get(url)
    text = get_resp_text(resp, encoding='utf-8')
    html = lxml.html.fromstring(text)
    html.make_links_absolute(url, resolve_base_href=True)
    # 清理功能仅应在需要的时候用来调试网页(如prestige),否则可能反过来影响调试(如JavBus)
    # html = cleaner.clean_html(html)
    # lxml.html.open_in_browser(html, encoding=encoding)  # for develop and debug
    return html
Пример #35
0
def scrape_list_page(response: requests.Response):
    """
    一覧ページのResponseから詳細ページのURLを抜き出すジェネレーター関数
    """
    html = lxml.html.fromstring(response.text)
    html.make_links_absolute(response.url)

    for a in html.cssselect('#listBook > li > a[itemprop="url"]'):
        url = a.get('href')
        yield url
Пример #36
0
def scrape(html: str, base_url: str) -> List[dict]:
    books = []
    html = lxml.html.fromstring(html)
    html.make_links_absolute(base_url)
    for a in html.cssselect('#listBook > li > a[itemprop="url"]'):
        url = a.get('href')
        p = a.cssselect('p[itemprop="name"]')[0]
        title = p.text_content()

        books.append({'url': url, 'title': title})
    return books
Пример #37
0
    def addImagePaths(self, body, baseurl):
        """Turn relative paths into absolute paths in section body"""

        # This is a convenience method for use in referencemanual_macros
        # section_collation macro.
        # TODO: when we not longer need 2.1 compatibility, this belongs in
        # a view.

        html = lxml.html.fromstring(safe_unicode(body))
        html.make_links_absolute(baseurl, resolve_base_href=False)
        return lxml.html.tostring(html, encoding='utf-8')
    def addImagePaths(self, body, baseurl):
        """Turn relative paths into absolute paths in section body"""

        # This is a convenience method for use in referencemanual_macros
        # section_collation macro.
        # TODO: when we not longer need 2.1 compatibility, this belongs in
        # a view.

        html = lxml.html.fromstring(safe_unicode(body))
        html.make_links_absolute(baseurl, resolve_base_href=False)
        return lxml.html.tostring(html, encoding='utf-8')
Пример #39
0
def gethtml(url):
    raw = get(url).content.decode('utf-8')
    if raw == '':
        return None
    try:
        html = lxml.html.fromstring(raw)
    except Exception as e:
        print(url)
        raise e
    html.make_links_absolute(url)
    return html
Пример #40
0
def thomaslevine():
    url = 'http://thomaslevine.com/!/'
    try:
        response = get(url)
    except:
        return []
    else:
        html = lxml.html.fromstring(response.text)
        html.make_links_absolute(url)
        return ((unicode(a.attrib['href']), a.text_content())
                for a in html.xpath('//a') if a.attrib['href'].startswith(url))
Пример #41
0
 def useWebsiteSearch (self, url, target):
     try:
         page = urllib2.urlopen (url)
     except (urllib2.URLError, ValueError):
         return None
     s = page.read()
     html = lxml.html.fromstring (s)
     html.make_links_absolute (baseURL)
     html.forms[0].inputs['query'].value = target
     new_page = lxml.html.submit_form(html.forms[0])
     new_s = new_page.read()
     return new_s
Пример #42
0
 def useWebsiteSearch(self, url, target):
     try:
         page = urllib2.urlopen(url)
     except (urllib2.URLError, ValueError):
         return None
     s = page.read()
     html = lxml.html.fromstring(s)
     html.make_links_absolute(baseURL)
     html.forms[0].inputs['query'].value = target
     new_page = lxml.html.submit_form(html.forms[0])
     new_s = new_page.read()
     return new_s
Пример #43
0
def parse_foi(response):
    html = lxml.html.fromstring(response.text)
    html.make_links_absolute(response.url)

    messages = ['\n\n'.join(div.xpath('strong/text()') + div.xpath('p/text()')) \
                for div in html.xpath('id("tabs-request")/div')]
    downloads = list(map(str, html.xpath('id("tabs")//a[text()="Download"]/@href')))

    return OrderedDict([
        ('messages', messages),
        ('downloads', downloads),
    ])
Пример #44
0
def scrape_list_page():
    """
    一覧ページのResponseから詳細ページのURLを抜き出すジェネレーター関数
    """
    html = lxml.html.fromstring(response.text)
    # 絶対URLに変換する
    html.make_links_absolute(response.url)

    for a in html.cssselect('#listBook > li > a[itemprop="url"]'):
        url = a.get('href')
        # yield文でジェネレーターイテレーターの要素を返す
        yield url
Пример #45
0
    def _extract_links(self, response_text, response_url):
        html = lxml.html.fromstring(response_text)
        html.make_links_absolute(response_url)
        for e, a, l, p in html.iterlinks():
            if self.scan_tag(e.tag):
                if self.scan_attr(a):
                    link = Link(self.process_attr(l), text=e.text)
                    self.links.append(link)

        links = unique_list(self.links, key=lambda link: link.url) if self.unique else self.links

        return links
Пример #46
0
def download_councillors():
    with open(WEBPAGESTXT, 'r') as txtfile:
        urls = txtfile.readlines()
    urls = [url.strip() for url in urls]

    session = http.client.HTTPSConnection('www.berlin.de', timeout=10)
    councillors = {}
    for url in urls:
        if councillors:
            time.sleep(2)

        bezirk = bezirk_from_url(url)

        headers = {'Accept-Encoding': 'gzip', 'Connection': 'keep-alive'}
        session.request('GET', url, headers=headers)
        response = session.getresponse()

        response = response.read()
        response = zlib.decompress(response, 47)

        try:
            response = response.decode('latin-1', 'strict')
        except UnicodeDecodeError:
            response = response.decode('windows-1252', 'strict')

        html = lxml.html.fromstring(response)
        html.make_links_absolute(url)

        tablerows = html.cssselect('.zl12')
        tablerows += html.cssselect('.zl11')

        number = html.cssselect('table.tk1:nth-child(8)')[0]
        number = number.text_content()
        _, number = number.split(':')
        number = number.strip()
        if number.isdigit():
            number = int(number)
            if not number == len(tablerows):
                print('%s:' % bezirk,
                      '%s councillors were found.' % len(tablerows),
                      'Should be %s councillors.' % number)

        for row in tablerows:
            councillor = extract_councillor(row)
            councillor['BEZIRK'] = bezirk
            identifier = normalized_name(councillor['ANZEIGENAME'])
            try:
                councillors[bezirk][identifier] = councillor
            except KeyError:
                councillors[bezirk] = {identifier: councillor}
    session.close()
    return councillors
Пример #47
0
def npm():
    url = 'https://www.npmjs.org/~tlevine'
    try:
        response = get(url)
    except:
        sys.stderr.write('Error loading NPM packages\n')
    else:
        html = lxml.html.fromstring(response.text)
        html.make_links_absolute(url)
        for li in html.xpath('id("profile")/ul/li'):
            href = li.xpath('a/@href')[0]
            text = ''.join(li.xpath('text()')).strip()
            yield unicode(href), unicode(text)
Пример #48
0
 def getCandidats (self, s): 
     html = lxml.html.fromstring (s)
     html.make_links_absolute (baseURL)
     #lxml.html.open_in_browser(html)
     names = list()
     links = list()
     for name in html.xpath('//@alt'):
         names.append(name)
         
     for a in html.xpath('//a'):
         if a.attrib.get('title') in names:
             links.append (a.attrib.get('href'))
     return (names, links)
Пример #49
0
def npm():
    url = 'https://www.npmjs.org/~tlevine'
    try:
        response = get(url)
    except:
        sys.stderr.write('Error loading NPM packages\n')
    else:
        html = lxml.html.fromstring(response.text)
        html.make_links_absolute(url)
        for li in html.xpath('id("profile")/ul/li'):
            href = li.xpath('a/@href')[0]
            text = ''.join(li.xpath('text()')).strip()
            yield unicode(href), unicode(text)
Пример #50
0
    def extract_links_from_html(base, body):
        try:
            html = lxml.html.fromstring(body)
            html.make_links_absolute(base)

            for element, attribute, link, pos in html.iterlinks():
                if isinstance(link, str):
                    link = link.encode('utf-8', 'ignore')
                yield link

        except Exception:
            logging.warning("(lxml) html parse error")
            import traceback; traceback.print_exc()
Пример #51
0
def bootstrap_data():
	data = urllib.urlopen(DATA_URL).read()
	#open("localcache", 'wb').write(data)
	print '='*20
	#data = open("localcache").read()
	html = lxml.html.fromstring(data)
	html.make_links_absolute(DATA_URL)
	if len(html.xpath('//td[@class="maintext"]/ul/li/a')) != 15:
		logging.error('Something changed/new? %s elements on first page.' % len(html.xpath('//td[@class="maintext"]/ul/li/a')))
	for a in html.xpath('//td[@class="maintext"]/ul/li/a'):
		if handlers.get(a.text):
			handlers.get(a.text)(a.attrib['href'])
		else:
			print [a.attrib['href'], a.text]
Пример #52
0
    def _extract_links(self, response_text, response_url):
        html = lxml.html.fromstring(response_text)
        html.make_links_absolute(response_url)
        sel = pyquery.PyQuery(html)

        evt_links = sel('.news > li:not(.more) > a')
        ann_links = sel('.announcement > li:not(.more) > a')

        all_links = [
                Link(elem.attrib['href'], text=elem.text)
                for elem in itertools.chain(evt_links, ann_links)
                ]

        return unique(all_links, key=lambda link: link.url)
Пример #53
0
def scraperwiki(url = 'https://classic.scraperwiki.com/profiles/tlevine/index.html'):
    try:
        response = get(url)
    except:
        pass
    else:
        html = lxml.html.fromstring(response.text)
        html.make_links_absolute(url)
        for href in html.xpath('//li[@class="code_object_line"]/descendant::h3/a[position()=2]/@href'):
            yield re.sub(r'index.html$', '', unicode(href)), 'A web scraper'
        nexts = html.xpath(u'//a[text()="Next »"]/@href')
        if nexts != []:
            for scraper in scraperwiki(nexts[0]):
                yield scraper
Пример #54
0
    def get_uri(self, url, html):
        if url is not None and html is not None:
            print(url)
            parsed_uri = urlparse(url)
            domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
            html.make_links_absolute(url)
            for l in html.iterlinks():
                parsed_uri = urlparse(l[2])
                curr_domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
                if curr_domain == domain:
                    if l[2] not in self.urls:
                        self.pool.put(l[2])

                    self.urls.add(l[2])