Exemplo n.º 1
0
def busca_link():
    response=requests.get("http://www.google.com")
    html = parser(response.text)
    soup = BeautifulSoup(response.text,"lxml")

    print("Links")
    all_links=soup.find_all("a")
    for link in all_links:
        print(link.get("href"))
Exemplo n.º 2
0
    def process(self, document_field, extract_dict, **kwargs):
        """XPath-based extraction was applied to this document """
        # logger.debug("field={field}, extract_dict={extract_dict}".format(**locals()))

        if type(extract_dict) == str:
            extract_dict = self._parse_strings_that_contain_dicts(extract_dict)
        try:
            parsed_document = parser(document_field)
        except:
            try:
                logger.info("attempting to process after utf-8 encoding")
                parsed_document = parser(
                    document_field.encode("utf-8", "replace"))
            except:
                logger.warning(
                    "failed to parse document! using xpath_parser, dict={extract_dict}"
                    .format(**locals()))
        parsed_fields = parse_dict(parsed_document, extract_dict)

        return parsed_fields
Exemplo n.º 3
0
    def handle_page_unit (self, unit):
        url  = self.page_counter.construct_url(unit)

        try:
            return self.get_elements(parser( self.get_page_content(url) ))
        except HTTPError as e:
            stderr.write(
                "!!! Page '{0}' is unavailable [{1.code}]".format(url, e)
            )
        if page is None:
            stderr.write('*** Problems with {0}. Please check'.format(url))

        return tuple()
Exemplo n.º 4
0
 def parse_page (self, url, results):
     for attempt in xrange(self.tries):
         try:
             results.append(self.start_parse_page(
                 url,
                 parser(self.get_page_content(url))
             ))
         except ValueError: # if self.get_page_content returns None and parser dies
             pass
         except (HTTP404, IndexError): # IndexError raised by lxml.etree in order if parsing was unsuccessful
             print 'Some problems with {0}. Please check'.format(url)
     else:
         print 'Some problems with {0}. Please check'.format(url)