예제 #1
0
 def test_get_node_text(self):
     elem = self.lxml_tree.xpath('//div[@id="bee"]')[0]
     self.assertEqual(get_node_text(elem),
                      u'пчела mozilla = 777; body { color: green; }')
     self.assertEqual(get_node_text(elem, smart=True), u'пче ла')
     elem = self.lxml_tree.xpath('//div[@id="fly"]')[0]
     self.assertEqual(get_node_text(elem), u'му ха')
예제 #2
0
파일: deprecated.py 프로젝트: abaelhe/grab
    def strip_tags(self, content, smart=False):
        """
        Strip tags from the HTML content.
        """
        from lxml.html import fromstring

        return get_node_text(fromstring(content), smart=smart)
예제 #3
0
파일: deprecated.py 프로젝트: abael/grab
    def strip_tags(self, content, smart=False):
        """
        Strip tags from the HTML content.
        """
        from lxml.html import fromstring

        return get_node_text(fromstring(content), smart=smart)
예제 #4
0
 def text(self, smart=False, normalize_space=True):
     if self.is_text_node():
         if normalize_space:
             return normalize_space_func(self.node())
         else:
             return self.node()
     else:
         return get_node_text(self.node(), smart=smart,
                              normalize_space=normalize_space)
예제 #5
0
 def text(self, smart=False, normalize_space=True):
     if self.is_text_node():
         if normalize_space:
             return normalize_space_func(self.node())
         else:
             return self.node()
     else:
         return get_node_text(self.node(),
                              smart=smart,
                              normalize_space=normalize_space)
예제 #6
0
파일: deprecated.py 프로젝트: abael/grab
    def css_text(self, path, default=NULL, smart=False, normalize_space=True):
        """
        Get normalized text of node which matches the css path.
        """

        try:
            return get_node_text(self.css_one(path), smart=smart,
                                 normalize_space=normalize_space)
        except IndexError:
            if default is NULL:
                raise
            else:
                return default
예제 #7
0
    def css_text(self, path, default=NULL, smart=False, normalize_space=True):
        """
        Get normalized text of node which matches the css path.
        """

        try:
            return get_node_text(self.css_one(path), smart=smart,
                                 normalize_space=normalize_space)
        except IndexError:
            if default is NULL:
                raise
            else:
                return default
예제 #8
0
def parse_search_results(grab, parse_index_size=False, strict_query=False):
    """
    Parse yandex search results page content.
    """

    if is_banned(grab):
        raise CaptchaError('Captcha found')

    elif grab.xpath_exists('//div[contains(@class, "b-error")]'):
        err_msg = grab.xpath_text('//div[contains(@class, "b-error")]')
        logging.debug('Found error message: %s' % err_msg)
        return []
    elif grab.xpath_exists('//ol[contains(@class, "b-serp-list")]'):
        # TODO:
        #if (strict_query and (
        #grab.search(u'Нет результатов для') or grab.search(u'No results found for'))):
        #pass
        #logging.debug('Query modified')
        results = []
        # TODO: parse_index_size
        # Yield found results
        results = []

        page_num = int(
            grab.xpath_text('//b[contains(@class, "b-pager__current")]'))

        for elem in grab.xpath_list('//li[contains(@class, "b-serp-item")]'):
            try:
                try:
                    title_elem = elem.xpath('.//h2/a')[0]
                    snippet = get_node_text(
                        elem.xpath(
                            './/div[contains(@class, "b-serp-item__text")]')
                        [0])
                except IndexError:
                    # this is video item or something like that
                    pass
                else:
                    item = {
                        'page': page_num,
                    }

                    # url
                    item['url'] = title_elem.get('href')
                    #if url.startswith('/url?'):
                    #url = url.split('?q=')[1].split('&')[0]
                    #url = unquote_plus(url)

                    item['position'] = int(
                        elem.xpath(
                            './/h2/b[contains(@class, "b-serp-item__number")]/text()'
                        )[0])

                    # title
                    item['title'] = get_node_text(title_elem)

                    item['snippet'] = snippet

                    results.append(item)
            except Exception as ex:
                logging.error('', exc_info=ex)

        return results
    else:
        print('parsing error')
        raise ParsingError('Could not identify yandex page format')
예제 #9
0
def parse_search_results(grab, parse_index_size=False, strict_query=False):
    """
    Parse google search results page content.
    """

    #elif grab.search(u'please type the characters below'):
    if grab.response.code == 403:
        raise AccessDenied('Access denied (HTTP 403)')
    elif grab.search(u'src="/sorry/image'):

        # Captcha!!!
        raise CaptchaFound('Captcha found')

    elif grab.css_exists('#ires'):
        if strict_query and \
                grab.search(u'Нет результатов для') or \
                grab.search(u'No results found for'):
            pass
            logging.debug('Query modified')
        else:
            if len(grab.css_list('#ires h3')):

                # Something was found
                if parse_index_size:
                    index_size = parse_index_size(grab)
                else:
                    index_size = None

                # Yield found results
                results = []

                for elem in grab.xpath_list('//*[h3[@class="r"]/a]'):
                    title_elem = elem.xpath('h3/a')[0]

                    # url
                    url = title_elem.get('href')
                    if url.startswith('/url?'):
                        url = url.split('?q=')[1].split('&')[0]
                        url = unquote_plus(url)

                    # title
                    title = get_node_text(title_elem)

                    # snippet
                    # Google could offer two type of snippet format: simple and extended
                    # It depends on user agent
                    # For <IE8, Opera, <FF3 you probably get simple format
                    try:
                        snippet_node = elem.xpath('div[@class="s"]')[0]
                    except IndexError as ex:
                        # Probably it is video or some other result
                        # Such result type is not supported yet
                        continue

                    try:
                        subnode = snippet_node.xpath('span[@class="st"]')[0]
                        snippet = get_node_text(subnode, smart=False)
                        extended_result = True
                    except IndexError:
                        drop_node(snippet_node, 'div')
                        drop_node(snippet_node, 'span[@class="f"]')
                        snippet = get_node_text(snippet_node, smart=False)
                        extended_result = False

                    # filetype
                    try:
                        filetype = elem.xpath('.//span[contains(@class, "xsm")]'\
                                              '/text()')[0].lower().strip('[]')
                    except IndexError:
                        filetype = None

                    #if 'File Format':
                    if url:
                        results.append({
                            'url': url,
                            'title': title,
                            'snippet': snippet,
                            'filetype': filetype,
                            'index_size': index_size,
                            'extended': extended_result,
                        })
                return results
            else:
                pass
                #return []
    elif grab.css_exists('#res'):
        # Could be search results here?
        # or just message "nothing was found"?
        pass
    else:
        raise ParsingError('Could not identify google page format')
예제 #10
0
파일: etree.py 프로젝트: abael/weblib
 def test_get_node_text(self):
     elem = self.lxml_tree.xpath('//div[@id="bee"]')[0]
     self.assertEqual(get_node_text(elem), u'пчела mozilla = 777; body { color: green; }')
     self.assertEqual(get_node_text(elem, smart=True), u'пче ла')
     elem = self.lxml_tree.xpath('//div[@id="fly"]')[0]
     self.assertEqual(get_node_text(elem), u'му ха')
예제 #11
0
    def task_html(self, grab, task):
        """ Process index page for each blog to extract
        valuable data

        """
        logger.info("Get blog: {}".format(grab.response.url))
        # find nodes with text
        nodes_with_text = filter(
            None,
            map(
                lambda x: x.strip(),
                grab.doc('//body//*').text_list()
            )
        )

        # extract text from the document
        # excluding script and style tags
        doc_text = get_node_text(
            grab.doc('//body').node(),
            smart=True
        ).encode('utf-8', 'ignore')

        page_params = {
            'doc_size_bytes': sys.getsizeof(grab.response.body),
            'doc_size_chars': len(grab.response.body),
            'nodes_with_text': len(nodes_with_text),
            'content_text_chars': len(doc_text),
            'content_text_bytes': sys.getsizeof(doc_text),
        }

        # extract links to scripts and css
        # used on page
        scripts = grab.doc(
            '//script[@src]').attr_list('src')
        css = grab.doc(
            '//link[@rel="stylesheet"]').attr_list('href')

        page_imports = {
            'scripts': filter(None, scripts),
            'css': filter(None, css),
        }

        # extract meta attributes
        page_meta = {
            'title': grab.doc('//title').text().lower(),
            'description': grab.doc(
                '//meta[contains(@name, "description")]'
            ).attr('content', default=u"").lower(),
            'keywords': grab.doc(
                '//meta[contains(@name, "keywords")]'
            ).attr('content', default=u"").lower(),
            'charset': grab.doc('//meta[@charset]').attr(
                'charset', default=u"").lower()
        }

        data = {
            'blog': task.blog,
            'source_url': grab.response.url,
            'params': page_params,
            'imports': page_imports,
            'meta': page_meta,
        }
        self.save_blog(data)

        # get rss feed for this blog
        # we clone grab object so it would be
        # if user go to the blog page and click rss link
        g = grab.clone()
        g.setup(url=task.blog['rss'])
        yield Task('rss', grab=g, data=data)
예제 #12
0
def parse_search_results(grab, parse_index_size=False, strict_query=False):
    """
    Parse yandex search results page content.
    """

    if is_banned(grab):
        raise CaptchaError('Captcha found')

    elif grab.xpath_exists('//div[contains(@class, "b-error")]'):
        err_msg = grab.xpath_text('//div[contains(@class, "b-error")]')
        logging.debug('Found error message: %s' % err_msg)
        return []
    elif grab.xpath_exists('//ol[contains(@class, "b-serp-list")]'):
        # TODO:
        #if (strict_query and (
            #grab.search(u'Нет результатов для') or grab.search(u'No results found for'))):
            #pass
            #logging.debug('Query modified')
        results = []
        # TODO: parse_index_size
        # Yield found results
        results = []

        page_num = int(grab.xpath_text('//b[contains(@class, "b-pager__current")]'))

        for elem in grab.xpath_list('//li[contains(@class, "b-serp-item")]'):
            try:
                try:
                    title_elem = elem.xpath('.//h2/a')[0]
                    snippet = get_node_text(
                        elem.xpath('.//div[contains(@class, "b-serp-item__text")]')[0])
                except IndexError:
                    # this is video item or something like that
                    pass
                else:
                    item = {
                        'page': page_num,
                    }

                    # url
                    item['url'] = title_elem.get('href')
                    #if url.startswith('/url?'):
                        #url = url.split('?q=')[1].split('&')[0]
                        #url = unquote_plus(url)

                    item['position'] = int(elem.xpath(
                        './/h2/b[contains(@class, "b-serp-item__number")]/text()')[0])

                    # title
                    item['title'] = get_node_text(title_elem)

                    item['snippet'] = snippet

                    results.append(item)
            except Exception as ex:
                logging.error('', exc_info=ex)

        return results
    else:
        print('parsing error')
        raise ParsingError('Could not identify yandex page format')