Пример #1
0
 def test_get_node_text(self):
     elem = self.lxml_tree.xpath('//div[@id="bee"]')[0]
     self.assertEqual(get_node_text(elem),
                      u'пчела mozilla = 777; body { color: green; }')
     self.assertEqual(get_node_text(elem, smart=True), u'пче ла')
     elem = self.lxml_tree.xpath('//div[@id="fly"]')[0]
     self.assertEqual(get_node_text(elem), u'му ха')
Пример #2
0
	def task_yarpage(self, grab, task):
		css_path = "div.b-serp-item__inner"
		for elem in grab.css_list(css_path):
			row = {}
			for childelems in elem.iterchildren():
				if 'b-serp-item__price' == childelems.attrib['class']:
					row['price'] = find_node_number(childelems, ignore_spaces=True)
				if 'b-serp-item__header' == childelems.attrib['class']:
					row['header'] = get_node_text(childelems)
					ahref = childelems.iterchildren()
					row['link'] = list(ahref)[0].get('href')
				if 'b-serp-item__about' == childelems.attrib['class']:
					row['about'] = get_node_text(childelems)
				if 'b-serp-item__address' == childelems.attrib['class']:
					adresselems = childelems.iterchildren()
					adress_and_subway = list(adresselems)[1]
					adress = adress_and_subway.text
					adress_and_subway_iter = adress_and_subway.iterchildren()
					subway = list(adress_and_subway_iter)[0].text
					row['adress'] = adress
					row['subway'] = subway
				if 'b-serp-item__owner' == childelems.attrib['class']:
					row['owner'] = get_node_text(childelems)
			row['time'] = int(time.time())
		self.csvfilesaver.save(listrow(row))
		grab.url.split('=page')
Пример #3
0
def parse_search_results(grab, parse_index_size=False, strict_query=False):
    """
    Parse yandex search results page content.
    """

    if is_banned(grab):
        raise CaptchaError('Captcha found')

    elif grab.xpath_exists('//div[contains(@class, "b-error")]'):
        err_msg = grab.xpath_text('//div[contains(@class, "b-error")]')
        logging.debug('Found error message: %s' % err_msg)
        return []
    elif grab.xpath_exists('//ol[contains(@class, "b-serp-list")]'):
        # TODO:
        #if (strict_query and (
            #grab.search(u'Нет результатов для') or grab.search(u'No results found for'))):
            #pass
            #logging.debug('Query modified')
        results = []
        # TODO: parse_index_size
        # Yield found results
        results = []

        page_num = int(grab.xpath_text('//b[contains(@class, "b-pager__current")]'))

        for elem in grab.xpath_list('//li[contains(@class, "b-serp-item")]'):
            try:
                try:
                    title_elem = elem.xpath('.//h2/a')[0]
                    snippet = get_node_text(
                        elem.xpath('.//div[contains(@class, "b-serp-item__text")]')[0])
                except IndexError:
                    # this is video item or something like that
                    pass
                else:
                    item = {
                        'page': page_num,
                    }

                    # url
                    item['url'] = title_elem.get('href')
                    #if url.startswith('/url?'):
                        #url = url.split('?q=')[1].split('&')[0]
                        #url = urllib.unquote_plus(url)

                    item['position'] = int(elem.xpath(
                        './/h2/b[contains(@class, "b-serp-item__number")]/text()')[0])

                    # title
                    item['title'] = get_node_text(title_elem)

                    item['snippet'] = snippet

                    results.append(item)
            except Exception, ex:
                logging.error('', exc_info=ex)

        return results
Пример #4
0
    def strip_tags(self, content, smart=False):
        """
        Strip tags from the HTML content.
        """
        from lxml.html import fromstring

        return get_node_text(fromstring(content), smart=smart)
Пример #5
0
    def strip_tags(self, content, smart=False):
        """
        Strip tags from the HTML content.
        """
        from lxml.html import fromstring

        return get_node_text(fromstring(content), smart=smart)
Пример #6
0
 def text(self, smart=False, normalize_space=True):
     elem = self.node
     if isinstance(elem, basestring):
         if normalize_space:
             return normalize_space_func(elem)
         else:
             return elem
     else:
         return get_node_text(elem, smart=smart,
                              normalize_space=normalize_space)
Пример #7
0
 def text(self, smart=False, normalize_space=True):
     elem = self.node
     if isinstance(elem, basestring):
         if normalize_space:
             return normalize_space_func(elem)
         else:
             return elem
     else:
         return get_node_text(elem,
                              smart=smart,
                              normalize_space=normalize_space)
Пример #8
0
    def css_text(self, path, default=NULL, smart=False, normalize_space=True):
        """
        Get normalized text of node which matches the css path.
        """

        try:
            return get_node_text(self.css_one(path), smart=smart, normalize_space=normalize_space)
        except IndexError:
            if default is NULL:
                raise
            else:
                return default
Пример #9
0
    def css_text(self, path, default=NULL, smart=False, normalize_space=True):
        """
        Get normalized text of node which matches the css path.
        """

        try:
            return get_node_text(self.css_one(path), smart=smart,
                                 normalize_space=normalize_space)
        except IndexError:
            if default is NULL:
                raise
            else:
                return default
Пример #10
0
def parse_search_results(grab, parse_index_size=False, strict_query=False):
    """
    Parse yandex search results page content.
    """

    if is_banned(grab):
        raise CaptchaError('Captcha found')

    elif grab.xpath_exists('//div[contains(@class, "b-error")]'):
        err_msg = grab.xpath_text('//div[contains(@class, "b-error")]')
        logging.debug('Found error message: %s' % err_msg)
        return []
    elif grab.xpath_exists('//ol[contains(@class, "b-serp-list")]'):
        # TODO:
        #if (strict_query and (
        #grab.search(u'Нет результатов для') or grab.search(u'No results found for'))):
        #pass
        #logging.debug('Query modified')
        results = []
        # TODO: parse_index_size
        # Yield found results
        results = []

        page_num = int(
            grab.xpath_text('//b[contains(@class, "b-pager__current")]'))

        for elem in grab.xpath_list('//li[contains(@class, "b-serp-item")]'):
            try:
                try:
                    title_elem = elem.xpath('.//h2/a')[0]
                    snippet = get_node_text(
                        elem.xpath(
                            './/div[contains(@class, "b-serp-item__text")]')
                        [0])
                except IndexError:
                    # this is video item or something like that
                    pass
                else:
                    item = {
                        'page': page_num,
                    }

                    # url
                    item['url'] = title_elem.get('href')
                    #if url.startswith('/url?'):
                    #url = url.split('?q=')[1].split('&')[0]
                    #url = unquote_plus(url)

                    item['position'] = int(
                        elem.xpath(
                            './/h2/b[contains(@class, "b-serp-item__number")]/text()'
                        )[0])

                    # title
                    item['title'] = get_node_text(title_elem)

                    item['snippet'] = snippet

                    results.append(item)
            except Exception as ex:
                logging.error('', exc_info=ex)

        return results
    else:
        print('parsing error')
        raise ParsingError('Could not identify yandex page format')
Пример #11
0
def parse_search_results(grab, parse_index_size=False, strict_query=False):
    """
    Parse google search results page content.
    """

    #elif grab.search(u'please type the characters below'):
    if grab.response.code == 403:
        raise AccessDenied('Access denied (HTTP 403)')
    elif grab.search(u'src="/sorry/image'):

        # Captcha!!!
        raise CaptchaFound('Captcha found')

    elif grab.css_exists('#ires'):
        if strict_query and \
                grab.search(u'Нет результатов для') or \
                grab.search(u'No results found for'):
            pass
            logging.debug('Query modified')
        else:
            if len(grab.css_list('#ires h3')):

                # Something was found
                if parse_index_size:
                    index_size = parse_index_size(grab)
                else:
                    index_size = None

                # Yield found results
                results = []

                for elem in grab.xpath_list('//*[h3[@class="r"]/a]'):
                    title_elem = elem.xpath('h3/a')[0]

                    # url
                    url = title_elem.get('href')
                    if url.startswith('/url?'):
                        url = url.split('?q=')[1].split('&')[0]
                        url = unquote_plus(url)

                    # title
                    title = get_node_text(title_elem)

                    # snippet
                    # Google could offer two type of snippet format: simple and extended
                    # It depends on user agent
                    # For <IE8, Opera, <FF3 you probably get simple format
                    try:
                        snippet_node = elem.xpath('div[@class="s"]')[0]
                    except IndexError as ex:
                        # Probably it is video or some other result
                        # Such result type is not supported yet
                        continue

                    try:
                        subnode = snippet_node.xpath('span[@class="st"]')[0]
                        snippet = get_node_text(subnode, smart=False)
                        extended_result = True
                    except IndexError:
                        drop_node(snippet_node, 'div')
                        drop_node(snippet_node, 'span[@class="f"]')
                        snippet = get_node_text(snippet_node, smart=False)
                        extended_result = False

                    # filetype
                    try:
                        filetype = elem.xpath('.//span[contains(@class, "xsm")]'\
                                              '/text()')[0].lower().strip('[]')
                    except IndexError:
                        filetype = None

                    #if 'File Format':
                    if url:
                        results.append({
                            'url': url,
                            'title': title,
                            'snippet': snippet,
                            'filetype': filetype,
                            'index_size': index_size,
                            'extended': extended_result,
                        })
                return results
            else:
                pass
                #return []
    elif grab.css_exists('#res'):
        # Could be search results here?
        # or just message "nothing was found"?
        pass
    else:
        raise ParsingError('Could not identify google page format')
Пример #12
0
query='habrahabr'
#file_out='out1.html'

go_url = golib._url(query,page=1)
g = Grab()
#g.go(go_url, log_file=file_out)
g.go(go_url)

print g.css_text('#resultStats')
print '---'
f = open('data_file.txt','a')
for elem in g.xpath_list('//div [@id="ires"]'):
    for i in range(len(elem.xpath('//span [@class="f nsa"]'))):
        title_elem = elem.xpath('//span [@class="f nsa"]')[i]
        data = get_node_text(title_elem, smart=False)
        #if (data[:1]!='1'):
        print 'out->',data
        f.write(data+'\n')

f.close()

print '---'
#print g.xpath_list('//span [@class="f nsa"]')




#print g.css_text('#search')
#print g.xpath_text('//h2[@class="hd"]')
#print g.xpath_text('//*[h3[@class="r"]/a]')
Пример #13
0
def parse_search_results(grab, parse_index_size=False, strict_query=False):
    """
    Parse google search results page content.
    """

    #elif grab.search(u'please type the characters below'):
    if grab.response.code == 403:
        raise AccessDenied('Access denied (HTTP 403)')
    elif grab.search(u'src="/sorry/image'):

        # Captcha!!!
        raise CaptchaFound('Captcha found')

    elif grab.css_exists('#ires'):
        if strict_query and \
                grab.search(u'Нет результатов для') or \
                grab.search(u'No results found for'):
            pass
            logging.debug('Query modified')
        else:
            if len(grab.css_list('#ires h3')):

                # Something was found
                if parse_index_size:
                    index_size = parse_index_size(grab)
                else:
                    index_size = None

                # Yield found results
                results = []

                for elem in grab.xpath_list('//*[h3[@class="r"]/a]'):
                    title_elem = elem.xpath('h3/a')[0]

                    # url
                    url = title_elem.get('href')
                    if url.startswith('/url?'):
                        url = url.split('?q=')[1].split('&')[0]
                        url = unquote_plus(url)

                    # title
                    title = get_node_text(title_elem)

                    # snippet
                    # Google could offer two type of snippet format: simple and extended
                    # It depends on user agent
                    # For <IE8, Opera, <FF3 you probably get simple format
                    try:
                        snippet_node = elem.xpath('div[@class="s"]')[0]
                    except IndexError as ex:
                        # Probably it is video or some other result
                        # Such result type is not supported yet
                        continue

                    try:
                        subnode = snippet_node.xpath('span[@class="st"]')[0]
                        snippet = get_node_text(subnode, smart=False)
                        extended_result = True
                    except IndexError:
                        drop_node(snippet_node, 'div')
                        drop_node(snippet_node, 'span[@class="f"]')
                        snippet = get_node_text(snippet_node, smart=False)
                        extended_result = False

                    # filetype
                    try:
                        filetype = elem.xpath('.//span[contains(@class, "xsm")]'\
                                              '/text()')[0].lower().strip('[]')
                    except IndexError:
                        filetype = None

                    #if 'File Format':
                    if url:
                        results.append({
                            'url': url,
                            'title': title,
                            'snippet': snippet,
                            'filetype': filetype,
                            'index_size': index_size,
                            'extended': extended_result,
                        })
                return results
            else:
                pass
                #return []
    elif grab.css_exists('#res'):
        # Could be search results here?
        # or just message "nothing was found"?
        pass
    else:
        raise ParsingError('Could not identify google page format')
Пример #14
0
def parse_search_results(grab, parse_index_size=False, strict_query=False):
    """
    Parse google search results page content.
    """

    # elif grab.search(u'please type the characters below'):
    if grab.search(u'src="/sorry/image'):

        # Captcha!!!
        raise CaptchaError("Captcha found")

    elif grab.css_exists("#ires"):
        if strict_query and (grab.search(u"Нет результатов для") or grab.search(u"No results found for")):
            pass
            logging.debug("Query modified")
        else:
            if len(grab.css_list("#ires h3")):

                # Something was found
                if parse_index_size:
                    index_size = parse_index_size(grab)
                else:
                    index_size = None

                # Yield found results
                results = []

                for elem in grab.xpath_list('//*[h3[@class="r"]/a]'):
                    title_elem = elem.xpath("h3/a")[0]

                    # url
                    url = title_elem.get("href")
                    if url.startswith("/url?"):
                        url = url.split("?q=")[1].split("&")[0]
                        url = unquote_plus(url)

                    # title
                    title = get_node_text(title_elem)

                    # snippet
                    # Google could offer two type of snippet format: simple and extended
                    # It depends on user agent
                    # For <IE8, Opera, <FF3 you probably get simple format
                    try:
                        snippet_node = elem.xpath('div[@class="s"]')[0]
                    except IndexError as ex:
                        # Probably it is video or some other result
                        # Such result type is not supported yet
                        continue

                    try:
                        subnode = snippet_node.xpath('span[@class="st"]')[0]
                        snippet = get_node_text(subnode, smart=False)
                        extended_result = True
                    except IndexError:
                        drop_node(snippet_node, "div")
                        drop_node(snippet_node, 'span[@class="f"]')
                        snippet = get_node_text(snippet_node, smart=False)
                        extended_result = False

                    # filetype
                    try:
                        filetype = elem.xpath('.//span[contains(@class, "xsm")]' "/text()")[0].lower().strip("[]")
                    except IndexError:
                        filetype = None

                    # if 'File Format':
                    if url:
                        results.append(
                            {
                                "url": url,
                                "title": title,
                                "snippet": snippet,
                                "filetype": filetype,
                                "index_size": index_size,
                                "extended": extended_result,
                            }
                        )
                return results
            else:
                pass
                # return []
    elif grab.css_exists("#res"):
        # Could be search results here?
        # or just message "nothing was found"?
        pass
    else:
        raise ParsingError("Could not identify google page format")
Пример #15
0
 def test_get_node_text(self):
     elem = self.lxml_tree.xpath('//div[@id="bee"]')[0]
     self.assertEqual(get_node_text(elem), u'пчела mozilla = 777; body { color: green; }')
     self.assertEqual(get_node_text(elem, smart=True), u'пче ла')
     elem = self.lxml_tree.xpath('//div[@id="fly"]')[0]
     self.assertEqual(get_node_text(elem), u'му ха')