Пример #1
0
 def get_item(self, content_type=None):
     grab = Grab(transport=GRAB_TRANSPORT)
     if content_type is not None:
         grab.setup(content_type=content_type)
     grab.fake_response(XML)
     player = Player(grab.tree)
     return player
Пример #2
0
 def get_item(self, content_type=None):
     grab = Grab(transport=GRAB_TRANSPORT)
     if content_type is not None:
         grab.setup(content_type=content_type)
     grab.fake_response(XML)
     player = Player(grab.tree)
     return player
Пример #3
0
    def test_stringfield_multiple(self):
        grab = Grab(transport=GRAB_TRANSPORT)
        grab.fake_response(XML)

        class GameItem(Item):
            names = StringField('//game/@name', multiple=True)

        game = GameItem(grab.tree)
        self.assertEqual(['quake1', 'quake2'], game.names)
Пример #4
0
    def test_stringfield_multiple(self):
        grab = Grab(transport=GRAB_TRANSPORT)
        grab.fake_response(XML)

        class GameItem(Item):
            names = StringField('//game/@name', multiple=True)

        game = GameItem(grab.tree)
        self.assertEqual(['quake1', 'quake2'], game.names)
Пример #5
0
class TestUploadContent(TestCase):
    def setUp(self):
        SERVER.reset()
        # Create fake grab instance with fake response
        self.g = Grab(transport=GRAB_TRANSPORT)
        self.g.fake_response(FORMS, charset='utf-8')

    def test(self):
        fc = UploadContent('a')
        self.assertEqual(fc, 'xxx')
        self.g.set_input('image', fc)
Пример #6
0
class TestUploadContent(TestCase):
    def setUp(self):
        SERVER.reset()
        # Create fake grab instance with fake response
        self.g = Grab(transport=GRAB_TRANSPORT)
        self.g.fake_response(FORMS, charset='utf-8')

    def test(self):
        fc = UploadContent('a')
        self.assertEqual(fc, 'xxx')
        self.g.set_input('image', fc)
Пример #7
0
    def test_cdata_issue(self):
        g = Grab(transport=GRAB_TRANSPORT)
        g.fake_response(XML)

        # By default HTML DOM builder is used
        # It handles CDATA incorrectly
        self.assertEqual(None, g.xpath_one('//weight').text)
        self.assertEqual(None, g.tree.xpath('//weight')[0].text)

        # But XML DOM builder produces valid result
        #self.assertEqual(None, g.xpath_one('//weight').text)
        self.assertEqual('30', g.xml_tree.xpath('//weight')[0].text)

        # Use `content_type` option to change default DOM builder
        g = Grab(transport=GRAB_TRANSPORT)
        g.fake_response(XML)
        g.setup(content_type='xml')

        self.assertEqual('30', g.xpath_one('//weight').text)
        self.assertEqual('30', g.tree.xpath('//weight')[0].text)
Пример #8
0
    def test_cdata_issue(self):
        g = Grab(transport=GRAB_TRANSPORT)
        g.fake_response(XML)

        # By default HTML DOM builder is used
        # It handles CDATA incorrectly
        self.assertEqual(None, g.xpath_one('//weight').text)
        self.assertEqual(None, g.tree.xpath('//weight')[0].text)

        # But XML DOM builder produces valid result
        #self.assertEqual(None, g.xpath_one('//weight').text)
        self.assertEqual('30', g.xml_tree.xpath('//weight')[0].text)

        # Use `content_type` option to change default DOM builder
        g = Grab(transport=GRAB_TRANSPORT)
        g.fake_response(XML)
        g.setup(content_type='xml')

        self.assertEqual('30', g.xpath_one('//weight').text)
        self.assertEqual('30', g.tree.xpath('//weight')[0].text)
Пример #9
0
    def test_item_interface(self):
        grab = Grab(transport=GRAB_TRANSPORT)
        grab.fake_response(XML)

        player = Player(grab.tree)

        self.assertEquals(26982032, player.id)
        self.assertEquals('Ardeshir', player.first_name)
        self.assertEquals('2012-09-11 07:38:44', str(player.retrieved))
        self.assertEquals('75-zoo-1', player.calculated)
        # should got from cache
        self.assertEquals('75-zoo-1', player.calculated)

        # test assigning value
        player.calculated = 'baz'
        self.assertEquals('baz', player.calculated)

        # test FuncField
        self.assertEquals('75-zoo2-1', player.calculated2)
        # should got from cache
        self.assertEquals('75-zoo2-1', player.calculated2)

        # By default comment_cdata attribute contains empty string
        # because HTML DOM builder is used by default
        self.assertEquals('abc', player.comment)
        self.assertEquals('', player.comment_cdata)

        # We can control default DOM builder with
        # content_type option
        grab = Grab(transport=GRAB_TRANSPORT)
        grab.fake_response(XML)
        grab.setup(content_type='xml')
        player = Player(grab.tree)
        self.assertEquals('abc', player.comment)
        self.assertEquals('abc', player.comment_cdata)

        self.assertRaises(DataNotFound, lambda: player.data_not_found)
Пример #10
0
class TestHtmlForms(TestCase):
    def setUp(self):
        SERVER.reset()

        # Create fake grab instance with fake response
        self.g = Grab(transport=GRAB_TRANSPORT)
        self.g.fake_response(FORMS)

    def test_choose_form(self):
        """
        Test ``choose_form`` method
        """

        # raise errors
        self.assertRaises(DataNotFound, lambda: self.g.choose_form(10))
        self.assertRaises(DataNotFound,
                          lambda: self.g.choose_form(id='bad_id'))
        self.assertRaises(DataNotFound,
                          lambda: self.g.choose_form(id='fake_form'))
        self.assertRaises(GrabMisuseError, lambda: self.g.choose_form())

        # check results
        self.g.choose_form(0)
        self.assertEqual('form', self.g._lxml_form.tag)
        self.assertEqual('search_form', self.g._lxml_form.get('id'))

        # reset current form
        self.g._lxml_form = None

        self.g.choose_form(id='common_form')
        self.assertEqual('form', self.g._lxml_form.tag)
        self.assertEqual('common_form', self.g._lxml_form.get('id'))

        # reset current form
        self.g._lxml_form = None

        self.g.choose_form(name='dummy')
        self.assertEqual('form', self.g._lxml_form.tag)
        self.assertEqual('dummy', self.g._lxml_form.get('name'))

        # reset current form
        self.g._lxml_form = None

        self.g.choose_form(xpath='//form[contains(@action, "/dummy")]')
        self.assertEqual('form', self.g._lxml_form.tag)
        self.assertEqual('dummy', self.g._lxml_form.get('name'))

    def assertEqualQueryString(self, qs1, qs2):
        args1 = set([(x, y[0]) for x, y in parse_qsl(qs1)])
        args2 = set([(x, y[0]) for x, y in parse_qsl(qs2)])
        self.assertEqual(args1, args2)

    def test_submit(self):
        g = Grab(transport=GRAB_TRANSPORT)
        SERVER.RESPONSE['get'] = POST_FORM
        g.go(SERVER.BASE_URL)
        g.set_input('name', 'Alex')
        g.submit()
        self.assertEqualQueryString(SERVER.REQUEST['post'],
                                    'name=Alex&secret=123')

        # Default submit control
        SERVER.RESPONSE['get'] = MULTIPLE_SUBMIT_FORM
        g.go(SERVER.BASE_URL)
        g.submit()
        self.assertEqualQueryString(SERVER.REQUEST['post'],
                                    'secret=123&submit1=submit1')

        # Selected submit control
        SERVER.RESPONSE['get'] = MULTIPLE_SUBMIT_FORM
        g.go(SERVER.BASE_URL)
        g.submit(submit_name='submit2')
        self.assertEqualQueryString(SERVER.REQUEST['post'],
                                    'secret=123&submit2=submit2')

        # Default submit control if submit control name is invalid
        SERVER.RESPONSE['get'] = MULTIPLE_SUBMIT_FORM
        g.go(SERVER.BASE_URL)
        g.submit(submit_name='submit3')
        self.assertEqualQueryString(SERVER.REQUEST['post'],
                                    'secret=123&submit1=submit1')

    def test_set_methods(self):
        g = Grab(transport=GRAB_TRANSPORT)
        SERVER.RESPONSE['get'] = FORMS
        g.go(SERVER.BASE_URL)

        self.assertEqual(g._lxml_form, None)

        g.set_input('gender', '1')
        self.assertEqual('common_form', g._lxml_form.get('id'))

        self.assertRaises(KeyError, lambda: g.set_input('query', 'asdf'))

        g._lxml_form = None
        g.set_input_by_id('search_box', 'asdf')
        self.assertEqual('search_form', g._lxml_form.get('id'))

        g.choose_form(xpath='//form[@id="common_form"]')
        g.set_input_by_number(0, 'asdf')

        g._lxml_form = None
        g.set_input_by_xpath('//*[@name="gender"]', '2')
        self.assertEqual('common_form', g._lxml_form.get('id'))

    def test_html_without_forms(self):
        g = Grab(transport=GRAB_TRANSPORT)
        SERVER.RESPONSE['get'] = NO_FORM_HTML
        g.go(SERVER.BASE_URL)
        self.assertRaises(DataNotFound, lambda: g.form)

    def test_disabled_radio(self):
        """
        Bug #57
        """

        g = Grab(transport=GRAB_TRANSPORT)
        SERVER.RESPONSE['get'] = DISABLED_RADIO_HTML
        g.go(SERVER.BASE_URL)
        g.submit(make_request=False)
Пример #11
0
    def fetch(self):
        """
        Download urls via multicurl.
        
        Get new tasks from queue.
        """ 
        m = pycurl.CurlMulti()
        m.handles = []

        # Create curl instances
        for x in xrange(self.thread_number):
            curl = pycurl.Curl()
            m.handles.append(curl)

        freelist = m.handles[:]

        # This is infinite cycle
        # You can break it only from outside code which
        # iterates over result of this method
        while True:

            cached_request = None

            while len(freelist):

                # Increase request counter
                if (self.request_limit is not None and
                    self.counters['request'] >= self.request_limit):
                    logging.debug('Request limit is reached: %s' %\
                                  self.request_limit)
                    if len(freelist) == self.thread_number:
                        yield None
                    else:
                        break
                else:
                    try:
                        priority, task = self.taskq.get(True, 0.1)
                    except Empty:
                        # If All handlers are free and no tasks in queue
                        # yield None signal
                        if len(freelist) == self.thread_number:
                            yield None
                        else:
                            break
                    else:
                        if not self._preprocess_task(task):
                            continue

                        task.network_try_count += 1
                        if task.task_try_count == 0:
                            task.task_try_count = 1

                        if task.task_try_count > self.task_try_limit:
                            logging.debug('Task tries ended: %s / %s' % (
                                          task.name, task.url))
                            self.add_item('too-many-task-tries', task.url)
                            continue
                        
                        if task.network_try_count > self.network_try_limit:
                            logging.debug('Network tries ended: %s / %s' % (
                                          task.name, task.url))
                            self.add_item('too-many-network-tries', task.url)
                            continue

                        #import pdb; pdb.set_trace()
                        if task.grab:
                            grab = task.grab
                        else:
                            # Set up curl instance via Grab interface
                            grab = Grab(**self.grab_config)
                            grab.setup(url=task.url)

                        if self.use_cache and not task.get('disable_cache'):
                            if grab.detect_request_method() == 'GET':
                                url = grab.config['url']
                                utf_url = url.encode('utf-8') if isinstance(url, unicode) else url
                                if self.cache_key_hash:
                                    url_hash = sha1(utf_url).hexdigest()
                                else:
                                    url_hash = url
                                cache_item = self.cache.find_one({'_id': url_hash})
                                if cache_item:
                                #if url in self.cache:
                                    #cache_item = pickle.loads(self.cache[url])
                                    logging.debug('From cache: %s' % url)
                                    cached_request = (grab, grab.clone(),
                                                      task, cache_item)
                                    grab.prepare_request()
                                    self.inc_count('request-cache')

                                    # break from prepre-request cycle
                                    # and go to process-response code
                                    break

                        self.inc_count('request-network')
                        if self.proxylist_config:
                            args, kwargs = self.proxylist_config
                            grab.setup_proxylist(*args, **kwargs)

                        curl = freelist.pop()
                        curl.grab = grab
                        curl.grab.curl = curl
                        curl.grab_original = grab.clone()
                        curl.grab.prepare_request()
                        curl.task = task
                        # Add configured curl instance to multi-curl processor
                        m.add_handle(curl)


            # If there were done network requests
            if len(freelist) != self.thread_number:
                while True:
                    status, active_objects = m.perform()
                    if status != pycurl.E_CALL_MULTI_PERFORM:
                        break

            if cached_request:
                grab, grab_original, task, cache_item = cached_request
                url = task.url# or grab.config['url']
                grab.fake_response(cache_item['body'])

                if self.use_cache_compression:
                    body = zlib.decompress(cache_item['body']) 
                else:
                    body = cache_item['body'].encode('utf-8')
                def custom_prepare_response(g):
                    g.response.head = cache_item['head'].encode('utf-8')
                    g.response.body = body
                    g.response.code = cache_item['response_code']
                    g.response.time = 0
                    g.response.url = cache_item['url']
                    g.response.parse('utf-8')
                    g.response.cookies = g.extract_cookies()

                grab.process_request_result(custom_prepare_response)

                yield {'ok': True, 'grab': grab, 'grab_original': grab_original,
                       'task': task, 'ecode': None, 'emsg': None}
                self.inc_count('request')

            while True:
                queued_messages, ok_list, fail_list = m.info_read()

                results = []
                for curl in ok_list:
                    results.append((True, curl, None, None))
                for curl, ecode, emsg in fail_list:
                    results.append((False, curl, ecode, emsg))

                for ok, curl, ecode, emsg in results:
                    res = self.process_multicurl_response(ok, curl,
                                                          ecode, emsg)
                    m.remove_handle(curl)
                    freelist.append(curl)
                    yield res
                    self.inc_count('request')

                if not queued_messages:
                    break

            m.select(0.5)
Пример #12
0
 def test_find(self):
     grab = Grab(transport=GRAB_TRANSPORT)
     grab.fake_response(XML)
     games = list(GameItem.find(grab.doc))
     self.assertEqual(['quake1', 'quake2'],
                      [x.name for x in games])
Пример #13
0
    def fetch(self):
        """
        Download urls via multicurl.
        
        Get new tasks from queue.
        """
        m = pycurl.CurlMulti()
        m.handles = []

        # Create curl instances
        for x in xrange(self.thread_number):
            curl = pycurl.Curl()
            m.handles.append(curl)

        freelist = m.handles[:]

        # This is infinite cycle
        # You can break it only from outside code which
        # iterates over result of this method
        while True:

            cached_request = None

            while len(freelist):

                # Increase request counter
                if (self.request_limit is not None
                        and self.counters['request'] >= self.request_limit):
                    logging.debug('Request limit is reached: %s' %\
                                  self.request_limit)
                    if len(freelist) == self.thread_number:
                        yield None
                    else:
                        break
                else:
                    try:
                        priority, task = self.taskq.get(True, 0.1)
                    except Empty:
                        # If All handlers are free and no tasks in queue
                        # yield None signal
                        if len(freelist) == self.thread_number:
                            yield None
                        else:
                            break
                    else:
                        if not self._preprocess_task(task):
                            continue

                        task.network_try_count += 1
                        if task.task_try_count == 0:
                            task.task_try_count = 1

                        if task.task_try_count > self.task_try_limit:
                            logging.debug('Task tries ended: %s / %s' %
                                          (task.name, task.url))
                            self.add_item('too-many-task-tries', task.url)
                            continue

                        if task.network_try_count > self.network_try_limit:
                            logging.debug('Network tries ended: %s / %s' %
                                          (task.name, task.url))
                            self.add_item('too-many-network-tries', task.url)
                            continue

                        #import pdb; pdb.set_trace()
                        if task.grab:
                            grab = task.grab
                        else:
                            # Set up curl instance via Grab interface
                            grab = Grab(**self.grab_config)
                            grab.setup(url=task.url)

                        if self.use_cache and not task.get('disable_cache'):
                            if grab.detect_request_method() == 'GET':
                                url = grab.config['url']
                                cache_item = self.cache.find_one({'_id': url})
                                if cache_item:
                                    #if url in self.cache:
                                    #cache_item = pickle.loads(self.cache[url])
                                    #logging.debug('From cache: %s' % url)
                                    cached_request = (grab, grab.clone(), task,
                                                      cache_item)
                                    grab.prepare_request()
                                    self.inc_count('request-cache')

                                    # break from prepre-request cycle
                                    # and go to process-response code
                                    break

                        self.inc_count('request-network')
                        if self.proxylist_config:
                            args, kwargs = self.proxylist_config
                            grab.setup_proxylist(*args, **kwargs)

                        curl = freelist.pop()
                        curl.grab = grab
                        curl.grab.curl = curl
                        curl.grab_original = grab.clone()
                        curl.grab.prepare_request()
                        curl.task = task
                        # Add configured curl instance to multi-curl processor
                        m.add_handle(curl)

            # If there were done network requests
            if len(freelist) != self.thread_number:
                while True:
                    status, active_objects = m.perform()
                    if status != pycurl.E_CALL_MULTI_PERFORM:
                        break

            if cached_request:
                grab, grab_original, task, cache_item = cached_request
                url = task.url  # or grab.config['url']
                grab.fake_response(cache_item['body'])

                def custom_prepare_response(g):
                    g.response.head = cache_item['head'].encode('utf-8')
                    g.response.body = cache_item['body'].encode('utf-8')
                    g.response.code = cache_item['response_code']
                    g.response.time = 0
                    g.response.url = cache_item['url']
                    g.response.parse('utf-8')
                    g.response.cookies = g.extract_cookies()

                grab.process_request_result(custom_prepare_response)

                yield {
                    'ok': True,
                    'grab': grab,
                    'grab_original': grab_original,
                    'task': task,
                    'ecode': None,
                    'emsg': None
                }
                self.inc_count('request')

            while True:
                queued_messages, ok_list, fail_list = m.info_read()

                results = []
                for curl in ok_list:
                    results.append((True, curl, None, None))
                for curl, ecode, emsg in fail_list:
                    results.append((False, curl, ecode, emsg))

                for ok, curl, ecode, emsg in results:
                    res = self.process_multicurl_response(
                        ok, curl, ecode, emsg)
                    m.remove_handle(curl)
                    freelist.append(curl)
                    yield res
                    self.inc_count('request')

                if not queued_messages:
                    break

            m.select(0.5)
Пример #14
0
class TextExtensionTest(TestCase):
    def setUp(self):
        SERVER.reset()

        # Create fake grab instance with fake response
        self.g = Grab(transport=GRAB_TRANSPORT)
        self.g.fake_response(HTML, charset='cp1251')

    def test_search(self):
        self.assertTrue(self.g.search(u'фыва'.encode('cp1251'), byte=True))
        self.assertTrue(self.g.search(u'фыва'))
        self.assertFalse(self.g.search(u'фыва2'))

    def test_search_usage_errors(self):
        self.assertRaises(GrabMisuseError,
            lambda: self.g.search(u'фыва', byte=True))
        self.assertRaises(GrabMisuseError,
            lambda: self.g.search('фыва'))

    def test_rex(self):
        # Search unicode rex in unicode body - default case
        rex = re.compile(u'(фыва)', re.U)
        self.assertEqual(u'фыва', self.g.rex(rex).group(1))

        # Search non-unicode rex in byte-string body
        rex = re.compile(u'(фыва)'.encode('cp1251'))
        self.assertEqual(u'фыва'.encode('cp1251'), self.g.rex(rex, byte=True).group(1))

        ## Search for non-unicode rex in unicode body shuld fail
        rex = re.compile('(фыва)')
        self.assertRaises(DataNotFound, lambda: self.g.rex(rex))

        ## Search for unicode rex in byte-string body shuld fail
        rex = re.compile(u'фыва', re.U)
        self.assertRaises(DataNotFound, lambda: self.g.rex(rex, byte=True))

        ## Search for unexesting fragment
        rex = re.compile(u'(фыва2)', re.U)
        self.assertRaises(DataNotFound, lambda: self.g.rex(rex))

    def test_assert_substring(self):
        self.g.assert_substring(u'фыва')
        self.g.assert_substring(u'фыва'.encode('cp1251'), byte=True)
        self.assertRaises(DataNotFound,
            lambda: self.g.assert_substring(u'фыва2'))

    def test_assert_substrings(self):
        self.g.assert_substrings((u'фыва',))
        self.g.assert_substrings((u'фывы нет', u'фыва'))
        self.g.assert_substrings((u'фыва'.encode('cp1251'), 'где ты фыва?'), byte=True)
        self.assertRaises(DataNotFound,
            lambda: self.g.assert_substrings((u'фыва, вернись', u'фыва-а-а-а')))

    def test_assert_rex(self):
        self.g.assert_rex(re.compile(u'фыва'))
        self.g.assert_rex(re.compile(u'фыва'.encode('cp1251')), byte=True)
        self.assertRaises(DataNotFound,
            lambda: self.g.assert_rex(re.compile(u'фыва2')))

    def test_assert_rex_text(self):
        self.assertEqual(u'ха', self.g.rex_text('<em id="fly-em">([^<]+)'))
Пример #15
0
class Yandex(object):

    def __init__(self):
        self._g = Grab()
        self._url = 'http://yandex.ru/yandsearch'
        self._sep_params = '?'
        self._sep_prm = '&'
        self._serp_block_class = 'serp-block'
        self._serp_item_class = 'serp-item'
        self.MAX_ITEMS_TO_PAGE = 10

    def create_request(self, query, region_id, page=0):
        req = self._url + self._sep_params + (self._sep_prm.join([
            'lr=' + region_id,
            'text=' + query,
            'p=' + str(page)
        ]))
        return req

    #for-testing
    def fetch_data_local(self):
        self._g.fake_response(open('2.html').read())

    def fetch_data(self, req):
        self._g.go(req)

    def get_serp(self, query, deep, region_id):
        items = self._get_items(deep, query, region_id)

        result = {'positions': [], 'totalResults': '', 'reask_phrase': ''}

        for en, item in enumerate(items):
            result['positions'].append({
                'positions': en,
                'url': item.url,
                'title': item.title,
                'mimeType': item.mime,
                'snippet': item.snippet
            })

        result['totalResults'] = self._get_total()
        result['reask_phrase'] = self._get_reask()

        return result

    def _get_item(self, item):
        full_path_url_title = self.__query_path_url_title(item)
        full_path_snippet = self.__query_path_snippet(item)
        full_path_mime = self.__query_path_mime(item)

        urltitle = self._g.doc.select(full_path_url_title)

        url = urltitle.attr('href')
        title = urltitle.text()
        try:
            snippet = self._g.doc.select(full_path_snippet).text()
        except DataNotFound:
            snippet = ''

        try:
            mime = self._g.doc.select(full_path_mime).attr('alt')
        except DataNotFound:
            mime = 'html'

        return ItemSerp(url=url, title=title, snippet=snippet, mime=mime)

    def _get_items(self, deep, query, region):
        result = []
        if deep != 0:
            count_page = int(math.ceil(float(deep) / self.MAX_ITEMS_TO_PAGE))
            for page in xrange(count_page):
                req = self.create_request(query, region, page)
                self.fetch_data(req)
                if page == count_page - 1 and page != 0:
                    count_items = deep % self.MAX_ITEMS_TO_PAGE + 1
                else:
                    count_items = self.MAX_ITEMS_TO_PAGE + 1
                for item in xrange(1, count_items):
                    result.append(self._get_item(item))
        return result

    def _get_total(self):
        query_count = self.__full_xpath([
            self.__factory_xpath('div', 'input__found')
        ])
        return self._g.doc.select(query_count).text()[2:]

    def _get_reask(self):
        query_misspell = self.__full_xpath([
            self.__factory_xpath('div', 'misspell'),
            self.__factory_xpath('div', 'message'),
            self.__factory_xpath('div', 'misspell__message')
        ])
        try:
            result = self._g.doc.select(query_misspell).text()
        except DataNotFound:
            result = ''
        return result

    def __query_to_serp_item_wrap(self, item):
            query_serp_block = self.__factory_xpath('div', self._serp_block_class)
            query_serp_block_item = self.__factory_xpath('div', self._serp_item_class, item)
            query_serp_item_wrap = self.__factory_xpath('div', 'serp-item__wrap')
            return '/'.join([
                query_serp_block,
                query_serp_block_item,
                query_serp_item_wrap
            ])

    def __query_path_url_title(self, item):
        query_serp_item_wrap = self.__query_to_serp_item_wrap(item)
        query_serp_title = '/'.join([
            self.__factory_xpath('h2', 'serp-item__title'),
            self.__factory_xpath('a', 'b-link serp-item__title-link')])

        return self.__full_xpath([
            query_serp_item_wrap,
            query_serp_title
        ])

    def __query_path_snippet(self, item):
        query_serp_item_wrap = self.__query_to_serp_item_wrap(item)
        query_serp_snippet = self.__factory_xpath('div', 'serp-item__text')

        return self.__full_xpath([
            query_serp_item_wrap,
            query_serp_snippet
        ])

    def __query_path_mime(self, item):
        query_serp_item_wrap = self.__query_to_serp_item_wrap(item)
        query_serp_item_extra_mine = '/'.join([
            self.__factory_xpath('div', 'serp-item__extra-wrap'),
            self.__factory_xpath('div', 'serp-item__extra'),
            self.__factory_xpath('a', 'b-link'),
            self.__factory_xpath('div', 'serp-item__mime'),
            self.__factory_xpath('img', 'serp-item__mime-icon'),
        ])

        return self.__full_xpath([
            query_serp_item_wrap, query_serp_item_extra_mine
        ])

    @classmethod
    def __full_xpath(cls, xpaths):
            first = xpaths[0]
            return '/'.join(['//' + first] + xpaths[1:])

    @classmethod
    def __factory_xpath(cls, tag, clas, index=None):
        index = '[' + str(index) + ']' if index else ''
        return '{0}{1}[contains(@class, "{2}")]'.format(tag, index, clas)
Пример #16
0
 def test_find(self):
     grab = Grab(transport=GRAB_TRANSPORT)
     grab.fake_response(XML)
     games = list(GameItem.find(grab.doc))
     self.assertEqual(['quake1', 'quake2'], [x.name for x in games])
Пример #17
0
class LXMLExtensionTest(TestCase):
    def setUp(self):
        SERVER.reset()

        # Create fake grab instance with fake response
        self.g = Grab(transport=GRAB_TRANSPORT)
        self.g.fake_response(HTML, charset='cp1251')

        from lxml.html import fromstring
        self.lxml_tree = fromstring(self.g.response.body)

    def test_lxml_text_content_fail(self):
        # lxml node text_content() method do not put spaces between text
        # content of adjacent XML nodes
        self.assertEqual(
            self.lxml_tree.xpath('//div[@id="bee"]/div')
            [0].text_content().strip(), u'пчела')
        self.assertEqual(
            self.lxml_tree.xpath('//div[@id="fly"]')[0].text_content().strip(),
            u'му\nха')

    def test_lxml_xpath(self):
        names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*'))
        self.assertEqual(set(['em', 'div', 'strong', 'style', 'script']),
                         names)
        names = set(x.tag for x in self.lxml_tree.xpath(
            '//div[@id="bee"]//*[name() != "script" and name() != "style"]'))
        self.assertEqual(set(['em', 'div', 'strong']), names)

    def test_xpath(self):
        self.assertEqual('bee-em', self.g.xpath_one('//em').get('id'))
        self.assertEqual(
            'num-2',
            self.g.xpath_one(u'//*[text() = "item #2"]').get('id'))
        self.assertRaises(DataNotFound,
                          lambda: self.g.xpath_one('//em[@id="baz"]'))
        self.assertEqual(None, self.g.xpath_one('//zzz', default=None))
        self.assertEqual('foo', self.g.xpath_one('//zzz', default='foo'))

    def test_xpath_text(self):
        self.assertEqual(u'пче ла',
                         self.g.xpath_text('//*[@id="bee"]', smart=True))
        self.assertEqual(u'пчела mozilla = 777; body { color: green; }',
                         self.g.xpath_text('//*[@id="bee"]', smart=False))
        self.assertEqual(u'пче ла му ха item #100 2 item #2',
                         self.g.xpath_text('/html/body', smart=True))
        self.assertRaises(DataNotFound, lambda: self.g.xpath_text('//code'))
        self.assertEqual(u'bee', self.g.xpath_one('//*[@id="bee"]/@id'))
        self.assertRaises(DataNotFound,
                          lambda: self.g.xpath_text('//*[@id="bee2"]/@id'))

    def test_xpath_number(self):
        self.assertEqual(100, self.g.xpath_number('//li'))
        self.assertEqual(100, self.g.xpath_number('//li', make_int=True))
        self.assertEqual('100', self.g.xpath_number('//li', make_int=False))
        self.assertEqual(1002, self.g.xpath_number('//li', ignore_spaces=True))
        self.assertEqual(
            '1002',
            self.g.xpath_number('//li', ignore_spaces=True, make_int=False))
        self.assertRaises(DataNotFound, lambda: self.g.xpath_number('//liza'))
        self.assertEqual('foo', self.g.xpath_number('//zzz', default='foo'))

    def test_xpath_list(self):
        self.assertEqual(['num-1', 'num-2'],
                         [x.get('id') for x in self.g.xpath_list('//li')])

    def test_css(self):
        self.assertEqual('bee-em', self.g.css_one('em').get('id'))
        self.assertEqual('num-2', self.g.css_one('#num-2').get('id'))
        self.assertRaises(DataNotFound, lambda: self.g.css_one('em#baz'))
        self.assertEqual('foo', self.g.css_one('zzz', default='foo'))

    def test_css_text(self):
        self.assertEqual(u'пче ла', self.g.css_text('#bee', smart=True))
        self.assertEqual(u'пче ла му ха item #100 2 item #2',
                         self.g.css_text('html body', smart=True))
        self.assertRaises(DataNotFound, lambda: self.g.css_text('code'))
        self.assertEqual('foo', self.g.css_text('zzz', default='foo'))

    def test_css_number(self):
        self.assertEqual(100, self.g.css_number('li'))
        self.assertEqual('100', self.g.css_number('li', make_int=False))
        self.assertEqual(1002, self.g.css_number('li', ignore_spaces=True))
        self.assertRaises(DataNotFound, lambda: self.g.css_number('liza'))
        self.assertEqual('foo', self.g.css_number('zzz', default='foo'))

    def test_css_list(self):
        self.assertEqual(['num-1', 'num-2'],
                         [x.get('id') for x in self.g.css_list('li')])

    def test_strip_tags(self):
        self.assertEqual('foo', self.g.strip_tags('<b>foo</b>'))
        self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b> <i>bar'))
        self.assertEqual('foobar', self.g.strip_tags('<b>foo</b><i>bar'))
        self.assertEqual('foo bar',
                         self.g.strip_tags('<b>foo</b><i>bar', smart=True))
        self.assertEqual('', self.g.strip_tags('<b> <div>'))

    def test_css_exists(self):
        self.assertTrue(self.g.css_exists('li#num-1'))
        self.assertFalse(self.g.css_exists('li#num-3'))

    def test_xpath_exists(self):
        self.assertTrue(self.g.xpath_exists('//li[@id="num-1"]'))
        self.assertFalse(self.g.xpath_exists('//li[@id="num-3"]'))

    def test_cdata_issue(self):
        g = Grab(transport=GRAB_TRANSPORT)
        g.fake_response(XML)

        # By default HTML DOM builder is used
        # It handles CDATA incorrectly
        self.assertEqual(None, g.xpath_one('//weight').text)
        self.assertEqual(None, g.tree.xpath('//weight')[0].text)

        # But XML DOM builder produces valid result
        #self.assertEqual(None, g.xpath_one('//weight').text)
        self.assertEqual('30', g.xml_tree.xpath('//weight')[0].text)

        # Use `content_type` option to change default DOM builder
        g = Grab(transport=GRAB_TRANSPORT)
        g.fake_response(XML)
        g.setup(content_type='xml')

        self.assertEqual('30', g.xpath_one('//weight').text)
        self.assertEqual('30', g.tree.xpath('//weight')[0].text)

    def test_xml_declaration(self):
        """
        HTML with XML declaration shuld be processed without errors.
        """
        SERVER.RESPONSE['get'] = """<?xml version="1.0" encoding="UTF-8"?>
        <html><body><h1>test</h1></body></html>
        """
        g = Grab()
        g.go(SERVER.BASE_URL)
        self.assertEqual('test', g.xpath_text('//h1'))

    def test_empty_document(self):
        SERVER.RESPONSE['get'] = 'oops'
        g = Grab()
        g.go(SERVER.BASE_URL)
        g.xpath_exists('//anytag')

        SERVER.RESPONSE['get'] = '<frameset></frameset>'
        g = Grab()
        g.go(SERVER.BASE_URL)
        g.xpath_exists('//anytag')
Пример #18
0
class LXMLExtensionTest(TestCase):
    def setUp(self):
        SERVER.reset()

        # Create fake grab instance with fake response
        self.g = Grab(transport=GRAB_TRANSPORT)
        self.g.fake_response(HTML, charset='cp1251')

        from lxml.html import fromstring
        self.lxml_tree = fromstring(self.g.response.body)

    def test_lxml_text_content_fail(self):
        # lxml node text_content() method do not put spaces between text
        # content of adjacent XML nodes
        self.assertEqual(self.lxml_tree.xpath('//div[@id="bee"]/div')[0].text_content().strip(), u'пчела')
        self.assertEqual(self.lxml_tree.xpath('//div[@id="fly"]')[0].text_content().strip(), u'му\nха')

    def test_lxml_xpath(self):
        names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*'))
        self.assertEqual(set(['em', 'div', 'strong', 'style', 'script']), names)
        names = set(x.tag for x in self.lxml_tree.xpath('//div[@id="bee"]//*[name() != "script" and name() != "style"]'))
        self.assertEqual(set(['em', 'div', 'strong']), names)

    def test_xpath(self):
        self.assertEqual('bee-em', self.g.xpath_one('//em').get('id'))
        self.assertEqual('num-2', self.g.xpath_one(u'//*[text() = "item #2"]').get('id'))
        self.assertRaises(DataNotFound,
            lambda: self.g.xpath_one('//em[@id="baz"]'))
        self.assertEqual(None, self.g.xpath_one('//zzz', default=None))
        self.assertEqual('foo', self.g.xpath_one('//zzz', default='foo'))

    def test_xpath_text(self):
        self.assertEqual(u'пче ла', self.g.xpath_text('//*[@id="bee"]', smart=True))
        self.assertEqual(u'пчела mozilla = 777; body { color: green; }', self.g.xpath_text('//*[@id="bee"]', smart=False))
        self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.xpath_text('/html/body', smart=True))
        self.assertRaises(DataNotFound,
            lambda: self.g.xpath_text('//code'))
        self.assertEqual(u'bee', self.g.xpath_one('//*[@id="bee"]/@id'))
        self.assertRaises(DataNotFound,
            lambda: self.g.xpath_text('//*[@id="bee2"]/@id'))

    def test_xpath_number(self):
        self.assertEqual(100, self.g.xpath_number('//li'))
        self.assertEqual(100, self.g.xpath_number('//li', make_int=True))
        self.assertEqual('100', self.g.xpath_number('//li', make_int=False))
        self.assertEqual(1002, self.g.xpath_number('//li', ignore_spaces=True))
        self.assertEqual('1002', self.g.xpath_number('//li', ignore_spaces=True,
                         make_int=False))
        self.assertRaises(DataNotFound,
            lambda: self.g.xpath_number('//liza'))
        self.assertEqual('foo', self.g.xpath_number('//zzz', default='foo'))

    def test_xpath_list(self):
        self.assertEqual(['num-1', 'num-2'],
            [x.get('id') for x in self.g.xpath_list('//li')])

    def test_css(self):
        self.assertEqual('bee-em', self.g.css_one('em').get('id'))
        self.assertEqual('num-2', self.g.css_one('#num-2').get('id'))
        self.assertRaises(DataNotFound,
            lambda: self.g.css_one('em#baz'))
        self.assertEqual('foo', self.g.css_one('zzz', default='foo'))

    def test_css_text(self):
        self.assertEqual(u'пче ла', self.g.css_text('#bee', smart=True))
        self.assertEqual(u'пче ла му ха item #100 2 item #2', self.g.css_text('html body', smart=True))
        self.assertRaises(DataNotFound,
            lambda: self.g.css_text('code'))
        self.assertEqual('foo', self.g.css_text('zzz', default='foo'))

    def test_css_number(self):
        self.assertEqual(100, self.g.css_number('li'))
        self.assertEqual('100', self.g.css_number('li', make_int=False))
        self.assertEqual(1002, self.g.css_number('li', ignore_spaces=True))
        self.assertRaises(DataNotFound,
            lambda: self.g.css_number('liza'))
        self.assertEqual('foo', self.g.css_number('zzz', default='foo'))

    def test_css_list(self):
        self.assertEqual(['num-1', 'num-2'],
            [x.get('id') for x in self.g.css_list('li')])

    def test_strip_tags(self):
        self.assertEqual('foo', self.g.strip_tags('<b>foo</b>'))
        self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b> <i>bar'))
        self.assertEqual('foobar', self.g.strip_tags('<b>foo</b><i>bar'))
        self.assertEqual('foo bar', self.g.strip_tags('<b>foo</b><i>bar', smart=True))
        self.assertEqual('', self.g.strip_tags('<b> <div>'))

    def test_css_exists(self):
        self.assertTrue(self.g.css_exists('li#num-1'))
        self.assertFalse(self.g.css_exists('li#num-3'))

    def test_xpath_exists(self):
        self.assertTrue(self.g.xpath_exists('//li[@id="num-1"]'))
        self.assertFalse(self.g.xpath_exists('//li[@id="num-3"]'))

    def test_cdata_issue(self):
        g = Grab(transport=GRAB_TRANSPORT)
        g.fake_response(XML)

        # By default HTML DOM builder is used
        # It handles CDATA incorrectly
        self.assertEqual(None, g.xpath_one('//weight').text)
        self.assertEqual(None, g.tree.xpath('//weight')[0].text)

        # But XML DOM builder produces valid result
        #self.assertEqual(None, g.xpath_one('//weight').text)
        self.assertEqual('30', g.xml_tree.xpath('//weight')[0].text)

        # Use `content_type` option to change default DOM builder
        g = Grab(transport=GRAB_TRANSPORT)
        g.fake_response(XML)
        g.setup(content_type='xml')

        self.assertEqual('30', g.xpath_one('//weight').text)
        self.assertEqual('30', g.tree.xpath('//weight')[0].text)

    def test_xml_declaration(self):
        """
        HTML with XML declaration shuld be processed without errors.
        """
        SERVER.RESPONSE['get'] = """<?xml version="1.0" encoding="UTF-8"?>
        <html><body><h1>test</h1></body></html>
        """
        g = Grab()
        g.go(SERVER.BASE_URL)
        self.assertEqual('test', g.xpath_text('//h1'))

    def test_empty_document(self):
        SERVER.RESPONSE['get'] = 'oops'
        g = Grab()
        g.go(SERVER.BASE_URL)
        g.xpath_exists('//anytag')

        SERVER.RESPONSE['get'] = '<frameset></frameset>'
        g = Grab()
        g.go(SERVER.BASE_URL)
        g.xpath_exists('//anytag')
Пример #19
0
class TestHtmlForms(TestCase):
    def setUp(self):
        SERVER.reset()

        # Create fake grab instance with fake response
        self.g = Grab(transport=GRAB_TRANSPORT)
        self.g.fake_response(FORMS)

    def test_choose_form(self):
        """
        Test ``choose_form`` method
        """
        
        # raise errors
        self.assertRaises(DataNotFound, lambda: self.g.choose_form(10))
        self.assertRaises(DataNotFound, lambda: self.g.choose_form(id='bad_id'))
        self.assertRaises(DataNotFound, lambda: self.g.choose_form(id='fake_form'))
        self.assertRaises(GrabMisuseError, lambda: self.g.choose_form())
        
        # check results
        self.g.choose_form(0)
        self.assertEqual('form', self.g._lxml_form.tag)
        self.assertEqual('search_form', self.g._lxml_form.get('id'))

        # reset current form
        self.g._lxml_form = None

        self.g.choose_form(id='common_form')
        self.assertEqual('form', self.g._lxml_form.tag)
        self.assertEqual('common_form', self.g._lxml_form.get('id'))

        # reset current form
        self.g._lxml_form = None

        self.g.choose_form(name='dummy')
        self.assertEqual('form', self.g._lxml_form.tag)
        self.assertEqual('dummy', self.g._lxml_form.get('name'))

        # reset current form
        self.g._lxml_form = None

        self.g.choose_form(xpath='//form[contains(@action, "/dummy")]')
        self.assertEqual('form', self.g._lxml_form.tag)
        self.assertEqual('dummy', self.g._lxml_form.get('name'))

    def assertEqualQueryString(self, qs1, qs2):
        args1 = set([(x, y[0]) for x, y in parse_qsl(qs1)])
        args2 = set([(x, y[0]) for x, y in parse_qsl(qs2)])
        self.assertEqual(args1, args2)

    def test_submit(self):
        g = Grab(transport=GRAB_TRANSPORT)
        SERVER.RESPONSE['get'] = POST_FORM
        g.go(SERVER.BASE_URL)
        g.set_input('name', 'Alex')
        g.submit()
        self.assertEqualQueryString(SERVER.REQUEST['post'], 'name=Alex&secret=123')

        # Default submit control
        SERVER.RESPONSE['get'] = MULTIPLE_SUBMIT_FORM
        g.go(SERVER.BASE_URL)
        g.submit()
        self.assertEqualQueryString(SERVER.REQUEST['post'], 'secret=123&submit1=submit1')

        # Selected submit control
        SERVER.RESPONSE['get'] = MULTIPLE_SUBMIT_FORM
        g.go(SERVER.BASE_URL)
        g.submit(submit_name='submit2')
        self.assertEqualQueryString(SERVER.REQUEST['post'], 'secret=123&submit2=submit2')

        # Default submit control if submit control name is invalid
        SERVER.RESPONSE['get'] = MULTIPLE_SUBMIT_FORM
        g.go(SERVER.BASE_URL)
        g.submit(submit_name='submit3')
        self.assertEqualQueryString(SERVER.REQUEST['post'], 'secret=123&submit1=submit1')

    def test_set_methods(self):
        g = Grab(transport=GRAB_TRANSPORT)
        SERVER.RESPONSE['get'] = FORMS
        g.go(SERVER.BASE_URL)

        self.assertEqual(g._lxml_form, None)

        g.set_input('gender', '1')
        self.assertEqual('common_form', g._lxml_form.get('id'))

        self.assertRaises(KeyError, lambda: g.set_input('query', 'asdf'))

        g._lxml_form = None
        g.set_input_by_id('search_box', 'asdf')
        self.assertEqual('search_form', g._lxml_form.get('id'))

        g.choose_form(xpath='//form[@id="common_form"]')
        g.set_input_by_number(0, 'asdf')

        g._lxml_form = None
        g.set_input_by_xpath('//*[@name="gender"]', '2')
        self.assertEqual('common_form', g._lxml_form.get('id'))

    def test_html_without_forms(self):
        g = Grab(transport=GRAB_TRANSPORT)
        SERVER.RESPONSE['get'] = NO_FORM_HTML
        g.go(SERVER.BASE_URL)
        self.assertRaises(DataNotFound, lambda: g.form)

    def test_disabled_radio(self):
        """
        Bug #57
        """

        g = Grab(transport=GRAB_TRANSPORT)
        SERVER.RESPONSE['get'] = DISABLED_RADIO_HTML
        g.go(SERVER.BASE_URL)
        g.submit(make_request=False)