Exemplo n.º 1
0
    def test_session(self):
        g = Grab(transport=GRAB_TRANSPORT)
        g.setup(reuse_cookies=True)
        SERVER.RESPONSE['cookies'] = {'foo': 'bar'}
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.cookies['foo'], 'bar')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar')

        g = Grab(transport=GRAB_TRANSPORT)
        g.setup(reuse_cookies=False)
        SERVER.RESPONSE['cookies'] = {'foo': 'baz'}
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.cookies['foo'], 'baz')
        g.go(SERVER.BASE_URL)
        self.assertTrue('Cookie' not in SERVER.REQUEST['headers'])

        g = Grab(transport=GRAB_TRANSPORT)
        g.setup(reuse_cookies=True)
        SERVER.RESPONSE['cookies'] = {'foo': 'bar'}
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.cookies['foo'], 'bar')
        g.clear_cookies()
        g.go(SERVER.BASE_URL)
        self.assertTrue('Cookie' not in SERVER.REQUEST['headers'])
Exemplo n.º 2
0
    def test_session(self):
        g = Grab(transport=GRAB_TRANSPORT)
        g.setup(reuse_cookies=True)
        SERVER.RESPONSE['cookies'] = {'foo': 'bar'}
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.cookies['foo'], 'bar')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar')

        g = Grab(transport=GRAB_TRANSPORT)
        g.setup(reuse_cookies=False)
        SERVER.RESPONSE['cookies'] = {'foo': 'baz'}
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.cookies['foo'], 'baz')
        g.go(SERVER.BASE_URL)
        self.assertTrue('Cookie' not in SERVER.REQUEST['headers'])

        g = Grab(transport=GRAB_TRANSPORT)
        g.setup(reuse_cookies=True)
        SERVER.RESPONSE['cookies'] = {'foo': 'bar'}
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.cookies['foo'], 'bar')
        g.clear_cookies()
        g.go(SERVER.BASE_URL)
        self.assertTrue('Cookie' not in SERVER.REQUEST['headers'])
Exemplo n.º 3
0
    def test_session(self):
        # Test that if Grab gets some cookies from the server
        # then it sends it back
        g = Grab(transport=GRAB_TRANSPORT)
        g.setup(reuse_cookies=True)
        SERVER.RESPONSE['cookies'] = {'foo': 'bar'}
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.cookies['foo'], 'bar')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar')
        g.go(SERVER.BASE_URL)
        self.assertEqual(SERVER.REQUEST['headers']['Cookie'], 'foo=bar')

        # Test reuse_cookies=False
        g = Grab(transport=GRAB_TRANSPORT)
        g.setup(reuse_cookies=False)
        SERVER.RESPONSE['cookies'] = {'foo': 'baz'}
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.cookies['foo'], 'baz')
        g.go(SERVER.BASE_URL)
        self.assertTrue(len(SERVER.REQUEST['cookies']) == 0)

        # Test something
        g = Grab(transport=GRAB_TRANSPORT)
        g.setup(reuse_cookies=True)
        SERVER.RESPONSE['cookies'] = {'foo': 'bar'}
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.cookies['foo'], 'bar')
        g.clear_cookies()
        g.go(SERVER.BASE_URL)
        self.assertTrue(len(SERVER.REQUEST['cookies']) == 0)
Exemplo n.º 4
0
    def task_generator(self):
        logging.debug("*****execute******")
        with open('directories.csv', 'rb') as f:
            content = csv.reader(f)
            directories = list(content)

        # directories = ['google']
        total = len(directories)
        logging.debug("*****{}******".format(total))
        i = 100
        total = 102
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        }

        test_url = 'https://www.google.com'

        while (True):
            logging.debug("Index: {}".format(i))
            if i >= total:
                break
            g = Grab()
            g.clear_cookies()
            g.setup(**config)
            g.setup(headers=headers)
            logging.debug("CONFIG : {}".format(g.config))
            data = dict(slug=directories[i][0], )
            logging.info(data)
            while True:
                try:
                    print "------------------------"
                    g.go(test_url)
                    print g.doc.body
                    print "++++++++++++++++++++++++"
                    break
                except Exception as e:
                    print "************************"
                    logging.debug(e)
                    time.sleep(1)

            yield Task('init', grab=g, data=data)
            time.sleep(5)
            i += 1
Exemplo n.º 5
0
def search(query, grab=None, limit=None, per_page=None):

    if not grab:
        grab = Grab()
    stop = False
    count = 0

    grab.clear_cookies()
    if grab.proxylist:
        grab.change_proxy()

    for page in xrange(1, 9999):
        if stop:
            break
        url = build_search_url(query, page, per_page=per_page)
        index_size = None
        grab = grab.go(url)
        #grab = google_request(url, grab=grab)

        count = 0
        for item in parse_search_results(grab):
            yield item # {url, title, index_size}
            count += 1

        if not count:
            stop = True

        if is_last_page(grab):
            logging.debug('Last page found')
            stop = True

        if limit is not None and count >= limit:
            logging.debug('Limit %d reached' % limit)
            stop = True

        grab.sleep(3, 5)
Exemplo n.º 6
0
class ParserWithProxy(Spider):
    u"""Базовый класс парсера для работы с прокси"""

    USE_PROXY = True

    def __init__(self, country_code, *args, **kwargs):
        super(ParserWithProxy, self).__init__(*args, **kwargs)

        self.country = countries.get(alpha2=country_code)
        self.proxies = []
        self.used_proxies = set()

        self.grab = None
        self.grab_use_count = None

        self.reinit_grab()

        self.setup_queue(getattr(config, 'QUEUE_BACKEND', 'memory'))
        if getattr(config, 'CACHE_ENABLED', False):
            self.setup_cache('mongo', getattr(config, 'CACHE_DATABASE', 'cache'))

    def check_grab(self, grab):
        return True

    def reinit_grab(self):
        if not self.grab:
            self.grab = Grab()

        self.grab_use_count = 0

        while True:
            self.grab.clear_cookies()
            self.grab.setup(**self.get_next_proxy())
            if self.check_grab(self.grab):
                break
            logger.info(u'Плохая прокси. Смена...')

    def get_grab(self):
        self.grab_use_count += 1

        if self.grab_use_count > config.PROXY_USE_LIMIT:
            self.reinit_grab()

        return self.grab.clone()

    def get_next_proxy(self):
        u"""Получение следующей неиспользованной прокси"""

        if not self.USE_PROXY:
            return {}

        while not self.proxies:
            # получение проксей и фильтрация неспользованных
            self.proxies = get_proxy_list(self.country.alpha2, 100)
            self.proxies = filter(
                lambda proxy: tuple(proxy.values()) not in self.used_proxies,
                self.proxies
            )
            if not self.proxies:
                logger.info(u'Кончились прокси, ожидание новых')
                sleep(10)
            else:
                break
        # возврат первой прокси
        proxy = self.proxies[0]
        self.used_proxies.add(tuple(proxy.values()))
        del self.proxies[0]
        return proxy
Exemplo n.º 7
0
def id_for_answer(answer):
    """возвращает числовой идентификатор варианта ответа"""
    body = G.response.body
    ai = body.find(answer)
    fr = body[ai - 50: ai]
    id = fr[fr.find('PDI_answer') + 10:-2]
    return id


def vote(c, l, j, g, o, p, k, m, h, f):
    """голосуем..."""
    d = id_for_answer(POSITION)
    url = 'http://polldaddy.com/vote.php?va={}&pt={}&r={}&p={}&a={}&o=&t={}&token={}'.format(o, g, j, c, d, p, f)
    G.go(url)
    try:
        print G.css_text('.poll-msg'),
    except:
        print 'No msg',
    print G.css_list('.votes')[POSITION_POS - 1].text

#запускаем голосовалку
for i in range(0, VOTES):
    G.clear_cookies()
    G.go('http://polldaddy.com/poll/6061575/')
    vote_call = G.css_list('.button-lrg')[0].attrib['onclick']
    vote_call_args = vote_call[vote_call.find('(') + 1:vote_call.find(')')]
    vote_call_args = vote_call_args.strip().replace("'", '').split(',')
    vote(*vote_call_args)
    sleep(2)