Exemplo n.º 1
0
 def task_generator(self):
     ua = UserAgent()
     grab = Grab()
     grab.load_proxylist(
         PROXY_PATH,
         'text_file',
         proxy_type='http',
         auto_init=False,
         auto_change=True
     )
     for link in VOCABULARY:
         url = link['url']
         pages = xrange(1, link['pages'])
         cat = link['cat']
         for page in pages:
             grab.change_proxy()
             grab.setup(
                 url=url % page,
                 proxy_userpwd=CREDENTIALS,
                 hammer_mode=True,
                 hammer_timeouts=HTM,
                 user_agent=ua.random,
                 reuse_cookies=False
             )
             yield Task('link_on_page', grab=grab, cat=cat)
Exemplo n.º 2
0
    def task_generator(self):
        ua = UserAgent()
        grab = Grab()
        grab.load_proxylist(self.PROXY_PATH,
                            'text_file',
                            proxy_type='http',
                            auto_init=False,
                            auto_change=True)

        for link in VOCABULARY:
            url = link['url']
            part_url = LOCALITY[self.city_pars]
            pages = xrange(1, link['pages'])
            cat = Category.objects.get(id=link['cat'][1])
            moroz = link['cat'][0]
            city = City.objects.get(id=LOCALITY[self.city_pars][1])
            for page in pages:
                print 'number_of_pages=', page
                grab.change_proxy()
                grab.setup(
                    url=url %
                    (part_url[0][0], part_url[0][1], part_url[0][2], page),
                    proxy_userpwd=self.CREDENTIALS,
                    hammer_mode=True,
                    hammer_timeouts=HTM,
                    user_agent=ua.random,
                    reuse_cookies=False)
                # check_proxies_for_slando(self, grab=grab, ua=ua.random, url=url % (LOCALITY[self.city_pars][0], page))
                print 'proxy before go of page list ', grab.config['proxy']
                yield Task('link_on_page',
                           delay=4,
                           grab=grab,
                           cat=cat,
                           city=city,
                           moroz=moroz)
Exemplo n.º 3
0
 def test_deprecated_setup_proxylist(self):
     g = Grab(transport=GRAB_TRANSPORT)
     open(TMP_FILE, 'w').write(PROXY1)
     g.load_proxylist(TMP_FILE, 'text_file')
     SERVER.RESPONSE['get'] = '123'
     g.change_proxy()
     g.go('http://yandex.ru')
     self.assertEqual('123', g.response.body)
     self.assertEqual('yandex.ru', SERVER.REQUEST['headers']['host'])
Exemplo n.º 4
0
 def test_deprecated_setup_proxylist(self):
     g = Grab(transport=GRAB_TRANSPORT)
     open('/tmp/__proxy.txt', 'w').write(PROXY1)
     g.load_proxylist('/tmp/__proxy.txt', 'text_file')
     SERVER.RESPONSE['get'] = '123'
     g.change_proxy()
     g.go('http://yandex.ru')
     self.assertEqual('123', g.response.body)
     self.assertEqual('yandex.ru', SERVER.REQUEST['headers']['host'])
Exemplo n.º 5
0
 def task_generator(self):
     ua = UserAgent()
     grab = Grab(timeout=30)
     grab.load_proxylist('proxy_http_auth.txt', 'text_file',
                         proxy_type='http', auto_init=False, auto_change=True)
     # grab.config["thread_number"] = 40
     for link in VOCABULARY:
         url = link['url']
         pages = xrange(1, link['pages'])
         cat = link['cat']
         for page in pages:
             grab.change_proxy()
             grab.setup(url=url % page, proxy_userpwd=CREDENTIALS,
                        hammer_mode=True, hammer_timeouts=((2, 5), (10, 15), (20, 30)), user_agent=ua.random, reuse_cookies=True)
             yield Task('link_on_page', grab=grab, cat=cat)
Exemplo n.º 6
0
 def task_generator(self):
     ua = UserAgent()
     grab = Grab(timeout=30)
     grab.load_proxylist('proxy_http_auth.txt',
                         'text_file',
                         proxy_type='http',
                         auto_init=False,
                         auto_change=True)
     while True:
         dig = random.randint(111, 999)
         grab.change_proxy()
         grab.setup(
             url='http://zipexpert.com.ua/catalog/?q=%s&s=' % dig,
             # url='http://good-service.com.ua/content/zapchasti-dlya-stiralnykh-mashin-v-kharkove-i-po-vsei-ukraine-optom-i-v-roznitsu',
             proxy_userpwd=CREDENTIALS,
             hammer_mode=True,
             hammer_timeouts=((2, 5), (10, 15), (20, 30)),
             user_agent=ua.random,
             reuse_cookies=False)
         yield Task('link_on_page', grab=grab)
Exemplo n.º 7
0
def search(query, grab=None, limit=None, per_page=None):

    if not grab:
        grab = Grab()
    stop = False
    count = 0

    grab.clear_cookies()
    if grab.proxylist:
        grab.change_proxy()

    for page in xrange(1, 9999):
        if stop:
            break
        url = build_search_url(query, page, per_page=per_page)
        index_size = None
        grab = grab.go(url)
        #grab = google_request(url, grab=grab)

        count = 0
        for item in parse_search_results(grab):
            yield item # {url, title, index_size}
            count += 1

        if not count:
            stop = True

        if is_last_page(grab):
            logging.debug('Last page found')
            stop = True

        if limit is not None and count >= limit:
            logging.debug('Limit %d reached' % limit)
            stop = True

        grab.sleep(3, 5)
Exemplo n.º 8
0
class Avito():
    PAGETRY = 3
    SLEEP = 1

    def __init__(self, proxy=None):
        self.PAGETRY, rc = startup.CFG.value('pagetry', Avito.PAGETRY).toInt()
        self.SLEEP, rc = startup.CFG.value('sleep', Avito.SLEEP).toInt()

        self.g = Grab()
        self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10)))
        if __name__ == '__main__':
            self.g.setup(log_dir='dump')
        if proxy:
            self.g.load_proxylist(proxy,
                                  'text_file',
                                  'http',
                                  auto_init=True,
                                  auto_change=False)
        self.tess = Tesseract()
        self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)

    def _go3(self, url, tag):
        c = self.PAGETRY
        while c:
            try:
                self.g.change_proxy()
                self.g.go(url)
                self.g.assert_substring(u'content="51a6d66a02fb23c7"')
                break
            except:
                log.exception('%s left %i', tag, c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception('%s error' % tag)

    def get_links(self, url):
        self._go3(url, 'start page')
        c = 999
        while True:
            links = self.g.doc.select('//h3[@class="title"]/a/@href')
            if not links:
                raise Exception('no links')
            for link in links:
                c -= 1
                if not c:
                    return
                yield urljoin(url, link.text())
            next = self.g.doc.select('//a[@class="next"]/@href')
            if not next:
                log.debug('last page?')
                break
            QtCore.QThread.sleep(self.SLEEP)
            nurl = urljoin(url, next.text())
            log.debug('open next page %s', nurl)
            self._go3(nurl, 'next page')

    def get_photos(self, photos):
        g = self.g.clone()
        datas = []
        for url in photos:
            if not url.startswith('http:'):
                url = 'http:' + url
            c = self.PAGETRY
            while c:
                try:
                    rc = g.go(url)
                    g.assert_substring('JFIF', byte=True)
                    datas.append(rc.body)
                    break
                except:
                    log.exception('get_item left %i', c)
                    c -= 1
                    g.change_proxy()
                    QtCore.QThread.sleep(self.SLEEP)
            else:
                raise Exception('get photo error')
        return datas

    def get_item(self, url):
        g = self.g.clone()
        c = self.PAGETRY
        while c:
            try:
                g.change_proxy()
                g.go(url)
                g.assert_substring(
                    u'content="499bdc75d3636c55"')  # avitos ya id
                break
            except:
                log.exception('get_item left %i', c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception('get item error')
        doc = g.doc
        title = doc.select('//h1[@itemprop="name"]').text()
        gallery = doc.select('//div[@class="gallery-item"]')
        photos = [
            s.text()
            for s in gallery.select('.//a[@class="gallery-link"]/@href')
        ]
        if not photos:
            egg = doc.select(
                '//img[contains(@class,"j-zoom-gallery-init")]/@src')
            if egg:
                photos.append(egg.text())
        item = doc.select('//div[@itemprop="offers"]')
        #price = item.select('.//strong[@itemprop="price"]').text()
        price = item.select('.//span[@itemprop="price"]').text()
        name = item.select('.//strong[@itemprop="name"]').text()

        try:
            town = item.select(
                './/span[@id="toggle_map"]/span[@itemprop="name"]').text()
        except:
            log.warning('xpath town not found, try another way')
            town = item.select(
                './/div[@id="map"]/span[@itemprop="name"]').text()
        #desc = item.select('.//div[@id="desc_text"]').text()
        desc = doc.select(
            "//div[contains(@class,\"description-text\")]").text()
        #<span id="item_id">348967351</span>
        item_id = doc.select('//span[@id="item_id"]').text()
        _url = g.rex_text("avito.item.url = '([^>]+?)'")
        _phone = g.rex_text("avito.item.phone = '([^>]+?)'")

        loc = {'item_phone': _phone, 'item': {'id': item_id, 'url': _url}}
        log.debug('jslock enter <--')
        with PyV8.JSContext(loc) as ctx:
            ctx.enter()
            ctx.eval('''function phoneDemixer(key){var
    pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}}
    return r;}''')

            #http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4
            egg = ctx.eval(
                "'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)")
            #egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)")
            log.debug('js rc %s', egg)
            ctx.leave()
        log.debug('jslock leave -->')
        phone = ''
        c = self.PAGETRY
        while c:
            log.debug('read phone image')
            try:
                rc = g.go(urljoin(url, egg))
                img = Image.open(StringIO(rc.body))
                phone = self.tess.from_image(img,
                                             basewidth=300,
                                             whitelist='0123456789-')
                break
            except:
                g.change_proxy()
                log.exception('get_phone left %i', c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            log.debug('get phone error')

        return dict(item=item_id,
                    title=title,
                    photos=photos,
                    price=price,
                    name=name,
                    town=town,
                    desc=desc,
                    phone=phone)
Exemplo n.º 9
0
class Avito:
    PAGETRY = 3
    SLEEP = 1

    def __init__(self, proxy=None):
        self.PAGETRY, rc = startup.CFG.value("pagetry", Avito.PAGETRY).toInt()
        self.SLEEP, rc = startup.CFG.value("sleep", Avito.SLEEP).toInt()

        self.g = Grab()
        self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10)))
        if __name__ == "__main__":
            self.g.setup(log_dir="dump")
        if proxy:
            self.g.load_proxylist(proxy, "text_file", "http", auto_init=True, auto_change=False)
        self.tess = Tesseract()
        self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)

    def _go3(self, url, tag):
        c = self.PAGETRY
        while c:
            try:
                self.g.change_proxy()
                self.g.go(url)
                self.g.assert_substring(u'content="51a6d66a02fb23c7"')
                break
            except:
                log.exception("%s left %i", tag, c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception("%s error" % tag)

    def get_links(self, url):
        self._go3(url, "start page")
        c = 999
        while True:
            links = self.g.doc.select('//h3[@class="title"]/a/@href')
            if not links:
                raise Exception("no links")
            for link in links:
                c -= 1
                if not c:
                    return
                yield urljoin(url, link.text())
            next = self.g.doc.select('//a[@class="next"]/@href')
            if not next:
                log.debug("last page?")
                break
            QtCore.QThread.sleep(self.SLEEP)
            nurl = urljoin(url, next.text())
            log.debug("open next page %s", nurl)
            self._go3(nurl, "next page")

    def get_photos(self, photos):
        g = self.g.clone()
        datas = []
        for url in photos:
            if not url.startswith("http:"):
                url = "http:" + url
            c = self.PAGETRY
            while c:
                try:
                    rc = g.go(url)
                    g.assert_substring("JFIF", byte=True)
                    datas.append(rc.body)
                    break
                except:
                    log.exception("get_item left %i", c)
                    c -= 1
                    g.change_proxy()
                    QtCore.QThread.sleep(self.SLEEP)
            else:
                raise Exception("get photo error")
        return datas

    def get_item(self, url):
        g = self.g.clone()
        c = self.PAGETRY
        while c:
            try:
                g.change_proxy()
                g.go(url)
                g.assert_substring(u'content="499bdc75d3636c55"')  # avitos ya id
                break
            except:
                log.exception("get_item left %i", c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception("get item error")
        doc = g.doc
        title = doc.select('//h1[@itemprop="name"]').text()
        gallery = doc.select('//div[@class="gallery-item"]')
        photos = [s.text() for s in gallery.select('.//a[@class="gallery-link"]/@href')]
        if not photos:
            egg = doc.select('//img[contains(@class,"j-zoom-gallery-init")]/@src')
            if egg:
                photos.append(egg.text())
        item = doc.select('//div[@itemprop="offers"]')
        # price = item.select('.//strong[@itemprop="price"]').text()
        price = item.select('.//span[@itemprop="price"]').text()
        name = item.select('.//strong[@itemprop="name"]').text()

        try:
            town = item.select('.//span[@id="toggle_map"]/span[@itemprop="name"]').text()
        except:
            log.warning("xpath town not found, try another way")
            town = item.select('.//div[@id="map"]/span[@itemprop="name"]').text()
        # desc = item.select('.//div[@id="desc_text"]').text()
        desc = doc.select('//div[contains(@class,"description-text")]').text()
        # <span id="item_id">348967351</span>
        item_id = doc.select('//span[@id="item_id"]').text()
        _url = g.rex_text("avito.item.url = '([^>]+?)'")
        _phone = g.rex_text("avito.item.phone = '([^>]+?)'")

        loc = {"item_phone": _phone, "item": {"id": item_id, "url": _url}}
        log.debug("jslock enter <--")
        with PyV8.JSContext(loc) as ctx:
            ctx.enter()
            ctx.eval(
                """function phoneDemixer(key){var
    pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}}
    return r;}"""
            )

            # http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4
            egg = ctx.eval("'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)")
            # egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)")
            log.debug("js rc %s", egg)
            ctx.leave()
        log.debug("jslock leave -->")
        phone = ""
        c = self.PAGETRY
        while c:
            log.debug("read phone image")
            try:
                rc = g.go(urljoin(url, egg))
                img = Image.open(StringIO(rc.body))
                phone = self.tess.from_image(img, basewidth=300, whitelist="0123456789-")
                break
            except:
                g.change_proxy()
                log.exception("get_phone left %i", c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            log.debug("get phone error")

        return dict(item=item_id, title=title, photos=photos, price=price, name=name, town=town, desc=desc, phone=phone)