Пример #1
0
def getproduct(href, item, scan, oneposition):
    pg = Grab(log_dir="/tmp", timeout=30)

    # proxy = Proxy.objects.filter(active=True).order_by('?')[0]

    # # proxyaddrlist = proxy.name.split(':')[0:1]
    # # proxyuserlist = proxy.name.split(':')[2:3]

    # proxyaddr = proxy.name.split(':')[0] + ':' + proxy.name.split(':')[1]
    # proxyuser = proxy.name.split(':')[2] + ':' + proxy.name.split(':')[3]

    # print proxyaddr

    # # assert False, proxyuser

    # pg.setup(proxy=proxyaddr, proxy_userpwd=proxyuser, proxy_type="http")

    pg.load_proxylist(os.path.dirname(os.path.realpath(__file__)) +
                      "/proxy1.txt",
                      source_type='text_file',
                      proxy_type='http',
                      auto_change=True)

    # print pg.config['proxy']

    # pg.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy.txt", source_type='text_file', proxy_type='http', auto_change=True)
    try:
        purl = "http://hotline.ua" + href
        pg.go(purl)
        # pass
    except Exception, e:
        print "Error: " + purl

        return
Пример #2
0
    def task_generator(self):
        ua = UserAgent()
        grab = Grab()
        grab.load_proxylist(self.PROXY_PATH,
                            'text_file',
                            proxy_type='http',
                            auto_init=False,
                            auto_change=True)

        for link in VOCABULARY:
            url = link['url']
            part_url = LOCALITY[self.city_pars]
            pages = xrange(1, link['pages'])
            cat = Category.objects.get(id=link['cat'][1])
            moroz = link['cat'][0]
            city = City.objects.get(id=LOCALITY[self.city_pars][1])
            for page in pages:
                print 'number_of_pages=', page
                grab.change_proxy()
                grab.setup(
                    url=url %
                    (part_url[0][0], part_url[0][1], part_url[0][2], page),
                    proxy_userpwd=self.CREDENTIALS,
                    hammer_mode=True,
                    hammer_timeouts=HTM,
                    user_agent=ua.random,
                    reuse_cookies=False)
                # check_proxies_for_slando(self, grab=grab, ua=ua.random, url=url % (LOCALITY[self.city_pars][0], page))
                print 'proxy before go of page list ', grab.config['proxy']
                yield Task('link_on_page',
                           delay=4,
                           grab=grab,
                           cat=cat,
                           city=city,
                           moroz=moroz)
Пример #3
0
 def task_generator(self):
     ua = UserAgent()
     grab = Grab()
     grab.load_proxylist(
         PROXY_PATH,
         'text_file',
         proxy_type='http',
         auto_init=False,
         auto_change=True
     )
     for link in VOCABULARY:
         url = link['url']
         pages = xrange(1, link['pages'])
         cat = link['cat']
         for page in pages:
             grab.change_proxy()
             grab.setup(
                 url=url % page,
                 proxy_userpwd=CREDENTIALS,
                 hammer_mode=True,
                 hammer_timeouts=HTM,
                 user_agent=ua.random,
                 reuse_cookies=False
             )
             yield Task('link_on_page', grab=grab, cat=cat)
Пример #4
0
def get_mag(link):
    idmag = link.split('/')

    firm = FirmHotline.objects.filter(itemid=int(idmag[2]))
    if firm:

        print("Firm exists:" + firm[0].name)
        return firm[0]
    else:
        fg = Grab(log_dir="/tmp", timeout=300)

        fg.load_proxylist(os.path.dirname(os.path.realpath(__file__)) +
                          "/proxy1.txt",
                          source_type='text_file',
                          proxy_type='http',
                          auto_change=True)
        fg.go('http://hotline.ua' + link)
        body = fg.response.body
        pyquery = pq(body)
        name = pyquery('ul.shop-title > li > h1').text()
        try:
            link = pyquery('ul.info-shops > li > p > a')[0].attrib['href']
        except:
            link = ""

        firm = FirmHotline(itemid=int(idmag[2]), name=name, url=link)
        firm.save()

        print("New Firm:" + firm.name)
        return firm
Пример #5
0
 def test_deprecated_setup_proxylist(self):
     g = Grab(transport=GRAB_TRANSPORT)
     open('/tmp/__proxy.txt', 'w').write(PROXY1)
     g.load_proxylist('/tmp/__proxy.txt', 'text_file')
     SERVER.RESPONSE['get'] = '123'
     g.change_proxy()
     g.go('http://yandex.ru')
     self.assertEqual('123', g.response.body)
     self.assertEqual('yandex.ru', SERVER.REQUEST['headers']['host'])
Пример #6
0
 def test_deprecated_setup_proxylist(self):
     g = Grab(transport=GRAB_TRANSPORT)
     open(TMP_FILE, 'w').write(PROXY1)
     g.load_proxylist(TMP_FILE, 'text_file')
     SERVER.RESPONSE['get'] = '123'
     g.change_proxy()
     g.go('http://yandex.ru')
     self.assertEqual('123', g.response.body)
     self.assertEqual('yandex.ru', SERVER.REQUEST['headers']['host'])
Пример #7
0
    def handle(self, *args, **options):
        settings = {
            'price__gt': 0,
            'href': '',
        }
        if args:
            category = Category.objects.get(pk=args[0])
            # assert False, category
            settings['product__category'] = category

        # assert False, settings
        items = ColorProduct.objects.filter(price__gt=0, href="")
        # items = ColorProduct.objects,filter(settings)
        prefix = 'https://www.google.com.ua/search?q=site:hotline.ua+'

        # assert False, items
        for pitem in items:
            if pitem.productname:
                goog = Grab(log_file='/tmp/log.html')
                # goog.proxylist.set_source('file', load_file=)

                goog.load_proxylist(
                    os.path.dirname(os.path.realpath(__file__)) + "/proxy.txt",
                    source_type='text_file',
                    proxy_type='http',
                    auto_change=True)
                self.stdout.write(str(pitem.pk))
                words = pitem.productname.split(' ')
                searchphrase = '+'.join(words)
                gurl = prefix + searchphrase
                goog.go(prefix + searchphrase)

                # file = codecs.open(PROJECT_ROOT + '/static/test2.txt', "w", "utf-8")
                # file.write(goog.response.body)
                # file.close()

                # assert False, goog.response.body

                if goog.doc.select('//h3[@class="r"]/a').exists():

                    sresult = goog.doc.select('//h3[@class="r"]/a')[0].attr(
                        'href')

                    self.stdout.write(sresult)
                    if "?q=" in sresult:
                        sresult = sresult.split('?q=')[1].split('&')[0]

                    sresult = sresult.replace('http://hotline.ua', '')

                    pitem.href = sresult
                    pitem.hrefok = True
                    pitem.save()

                    self.stdout.write(sresult)
                else:
                    self.stdout.write('net')
Пример #8
0
 def task_generator(self):
     ua = UserAgent()
     grab = Grab(timeout=30)
     grab.load_proxylist('proxy_http_auth.txt', 'text_file',
                         proxy_type='http', auto_init=False, auto_change=True)
     # grab.config["thread_number"] = 40
     for link in VOCABULARY:
         url = link['url']
         pages = xrange(1, link['pages'])
         cat = link['cat']
         for page in pages:
             grab.change_proxy()
             grab.setup(url=url % page, proxy_userpwd=CREDENTIALS,
                        hammer_mode=True, hammer_timeouts=((2, 5), (10, 15), (20, 30)), user_agent=ua.random, reuse_cookies=True)
             yield Task('link_on_page', grab=grab, cat=cat)
Пример #9
0
def Grab(**kwargs):
    grb = GrabLib()
    default_settings = {
        'user_agent_file': USER_AGENT_FILE,
        'connect_timeout': defaults.GRABBER_CONNECT_TIMEOUT,
        'timeout': defaults.GRABBER_TIMEOUT,
        'hammer_mode': True,
        'hammer_timeouts': defaults.GRABBER_HAMMER_TIMEOUTS,
        'headers': defaults.GRABBER_HEADERS
    }
    default_settings.update(kwargs)
    grb.setup(**default_settings)
    grb.load_proxylist(
        source=get_proxies(),
        source_type='list',
        auto_init=True,
        auto_change=True
    )
    return grb
Пример #10
0
 def task_generator(self):
     ua = UserAgent()
     grab = Grab(timeout=30)
     grab.load_proxylist('proxy_http_auth.txt',
                         'text_file',
                         proxy_type='http',
                         auto_init=False,
                         auto_change=True)
     while True:
         dig = random.randint(111, 999)
         grab.change_proxy()
         grab.setup(
             url='http://zipexpert.com.ua/catalog/?q=%s&s=' % dig,
             # url='http://good-service.com.ua/content/zapchasti-dlya-stiralnykh-mashin-v-kharkove-i-po-vsei-ukraine-optom-i-v-roznitsu',
             proxy_userpwd=CREDENTIALS,
             hammer_mode=True,
             hammer_timeouts=((2, 5), (10, 15), (20, 30)),
             user_agent=ua.random,
             reuse_cookies=False)
         yield Task('link_on_page', grab=grab)
Пример #11
0
 def task_generator(self):
     ua = UserAgent()
     grab = Grab()
     grab.load_proxylist(PROXY_PATH_fine,
                         'text_file',
                         proxy_type='http',
                         auto_init=False,
                         auto_change=True)
     g = Grab()
     g.go(u'http://dom.ria.com/ru/Каталог/Продажа-аренда/')
     print 'sssssssssssssssssssssssssssss', g.doc.select(
         '//a[@class="photo photo-185x120"]')
     for item in g.doc.select('//a[@class="photo photo-185x120"]'):
         print u'http://dom.ria.com/%s' % item.attr('href')
         grab.setup(url=u'http://dom.ria.com%s' % item.attr('href'),
                    proxy_userpwd=CREDENTIALS_fine,
                    hammer_mode=True,
                    hammer_timeouts=HTM,
                    user_agent=ua.random,
                    reuse_cookies=False)
         yield Task('link_on_page', grab=grab)
Пример #12
0
    def test_load_proxylist(self):
        content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3)
        open(TMP_FILE, 'w').write(content)

        # By default auto_change is True
        g = Grab(transport=GRAB_TRANSPORT)
        g.load_proxylist(TMP_FILE, 'text_file')
        self.assertEqual(g.config['proxy_auto_change'], True)
        servers = set()
        for x in xrange(10):
            g.go('http://yandex.ru')
            servers.add(g.config['proxy'])

        self.assertTrue(len(servers) > 1)

        # Disable auto_change
        # By default auto_init is True
        g = Grab(transport=GRAB_TRANSPORT)
        g.load_proxylist(TMP_FILE, 'text_file', auto_change=False)
        self.assertEqual(g.config['proxy_auto_change'], False)
        servers = set()
        for x in xrange(10):
            g.go('http://yandex.ru')
            servers.add(g.config['proxy'])
        self.assertEqual(len(servers), 1)

        # Disable auto_change
        # Disable auto_init
        # Proxylist will not be used by default
        g = Grab(transport=GRAB_TRANSPORT)
        g.load_proxylist(TMP_FILE, 'text_file', auto_change=False,
                         auto_init=False)
        self.assertEqual(g.config['proxy_auto_change'], False)
        g.go('http://yandex.ru')
        self.assertEqual(g.config['proxy'], None)
Пример #13
0
    def handle(self, *args, **options):
        msg = ""

        goog = Grab(log_file='/tmp/log.html')
        goog.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy1.txt", source_type='text_file', proxy_type='http', auto_change=True)
        goog = scanhot("http://hotline.ua/catalog/")
        pquery = pq(goog.response.body)

        cells = pquery('.all-cat.m_t-30 li > a')

        # assert False, len(cells)

        # assert False, cells[0].outerHtml()

        # items = []

        for col in cells:
            item = {}
            msg += "python manage.py scanhotall " + col.attrib['href'] + "\n"

        import codecs
        with codecs.open(PROJECT_PATH + '/../bigname.txt', "w", encoding="utf-8") as f:
            f.write(unicode(msg))
Пример #14
0
    def test_change_proxy(self):
        g = Grab(transport=GRAB_TRANSPORT)
        with open(TMP_FILE, 'w') as out:
            for x in xrange(10):
                out.write('server-%d:777\n' % x)

        g.load_proxylist(TMP_FILE, 'text_file', auto_init=False, auto_change=False)
        self.assertEqual(g.config['proxy'], None)

        g.load_proxylist(TMP_FILE, 'text_file', auto_init=False, auto_change=True)
        self.assertEqual(g.config['proxy'], None)

        g.load_proxylist(TMP_FILE, 'text_file', auto_init=True, auto_change=False)
        self.assertTrue('server-' in g.config['proxy'])
Пример #15
0
    def test_change_proxy(self):
        g = Grab(transport=GRAB_TRANSPORT)
        with open('/tmp/__proxy.txt', 'w') as out:
            for x in xrange(10):
                out.write('server-%d:777\n' % x)

        g.load_proxylist('/tmp/__proxy.txt',
                         'text_file',
                         auto_init=False,
                         auto_change=False)
        self.assertEqual(g.config['proxy'], None)

        g.load_proxylist('/tmp/__proxy.txt',
                         'text_file',
                         auto_init=False,
                         auto_change=True)
        self.assertEqual(g.config['proxy'], None)

        g.load_proxylist('/tmp/__proxy.txt',
                         'text_file',
                         auto_init=True,
                         auto_change=False)
        self.assertTrue('server-' in g.config['proxy'])
Пример #16
0
    def test_load_proxylist(self):
        content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3)
        open('/tmp/__proxy.txt', 'w').write(content)

        # By default auto_change is True
        g = Grab(transport=GRAB_TRANSPORT)
        g.load_proxylist('/tmp/__proxy.txt', 'text_file')
        self.assertEqual(g.config['proxy_auto_change'], True)
        servers = set()
        for x in xrange(10):
            g.go('http://yandex.ru')
            servers.add(g.config['proxy'])

        self.assertTrue(len(servers) > 1)

        # Disable auto_change
        # By default auto_init is True
        g = Grab(transport=GRAB_TRANSPORT)
        g.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False)
        self.assertEqual(g.config['proxy_auto_change'], False)
        servers = set()
        for x in xrange(10):
            g.go('http://yandex.ru')
            servers.add(g.config['proxy'])
        self.assertEqual(len(servers), 1)

        # Disable auto_change
        # Disable auto_init
        # Proxylist will not be used by default
        g = Grab(transport=GRAB_TRANSPORT)
        g.load_proxylist('/tmp/__proxy.txt',
                         'text_file',
                         auto_change=False,
                         auto_init=False)
        self.assertEqual(g.config['proxy_auto_change'], False)
        g.go('http://yandex.ru')
        self.assertEqual(g.config['proxy'], None)
Пример #17
0
class Avito():
    PAGETRY = 3
    SLEEP = 1

    def __init__(self, proxy=None):
        self.PAGETRY, rc = startup.CFG.value('pagetry', Avito.PAGETRY).toInt()
        self.SLEEP, rc = startup.CFG.value('sleep', Avito.SLEEP).toInt()

        self.g = Grab()
        self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10)))
        if __name__ == '__main__':
            self.g.setup(log_dir='dump')
        if proxy:
            self.g.load_proxylist(proxy,
                                  'text_file',
                                  'http',
                                  auto_init=True,
                                  auto_change=False)
        self.tess = Tesseract()
        self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)

    def _go3(self, url, tag):
        c = self.PAGETRY
        while c:
            try:
                self.g.change_proxy()
                self.g.go(url)
                self.g.assert_substring(u'content="51a6d66a02fb23c7"')
                break
            except:
                log.exception('%s left %i', tag, c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception('%s error' % tag)

    def get_links(self, url):
        self._go3(url, 'start page')
        c = 999
        while True:
            links = self.g.doc.select('//h3[@class="title"]/a/@href')
            if not links:
                raise Exception('no links')
            for link in links:
                c -= 1
                if not c:
                    return
                yield urljoin(url, link.text())
            next = self.g.doc.select('//a[@class="next"]/@href')
            if not next:
                log.debug('last page?')
                break
            QtCore.QThread.sleep(self.SLEEP)
            nurl = urljoin(url, next.text())
            log.debug('open next page %s', nurl)
            self._go3(nurl, 'next page')

    def get_photos(self, photos):
        g = self.g.clone()
        datas = []
        for url in photos:
            if not url.startswith('http:'):
                url = 'http:' + url
            c = self.PAGETRY
            while c:
                try:
                    rc = g.go(url)
                    g.assert_substring('JFIF', byte=True)
                    datas.append(rc.body)
                    break
                except:
                    log.exception('get_item left %i', c)
                    c -= 1
                    g.change_proxy()
                    QtCore.QThread.sleep(self.SLEEP)
            else:
                raise Exception('get photo error')
        return datas

    def get_item(self, url):
        g = self.g.clone()
        c = self.PAGETRY
        while c:
            try:
                g.change_proxy()
                g.go(url)
                g.assert_substring(
                    u'content="499bdc75d3636c55"')  # avitos ya id
                break
            except:
                log.exception('get_item left %i', c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception('get item error')
        doc = g.doc
        title = doc.select('//h1[@itemprop="name"]').text()
        gallery = doc.select('//div[@class="gallery-item"]')
        photos = [
            s.text()
            for s in gallery.select('.//a[@class="gallery-link"]/@href')
        ]
        if not photos:
            egg = doc.select(
                '//img[contains(@class,"j-zoom-gallery-init")]/@src')
            if egg:
                photos.append(egg.text())
        item = doc.select('//div[@itemprop="offers"]')
        #price = item.select('.//strong[@itemprop="price"]').text()
        price = item.select('.//span[@itemprop="price"]').text()
        name = item.select('.//strong[@itemprop="name"]').text()

        try:
            town = item.select(
                './/span[@id="toggle_map"]/span[@itemprop="name"]').text()
        except:
            log.warning('xpath town not found, try another way')
            town = item.select(
                './/div[@id="map"]/span[@itemprop="name"]').text()
        #desc = item.select('.//div[@id="desc_text"]').text()
        desc = doc.select(
            "//div[contains(@class,\"description-text\")]").text()
        #<span id="item_id">348967351</span>
        item_id = doc.select('//span[@id="item_id"]').text()
        _url = g.rex_text("avito.item.url = '([^>]+?)'")
        _phone = g.rex_text("avito.item.phone = '([^>]+?)'")

        loc = {'item_phone': _phone, 'item': {'id': item_id, 'url': _url}}
        log.debug('jslock enter <--')
        with PyV8.JSContext(loc) as ctx:
            ctx.enter()
            ctx.eval('''function phoneDemixer(key){var
    pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}}
    return r;}''')

            #http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4
            egg = ctx.eval(
                "'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)")
            #egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)")
            log.debug('js rc %s', egg)
            ctx.leave()
        log.debug('jslock leave -->')
        phone = ''
        c = self.PAGETRY
        while c:
            log.debug('read phone image')
            try:
                rc = g.go(urljoin(url, egg))
                img = Image.open(StringIO(rc.body))
                phone = self.tess.from_image(img,
                                             basewidth=300,
                                             whitelist='0123456789-')
                break
            except:
                g.change_proxy()
                log.exception('get_phone left %i', c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            log.debug('get phone error')

        return dict(item=item_id,
                    title=title,
                    photos=photos,
                    price=price,
                    name=name,
                    town=town,
                    desc=desc,
                    phone=phone)
Пример #18
0
    def handle(self, *args, **options):
        filterdb = db.hotfilters
        filteritemdb = db.hotfiltersitems
        category = db.hotcategory
        product = db.hotproduct

        url = "/mobile/umnye-chasy-smartwatch/"
        msg = ""

        itemsname = []
        itemsvalues = []

        goog = Grab(log_file='/tmp/log.html')
        goog.load_proxylist(os.path.dirname(os.path.realpath(__file__)) +
                            "/proxy1.txt",
                            source_type='text_file',
                            proxy_type='http',
                            auto_change=True)
        goog.go("http://hotline.ua" + url)

        pquery = pq(goog.response.body)

        categoryname = pquery('h1.title-24.p_b-10')[0].text
        itemscount = float(
            pquery('h2.selected-filtrs.grey-6')[0].text.split(':')[1].strip())
        pages = int(math.ceil(itemscount / 24))

        catdata = category.find_one({'url': url})

        if catdata:
            cat_id = catdata['_id']
            msg += u"Category used: " + categoryname + u" .Used proxy: " + goog.config[
                'proxy'] + '\n'

        else:
            cat_id = category.insert_one({
                'name': categoryname,
                'count': itemscount,
                'pages': pages,
                'url': url
            }).inserted_id
            msg += u"Category created: " + categoryname + u" .Used proxy: " + goog.config[
                'proxy'] + '\n'

        # assert False, itemscount

        names = pquery('#filters > .cell.f-title')

        i = 0
        for name in names:
            fitemstr = pquery('#filters > .cell.full-list').eq(i)

            if name.text:
                nametext = name.text.strip()
                if not nametext in itemsname:
                    filteritem = filterdb.find_one({
                        'name': nametext,
                        'category': ObjectId(cat_id)
                    })
                    if filteritem:
                        filterinserted_id = filteritem['_id']
                        msg += u"Filter used: " + nametext + " .Used proxy: " + goog.config[
                            'proxy'] + '\n'

                    else:
                        fi = {'name': nametext, 'category': ObjectId(cat_id)}
                        msg += u"Filter created: " + nametext + " .Used proxy: " + goog.config[
                            'proxy'] + '\n'
                        filterinserted_id = filterdb.insert_one(fi).inserted_id

                    for fitem in pq(fitemstr)('.f-item > a'):
                        fitext = fitem.text.strip()
                        href = fitem.attrib['href']
                        fid = int(href.split('/')[3])
                        filteritem = filteritemdb.find_one({'href': href})
                        if not filteritem:
                            filteriteminserted = filteritemdb.insert_one({
                                'name':
                                fitext,
                                'href':
                                href,
                                'hid':
                                fid,
                                'filter':
                                ObjectId(filterinserted_id),
                                'finished':
                                False
                            })
            i += 1

        for i in range(0, pages):
            self.stdout.write('Page now: ' + str(i))

            goog.load_proxylist(os.path.dirname(os.path.realpath(__file__)) +
                                "/proxy1.txt",
                                source_type='text_file',
                                proxy_type='http',
                                auto_change=True)
            goog.go("http://hotline.ua" + url + "?p=" + str(i))

            productlinks = pquery(
                goog.response.body)('.cell.gd b.m_r-10 > a.g_statistic')

            for productlink in productlinks:
                producturl = productlink.attrib['href']
                tempproduct = product.find_one({'url': producturl})
                if tempproduct:
                    msg += u"Product used: " + tempproduct[
                        'name'] + " .Used proxy: " + goog.config['proxy'] + '\n'
                else:

                    self.stdout.write(producturl)
                    productitem = get_hotline_data(
                        'http://hotline.ua' + producturl, False, False)
                    msg += u"Product created: " + productitem[
                        'name'] + " .Used proxy: " + goog.config['proxy'] + '\n'
                    productitem['category'] = ObjectId(cat_id)
                    product.insert_one(productitem)

        self.stdout.write(msg)
        send_mail('Scan category:' + url,
                  msg,
                  '*****@*****.**', ['*****@*****.**'],
                  fail_silently=False)
Пример #19
0
    def handle(self, *args, **options):
        def getproductsurl(pageurl):

            self.stdout.write("Get pages of product. Page: " +
                              str(pageurls.index(pageurl)) + ', From:' +
                              str(allpagesurls))

            allproductslinks = []
            goog = scanhot(pageurl)
            productlinks = pq(
                goog.response.body)('.cell.gd b.m_r-10 > a.g_statistic')
            for productlink in productlinks:
                producturl = productlink.attrib['href']
                tempproduct = product.find_one({'url': producturl})
                # tempproduct = False
                if not tempproduct:
                    # msg += u"Product used: " + tempproduct['name'] + " .Used proxy: " + goog.config['proxy'] + '\n'
                    allproductslinks.append('http://hotline.ua' + producturl)
                # else:
            return allproductslinks

        def addfilterstoproduct(onef):

            toend = int(db.hotsettings.find_one({"name": "toend"})['value'])

            # assert False, toend
            toend -= 1
            db.hotsettings.update({"name": "toend"}, {
                'name': 'toend',
                'value': toend
            })

            # filterspagedokonca -= 1
            # filterspagecount = len()
            self.stdout.write("To end:" + str(toend))
            goog = scanhot(onef['href'])
            # itemscount = float(goog.doc.pyquery('h2.selected-filtrs.grey-6')[0].text.split(' ')[1].strip())
            # pages = int(math.ceil(itemscount/24))
            productlinks = goog.doc.pyquery(
                '.cell.gd b.m_r-10 > a.g_statistic')

            # if goog.doc.pyquery('.g-recaptcha').eq(0):
            #     finished = False

            for productlink in productlinks:
                producturl = productlink.attrib['href']
                # self.stdout.write(producturl)

                item = product.find_one({'url': producturl})
                if item:
                    if not 'filters' in item:
                        item['filters'] = []

                    if not onef['filter'] in item['filters']:
                        item['filters'].append(onef['filter'])

                    product.update({'url': producturl}, item)
                # else:
                #     error += "Not Found: " + producturl + ", " + onef['href'] + "\n"

        # with Profiler() as p:

        #     urls = [
        #         'http://www.python.org',
        #         'http://www.python.org/about/',
        #         'http://www.onlamp.com/pub/a/python/2003/04/17/metaclasses.html',
        #         'http://www.python.org/doc/',
        #         'http://www.python.org/download/',
        #         'http://www.python.org/getit/',
        #         'http://www.python.org/community/',
        #         'https://wiki.python.org/moin/',
        #         'http://planet.python.org/',
        #         'https://wiki.python.org/moin/LocalUserGroups',
        #         'http://www.python.org/psf/',
        #         'http://docs.python.org/devguide/',
        #         'http://www.python.org/community/awards/'
        #         # etc..
        #         ]

        #     # Make the Pool of workers
        #     pool = ThreadPool(1)

        #     # Open the urls in their own threads
        #     # and return the results
        #     results = pool.map(urllib2.urlopen, urls)

        #     #close the pool and wait for the work to finish
        #     pool.close()
        #     pool.join()

        allproductslinks = []
        filterspagedokonca = 0
        filterdb = db.hotfilters
        filteritemdb = db.hotfiltersitems
        category = db.hotcategory
        product = db.hotproduct
        propertydb = db.hotproperty
        settingdb = db.hotsettings
        allproductslinks = []

        if settingdb.find_one({'name': 'status'}):
            settingdb.update({'name': "status"}, {
                "value": "started",
                "name": "status"
            })
        else:
            settingdb.insert_one({"value": "started", "name": "status"})

        product.remove({"name": ""})

        self.stdout.write(args[0])

        # assert False, args[0]
        url = args[0]
        if len(args) > 1:
            direction = args[1]
        else:
            direction = 0
        msg = ""
        error = ""

        itemsname = []
        itemsvalues = []
        filterpages = []

        goog = Grab(log_file='/tmp/log.html')
        goog.load_proxylist(os.path.dirname(os.path.realpath(__file__)) +
                            "/proxy1.txt",
                            source_type='text_file',
                            proxy_type='http',
                            auto_change=True)
        goog = scanhot("http://hotline.ua" + url)

        pquery = pq(goog.response.body)

        categoryname = pquery('h1.title-24.p_b-10')[0].text
        itemscount = float(
            pquery('h2.selected-filtrs.grey-6')[0].text.split(':')[1].strip())
        pages = int(math.ceil(itemscount / 24))

        catdata = category.find_one({'url': url})

        if catdata:
            cat_id = catdata['_id']
            msg += u"Category used: " + categoryname + u" .Used proxy: " + goog.config[
                'proxy'] + '\n'

        else:
            cat_id = category.insert_one({
                'name': categoryname,
                'count': itemscount,
                'pages': pages,
                'url': url
            }).inserted_id
            msg += u"Category created: " + categoryname + u" .Used proxy: " + goog.config[
                'proxy'] + '\n'

        # assert False, itemscount

        settingdb.insert_one({"value": categoryname, "name": "status"})

        names = pquery('#filters > .cell.f-title')

        i = 0
        for name in names:

            # if direction:
            index = i - int(direction)

            if index >= 0:

                fitemstr = pquery('#filters > .cell.full-list').eq(index)

                if name.text:
                    nametext = name.text.strip()
                    if not nametext in itemsname:
                        filteritem = filterdb.find_one({
                            'name':
                            nametext,
                            'category':
                            ObjectId(cat_id)
                        })
                        if filteritem:
                            filterinserted_id = filteritem['_id']
                            msg += u"Filter used: " + nametext + " .Used proxy: " + goog.config[
                                'proxy'] + '\n'

                        else:
                            fi = {
                                'name': nametext,
                                'category': ObjectId(cat_id)
                            }
                            msg += u"Filter created: " + nametext + " .Used proxy: " + goog.config[
                                'proxy'] + '\n'
                            filterinserted_id = filterdb.insert_one(
                                fi).inserted_id

                        for fitem in pq(fitemstr)('.f-item > a'):

                            try:
                                string = tostring(fitem)
                                m = re.search(r"\(([0-9_]+)\)", string)
                                fitems = float(m.group(1))
                                # itemscount = float(goog.doc.pyquery('h2.selected-filtrs.grey-6')[0].text.split(' ')[1].strip())
                                fpages = int(math.ceil(items / 24))
                            except Exception, e:
                                fpages = 1
                                fitems = 1

                                # raise e                            string = tostring(fitem)

                            fitext = fitem.text.strip()
                            href = fitem.attrib['href']
                            fid = int(href.split('/')[3])
                            filteritem = filteritemdb.remove({'href': href})
                            # if not filteritem:
                            filteriteminserted = filteritemdb.insert_one({
                                'name':
                                fitext,
                                'href':
                                href,
                                'hid':
                                fid,
                                'filter':
                                ObjectId(filterinserted_id),
                                'items':
                                fitems,
                                'pages':
                                fpages,
                                'finished':
                                False
                            })

                            for fi in range(0, pages):
                                filterpages.append({
                                    "href":
                                    "http://hotline.ua" + href + "?p=" +
                                    str(fi),
                                    "filter":
                                    fid
                                })
            i += 1
Пример #20
0
    def handle(self, *args, **options):
        def decode_captcha(newcaptcha, pitem):
            goog.go("https://ipv4.google.com/sorry/CaptchaRedirect?continue=" +
                    newcaptcha.continueurl + '&id=' + newcaptcha.hiddenid +
                    '&captcha=' + newcaptcha.response)
            if goog.doc.select('//h3[@class="r"]/a').exists():
                # assert False, 'we are here'
                sresult = goog.doc.select('//h3[@class="r"]/a')[0].attr('href')
                get_hotline_data(sresult, pitem)
            else:
                pass

        def wait_enter_captcha(captcha, timer, pitem):
            self.stdout.write('captcha continueurl:' + continueurl)
            newcaptcha = EcomerceCaptcha.objects.filter(pk=captcha.pk).get()
            if newcaptcha.response:
                decode_captcha(newcaptcha, pitem)
            else:
                self.stdout.write('Pause: ' + str(timer))
                time.sleep(timer)
                timer = timer + 5
                wait_enter_captcha(captcha, timer, pitem)

        prefix = 'https://www.google.com.ua/search?q=site:hotline.ua+'
        # assert False, )

        items = PriceString.objects.filter(checked=False)

        for pitem in items:
            if pitem.name:
                goog = Grab(log_file='/tmp/log.html')
                pitem.checked = True
                pitem.save()
                # goog.proxylist.set_source('file', load_file=)

                goog.load_proxylist(
                    os.path.dirname(os.path.realpath(__file__)) + "/proxy.txt",
                    source_type='text_file',
                    proxy_type='http',
                    auto_change=True)
                self.stdout.write(str(pitem.pk))
                words = pitem.name.split(' ')
                searchphrase = '+'.join(words)
                gurl = prefix + searchphrase
                goog.go(prefix + searchphrase)

                # file = codecs.open(PROJECT_ROOT + '/static/test2.txt', "w", "utf-8")
                # file.write(goog.response.body)
                # file.close()

                # assert False, goog.response.body

                if goog.doc.select('//h3[@class="r"]/a').exists():

                    sresult = goog.doc.select('//h3[@class="r"]/a')[0].attr(
                        'href')

                    self.stdout.write(sresult)
                    if "?q=" in sresult:
                        sresult = sresult.split('?q=')[1].split('&')[0]
                    self.stdout.write(sresult)
                    get_hotline_data(sresult, pitem)

                else:

                    if (goog.doc.select('//input[@name="id"]').exists()):

                        fname = random.randint(1, 110000000)

                        hiddenid = goog.doc.select(
                            '//input[@name="id"]')[0].attr('value')
                        continueurl = goog.doc.select(
                            '//input[@name="continue"]')[0].attr('value')

                        goog.go("https://ipv4.google.com" +
                                goog.doc.select('//img')[0].attr('src'))

                        f = open(
                            PROJECT_ROOT + '/media/captcha/' + str(fname) +
                            '.jpg', 'w+')
                        f.write(goog.response.body)
                        f.close()

                        captcha = EcomerceCaptcha(request=fname,
                                                  hiddenid=hiddenid,
                                                  continueurl=continueurl)
                        captcha.save()

                        wait_enter_captcha(captcha, 5, pitem)
Пример #21
0
class Avito:
    PAGETRY = 3
    SLEEP = 1

    def __init__(self, proxy=None):
        self.PAGETRY, rc = startup.CFG.value("pagetry", Avito.PAGETRY).toInt()
        self.SLEEP, rc = startup.CFG.value("sleep", Avito.SLEEP).toInt()

        self.g = Grab()
        self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10)))
        if __name__ == "__main__":
            self.g.setup(log_dir="dump")
        if proxy:
            self.g.load_proxylist(proxy, "text_file", "http", auto_init=True, auto_change=False)
        self.tess = Tesseract()
        self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)

    def _go3(self, url, tag):
        c = self.PAGETRY
        while c:
            try:
                self.g.change_proxy()
                self.g.go(url)
                self.g.assert_substring(u'content="51a6d66a02fb23c7"')
                break
            except:
                log.exception("%s left %i", tag, c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception("%s error" % tag)

    def get_links(self, url):
        self._go3(url, "start page")
        c = 999
        while True:
            links = self.g.doc.select('//h3[@class="title"]/a/@href')
            if not links:
                raise Exception("no links")
            for link in links:
                c -= 1
                if not c:
                    return
                yield urljoin(url, link.text())
            next = self.g.doc.select('//a[@class="next"]/@href')
            if not next:
                log.debug("last page?")
                break
            QtCore.QThread.sleep(self.SLEEP)
            nurl = urljoin(url, next.text())
            log.debug("open next page %s", nurl)
            self._go3(nurl, "next page")

    def get_photos(self, photos):
        g = self.g.clone()
        datas = []
        for url in photos:
            if not url.startswith("http:"):
                url = "http:" + url
            c = self.PAGETRY
            while c:
                try:
                    rc = g.go(url)
                    g.assert_substring("JFIF", byte=True)
                    datas.append(rc.body)
                    break
                except:
                    log.exception("get_item left %i", c)
                    c -= 1
                    g.change_proxy()
                    QtCore.QThread.sleep(self.SLEEP)
            else:
                raise Exception("get photo error")
        return datas

    def get_item(self, url):
        g = self.g.clone()
        c = self.PAGETRY
        while c:
            try:
                g.change_proxy()
                g.go(url)
                g.assert_substring(u'content="499bdc75d3636c55"')  # avitos ya id
                break
            except:
                log.exception("get_item left %i", c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception("get item error")
        doc = g.doc
        title = doc.select('//h1[@itemprop="name"]').text()
        gallery = doc.select('//div[@class="gallery-item"]')
        photos = [s.text() for s in gallery.select('.//a[@class="gallery-link"]/@href')]
        if not photos:
            egg = doc.select('//img[contains(@class,"j-zoom-gallery-init")]/@src')
            if egg:
                photos.append(egg.text())
        item = doc.select('//div[@itemprop="offers"]')
        # price = item.select('.//strong[@itemprop="price"]').text()
        price = item.select('.//span[@itemprop="price"]').text()
        name = item.select('.//strong[@itemprop="name"]').text()

        try:
            town = item.select('.//span[@id="toggle_map"]/span[@itemprop="name"]').text()
        except:
            log.warning("xpath town not found, try another way")
            town = item.select('.//div[@id="map"]/span[@itemprop="name"]').text()
        # desc = item.select('.//div[@id="desc_text"]').text()
        desc = doc.select('//div[contains(@class,"description-text")]').text()
        # <span id="item_id">348967351</span>
        item_id = doc.select('//span[@id="item_id"]').text()
        _url = g.rex_text("avito.item.url = '([^>]+?)'")
        _phone = g.rex_text("avito.item.phone = '([^>]+?)'")

        loc = {"item_phone": _phone, "item": {"id": item_id, "url": _url}}
        log.debug("jslock enter <--")
        with PyV8.JSContext(loc) as ctx:
            ctx.enter()
            ctx.eval(
                """function phoneDemixer(key){var
    pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}}
    return r;}"""
            )

            # http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4
            egg = ctx.eval("'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)")
            # egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)")
            log.debug("js rc %s", egg)
            ctx.leave()
        log.debug("jslock leave -->")
        phone = ""
        c = self.PAGETRY
        while c:
            log.debug("read phone image")
            try:
                rc = g.go(urljoin(url, egg))
                img = Image.open(StringIO(rc.body))
                phone = self.tess.from_image(img, basewidth=300, whitelist="0123456789-")
                break
            except:
                g.change_proxy()
                log.exception("get_phone left %i", c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            log.debug("get phone error")

        return dict(item=item_id, title=title, photos=photos, price=price, name=name, town=town, desc=desc, phone=phone)
Пример #22
0
def scancategory(scan, now=0):

    # scan = ScanHotline(pub_date=datetime.datetime.now(), category=category)
    # scan.save()
    # get all active products
    items = ColorProduct.objects.filter(price__gt=0,
                                        product__category=scan.category,
                                        product__published=True)

    scan.started = True
    scan.items = len(items)
    scan.nowitems = now
    scan.save()

    # assert False, len(items)

    g = Grab(log_file='/tmp/log.html', timeout=300)

    g.load_proxylist(os.path.dirname(os.path.realpath(__file__)) +
                     "/proxy1.txt",
                     source_type='text_file',
                     proxy_type='http',
                     auto_change=True)
    # g.load_proxylist(proxy_file='/tmp/proxy.txt', proxy_type='http')

    prefix = 'http://hotline.ua/sr/?tab=pr&sort=1&q='

    myfirmid = 22242

    if now:
        items = items[:now - 1]

    # titems = []
    # import re
    for item in items:

        scan.nowitems += 1
        scan.lastitems = 0
        scan.pause = 0
        scan.save()
        # if item.name:
        # item.name = 'Alpine SWG-1244 12" (30 см)'
        result = re.sub("[^A-Za-z0-9 ()-]", "", item.productname)
        # titems.append(result)
        # assert False, item.name
        words = result.split(' ')
        searchphrase = '+'.join(words)
        gurl = prefix + searchphrase
        # assert False, item.id
        # assert False, searchphrase

        oneposition = OneHotline(product=item, scan=scan)
        oneposition.save()
        print(123)
        if item.hrefok:
            getproduct(item.href, item, scan, oneposition)
        else:
            print("net")

    scan.finished = True
    scan.pub_date = datetime.datetime.now()
    scan.save()
Пример #23
0
    def handle(self, *args, **options):
        filterdb = db.hotfilters
        filteritemdb = db.hotfiltersitems
        category = db.hotcategory
        product = db.hotproduct

        url = "/mobile/umnye-chasy-smartwatch/"

        cat_id = category.find_one({'url': url})['_id']

        mainfilters = list(filterdb.find({'category': ObjectId(cat_id)}))

        for mainfilter in mainfilters:

            onefilters = filteritemdb.find({
                'filter':
                ObjectId(mainfilter['_id']),
                'finished':
                False
            })

            for onef in onefilters:
                finished = True

                self.stdout.write(onef['href'] + ": " + str(onef['hid']))

                goog = Grab(log_file='/tmp/log.html')
                goog.load_proxylist(
                    os.path.dirname(os.path.realpath(__file__)) +
                    "/proxy1.txt",
                    source_type='text_file',
                    proxy_type='http',
                    auto_change=True)
                goog.go("http://hotline.ua" + onef['href'])

                itemscount = float(
                    goog.doc.pyquery('h2.selected-filtrs.grey-6')
                    [0].text.split(' ')[1].strip())
                pages = int(math.ceil(itemscount / 24))

                self.stdout.write('Url: ' + onef['href'] + ', Pages:' +
                                  str(pages))

                productlinks = goog.doc.pyquery(
                    '.cell.gd b.m_r-10 > a.g_statistic')
                if goog.doc.pyquery('.g-recaptcha').eq(0):
                    finished = False
                    # print('captcha error')
                    # # proxy.active=False
                    # # proxy.save()
                    # send_mail(pg.config['proxy'], body, '*****@*****.**',['*****@*****.**'], fail_silently=False)
                    # return

                for productlink in productlinks:
                    producturl = productlink.attrib['href']
                    self.stdout.write(producturl)

                    item = product.find_one({'url': producturl})
                    if not 'filters' in item:
                        item['filters'] = []
                    item['filters'].append(onef['hid'])

                    product.update({'url': producturl}, item)

                for i in range(1, pages):
                    goog.load_proxylist(
                        os.path.dirname(os.path.realpath(__file__)) +
                        "/proxy1.txt",
                        source_type='text_file',
                        proxy_type='http',
                        auto_change=True)
                    goog.go("http://hotline.ua" + onef['href'] + '?p=' +
                            str(i))

                    self.stdout.write("http://hotline.ua" + onef['href'] +
                                      '?p=' + str(i))

                    # itemscount = float(goog.doc.pyquery('h2.selected-filtrs.grey-6')[0].text.split(' ')[1].strip())
                    # pages = int(math.ceil(itemscount/24))
                    productlinks = goog.doc.pyquery(
                        '.cell.gd b.m_r-10 > a.g_statistic')

                    if goog.doc.pyquery('.g-recaptcha').eq(0):
                        finished = False

                    for productlink in productlinks:
                        producturl = productlink.attrib['href']
                        self.stdout.write(producturl)

                        item = product.find_one({'url': producturl})
                        if not 'filters' in item:
                            item['filters'] = []

                        if not onef['hid'] in item['filters']:
                            item['filters'].append(onef['hid'])

                        product.update({'url': producturl}, item)

                if onef['finished'] != finished:
                    onef['finished'] = finished
                    product.update({'href': onef['href']}, onef)