def getproduct(href, item, scan, oneposition): pg = Grab(log_dir="/tmp", timeout=30) # proxy = Proxy.objects.filter(active=True).order_by('?')[0] # # proxyaddrlist = proxy.name.split(':')[0:1] # # proxyuserlist = proxy.name.split(':')[2:3] # proxyaddr = proxy.name.split(':')[0] + ':' + proxy.name.split(':')[1] # proxyuser = proxy.name.split(':')[2] + ':' + proxy.name.split(':')[3] # print proxyaddr # # assert False, proxyuser # pg.setup(proxy=proxyaddr, proxy_userpwd=proxyuser, proxy_type="http") pg.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy1.txt", source_type='text_file', proxy_type='http', auto_change=True) # print pg.config['proxy'] # pg.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy.txt", source_type='text_file', proxy_type='http', auto_change=True) try: purl = "http://hotline.ua" + href pg.go(purl) # pass except Exception, e: print "Error: " + purl return
def task_generator(self): ua = UserAgent() grab = Grab() grab.load_proxylist(self.PROXY_PATH, 'text_file', proxy_type='http', auto_init=False, auto_change=True) for link in VOCABULARY: url = link['url'] part_url = LOCALITY[self.city_pars] pages = xrange(1, link['pages']) cat = Category.objects.get(id=link['cat'][1]) moroz = link['cat'][0] city = City.objects.get(id=LOCALITY[self.city_pars][1]) for page in pages: print 'number_of_pages=', page grab.change_proxy() grab.setup( url=url % (part_url[0][0], part_url[0][1], part_url[0][2], page), proxy_userpwd=self.CREDENTIALS, hammer_mode=True, hammer_timeouts=HTM, user_agent=ua.random, reuse_cookies=False) # check_proxies_for_slando(self, grab=grab, ua=ua.random, url=url % (LOCALITY[self.city_pars][0], page)) print 'proxy before go of page list ', grab.config['proxy'] yield Task('link_on_page', delay=4, grab=grab, cat=cat, city=city, moroz=moroz)
def task_generator(self): ua = UserAgent() grab = Grab() grab.load_proxylist( PROXY_PATH, 'text_file', proxy_type='http', auto_init=False, auto_change=True ) for link in VOCABULARY: url = link['url'] pages = xrange(1, link['pages']) cat = link['cat'] for page in pages: grab.change_proxy() grab.setup( url=url % page, proxy_userpwd=CREDENTIALS, hammer_mode=True, hammer_timeouts=HTM, user_agent=ua.random, reuse_cookies=False ) yield Task('link_on_page', grab=grab, cat=cat)
def get_mag(link): idmag = link.split('/') firm = FirmHotline.objects.filter(itemid=int(idmag[2])) if firm: print("Firm exists:" + firm[0].name) return firm[0] else: fg = Grab(log_dir="/tmp", timeout=300) fg.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy1.txt", source_type='text_file', proxy_type='http', auto_change=True) fg.go('http://hotline.ua' + link) body = fg.response.body pyquery = pq(body) name = pyquery('ul.shop-title > li > h1').text() try: link = pyquery('ul.info-shops > li > p > a')[0].attrib['href'] except: link = "" firm = FirmHotline(itemid=int(idmag[2]), name=name, url=link) firm.save() print("New Firm:" + firm.name) return firm
def test_deprecated_setup_proxylist(self): g = Grab(transport=GRAB_TRANSPORT) open('/tmp/__proxy.txt', 'w').write(PROXY1) g.load_proxylist('/tmp/__proxy.txt', 'text_file') SERVER.RESPONSE['get'] = '123' g.change_proxy() g.go('http://yandex.ru') self.assertEqual('123', g.response.body) self.assertEqual('yandex.ru', SERVER.REQUEST['headers']['host'])
def test_deprecated_setup_proxylist(self): g = Grab(transport=GRAB_TRANSPORT) open(TMP_FILE, 'w').write(PROXY1) g.load_proxylist(TMP_FILE, 'text_file') SERVER.RESPONSE['get'] = '123' g.change_proxy() g.go('http://yandex.ru') self.assertEqual('123', g.response.body) self.assertEqual('yandex.ru', SERVER.REQUEST['headers']['host'])
def handle(self, *args, **options): settings = { 'price__gt': 0, 'href': '', } if args: category = Category.objects.get(pk=args[0]) # assert False, category settings['product__category'] = category # assert False, settings items = ColorProduct.objects.filter(price__gt=0, href="") # items = ColorProduct.objects,filter(settings) prefix = 'https://www.google.com.ua/search?q=site:hotline.ua+' # assert False, items for pitem in items: if pitem.productname: goog = Grab(log_file='/tmp/log.html') # goog.proxylist.set_source('file', load_file=) goog.load_proxylist( os.path.dirname(os.path.realpath(__file__)) + "/proxy.txt", source_type='text_file', proxy_type='http', auto_change=True) self.stdout.write(str(pitem.pk)) words = pitem.productname.split(' ') searchphrase = '+'.join(words) gurl = prefix + searchphrase goog.go(prefix + searchphrase) # file = codecs.open(PROJECT_ROOT + '/static/test2.txt', "w", "utf-8") # file.write(goog.response.body) # file.close() # assert False, goog.response.body if goog.doc.select('//h3[@class="r"]/a').exists(): sresult = goog.doc.select('//h3[@class="r"]/a')[0].attr( 'href') self.stdout.write(sresult) if "?q=" in sresult: sresult = sresult.split('?q=')[1].split('&')[0] sresult = sresult.replace('http://hotline.ua', '') pitem.href = sresult pitem.hrefok = True pitem.save() self.stdout.write(sresult) else: self.stdout.write('net')
def task_generator(self): ua = UserAgent() grab = Grab(timeout=30) grab.load_proxylist('proxy_http_auth.txt', 'text_file', proxy_type='http', auto_init=False, auto_change=True) # grab.config["thread_number"] = 40 for link in VOCABULARY: url = link['url'] pages = xrange(1, link['pages']) cat = link['cat'] for page in pages: grab.change_proxy() grab.setup(url=url % page, proxy_userpwd=CREDENTIALS, hammer_mode=True, hammer_timeouts=((2, 5), (10, 15), (20, 30)), user_agent=ua.random, reuse_cookies=True) yield Task('link_on_page', grab=grab, cat=cat)
def Grab(**kwargs): grb = GrabLib() default_settings = { 'user_agent_file': USER_AGENT_FILE, 'connect_timeout': defaults.GRABBER_CONNECT_TIMEOUT, 'timeout': defaults.GRABBER_TIMEOUT, 'hammer_mode': True, 'hammer_timeouts': defaults.GRABBER_HAMMER_TIMEOUTS, 'headers': defaults.GRABBER_HEADERS } default_settings.update(kwargs) grb.setup(**default_settings) grb.load_proxylist( source=get_proxies(), source_type='list', auto_init=True, auto_change=True ) return grb
def task_generator(self): ua = UserAgent() grab = Grab(timeout=30) grab.load_proxylist('proxy_http_auth.txt', 'text_file', proxy_type='http', auto_init=False, auto_change=True) while True: dig = random.randint(111, 999) grab.change_proxy() grab.setup( url='http://zipexpert.com.ua/catalog/?q=%s&s=' % dig, # url='http://good-service.com.ua/content/zapchasti-dlya-stiralnykh-mashin-v-kharkove-i-po-vsei-ukraine-optom-i-v-roznitsu', proxy_userpwd=CREDENTIALS, hammer_mode=True, hammer_timeouts=((2, 5), (10, 15), (20, 30)), user_agent=ua.random, reuse_cookies=False) yield Task('link_on_page', grab=grab)
def task_generator(self): ua = UserAgent() grab = Grab() grab.load_proxylist(PROXY_PATH_fine, 'text_file', proxy_type='http', auto_init=False, auto_change=True) g = Grab() g.go(u'http://dom.ria.com/ru/Каталог/Продажа-аренда/') print 'sssssssssssssssssssssssssssss', g.doc.select( '//a[@class="photo photo-185x120"]') for item in g.doc.select('//a[@class="photo photo-185x120"]'): print u'http://dom.ria.com/%s' % item.attr('href') grab.setup(url=u'http://dom.ria.com%s' % item.attr('href'), proxy_userpwd=CREDENTIALS_fine, hammer_mode=True, hammer_timeouts=HTM, user_agent=ua.random, reuse_cookies=False) yield Task('link_on_page', grab=grab)
def test_load_proxylist(self): content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3) open(TMP_FILE, 'w').write(content) # By default auto_change is True g = Grab(transport=GRAB_TRANSPORT) g.load_proxylist(TMP_FILE, 'text_file') self.assertEqual(g.config['proxy_auto_change'], True) servers = set() for x in xrange(10): g.go('http://yandex.ru') servers.add(g.config['proxy']) self.assertTrue(len(servers) > 1) # Disable auto_change # By default auto_init is True g = Grab(transport=GRAB_TRANSPORT) g.load_proxylist(TMP_FILE, 'text_file', auto_change=False) self.assertEqual(g.config['proxy_auto_change'], False) servers = set() for x in xrange(10): g.go('http://yandex.ru') servers.add(g.config['proxy']) self.assertEqual(len(servers), 1) # Disable auto_change # Disable auto_init # Proxylist will not be used by default g = Grab(transport=GRAB_TRANSPORT) g.load_proxylist(TMP_FILE, 'text_file', auto_change=False, auto_init=False) self.assertEqual(g.config['proxy_auto_change'], False) g.go('http://yandex.ru') self.assertEqual(g.config['proxy'], None)
def handle(self, *args, **options): msg = "" goog = Grab(log_file='/tmp/log.html') goog.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy1.txt", source_type='text_file', proxy_type='http', auto_change=True) goog = scanhot("http://hotline.ua/catalog/") pquery = pq(goog.response.body) cells = pquery('.all-cat.m_t-30 li > a') # assert False, len(cells) # assert False, cells[0].outerHtml() # items = [] for col in cells: item = {} msg += "python manage.py scanhotall " + col.attrib['href'] + "\n" import codecs with codecs.open(PROJECT_PATH + '/../bigname.txt', "w", encoding="utf-8") as f: f.write(unicode(msg))
def test_change_proxy(self): g = Grab(transport=GRAB_TRANSPORT) with open(TMP_FILE, 'w') as out: for x in xrange(10): out.write('server-%d:777\n' % x) g.load_proxylist(TMP_FILE, 'text_file', auto_init=False, auto_change=False) self.assertEqual(g.config['proxy'], None) g.load_proxylist(TMP_FILE, 'text_file', auto_init=False, auto_change=True) self.assertEqual(g.config['proxy'], None) g.load_proxylist(TMP_FILE, 'text_file', auto_init=True, auto_change=False) self.assertTrue('server-' in g.config['proxy'])
def test_change_proxy(self): g = Grab(transport=GRAB_TRANSPORT) with open('/tmp/__proxy.txt', 'w') as out: for x in xrange(10): out.write('server-%d:777\n' % x) g.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_init=False, auto_change=False) self.assertEqual(g.config['proxy'], None) g.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_init=False, auto_change=True) self.assertEqual(g.config['proxy'], None) g.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_init=True, auto_change=False) self.assertTrue('server-' in g.config['proxy'])
def test_load_proxylist(self): content = '%s\n%s\n%s' % (PROXY1, PROXY2, PROXY3) open('/tmp/__proxy.txt', 'w').write(content) # By default auto_change is True g = Grab(transport=GRAB_TRANSPORT) g.load_proxylist('/tmp/__proxy.txt', 'text_file') self.assertEqual(g.config['proxy_auto_change'], True) servers = set() for x in xrange(10): g.go('http://yandex.ru') servers.add(g.config['proxy']) self.assertTrue(len(servers) > 1) # Disable auto_change # By default auto_init is True g = Grab(transport=GRAB_TRANSPORT) g.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False) self.assertEqual(g.config['proxy_auto_change'], False) servers = set() for x in xrange(10): g.go('http://yandex.ru') servers.add(g.config['proxy']) self.assertEqual(len(servers), 1) # Disable auto_change # Disable auto_init # Proxylist will not be used by default g = Grab(transport=GRAB_TRANSPORT) g.load_proxylist('/tmp/__proxy.txt', 'text_file', auto_change=False, auto_init=False) self.assertEqual(g.config['proxy_auto_change'], False) g.go('http://yandex.ru') self.assertEqual(g.config['proxy'], None)
class Avito(): PAGETRY = 3 SLEEP = 1 def __init__(self, proxy=None): self.PAGETRY, rc = startup.CFG.value('pagetry', Avito.PAGETRY).toInt() self.SLEEP, rc = startup.CFG.value('sleep', Avito.SLEEP).toInt() self.g = Grab() self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10))) if __name__ == '__main__': self.g.setup(log_dir='dump') if proxy: self.g.load_proxylist(proxy, 'text_file', 'http', auto_init=True, auto_change=False) self.tess = Tesseract() self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE) def _go3(self, url, tag): c = self.PAGETRY while c: try: self.g.change_proxy() self.g.go(url) self.g.assert_substring(u'content="51a6d66a02fb23c7"') break except: log.exception('%s left %i', tag, c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception('%s error' % tag) def get_links(self, url): self._go3(url, 'start page') c = 999 while True: links = self.g.doc.select('//h3[@class="title"]/a/@href') if not links: raise Exception('no links') for link in links: c -= 1 if not c: return yield urljoin(url, link.text()) next = self.g.doc.select('//a[@class="next"]/@href') if not next: log.debug('last page?') break QtCore.QThread.sleep(self.SLEEP) nurl = urljoin(url, next.text()) log.debug('open next page %s', nurl) self._go3(nurl, 'next page') def get_photos(self, photos): g = self.g.clone() datas = [] for url in photos: if not url.startswith('http:'): url = 'http:' + url c = self.PAGETRY while c: try: rc = g.go(url) g.assert_substring('JFIF', byte=True) datas.append(rc.body) break except: log.exception('get_item left %i', c) c -= 1 g.change_proxy() QtCore.QThread.sleep(self.SLEEP) else: raise Exception('get photo error') return datas def get_item(self, url): g = self.g.clone() c = self.PAGETRY while c: try: g.change_proxy() g.go(url) g.assert_substring( u'content="499bdc75d3636c55"') # avitos ya id break except: log.exception('get_item left %i', c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception('get item error') doc = g.doc title = doc.select('//h1[@itemprop="name"]').text() gallery = doc.select('//div[@class="gallery-item"]') photos = [ s.text() for s in gallery.select('.//a[@class="gallery-link"]/@href') ] if not photos: egg = doc.select( '//img[contains(@class,"j-zoom-gallery-init")]/@src') if egg: photos.append(egg.text()) item = doc.select('//div[@itemprop="offers"]') #price = item.select('.//strong[@itemprop="price"]').text() price = item.select('.//span[@itemprop="price"]').text() name = item.select('.//strong[@itemprop="name"]').text() try: town = item.select( './/span[@id="toggle_map"]/span[@itemprop="name"]').text() except: log.warning('xpath town not found, try another way') town = item.select( './/div[@id="map"]/span[@itemprop="name"]').text() #desc = item.select('.//div[@id="desc_text"]').text() desc = doc.select( "//div[contains(@class,\"description-text\")]").text() #<span id="item_id">348967351</span> item_id = doc.select('//span[@id="item_id"]').text() _url = g.rex_text("avito.item.url = '([^>]+?)'") _phone = g.rex_text("avito.item.phone = '([^>]+?)'") loc = {'item_phone': _phone, 'item': {'id': item_id, 'url': _url}} log.debug('jslock enter <--') with PyV8.JSContext(loc) as ctx: ctx.enter() ctx.eval('''function phoneDemixer(key){var pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}} return r;}''') #http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4 egg = ctx.eval( "'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)") #egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)") log.debug('js rc %s', egg) ctx.leave() log.debug('jslock leave -->') phone = '' c = self.PAGETRY while c: log.debug('read phone image') try: rc = g.go(urljoin(url, egg)) img = Image.open(StringIO(rc.body)) phone = self.tess.from_image(img, basewidth=300, whitelist='0123456789-') break except: g.change_proxy() log.exception('get_phone left %i', c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: log.debug('get phone error') return dict(item=item_id, title=title, photos=photos, price=price, name=name, town=town, desc=desc, phone=phone)
def handle(self, *args, **options): filterdb = db.hotfilters filteritemdb = db.hotfiltersitems category = db.hotcategory product = db.hotproduct url = "/mobile/umnye-chasy-smartwatch/" msg = "" itemsname = [] itemsvalues = [] goog = Grab(log_file='/tmp/log.html') goog.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy1.txt", source_type='text_file', proxy_type='http', auto_change=True) goog.go("http://hotline.ua" + url) pquery = pq(goog.response.body) categoryname = pquery('h1.title-24.p_b-10')[0].text itemscount = float( pquery('h2.selected-filtrs.grey-6')[0].text.split(':')[1].strip()) pages = int(math.ceil(itemscount / 24)) catdata = category.find_one({'url': url}) if catdata: cat_id = catdata['_id'] msg += u"Category used: " + categoryname + u" .Used proxy: " + goog.config[ 'proxy'] + '\n' else: cat_id = category.insert_one({ 'name': categoryname, 'count': itemscount, 'pages': pages, 'url': url }).inserted_id msg += u"Category created: " + categoryname + u" .Used proxy: " + goog.config[ 'proxy'] + '\n' # assert False, itemscount names = pquery('#filters > .cell.f-title') i = 0 for name in names: fitemstr = pquery('#filters > .cell.full-list').eq(i) if name.text: nametext = name.text.strip() if not nametext in itemsname: filteritem = filterdb.find_one({ 'name': nametext, 'category': ObjectId(cat_id) }) if filteritem: filterinserted_id = filteritem['_id'] msg += u"Filter used: " + nametext + " .Used proxy: " + goog.config[ 'proxy'] + '\n' else: fi = {'name': nametext, 'category': ObjectId(cat_id)} msg += u"Filter created: " + nametext + " .Used proxy: " + goog.config[ 'proxy'] + '\n' filterinserted_id = filterdb.insert_one(fi).inserted_id for fitem in pq(fitemstr)('.f-item > a'): fitext = fitem.text.strip() href = fitem.attrib['href'] fid = int(href.split('/')[3]) filteritem = filteritemdb.find_one({'href': href}) if not filteritem: filteriteminserted = filteritemdb.insert_one({ 'name': fitext, 'href': href, 'hid': fid, 'filter': ObjectId(filterinserted_id), 'finished': False }) i += 1 for i in range(0, pages): self.stdout.write('Page now: ' + str(i)) goog.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy1.txt", source_type='text_file', proxy_type='http', auto_change=True) goog.go("http://hotline.ua" + url + "?p=" + str(i)) productlinks = pquery( goog.response.body)('.cell.gd b.m_r-10 > a.g_statistic') for productlink in productlinks: producturl = productlink.attrib['href'] tempproduct = product.find_one({'url': producturl}) if tempproduct: msg += u"Product used: " + tempproduct[ 'name'] + " .Used proxy: " + goog.config['proxy'] + '\n' else: self.stdout.write(producturl) productitem = get_hotline_data( 'http://hotline.ua' + producturl, False, False) msg += u"Product created: " + productitem[ 'name'] + " .Used proxy: " + goog.config['proxy'] + '\n' productitem['category'] = ObjectId(cat_id) product.insert_one(productitem) self.stdout.write(msg) send_mail('Scan category:' + url, msg, '*****@*****.**', ['*****@*****.**'], fail_silently=False)
def handle(self, *args, **options): def getproductsurl(pageurl): self.stdout.write("Get pages of product. Page: " + str(pageurls.index(pageurl)) + ', From:' + str(allpagesurls)) allproductslinks = [] goog = scanhot(pageurl) productlinks = pq( goog.response.body)('.cell.gd b.m_r-10 > a.g_statistic') for productlink in productlinks: producturl = productlink.attrib['href'] tempproduct = product.find_one({'url': producturl}) # tempproduct = False if not tempproduct: # msg += u"Product used: " + tempproduct['name'] + " .Used proxy: " + goog.config['proxy'] + '\n' allproductslinks.append('http://hotline.ua' + producturl) # else: return allproductslinks def addfilterstoproduct(onef): toend = int(db.hotsettings.find_one({"name": "toend"})['value']) # assert False, toend toend -= 1 db.hotsettings.update({"name": "toend"}, { 'name': 'toend', 'value': toend }) # filterspagedokonca -= 1 # filterspagecount = len() self.stdout.write("To end:" + str(toend)) goog = scanhot(onef['href']) # itemscount = float(goog.doc.pyquery('h2.selected-filtrs.grey-6')[0].text.split(' ')[1].strip()) # pages = int(math.ceil(itemscount/24)) productlinks = goog.doc.pyquery( '.cell.gd b.m_r-10 > a.g_statistic') # if goog.doc.pyquery('.g-recaptcha').eq(0): # finished = False for productlink in productlinks: producturl = productlink.attrib['href'] # self.stdout.write(producturl) item = product.find_one({'url': producturl}) if item: if not 'filters' in item: item['filters'] = [] if not onef['filter'] in item['filters']: item['filters'].append(onef['filter']) product.update({'url': producturl}, item) # else: # error += "Not Found: " + producturl + ", " + onef['href'] + "\n" # with Profiler() as p: # urls = [ # 'http://www.python.org', # 'http://www.python.org/about/', # 'http://www.onlamp.com/pub/a/python/2003/04/17/metaclasses.html', # 'http://www.python.org/doc/', # 'http://www.python.org/download/', # 'http://www.python.org/getit/', # 'http://www.python.org/community/', # 'https://wiki.python.org/moin/', # 'http://planet.python.org/', # 'https://wiki.python.org/moin/LocalUserGroups', # 'http://www.python.org/psf/', # 'http://docs.python.org/devguide/', # 'http://www.python.org/community/awards/' # # etc.. # ] # # Make the Pool of workers # pool = ThreadPool(1) # # Open the urls in their own threads # # and return the results # results = pool.map(urllib2.urlopen, urls) # #close the pool and wait for the work to finish # pool.close() # pool.join() allproductslinks = [] filterspagedokonca = 0 filterdb = db.hotfilters filteritemdb = db.hotfiltersitems category = db.hotcategory product = db.hotproduct propertydb = db.hotproperty settingdb = db.hotsettings allproductslinks = [] if settingdb.find_one({'name': 'status'}): settingdb.update({'name': "status"}, { "value": "started", "name": "status" }) else: settingdb.insert_one({"value": "started", "name": "status"}) product.remove({"name": ""}) self.stdout.write(args[0]) # assert False, args[0] url = args[0] if len(args) > 1: direction = args[1] else: direction = 0 msg = "" error = "" itemsname = [] itemsvalues = [] filterpages = [] goog = Grab(log_file='/tmp/log.html') goog.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy1.txt", source_type='text_file', proxy_type='http', auto_change=True) goog = scanhot("http://hotline.ua" + url) pquery = pq(goog.response.body) categoryname = pquery('h1.title-24.p_b-10')[0].text itemscount = float( pquery('h2.selected-filtrs.grey-6')[0].text.split(':')[1].strip()) pages = int(math.ceil(itemscount / 24)) catdata = category.find_one({'url': url}) if catdata: cat_id = catdata['_id'] msg += u"Category used: " + categoryname + u" .Used proxy: " + goog.config[ 'proxy'] + '\n' else: cat_id = category.insert_one({ 'name': categoryname, 'count': itemscount, 'pages': pages, 'url': url }).inserted_id msg += u"Category created: " + categoryname + u" .Used proxy: " + goog.config[ 'proxy'] + '\n' # assert False, itemscount settingdb.insert_one({"value": categoryname, "name": "status"}) names = pquery('#filters > .cell.f-title') i = 0 for name in names: # if direction: index = i - int(direction) if index >= 0: fitemstr = pquery('#filters > .cell.full-list').eq(index) if name.text: nametext = name.text.strip() if not nametext in itemsname: filteritem = filterdb.find_one({ 'name': nametext, 'category': ObjectId(cat_id) }) if filteritem: filterinserted_id = filteritem['_id'] msg += u"Filter used: " + nametext + " .Used proxy: " + goog.config[ 'proxy'] + '\n' else: fi = { 'name': nametext, 'category': ObjectId(cat_id) } msg += u"Filter created: " + nametext + " .Used proxy: " + goog.config[ 'proxy'] + '\n' filterinserted_id = filterdb.insert_one( fi).inserted_id for fitem in pq(fitemstr)('.f-item > a'): try: string = tostring(fitem) m = re.search(r"\(([0-9_]+)\)", string) fitems = float(m.group(1)) # itemscount = float(goog.doc.pyquery('h2.selected-filtrs.grey-6')[0].text.split(' ')[1].strip()) fpages = int(math.ceil(items / 24)) except Exception, e: fpages = 1 fitems = 1 # raise e string = tostring(fitem) fitext = fitem.text.strip() href = fitem.attrib['href'] fid = int(href.split('/')[3]) filteritem = filteritemdb.remove({'href': href}) # if not filteritem: filteriteminserted = filteritemdb.insert_one({ 'name': fitext, 'href': href, 'hid': fid, 'filter': ObjectId(filterinserted_id), 'items': fitems, 'pages': fpages, 'finished': False }) for fi in range(0, pages): filterpages.append({ "href": "http://hotline.ua" + href + "?p=" + str(fi), "filter": fid }) i += 1
def handle(self, *args, **options): def decode_captcha(newcaptcha, pitem): goog.go("https://ipv4.google.com/sorry/CaptchaRedirect?continue=" + newcaptcha.continueurl + '&id=' + newcaptcha.hiddenid + '&captcha=' + newcaptcha.response) if goog.doc.select('//h3[@class="r"]/a').exists(): # assert False, 'we are here' sresult = goog.doc.select('//h3[@class="r"]/a')[0].attr('href') get_hotline_data(sresult, pitem) else: pass def wait_enter_captcha(captcha, timer, pitem): self.stdout.write('captcha continueurl:' + continueurl) newcaptcha = EcomerceCaptcha.objects.filter(pk=captcha.pk).get() if newcaptcha.response: decode_captcha(newcaptcha, pitem) else: self.stdout.write('Pause: ' + str(timer)) time.sleep(timer) timer = timer + 5 wait_enter_captcha(captcha, timer, pitem) prefix = 'https://www.google.com.ua/search?q=site:hotline.ua+' # assert False, ) items = PriceString.objects.filter(checked=False) for pitem in items: if pitem.name: goog = Grab(log_file='/tmp/log.html') pitem.checked = True pitem.save() # goog.proxylist.set_source('file', load_file=) goog.load_proxylist( os.path.dirname(os.path.realpath(__file__)) + "/proxy.txt", source_type='text_file', proxy_type='http', auto_change=True) self.stdout.write(str(pitem.pk)) words = pitem.name.split(' ') searchphrase = '+'.join(words) gurl = prefix + searchphrase goog.go(prefix + searchphrase) # file = codecs.open(PROJECT_ROOT + '/static/test2.txt', "w", "utf-8") # file.write(goog.response.body) # file.close() # assert False, goog.response.body if goog.doc.select('//h3[@class="r"]/a').exists(): sresult = goog.doc.select('//h3[@class="r"]/a')[0].attr( 'href') self.stdout.write(sresult) if "?q=" in sresult: sresult = sresult.split('?q=')[1].split('&')[0] self.stdout.write(sresult) get_hotline_data(sresult, pitem) else: if (goog.doc.select('//input[@name="id"]').exists()): fname = random.randint(1, 110000000) hiddenid = goog.doc.select( '//input[@name="id"]')[0].attr('value') continueurl = goog.doc.select( '//input[@name="continue"]')[0].attr('value') goog.go("https://ipv4.google.com" + goog.doc.select('//img')[0].attr('src')) f = open( PROJECT_ROOT + '/media/captcha/' + str(fname) + '.jpg', 'w+') f.write(goog.response.body) f.close() captcha = EcomerceCaptcha(request=fname, hiddenid=hiddenid, continueurl=continueurl) captcha.save() wait_enter_captcha(captcha, 5, pitem)
class Avito: PAGETRY = 3 SLEEP = 1 def __init__(self, proxy=None): self.PAGETRY, rc = startup.CFG.value("pagetry", Avito.PAGETRY).toInt() self.SLEEP, rc = startup.CFG.value("sleep", Avito.SLEEP).toInt() self.g = Grab() self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10))) if __name__ == "__main__": self.g.setup(log_dir="dump") if proxy: self.g.load_proxylist(proxy, "text_file", "http", auto_init=True, auto_change=False) self.tess = Tesseract() self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE) def _go3(self, url, tag): c = self.PAGETRY while c: try: self.g.change_proxy() self.g.go(url) self.g.assert_substring(u'content="51a6d66a02fb23c7"') break except: log.exception("%s left %i", tag, c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception("%s error" % tag) def get_links(self, url): self._go3(url, "start page") c = 999 while True: links = self.g.doc.select('//h3[@class="title"]/a/@href') if not links: raise Exception("no links") for link in links: c -= 1 if not c: return yield urljoin(url, link.text()) next = self.g.doc.select('//a[@class="next"]/@href') if not next: log.debug("last page?") break QtCore.QThread.sleep(self.SLEEP) nurl = urljoin(url, next.text()) log.debug("open next page %s", nurl) self._go3(nurl, "next page") def get_photos(self, photos): g = self.g.clone() datas = [] for url in photos: if not url.startswith("http:"): url = "http:" + url c = self.PAGETRY while c: try: rc = g.go(url) g.assert_substring("JFIF", byte=True) datas.append(rc.body) break except: log.exception("get_item left %i", c) c -= 1 g.change_proxy() QtCore.QThread.sleep(self.SLEEP) else: raise Exception("get photo error") return datas def get_item(self, url): g = self.g.clone() c = self.PAGETRY while c: try: g.change_proxy() g.go(url) g.assert_substring(u'content="499bdc75d3636c55"') # avitos ya id break except: log.exception("get_item left %i", c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception("get item error") doc = g.doc title = doc.select('//h1[@itemprop="name"]').text() gallery = doc.select('//div[@class="gallery-item"]') photos = [s.text() for s in gallery.select('.//a[@class="gallery-link"]/@href')] if not photos: egg = doc.select('//img[contains(@class,"j-zoom-gallery-init")]/@src') if egg: photos.append(egg.text()) item = doc.select('//div[@itemprop="offers"]') # price = item.select('.//strong[@itemprop="price"]').text() price = item.select('.//span[@itemprop="price"]').text() name = item.select('.//strong[@itemprop="name"]').text() try: town = item.select('.//span[@id="toggle_map"]/span[@itemprop="name"]').text() except: log.warning("xpath town not found, try another way") town = item.select('.//div[@id="map"]/span[@itemprop="name"]').text() # desc = item.select('.//div[@id="desc_text"]').text() desc = doc.select('//div[contains(@class,"description-text")]').text() # <span id="item_id">348967351</span> item_id = doc.select('//span[@id="item_id"]').text() _url = g.rex_text("avito.item.url = '([^>]+?)'") _phone = g.rex_text("avito.item.phone = '([^>]+?)'") loc = {"item_phone": _phone, "item": {"id": item_id, "url": _url}} log.debug("jslock enter <--") with PyV8.JSContext(loc) as ctx: ctx.enter() ctx.eval( """function phoneDemixer(key){var pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}} return r;}""" ) # http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4 egg = ctx.eval("'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)") # egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)") log.debug("js rc %s", egg) ctx.leave() log.debug("jslock leave -->") phone = "" c = self.PAGETRY while c: log.debug("read phone image") try: rc = g.go(urljoin(url, egg)) img = Image.open(StringIO(rc.body)) phone = self.tess.from_image(img, basewidth=300, whitelist="0123456789-") break except: g.change_proxy() log.exception("get_phone left %i", c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: log.debug("get phone error") return dict(item=item_id, title=title, photos=photos, price=price, name=name, town=town, desc=desc, phone=phone)
def scancategory(scan, now=0): # scan = ScanHotline(pub_date=datetime.datetime.now(), category=category) # scan.save() # get all active products items = ColorProduct.objects.filter(price__gt=0, product__category=scan.category, product__published=True) scan.started = True scan.items = len(items) scan.nowitems = now scan.save() # assert False, len(items) g = Grab(log_file='/tmp/log.html', timeout=300) g.load_proxylist(os.path.dirname(os.path.realpath(__file__)) + "/proxy1.txt", source_type='text_file', proxy_type='http', auto_change=True) # g.load_proxylist(proxy_file='/tmp/proxy.txt', proxy_type='http') prefix = 'http://hotline.ua/sr/?tab=pr&sort=1&q=' myfirmid = 22242 if now: items = items[:now - 1] # titems = [] # import re for item in items: scan.nowitems += 1 scan.lastitems = 0 scan.pause = 0 scan.save() # if item.name: # item.name = 'Alpine SWG-1244 12" (30 см)' result = re.sub("[^A-Za-z0-9 ()-]", "", item.productname) # titems.append(result) # assert False, item.name words = result.split(' ') searchphrase = '+'.join(words) gurl = prefix + searchphrase # assert False, item.id # assert False, searchphrase oneposition = OneHotline(product=item, scan=scan) oneposition.save() print(123) if item.hrefok: getproduct(item.href, item, scan, oneposition) else: print("net") scan.finished = True scan.pub_date = datetime.datetime.now() scan.save()
def handle(self, *args, **options): filterdb = db.hotfilters filteritemdb = db.hotfiltersitems category = db.hotcategory product = db.hotproduct url = "/mobile/umnye-chasy-smartwatch/" cat_id = category.find_one({'url': url})['_id'] mainfilters = list(filterdb.find({'category': ObjectId(cat_id)})) for mainfilter in mainfilters: onefilters = filteritemdb.find({ 'filter': ObjectId(mainfilter['_id']), 'finished': False }) for onef in onefilters: finished = True self.stdout.write(onef['href'] + ": " + str(onef['hid'])) goog = Grab(log_file='/tmp/log.html') goog.load_proxylist( os.path.dirname(os.path.realpath(__file__)) + "/proxy1.txt", source_type='text_file', proxy_type='http', auto_change=True) goog.go("http://hotline.ua" + onef['href']) itemscount = float( goog.doc.pyquery('h2.selected-filtrs.grey-6') [0].text.split(' ')[1].strip()) pages = int(math.ceil(itemscount / 24)) self.stdout.write('Url: ' + onef['href'] + ', Pages:' + str(pages)) productlinks = goog.doc.pyquery( '.cell.gd b.m_r-10 > a.g_statistic') if goog.doc.pyquery('.g-recaptcha').eq(0): finished = False # print('captcha error') # # proxy.active=False # # proxy.save() # send_mail(pg.config['proxy'], body, '*****@*****.**',['*****@*****.**'], fail_silently=False) # return for productlink in productlinks: producturl = productlink.attrib['href'] self.stdout.write(producturl) item = product.find_one({'url': producturl}) if not 'filters' in item: item['filters'] = [] item['filters'].append(onef['hid']) product.update({'url': producturl}, item) for i in range(1, pages): goog.load_proxylist( os.path.dirname(os.path.realpath(__file__)) + "/proxy1.txt", source_type='text_file', proxy_type='http', auto_change=True) goog.go("http://hotline.ua" + onef['href'] + '?p=' + str(i)) self.stdout.write("http://hotline.ua" + onef['href'] + '?p=' + str(i)) # itemscount = float(goog.doc.pyquery('h2.selected-filtrs.grey-6')[0].text.split(' ')[1].strip()) # pages = int(math.ceil(itemscount/24)) productlinks = goog.doc.pyquery( '.cell.gd b.m_r-10 > a.g_statistic') if goog.doc.pyquery('.g-recaptcha').eq(0): finished = False for productlink in productlinks: producturl = productlink.attrib['href'] self.stdout.write(producturl) item = product.find_one({'url': producturl}) if not 'filters' in item: item['filters'] = [] if not onef['hid'] in item['filters']: item['filters'].append(onef['hid']) product.update({'url': producturl}, item) if onef['finished'] != finished: onef['finished'] = finished product.update({'href': onef['href']}, onef)