def transport_option_logic(self, curl_transport, fake_transport): g = Grab(transport=curl_transport) g.go(self.server.get_url()) self.assertEqual(g.response.body, b'XYZ') g2 = g.clone() g.go(self.server.get_url()) self.assertEqual(g.response.body, b'XYZ') g2_data = pickle.dumps(g2, pickle.HIGHEST_PROTOCOL) g3 = pickle.loads(g2_data) g3.go(self.server.get_url()) self.assertEqual(g3.response.body, b'XYZ') g = Grab(transport=fake_transport) g.go(self.server.get_url()) self.assertEqual(g.response.body, b'Faked XYZ') g2 = g.clone() g.go(self.server.get_url()) self.assertEqual(g.response.body, b'Faked XYZ') g2_data = pickle.dumps(g2, pickle.HIGHEST_PROTOCOL) g3 = pickle.loads(g2_data) g3.go(self.server.get_url()) self.assertEqual(g3.response.body, b'Faked XYZ')
def transport_option_logic(self, curl_transport, fake_transport): g = Grab(transport=curl_transport) g.go(self.server.get_url()) self.assertEqual(g.response.body, b'XYZ') g2 = g.clone() g.go(self.server.get_url()) self.assertEqual(g.response.body, b'XYZ') g2_data = pickle.dumps(g2, pickle.HIGHEST_PROTOCOL) g3 = pickle.loads(g2_data) g3.go(self.server.get_url()) self.assertEqual(g3.response.body, b'XYZ') g = Grab(transport=fake_transport) g.go(self.server.get_url()) self.assertEqual(g.response.body, b'Faked XYZ') g2 = g.clone() g.go(self.server.get_url()) self.assertEqual(g.response.body, b'Faked XYZ') g2_data = pickle.dumps(g2, pickle.HIGHEST_PROTOCOL) g3 = pickle.loads(g2_data) g3.go(self.server.get_url()) self.assertEqual(g3.response.body, b'Faked XYZ')
def transport_option_logic(self, curl_transport, fake_transport): g = Grab(transport=curl_transport) g.go(SERVER.BASE_URL) self.assertEqual(g.response.body, 'XYZ') g2 = g.clone() g.go(SERVER.BASE_URL) self.assertEqual(g.response.body, 'XYZ') g2_data = pickle.dumps(g2) g3 = pickle.loads(g2_data) g3.go(SERVER.BASE_URL) self.assertEqual(g3.response.body, 'XYZ') g = Grab(transport=fake_transport) g.go(SERVER.BASE_URL) self.assertEqual(g.response.body, 'Faked XYZ') g2 = g.clone() g.go(SERVER.BASE_URL) self.assertEqual(g.response.body, 'Faked XYZ') g2_data = pickle.dumps(g2) g3 = pickle.loads(g2_data) g3.go(SERVER.BASE_URL) self.assertEqual(g3.response.body, 'Faked XYZ')
def transport_option_logic(self, curl_transport, fake_transport): g = Grab(transport=curl_transport) g.go(SERVER.BASE_URL) self.assertEqual(g.response.body, "XYZ") g2 = g.clone() g.go(SERVER.BASE_URL) self.assertEqual(g.response.body, "XYZ") g2_data = pickle.dumps(g2, pickle.HIGHEST_PROTOCOL) g3 = pickle.loads(g2_data) g3.go(SERVER.BASE_URL) self.assertEqual(g3.response.body, "XYZ") g = Grab(transport=fake_transport) g.go(SERVER.BASE_URL) self.assertEqual(g.response.body, "Faked XYZ") g2 = g.clone() g.go(SERVER.BASE_URL) self.assertEqual(g.response.body, "Faked XYZ") g2_data = pickle.dumps(g2, pickle.HIGHEST_PROTOCOL) g3 = pickle.loads(g2_data) g3.go(SERVER.BASE_URL) self.assertEqual(g3.response.body, "Faked XYZ")
def assert_transport_pickle(self, transport, response): grab = Grab(transport=transport) grab2 = grab.clone() grab2_data = pickle.dumps(grab2, pickle.HIGHEST_PROTOCOL) grab3 = pickle.loads(grab2_data) grab3.go(self.server.get_url()) self.assertEqual(grab3.doc.body, response)
def test_clone(self): g = Grab(transport=GRAB_TRANSPORT) SERVER.RESPONSE['get'] = 'Moon' g.go(SERVER.BASE_URL) self.assertTrue('Moon' in g.response.body) g2 = Grab(transport=GRAB_TRANSPORT) self.assertEqual(g2.response, None) g2 = g.clone() self.assertTrue('Moon' in g.response.body)
def test_clone(self): g = Grab(transport=GRAB_TRANSPORT) SERVER.RESPONSE['get'] = 'Moon' g.go(SERVER.BASE_URL) self.assertTrue('Moon' in g.response.body) g2 = Grab(transport=GRAB_TRANSPORT) self.assertEqual(g2.response, None) g2 = g.clone() self.assertTrue('Moon' in g.response.body)
def assert_transport_response(self, transport, response): self.server.response['get.data'] = response grab = Grab(transport=transport) grab.go(self.server.get_url()) self.assertEqual(grab.doc.body, response) grab2 = grab.clone() grab2.go(self.server.get_url()) self.assertEqual(grab2.doc.body, response)
def yandex_request(query_url): g = Grab(cookiefile='var/yandex.cookies') g.go(query_url) while g.search(u'<title>Ой!') is not None: url = g.css('.b-captcha__image').get('src') solution = solve_captcha(g.clone(), url) g.set_input('rep', solution) g.submit() if g.search(u'<title>Ой!') is None: g.go(query_url) return g
def google_request(query_url): g = Grab(cookiefile='var/google.cookies', log_dir='log') g.go(query_url) while g.search('please type the characters below') is not None: url = g.get_xpath('//img', lambda x: 'sorry' in x.get('src')).get('src') solution = solve_captcha(g.clone(), url) g.set_input('captcha', solution) g.submit() if g.search('please type the characters below') is None: g.go(query_url) return g
def test_empty_clone(self): g = Grab() g.clone()
def fetch(self): """ Download urls via multicurl. Get new tasks from queue. """ m = pycurl.CurlMulti() m.handles = [] # Create curl instances for x in xrange(self.thread_number): curl = pycurl.Curl() m.handles.append(curl) freelist = m.handles[:] # This is infinite cycle # You can break it only from outside code which # iterates over result of this method while True: cached_request = None while len(freelist): # Increase request counter if (self.request_limit is not None and self.counters['request'] >= self.request_limit): logging.debug('Request limit is reached: %s' %\ self.request_limit) if len(freelist) == self.thread_number: yield None else: break else: try: priority, task = self.taskq.get(True, 0.1) except Empty: # If All handlers are free and no tasks in queue # yield None signal if len(freelist) == self.thread_number: yield None else: break else: if not self._preprocess_task(task): continue task.network_try_count += 1 if task.task_try_count == 0: task.task_try_count = 1 if task.task_try_count > self.task_try_limit: logging.debug('Task tries ended: %s / %s' % (task.name, task.url)) self.add_item('too-many-task-tries', task.url) continue if task.network_try_count > self.network_try_limit: logging.debug('Network tries ended: %s / %s' % (task.name, task.url)) self.add_item('too-many-network-tries', task.url) continue #import pdb; pdb.set_trace() if task.grab: grab = task.grab else: # Set up curl instance via Grab interface grab = Grab(**self.grab_config) grab.setup(url=task.url) if self.use_cache and not task.get('disable_cache'): if grab.detect_request_method() == 'GET': url = grab.config['url'] cache_item = self.cache.find_one({'_id': url}) if cache_item: #if url in self.cache: #cache_item = pickle.loads(self.cache[url]) #logging.debug('From cache: %s' % url) cached_request = (grab, grab.clone(), task, cache_item) grab.prepare_request() self.inc_count('request-cache') # break from prepre-request cycle # and go to process-response code break self.inc_count('request-network') if self.proxylist_config: args, kwargs = self.proxylist_config grab.setup_proxylist(*args, **kwargs) curl = freelist.pop() curl.grab = grab curl.grab.curl = curl curl.grab_original = grab.clone() curl.grab.prepare_request() curl.task = task # Add configured curl instance to multi-curl processor m.add_handle(curl) # If there were done network requests if len(freelist) != self.thread_number: while True: status, active_objects = m.perform() if status != pycurl.E_CALL_MULTI_PERFORM: break if cached_request: grab, grab_original, task, cache_item = cached_request url = task.url # or grab.config['url'] grab.fake_response(cache_item['body']) def custom_prepare_response(g): g.response.head = cache_item['head'].encode('utf-8') g.response.body = cache_item['body'].encode('utf-8') g.response.code = cache_item['response_code'] g.response.time = 0 g.response.url = cache_item['url'] g.response.parse('utf-8') g.response.cookies = g.extract_cookies() grab.process_request_result(custom_prepare_response) yield { 'ok': True, 'grab': grab, 'grab_original': grab_original, 'task': task, 'ecode': None, 'emsg': None } self.inc_count('request') while True: queued_messages, ok_list, fail_list = m.info_read() results = [] for curl in ok_list: results.append((True, curl, None, None)) for curl, ecode, emsg in fail_list: results.append((False, curl, ecode, emsg)) for ok, curl, ecode, emsg in results: res = self.process_multicurl_response( ok, curl, ecode, emsg) m.remove_handle(curl) freelist.append(curl) yield res self.inc_count('request') if not queued_messages: break m.select(0.5)
def task_collect_adv_data(self, grab, task): self.stats['taken'] += 1 one = [] url_adv = [] for i in grab.doc.select('//@href'): one.append(i.text()) for item in one: if DOMEN[self.city][29:] == 'kharkov': if item.startswith('/nedvizhimost/xarkov-'): url_adv.append(item) else: continue else: if item.startswith('/nedvizhimost/%s-' % DOMEN[self.city][29:]): url_adv.append(item) else: continue for one_adv in url_adv: self.stats['processed'] += 1 addition = task.get('addition') g = Grab() g.go(DOMEN[self.city][:15] + one_adv) advert = Advert() advert.category_id = CATEGORIES[addition['category']] advert.city_id = self.city_id advert.author_id = self.author_id advert.link = DOMEN[self.city][:15] + one_adv categories = g.doc.select( '//div[@id="content_objectTabWidgetinfo_tab"]').text() if g.doc.select('//h1').text(): title = g.doc.select('//h1').text() advert.title = title if g.doc.select('//p[@itemprop="average"]').text(): numlist = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] price = "" for i in g.doc.select('//p[@itemprop="average"]').text(): if i in numlist: price += i advert.price_uah = int(price) if g.doc.select('//div[@class="objava_define"]').text(): text = g.doc.select('//div[@class="objava_define"]').text() advert.main_text = text if g.doc.select('//p[@class="tel_user_obj tel"]').text(): phones = g.doc.select('//p[@class="tel_user_obj tel"]').text() advert.raw_phones = phones if g.doc.select('//a[@class="ceeboxAuto"]').text(): contact = g.doc.select('//a[@class="ceeboxAuto"]').text() advert.contact_name = contact if advert.category_id in [21, 11, 24, 27, 17]: extra_object = ExtraFlat() if u"Этаж" in categories: separator1 = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Этаж').text().find(":") both = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Этаж').text()[separator1 + 2:] separator2 = both.find("/") floors = both[separator2 + 1:] floor = both[:separator2] extra_object.floors = floors extra_object.floor = floor if u"Комнат" in categories: separator = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Комнат').text().find(":") rooms_number = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Комнат').text()[separator + 2:separator + 3] extra_object.rooms_number = rooms_number if u"Общая" in categories: separator1 = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Общая').text().find(":") full_area = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Общая').text()[separator1 + 2:] separator2 = full_area.find(" ") area = full_area[:separator2] extra_object.total_area = area if advert.category_id in [14]: extra_object = ExtraHouse() if u"Этажей" in categories: floors = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Этажей').text()[8:] extra_object.floors = floors if u"Общая" in categories: separator1 = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Общая').text().find(":") full_area = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Общая').text()[separator1 + 2:] separator2 = full_area.find(" ") area = full_area[:separator2] extra_object.total_area = area if advert.category_id in [16, 26]: extra_object = ExtraLot() if u"Площадь" in categories: separator1 = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Площадь').text().find(":") area = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Площадь').text()[separator1 + 2:] extra_object.total_area = area if u"Под ком. заст." or u"Под жил. заст." in categories: granted = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Под').text() extra_object.intended_purpose = granted if u"Метро" in categories: separator = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Метро').text().find("-") metro = g.doc.select( "//div[@class='kratkost']/p[contains(.,'%s')]" % u'Метро').text()[7:separator - 2] if metro == u'Дворец Спорта': advert.metro_id = 76 elif metro == u'Дружбы Народов': advert.metro_id = 79 elif metro == u'Красный Хутор': advert.metro_id = 87 elif metro == u'Демеевская': advert.metro_id = 66 elif metro == u'Советской армии': advert.metro_id = 21 elif metro == u'Маршала Жукова': advert.metro_id = 12 elif metro == u'Метростроителей им. Ващенко': advert.metro_id = 13 elif metro == u'им. А.С. Масельского': advert.metro_id = 9 else: advert.metro_id = METRO_CIUA[metro] if g.doc.select("//div[@class='kratkost']").text(): if DOMEN[self.city][29:] == 'kharkov': separator1 = g.doc.select( "//div[@class='kratkost']").text().find(",") fulladress = g.doc.select( "//div[@class='kratkost']").text()[separator1 + 11:] separator2 = fulladress.find(",") subloc = fulladress[:separator2] else: separator1 = g.doc.select( "//div[@class='kratkost']").text().find(",") fulladress = g.doc.select( "//div[@class='kratkost']").text()[separator1 + 2:] separator2 = fulladress.find(",") subloc = fulladress[:separator2] advert.sublocality_id = SUB_CIUA[subloc] same_adv = Advert.objects.filter( category_id=CATEGORIES[addition['category']], author_id=self.author_id, city_id=self.city_id, link=advert.link, ).first() if same_adv: self.stats['omited'] += 1 if same_adv.date_of_update < (timezone.now() - datetime.timedelta(hours=20)): same_adv.date_of_update = timezone.now() same_adv.save() self.stats['date_of_update'] += 1 continue advert.save() photo_grab = g.clone() photo_grab.setup(proxy_auto_change=False, reuse_referer=False) img = [] for i in g.doc.select('//@href'): if i.text().startswith("/pic/objects/"): img.append(i.text()) else: continue for photo in img: photo_name_except = photo[22:54] photo_link = '%s%s' % (DOMEN[self.city][:15], photo) photos = [] photo_links2 = [] sleep(0.2) try: photo_grab.go(photo_link) if photo_grab.response.code == 200 and \ re.match('image/', photo_grab.response.headers['Content-Type']): photos.append({ 'body': photo_grab.response.body, 'extention': RE_EXTENTION.search( photo_grab.config['url']).group() }) except GrabNetworkError as error: photo_links2.append(photo_link) photo = Photo(advert_id=advert.id) try: file_name = '%s.%s' % (hashlib.md5( photo_grab.config['url']).hexdigest(), photos[0]['extention']) except UnicodeEncodeError: file_name = file_name = '%s.%s' % (hashlib.md5( photo_name_except).hexdigest(), photos[0]['extention']) photo.photo.save(file_name, ContentFile(photos[0]['body'])) if extra_object: extra_object.advert = advert extra_object.save() self.stats['saved'] += 1
def fetch(self): """ Download urls via multicurl. Get new tasks from queue. """ m = pycurl.CurlMulti() m.handles = [] # Create curl instances for x in xrange(self.thread_number): curl = pycurl.Curl() m.handles.append(curl) freelist = m.handles[:] # This is infinite cycle # You can break it only from outside code which # iterates over result of this method while True: cached_request = None while len(freelist): # Increase request counter if (self.request_limit is not None and self.counters['request'] >= self.request_limit): logging.debug('Request limit is reached: %s' %\ self.request_limit) if len(freelist) == self.thread_number: yield None else: break else: try: priority, task = self.taskq.get(True, 0.1) except Empty: # If All handlers are free and no tasks in queue # yield None signal if len(freelist) == self.thread_number: yield None else: break else: if not self._preprocess_task(task): continue task.network_try_count += 1 if task.task_try_count == 0: task.task_try_count = 1 if task.task_try_count > self.task_try_limit: logging.debug('Task tries ended: %s / %s' % ( task.name, task.url)) self.add_item('too-many-task-tries', task.url) continue if task.network_try_count > self.network_try_limit: logging.debug('Network tries ended: %s / %s' % ( task.name, task.url)) self.add_item('too-many-network-tries', task.url) continue #import pdb; pdb.set_trace() if task.grab: grab = task.grab else: # Set up curl instance via Grab interface grab = Grab(**self.grab_config) grab.setup(url=task.url) if self.use_cache and not task.get('disable_cache'): if grab.detect_request_method() == 'GET': url = grab.config['url'] utf_url = url.encode('utf-8') if isinstance(url, unicode) else url if self.cache_key_hash: url_hash = sha1(utf_url).hexdigest() else: url_hash = url cache_item = self.cache.find_one({'_id': url_hash}) if cache_item: #if url in self.cache: #cache_item = pickle.loads(self.cache[url]) logging.debug('From cache: %s' % url) cached_request = (grab, grab.clone(), task, cache_item) grab.prepare_request() self.inc_count('request-cache') # break from prepre-request cycle # and go to process-response code break self.inc_count('request-network') if self.proxylist_config: args, kwargs = self.proxylist_config grab.setup_proxylist(*args, **kwargs) curl = freelist.pop() curl.grab = grab curl.grab.curl = curl curl.grab_original = grab.clone() curl.grab.prepare_request() curl.task = task # Add configured curl instance to multi-curl processor m.add_handle(curl) # If there were done network requests if len(freelist) != self.thread_number: while True: status, active_objects = m.perform() if status != pycurl.E_CALL_MULTI_PERFORM: break if cached_request: grab, grab_original, task, cache_item = cached_request url = task.url# or grab.config['url'] grab.fake_response(cache_item['body']) if self.use_cache_compression: body = zlib.decompress(cache_item['body']) else: body = cache_item['body'].encode('utf-8') def custom_prepare_response(g): g.response.head = cache_item['head'].encode('utf-8') g.response.body = body g.response.code = cache_item['response_code'] g.response.time = 0 g.response.url = cache_item['url'] g.response.parse('utf-8') g.response.cookies = g.extract_cookies() grab.process_request_result(custom_prepare_response) yield {'ok': True, 'grab': grab, 'grab_original': grab_original, 'task': task, 'ecode': None, 'emsg': None} self.inc_count('request') while True: queued_messages, ok_list, fail_list = m.info_read() results = [] for curl in ok_list: results.append((True, curl, None, None)) for curl, ecode, emsg in fail_list: results.append((False, curl, ecode, emsg)) for ok, curl, ecode, emsg in results: res = self.process_multicurl_response(ok, curl, ecode, emsg) m.remove_handle(curl) freelist.append(curl) yield res self.inc_count('request') if not queued_messages: break m.select(0.5)
def task_collect_adv_data(self, grab, task): self.stats['taken'] += 1 one = [] for i in grab.doc.select('//@href'): one.append(i.text()) url_adv = re.findall(r'adv-\d+\.\w+', ','.join(one))[0::3] for one_adv in url_adv: self.stats['processed'] += 1 addition = task.get('addition') g = Grab() g.go(DOMEN + one_adv) advert = Advert() advert.category_id = CATEGORIES[addition['category']] advert.city_id = self.city_id advert.author_id = self.author_id advert.link = DOMEN + one_adv categories = g.doc.select( '//table[@class="adv_info_table"]').text() if g.doc.select('//h2[@class="pagetitle"]').text(): title = g.doc.select('//h2[@class="pagetitle"]').text() advert.title = title price = g.doc.select('//td[@class="adv-price"]').text() if price: price_search = re.findall('\d+', price) price_one = "" for i in price_search: price_one += i advert.price_uah = int(price_one) else: advert.price_uah = 1 if u"Описание:" in categories: text = g.doc.select( '//td[@style="border-bottom:none;"][@colspan="2"]').text() if text: advert.main_text = text if u"Телефон:" in categories: phones = re.sub( u'Телефон:', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Телефон').text()) if phones: phon = re.sub(r'\-', "", phones) advert.raw_phones = phon if u"Имя, фамилия:" in categories: contact = re.sub( u'Имя, фамилия:', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Имя').text()) if contact: advert.contact_name = contact if advert.category_id in [21, 11, 12]: extra_object = ExtraFlat() if u"Этажность" in categories: floors = re.sub( u'Этажность ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Этажность').text()) if floors: extra_object.floors = floors if u"Этаж" in categories: floor = re.sub( u'Этаж ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Этаж').text()) if floor: extra_object.floor = floor if u"Количество комнат" in categories: rooms_number = re.sub( u'Количество комнат ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Количество комнат').text()) if rooms_number: extra_object.rooms_number = rooms_number if u"Общая площадь" in categories: area = re.sub( u'Общая площадь ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Общая площадь').text()) area_search = re.search('\d+', area) if area_search: extra_object.total_area = area_search.group() if advert.category_id in [14]: extra_object = ExtraHouse() if u"Этажность" in categories: floors = re.sub( u'Этажность ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Этажность').text()) if floors: extra_object.floors = floors if u"Общая площадь" in categories: area = re.sub( u'Общая площадь ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Общая площадь').text()) area_search = re.search('\d+', area) if area_search: extra_object.total_area = area_search.group() if advert.category_id in [16]: extra_object = ExtraLot() if u"Общая площадь" in categories: area = re.sub( u'Общая площадь ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Общая площадь').text()) area_search = re.search('\d+', area) if area_search: extra_object.total_area = area_search.group() if self.metro_marker: metroc = advert.detect_metro_id(self.metro_marker) if metroc: advert.metro_id = metroc if u"Метро" in categories: metro = re.sub( u'Метро ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Метро').text()) if metro: advert.metro_id = METRO_PREM[metro] if self.sublocality_marker: subloc = advert.detect_sublocality_id(self.sublocality_marker) if subloc: advert.sublocality_id = subloc if u"Район" in categories: subloc = re.sub( u'Район ', "", g.doc.select( "//table[@class='adv_info_table']/tr[contains(.,'%s')]" % u'Район').text()) if subloc: advert.sublocality_id = SUB_PREM[subloc] same_adv = Advert.objects.filter( category_id=CATEGORIES[addition['category']], author_id=self.author_id, city_id=self.city_id, link=advert.link, ).first() if same_adv: self.stats['omited'] += 1 if same_adv.date_of_update < (timezone.now() - datetime.timedelta(hours=20)): same_adv.date_of_update = timezone.now() same_adv.save() self.stats['date_of_update'] += 1 continue advert.save() photo_grab = g.clone() photo_grab.setup(proxy_auto_change=False, reuse_referer=False) img = [] for i in g.doc.select( '//a[@data-lightbox="advertisement-images"]/@href'): img.append(i.text()) for photo in img: photo_name_except = re.search(r'\d{8}', photo).group() photo_link = '%s%s' % (DOMEN2, photo) photos = [] sleep(0.2) try: photo_grab.go(photo_link) if photo_grab.response.code == 200 and \ re.match('image/', photo_grab.response.headers['Content-Type']): photos.append({ 'body': photo_grab.response.body, 'extention': RE_EXTENTION.search( photo_grab.config['url']).group() }) except GrabNetworkError as error: photo_links2.append(photo_link) photo = Photo(advert_id=advert.id) try: file_name = '%s.%s' % (hashlib.md5( photo_grab.config['url']).hexdigest(), photos[0]['extention']) except UnicodeEncodeError: file_name = file_name = '%s.%s' % (hashlib.md5( photo_name_except).hexdigest(), photos[0]['extention']) photo.photo.save(file_name, ContentFile(photos[0]['body'])) if extra_object: extra_object.advert = advert extra_object.save() self.stats['saved'] += 1
class ParserWithProxy(Spider): u"""Базовый класс парсера для работы с прокси""" USE_PROXY = True def __init__(self, country_code, *args, **kwargs): super(ParserWithProxy, self).__init__(*args, **kwargs) self.country = countries.get(alpha2=country_code) self.proxies = [] self.used_proxies = set() self.grab = None self.grab_use_count = None self.reinit_grab() self.setup_queue(getattr(config, 'QUEUE_BACKEND', 'memory')) if getattr(config, 'CACHE_ENABLED', False): self.setup_cache('mongo', getattr(config, 'CACHE_DATABASE', 'cache')) def check_grab(self, grab): return True def reinit_grab(self): if not self.grab: self.grab = Grab() self.grab_use_count = 0 while True: self.grab.clear_cookies() self.grab.setup(**self.get_next_proxy()) if self.check_grab(self.grab): break logger.info(u'Плохая прокси. Смена...') def get_grab(self): self.grab_use_count += 1 if self.grab_use_count > config.PROXY_USE_LIMIT: self.reinit_grab() return self.grab.clone() def get_next_proxy(self): u"""Получение следующей неиспользованной прокси""" if not self.USE_PROXY: return {} while not self.proxies: # получение проксей и фильтрация неспользованных self.proxies = get_proxy_list(self.country.alpha2, 100) self.proxies = filter( lambda proxy: tuple(proxy.values()) not in self.used_proxies, self.proxies ) if not self.proxies: logger.info(u'Кончились прокси, ожидание новых') sleep(10) else: break # возврат первой прокси proxy = self.proxies[0] self.used_proxies.add(tuple(proxy.values())) del self.proxies[0] return proxy
def test_empty_clone(self): g = Grab() g.clone()
def task_collect_adv_data(self, grab, task): self.stats['taken'] += 1 one = [] url_adv = [] for i in grab.doc.select('//@href'): one.append(i.text()) for item in one: if item.startswith('view.php?ad_id=') and item not in url_adv: url_adv.append(item) else: continue for one_adv in url_adv: self.stats['processed'] += 1 addition = task.get('addition') advert = Advert() if self.city == 'kharkov': advert.category_id = CATEGORIES_khar[addition['category']] else: advert.category_id = CATEGORIES_kiev[addition['category']] advert.city_id = self.city_id advert.author_id = self.author_id advert.link = DOMEN[:13] + one_adv g = Grab() g.go(DOMEN[:13] + "print_" + one_adv) if g.doc.select("//td/p[contains(.,'%s')]" % u'Тел:').text(): phones = g.doc.select("//td/p[contains(.,'%s')]" % u'Тел:').text()[5:] advert.raw_phones = phones g.go(DOMEN[:13] + one_adv) categories = g.doc.select( '//div[@style="font-size: 11px;"]').text() if g.doc.select('//h1').text(): title = g.doc.select('//h1').text() advert.title = title if g.doc.select('//p[@class="ad-price"]').text(): numlist = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] price = "" for i in g.doc.select('//p[@class="ad-price"]').text(): if i in numlist: price += i else: continue if u'грн' in g.doc.select('//p[@class="ad-price"]').text(): if price: advert.price_uah = int(price) else: if price: advert.price_usd = int(price) if g.doc.select('//p[@class="ad-desc"]').text(): text = g.doc.select('//p[@class="ad-desc"]').text() advert.main_text = text # if g.doc.select("//div[@style='font-size: 11px;']/p[contains(.,'%s')]" %u'Контакт').text(): # contact = g.doc.select("//div[@style='font-size: 11px;']/p[contains(.,'%s')]" %u'Контакт').text()[9:] # advert.contact_name = contact if advert.category_id in [21, 11, 27, 17]: extra_object = ExtraFlat() if u"Этаж" in categories: separator1 = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Этаж').text().find(":") both = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Этаж').text()[separator1 + 2:] separator2 = both.find("/") floors = both[separator2 + 2:] floor = both[:separator2 - 1] extra_object.floors = floors extra_object.floor = floor if u"Комнат" in categories: rooms_number = g.doc.select( '//p[@class="ad-contacts"]').text()[-1] extra_object.rooms_number = rooms_number if u"Общая" in categories: separator = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Общая площадь').text().find(":") area = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Общая площадь').text()[separator + 2:-6] extra_object.total_area = area if advert.category_id in [14, 24]: extra_object = ExtraHouse() if u"Этажность" in categories: floors = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Этаж').text()[-1] extra_object.floors = floors if u"Площадь дома" in categories: separator = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Площадь дома').text().find(":") area = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Площадь дома').text()[separator + 2:-6] extra_object.total_area = area if u"Площадь участка" in categories: separator = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Площадь участка').text().find(":") area = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Площадь участка').text()[separator + 2:] extra_object.lot_area = area if advert.category_id in [16]: extra_object = ExtraLot() if u"Площадь участка" in categories: separator1 = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Площадь участка').text().find(":") area = g.doc.select( "//div[@style='font-size: 11px;']/p[contains(.,'%s')]" % u'Площадь участка').text()[separator1 + 2:] extra_object.total_area = area if u"Под строительство" in categories: granted = g.doc.select( "//ul[@style='list-style-type: none']/li[contains(.,'%s')]" % u'Под').text() extra_object.intended_purpose = granted if g.doc.select("//h3").text(): if "," in g.doc.select("//h3").text(): subloc = g.doc.select("//h3").text() else: separator = g.doc.select("//h3").text().find(" ") subloc = g.doc.select("//h3").text()[:separator] advert.sublocality_id = SUB_FN[subloc] advert.metro_id = advert.detect_metro_id(self.metro_marker) same_adv = Advert.objects.filter( category_id=advert.category_id, author_id=self.author_id, city_id=self.city_id, link=advert.link, ).first() if same_adv: self.stats['omited'] += 1 if same_adv.date_of_update < (timezone.now() - datetime.timedelta(hours=20)): same_adv.date_of_update = timezone.now() same_adv.save() self.stats['date_of_update'] += 1 continue advert.save() photo_grab = g.clone() photo_grab.setup(proxy_auto_change=False, reuse_referer=False) img = [] for i in g.doc.select('//@href'): if i.text().startswith("./upload/pics/"): img.append(i.text()) else: continue for photo in img: photo_name_except = photo[14:-4] photo_link = '%s%s' % (DOMEN[:13], photo) photos = [] photo_links2 = [] sleep(0.2) try: photo_grab.go(photo_link) if photo_grab.response.code == 200 and \ re.match('image/', photo_grab.response.headers['Content-Type']): photos.append({ 'body': photo_grab.response.body, 'extention': RE_EXTENTION.search( photo_grab.config['url']).group() }) except GrabNetworkError as error: photo_links2.append(photo_link) photo = Photo(advert_id=advert.id) try: file_name = '%s.%s' % (hashlib.md5( photo_grab.config['url']).hexdigest(), photos[0]['extention']) except UnicodeEncodeError: file_name = file_name = '%s.%s' % (hashlib.md5( photo_name_except).hexdigest(), photos[0]['extention']) photo.photo.save(file_name, ContentFile(photos[0]['body'])) if extra_object: extra_object.advert = advert extra_object.save() self.stats['saved'] += 1
def create_advert(self, raw): grab = Grab() self.stats['taken'] += 1 self.stats['processed'] += 1 # general fields extra_object = None adv = Advert() adv.city_id = self.city_id adv.author_id = self.author_id if 'priceArr' in raw: if raw['priceArr']['3']: price = re.sub(r'\s', '', raw['priceArr']['3']) adv.price_uah = int(price) else: adv.price_uah = 1 if 'description' in raw: adv.main_text = raw['description'] if 'user' in raw: adv.contact_name = raw['user']['name'] if 'user_id' in raw: us_id = raw['user_id'] p = Grab() p.go(URLS_ID % us_id) main = p.doc.select("//li[@class='fieldWrap']").text() adv.raw_phones = re.sub(r'\s|\(|\)|\-', '', main) if 'street_name' in raw: adv.street = raw['street_name'] if 'beautiful_url' in raw: url_domria = raw['beautiful_url'] adv.link = (LINK_DOM % url_domria) if raw['advert_type_id'] in [1]: if raw['realty_type_name'] in [u'квартира', u'Квартира']: extra_object = ExtraFlat() adv.category_id = CATEGORIES['prodazha-kvartir'] titles = u'Продажа квартиры %s' if 'floors_count' in raw: extra_object.floors = raw['floors_count'] if 'floor' in raw: extra_object.floor = raw['floor'] if 'rooms_count' in raw: extra_object.rooms_number = raw['rooms_count'] if 'total_square_meters' in raw: extra_object.total_area = raw['total_square_meters'] if raw['realty_type_name'] in [u'дом', u'Дом']: extra_object = ExtraHouse() adv.category_id = CATEGORIES['prodazha-domov'] titles = u'Продажа дома %s' if 'total_square_meters' in raw: extra_object.total_area = raw['total_square_meters'] if raw['advert_type_id'] in [3, 4]: if raw['realty_type_name'] in [u'квартира', u'Квартира']: extra_object = ExtraFlat() adv.category_id = CATEGORIES['arenda-kvartir'] titles = u'Аренда квартиры %s' if 'floors_count' in raw: extra_object.floors = raw['floors_count'] if 'floor' in raw: extra_object.floor = raw['floor'] if 'rooms_count' in raw: extra_object.rooms_number = raw['rooms_count'] if 'total_square_meters' in raw: extra_object.total_area = raw['total_square_meters'] if raw['realty_type_name'] in [u'дом', u'Дом']: extra_object = ExtraHouse() adv.category_id = CATEGORIES['arenda-domov'] titles = u'Аренда дома %s' if 'total_square_meters' in raw: extra_object.total_area = raw['total_square_meters'] if 'district_name' in raw: sudlo_name = int(SUB_CIUA[raw['district_name']]) adv.sublocality_id = sudlo_name adv.title = (titles % adv.sublocality.name) if 'district_name' not in raw: if 'street_name' in raw: adv.title = (titles % raw['street_name']) else: adv.title = (titles % self.city) if self.metro_marker: if 'metro_station_name' in raw: metro_station = METRO_CIUA[raw['metro_station_name']] adv.metro_id = metro_station adv.save() photo_grab = grab.clone() photo_grab.setup(proxy_auto_change=False, reuse_referer=False) for key in raw['photos']: key_photo = raw['photos'][key]['file'] photo_link = (PHOTO_URL % (key_photo.replace('.', 'f.'))) photos = [] sleep(0.2) try: photo_grab.go(photo_link) if photo_grab.response.code == 200 and \ re.match('image/', photo_grab.response.headers['Content-Type']): photos.append({ 'body': photo_grab.response.body, 'extention': RE_EXTENTION.search(photo_grab.config['url']).group() }) except GrabNetworkError as error: photo_links2.append(photo_link) photo = Photo(advert_id=adv.id) try: file_name = '%s.%s' % (hashlib.md5( photo_grab.config['url']).hexdigest(), photos[0]['extention']) photo.photo.save(file_name, ContentFile(photos[0]['body'])) except IndexError: pass if extra_object: extra_object.advert = adv extra_object.save() self.stats['saved'] += 1
class Avito: PAGETRY = 3 SLEEP = 1 def __init__(self, proxy=None): self.PAGETRY, rc = startup.CFG.value("pagetry", Avito.PAGETRY).toInt() self.SLEEP, rc = startup.CFG.value("sleep", Avito.SLEEP).toInt() self.g = Grab() self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10))) if __name__ == "__main__": self.g.setup(log_dir="dump") if proxy: self.g.load_proxylist(proxy, "text_file", "http", auto_init=True, auto_change=False) self.tess = Tesseract() self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE) def _go3(self, url, tag): c = self.PAGETRY while c: try: self.g.change_proxy() self.g.go(url) self.g.assert_substring(u'content="51a6d66a02fb23c7"') break except: log.exception("%s left %i", tag, c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception("%s error" % tag) def get_links(self, url): self._go3(url, "start page") c = 999 while True: links = self.g.doc.select('//h3[@class="title"]/a/@href') if not links: raise Exception("no links") for link in links: c -= 1 if not c: return yield urljoin(url, link.text()) next = self.g.doc.select('//a[@class="next"]/@href') if not next: log.debug("last page?") break QtCore.QThread.sleep(self.SLEEP) nurl = urljoin(url, next.text()) log.debug("open next page %s", nurl) self._go3(nurl, "next page") def get_photos(self, photos): g = self.g.clone() datas = [] for url in photos: if not url.startswith("http:"): url = "http:" + url c = self.PAGETRY while c: try: rc = g.go(url) g.assert_substring("JFIF", byte=True) datas.append(rc.body) break except: log.exception("get_item left %i", c) c -= 1 g.change_proxy() QtCore.QThread.sleep(self.SLEEP) else: raise Exception("get photo error") return datas def get_item(self, url): g = self.g.clone() c = self.PAGETRY while c: try: g.change_proxy() g.go(url) g.assert_substring(u'content="499bdc75d3636c55"') # avitos ya id break except: log.exception("get_item left %i", c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception("get item error") doc = g.doc title = doc.select('//h1[@itemprop="name"]').text() gallery = doc.select('//div[@class="gallery-item"]') photos = [s.text() for s in gallery.select('.//a[@class="gallery-link"]/@href')] if not photos: egg = doc.select('//img[contains(@class,"j-zoom-gallery-init")]/@src') if egg: photos.append(egg.text()) item = doc.select('//div[@itemprop="offers"]') # price = item.select('.//strong[@itemprop="price"]').text() price = item.select('.//span[@itemprop="price"]').text() name = item.select('.//strong[@itemprop="name"]').text() try: town = item.select('.//span[@id="toggle_map"]/span[@itemprop="name"]').text() except: log.warning("xpath town not found, try another way") town = item.select('.//div[@id="map"]/span[@itemprop="name"]').text() # desc = item.select('.//div[@id="desc_text"]').text() desc = doc.select('//div[contains(@class,"description-text")]').text() # <span id="item_id">348967351</span> item_id = doc.select('//span[@id="item_id"]').text() _url = g.rex_text("avito.item.url = '([^>]+?)'") _phone = g.rex_text("avito.item.phone = '([^>]+?)'") loc = {"item_phone": _phone, "item": {"id": item_id, "url": _url}} log.debug("jslock enter <--") with PyV8.JSContext(loc) as ctx: ctx.enter() ctx.eval( """function phoneDemixer(key){var pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}} return r;}""" ) # http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4 egg = ctx.eval("'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)") # egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)") log.debug("js rc %s", egg) ctx.leave() log.debug("jslock leave -->") phone = "" c = self.PAGETRY while c: log.debug("read phone image") try: rc = g.go(urljoin(url, egg)) img = Image.open(StringIO(rc.body)) phone = self.tess.from_image(img, basewidth=300, whitelist="0123456789-") break except: g.change_proxy() log.exception("get_phone left %i", c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: log.debug("get phone error") return dict(item=item_id, title=title, photos=photos, price=price, name=name, town=town, desc=desc, phone=phone)
def volume_info(url_volume, url_ranobe): """Функция возвращает словарь, содержащий информацию о томе ранобе.""" g = Grab() g.setup(hammer_mode=True) # Переходим на страницу тома g.go(url_volume) if g.response.code != 200: print("Страница: {}, код возврата: {}".format(url_volume, g.response.code)) return # Ссылка к картинке обложки тома url_cover_volume = get_url2full_image_cover(g.clone(), url_ranobe) # Получаем список строк с двумя столбцами, каждая строка содержит # некоторую информацию о томе: названия на нескольких языка, серия, # автор, иллюстратор и т.п. list_info = g.doc.select('//table[@id="release-info"]/tr/td[2]') # volume_ja_name = None # Название тома на японском # volume_en_name = None # Название тома на английском volume_name = None series = None author = None illustrator = None volume_isbn = None # status = None # Статус (наверное, статус перевода) tr_team = None translators = None try: # volume_ja_name = list_info[0].text() # Название тома на японском # volume_en_name = list_info[1].text() # Название тома на английском volume_name = list_info[2].text() series = list_info[3].text() author = list_info[4].text() illustrator = list_info[5].text() volume_isbn = list_info[6].text() # status = list_info[7].text() # Статус (наверное, статус перевода) tr_team = list_info[8].text() translators = list_info[9].text().split(', ') except IndexError: print("Не хватает полей с информацией о томе: {}".format(url_volume)) # Получение списка глав тома из оглавления volume_pages = get_volume_pages(g) # Если нет содержания -- пропускаем том if not volume_pages: print("Нет содержания: {}".format(url_volume)) return # Список глав тома chapters = list() # Остальные страниц, которые не относятся к главам тома other_pages = dict() # Словарь содержит информацию о томе info = { "name": volume_name, "series": series, "author": author, "illustrator": illustrator, "ISBN": volume_isbn, "url_cover": url_cover_volume, "pages": { "chapters": chapters, "other": other_pages, }, "translation": { "team": tr_team, # команда перевода "translators": translators, # переводчики }, } # Переберем все страницы тома for page in volume_pages: # Адрес к главе тома name_ch, url_ch = page if not isinstance(url_ch, list): check = check_volume_page(url_ch) if check is False: return elif check is None: continue else: # Проверяем подглавы: for sub_ch in url_ch: sub_name, sub_url = sub_ch check = check_volume_page(sub_url) if check is False: return elif check is None: continue # Разбиение списка глав соответственно с типами страниц: # главы -- отдельно, а все остальное тоже отдельно. # Если адресом является список подглав if isinstance(url_ch, list): # Добавление списка подглав chapters.append(page) else: # Тип страницы тома может быть "Начальные иллюстрации", "Пролог", сами главы, и т.п. # Типы страниц описаны выше данной функции. volume_base_page = get_volume_base_page(url_ch) # Если типом страницы является глава: if type_pages_is_chapter(volume_base_page): # Добавление адреса главы к списку chapters.append(page) else: # Добавляем в словарь данную страницу, которая не относится к главам other_pages[volume_base_page] = page return info
class Avito(): PAGETRY = 3 SLEEP = 1 def __init__(self, proxy=None): self.PAGETRY, rc = startup.CFG.value('pagetry', Avito.PAGETRY).toInt() self.SLEEP, rc = startup.CFG.value('sleep', Avito.SLEEP).toInt() self.g = Grab() self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10))) if __name__ == '__main__': self.g.setup(log_dir='dump') if proxy: self.g.load_proxylist(proxy, 'text_file', 'http', auto_init=True, auto_change=False) self.tess = Tesseract() self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE) def _go3(self, url, tag): c = self.PAGETRY while c: try: self.g.change_proxy() self.g.go(url) self.g.assert_substring(u'content="51a6d66a02fb23c7"') break except: log.exception('%s left %i', tag, c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception('%s error' % tag) def get_links(self, url): self._go3(url, 'start page') c = 999 while True: links = self.g.doc.select('//h3[@class="title"]/a/@href') if not links: raise Exception('no links') for link in links: c -= 1 if not c: return yield urljoin(url, link.text()) next = self.g.doc.select('//a[@class="next"]/@href') if not next: log.debug('last page?') break QtCore.QThread.sleep(self.SLEEP) nurl = urljoin(url, next.text()) log.debug('open next page %s', nurl) self._go3(nurl, 'next page') def get_photos(self, photos): g = self.g.clone() datas = [] for url in photos: if not url.startswith('http:'): url = 'http:' + url c = self.PAGETRY while c: try: rc = g.go(url) g.assert_substring('JFIF', byte=True) datas.append(rc.body) break except: log.exception('get_item left %i', c) c -= 1 g.change_proxy() QtCore.QThread.sleep(self.SLEEP) else: raise Exception('get photo error') return datas def get_item(self, url): g = self.g.clone() c = self.PAGETRY while c: try: g.change_proxy() g.go(url) g.assert_substring( u'content="499bdc75d3636c55"') # avitos ya id break except: log.exception('get_item left %i', c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception('get item error') doc = g.doc title = doc.select('//h1[@itemprop="name"]').text() gallery = doc.select('//div[@class="gallery-item"]') photos = [ s.text() for s in gallery.select('.//a[@class="gallery-link"]/@href') ] if not photos: egg = doc.select( '//img[contains(@class,"j-zoom-gallery-init")]/@src') if egg: photos.append(egg.text()) item = doc.select('//div[@itemprop="offers"]') #price = item.select('.//strong[@itemprop="price"]').text() price = item.select('.//span[@itemprop="price"]').text() name = item.select('.//strong[@itemprop="name"]').text() try: town = item.select( './/span[@id="toggle_map"]/span[@itemprop="name"]').text() except: log.warning('xpath town not found, try another way') town = item.select( './/div[@id="map"]/span[@itemprop="name"]').text() #desc = item.select('.//div[@id="desc_text"]').text() desc = doc.select( "//div[contains(@class,\"description-text\")]").text() #<span id="item_id">348967351</span> item_id = doc.select('//span[@id="item_id"]').text() _url = g.rex_text("avito.item.url = '([^>]+?)'") _phone = g.rex_text("avito.item.phone = '([^>]+?)'") loc = {'item_phone': _phone, 'item': {'id': item_id, 'url': _url}} log.debug('jslock enter <--') with PyV8.JSContext(loc) as ctx: ctx.enter() ctx.eval('''function phoneDemixer(key){var pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}} return r;}''') #http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4 egg = ctx.eval( "'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)") #egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)") log.debug('js rc %s', egg) ctx.leave() log.debug('jslock leave -->') phone = '' c = self.PAGETRY while c: log.debug('read phone image') try: rc = g.go(urljoin(url, egg)) img = Image.open(StringIO(rc.body)) phone = self.tess.from_image(img, basewidth=300, whitelist='0123456789-') break except: g.change_proxy() log.exception('get_phone left %i', c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: log.debug('get phone error') return dict(item=item_id, title=title, photos=photos, price=price, name=name, town=town, desc=desc, phone=phone)
logger_network = logging.getLogger('grab.network') # Настройки объекта g = Grab() capUrl = 'http://is.fssprus.ru/ajax_search' url = 'http://fssprus.ru/iss/ip/' g.setup( log_file = 'fs_grab_log.txt', log_dir = '~/workspace/', debug_post = True, debug = True, verbose_logging = True, ) g.go(url, method = "POST") if g.go(url, post = { 'variant': '1', 'last_name': 'Антон', 'first_name': 'Петров' }): g1 = g.clone() print(g1.request_method) else: print("Чет пошло не так") if input(int()) == 1: g2 = g1.clone()
class FreeLanceRu(Spider): PROJECT_BY_PID = 'http://www.free-lance.ru/projects/?pid=%d' INDEX_BY_PAGE = 'http://www.free-lance.ru/?page=%d' def __init__(self, pages_count=5, *args, **kwargs): self.pages_count = pages_count super(FreeLanceRu, self).__init__(*args, **kwargs) def prepare(self): self.grab = Grab() self.grab.setup(headers=additional_headers) def get_grab(self, url=None): grab = self.grab.clone() if url: grab.setup(url=url) return grab def get_task(self, **kwargs): url = None if 'url' in kwargs: url = kwargs['url'] del kwargs['url'] grab = self.get_grab(url=url) return Task( grab=grab, **kwargs ) def task_generator(self): for index in range(self.pages_count): yield self.get_task( name='page', url=FreeLanceRu.INDEX_BY_PAGE % (index + 1) ) def task_page(self, grab, task): pids = grab.xpath_list('//a[starts-with(@id, "prj_name_")]/@id') pids = map(lambda item: int(item.split('_')[-1]), pids) for pid in pids: url = FreeLanceRu.PROJECT_BY_PID % (pid) if model.Project.query.filter_by(url=url).first(): continue yield self.get_task( name='project', pid=pid, url=url ) def task_project(self, grab, task): project = None if grab.xpath_exists('//*[@class="contest-view"]'): project = self.parse_contest_view(grab, task) elif grab.xpath_exists('//*[@class="pay-prjct"]'): project = self.parse_pay_project(grab, task) else: project = self.parse_project(grab, task) if project: self.check_project(project) def parse_project(self, grab, task): project = {} # project['url'] = FreeLanceRu.PROJECT_BY_PID % (task.pid) # name = grab.xpath('//h1[@class="prj_name"]/text()') name = name.strip().encode('utf-8') project['name'] = name # date = grab.xpath('//*[@class="user-about-r"]/p/text()') date = date.split('[', 1)[0] date = date.strip().encode('utf-8') date = datetime.datetime.strptime( date, "%d.%m.%Y | %H:%M" ) project['date'] = date # category = grab.rex( u'<p class="crumbs">Разделы:    (.*?)(, |</p>)' ) category = category.group(1) items = fromstring(category).xpath('./a/text()') if not items: items = category.split(' / ') category = items category = map(lambda a: a.strip().encode('utf-8'), category) project['category'] = category # description = grab.xpath('//*[@class="prj_text"]/text()') description = description.encode('utf-8') project['description'] = description # project['type'] = 'simple' # return project def parse_contest_view(self, grab, task): project = {} project['url'] = FreeLanceRu.PROJECT_BY_PID % (task.pid) project['type'] = 'contest' return project def parse_pay_project(self, grab, task): project = {} project['url'] = FreeLanceRu.PROJECT_BY_PID % (task.pid) project['type'] = 'pay' return project def check_project(self, project): if model.Project.query.filter_by(url=project['url']).first(): return category = None if 'category' in project: category = self.get_category(project['category']) model.Project( name=project.get('name', None), url=project['url'], description=project.get('description', None), project_type=project['type'], category=category, date=project.get('date', None), site=model.free_lance_ru ) session.commit() def get_category(self, path): path.reverse() category = None while path: category = model.Category.query.filter_by( name=path.pop(), parent=category, site=model.free_lance_ru ) category = category.first() return category