Exemplo n.º 1
0
    def transport_option_logic(self, curl_transport, fake_transport):
        g = Grab(transport=curl_transport)
        g.go(self.server.get_url())
        self.assertEqual(g.response.body, b'XYZ')

        g2 = g.clone()
        g.go(self.server.get_url())
        self.assertEqual(g.response.body, b'XYZ')

        g2_data = pickle.dumps(g2, pickle.HIGHEST_PROTOCOL)
        g3 = pickle.loads(g2_data)
        g3.go(self.server.get_url())
        self.assertEqual(g3.response.body, b'XYZ')

        g = Grab(transport=fake_transport)
        g.go(self.server.get_url())
        self.assertEqual(g.response.body, b'Faked XYZ')

        g2 = g.clone()
        g.go(self.server.get_url())
        self.assertEqual(g.response.body, b'Faked XYZ')

        g2_data = pickle.dumps(g2, pickle.HIGHEST_PROTOCOL)
        g3 = pickle.loads(g2_data)
        g3.go(self.server.get_url())
        self.assertEqual(g3.response.body, b'Faked XYZ')
Exemplo n.º 2
0
    def transport_option_logic(self, curl_transport, fake_transport):
        g = Grab(transport=curl_transport)
        g.go(self.server.get_url())
        self.assertEqual(g.response.body, b'XYZ')

        g2 = g.clone()
        g.go(self.server.get_url())
        self.assertEqual(g.response.body, b'XYZ')

        g2_data = pickle.dumps(g2, pickle.HIGHEST_PROTOCOL)
        g3 = pickle.loads(g2_data)
        g3.go(self.server.get_url())
        self.assertEqual(g3.response.body, b'XYZ')

        g = Grab(transport=fake_transport)
        g.go(self.server.get_url())
        self.assertEqual(g.response.body, b'Faked XYZ')

        g2 = g.clone()
        g.go(self.server.get_url())
        self.assertEqual(g.response.body, b'Faked XYZ')

        g2_data = pickle.dumps(g2, pickle.HIGHEST_PROTOCOL)
        g3 = pickle.loads(g2_data)
        g3.go(self.server.get_url())
        self.assertEqual(g3.response.body, b'Faked XYZ')
Exemplo n.º 3
0
    def transport_option_logic(self, curl_transport, fake_transport):
        g = Grab(transport=curl_transport)
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.body, 'XYZ')

        g2 = g.clone()
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.body, 'XYZ')

        g2_data = pickle.dumps(g2)
        g3 = pickle.loads(g2_data)
        g3.go(SERVER.BASE_URL)
        self.assertEqual(g3.response.body, 'XYZ')

        g = Grab(transport=fake_transport)
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.body, 'Faked XYZ')

        g2 = g.clone()
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.body, 'Faked XYZ')

        g2_data = pickle.dumps(g2)
        g3 = pickle.loads(g2_data)
        g3.go(SERVER.BASE_URL)
        self.assertEqual(g3.response.body, 'Faked XYZ')
Exemplo n.º 4
0
    def transport_option_logic(self, curl_transport, fake_transport):
        g = Grab(transport=curl_transport)
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.body, "XYZ")

        g2 = g.clone()
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.body, "XYZ")

        g2_data = pickle.dumps(g2, pickle.HIGHEST_PROTOCOL)
        g3 = pickle.loads(g2_data)
        g3.go(SERVER.BASE_URL)
        self.assertEqual(g3.response.body, "XYZ")

        g = Grab(transport=fake_transport)
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.body, "Faked XYZ")

        g2 = g.clone()
        g.go(SERVER.BASE_URL)
        self.assertEqual(g.response.body, "Faked XYZ")

        g2_data = pickle.dumps(g2, pickle.HIGHEST_PROTOCOL)
        g3 = pickle.loads(g2_data)
        g3.go(SERVER.BASE_URL)
        self.assertEqual(g3.response.body, "Faked XYZ")
Exemplo n.º 5
0
 def assert_transport_pickle(self, transport, response):
     grab = Grab(transport=transport)
     grab2 = grab.clone()
     grab2_data = pickle.dumps(grab2, pickle.HIGHEST_PROTOCOL)
     grab3 = pickle.loads(grab2_data)
     grab3.go(self.server.get_url())
     self.assertEqual(grab3.doc.body, response)
Exemplo n.º 6
0
 def test_clone(self):
     g = Grab(transport=GRAB_TRANSPORT)
     SERVER.RESPONSE['get'] = 'Moon'
     g.go(SERVER.BASE_URL)
     self.assertTrue('Moon' in g.response.body)
     g2 = Grab(transport=GRAB_TRANSPORT)
     self.assertEqual(g2.response, None)
     g2 = g.clone()
     self.assertTrue('Moon' in g.response.body)
Exemplo n.º 7
0
 def test_clone(self):
     g = Grab(transport=GRAB_TRANSPORT)
     SERVER.RESPONSE['get'] = 'Moon'
     g.go(SERVER.BASE_URL)
     self.assertTrue('Moon' in g.response.body)
     g2 = Grab(transport=GRAB_TRANSPORT)
     self.assertEqual(g2.response, None)
     g2 = g.clone()
     self.assertTrue('Moon' in g.response.body)
Exemplo n.º 8
0
    def assert_transport_response(self, transport, response):
        self.server.response['get.data'] = response

        grab = Grab(transport=transport)
        grab.go(self.server.get_url())
        self.assertEqual(grab.doc.body, response)

        grab2 = grab.clone()
        grab2.go(self.server.get_url())
        self.assertEqual(grab2.doc.body, response)
Exemplo n.º 9
0
def yandex_request(query_url):
    g = Grab(cookiefile='var/yandex.cookies')
    g.go(query_url)

    while g.search(u'<title>Ой!') is not None:
        url = g.css('.b-captcha__image').get('src')
        solution = solve_captcha(g.clone(), url)
        g.set_input('rep', solution)
        g.submit()
        if g.search(u'<title>Ой!') is None:
            g.go(query_url)
    return g
Exemplo n.º 10
0
def google_request(query_url):
    g = Grab(cookiefile='var/google.cookies', log_dir='log')
    g.go(query_url)

    while g.search('please type the characters below') is not None:
        url = g.get_xpath('//img', lambda x: 'sorry' in x.get('src')).get('src')
        solution = solve_captcha(g.clone(), url)
        g.set_input('captcha', solution)
        g.submit()
        if g.search('please type the characters below') is None:
            g.go(query_url)

    return g
Exemplo n.º 11
0
 def test_empty_clone(self):
     g = Grab()
     g.clone()
Exemplo n.º 12
0
    def fetch(self):
        """
        Download urls via multicurl.
        
        Get new tasks from queue.
        """
        m = pycurl.CurlMulti()
        m.handles = []

        # Create curl instances
        for x in xrange(self.thread_number):
            curl = pycurl.Curl()
            m.handles.append(curl)

        freelist = m.handles[:]

        # This is infinite cycle
        # You can break it only from outside code which
        # iterates over result of this method
        while True:

            cached_request = None

            while len(freelist):

                # Increase request counter
                if (self.request_limit is not None
                        and self.counters['request'] >= self.request_limit):
                    logging.debug('Request limit is reached: %s' %\
                                  self.request_limit)
                    if len(freelist) == self.thread_number:
                        yield None
                    else:
                        break
                else:
                    try:
                        priority, task = self.taskq.get(True, 0.1)
                    except Empty:
                        # If All handlers are free and no tasks in queue
                        # yield None signal
                        if len(freelist) == self.thread_number:
                            yield None
                        else:
                            break
                    else:
                        if not self._preprocess_task(task):
                            continue

                        task.network_try_count += 1
                        if task.task_try_count == 0:
                            task.task_try_count = 1

                        if task.task_try_count > self.task_try_limit:
                            logging.debug('Task tries ended: %s / %s' %
                                          (task.name, task.url))
                            self.add_item('too-many-task-tries', task.url)
                            continue

                        if task.network_try_count > self.network_try_limit:
                            logging.debug('Network tries ended: %s / %s' %
                                          (task.name, task.url))
                            self.add_item('too-many-network-tries', task.url)
                            continue

                        #import pdb; pdb.set_trace()
                        if task.grab:
                            grab = task.grab
                        else:
                            # Set up curl instance via Grab interface
                            grab = Grab(**self.grab_config)
                            grab.setup(url=task.url)

                        if self.use_cache and not task.get('disable_cache'):
                            if grab.detect_request_method() == 'GET':
                                url = grab.config['url']
                                cache_item = self.cache.find_one({'_id': url})
                                if cache_item:
                                    #if url in self.cache:
                                    #cache_item = pickle.loads(self.cache[url])
                                    #logging.debug('From cache: %s' % url)
                                    cached_request = (grab, grab.clone(), task,
                                                      cache_item)
                                    grab.prepare_request()
                                    self.inc_count('request-cache')

                                    # break from prepre-request cycle
                                    # and go to process-response code
                                    break

                        self.inc_count('request-network')
                        if self.proxylist_config:
                            args, kwargs = self.proxylist_config
                            grab.setup_proxylist(*args, **kwargs)

                        curl = freelist.pop()
                        curl.grab = grab
                        curl.grab.curl = curl
                        curl.grab_original = grab.clone()
                        curl.grab.prepare_request()
                        curl.task = task
                        # Add configured curl instance to multi-curl processor
                        m.add_handle(curl)

            # If there were done network requests
            if len(freelist) != self.thread_number:
                while True:
                    status, active_objects = m.perform()
                    if status != pycurl.E_CALL_MULTI_PERFORM:
                        break

            if cached_request:
                grab, grab_original, task, cache_item = cached_request
                url = task.url  # or grab.config['url']
                grab.fake_response(cache_item['body'])

                def custom_prepare_response(g):
                    g.response.head = cache_item['head'].encode('utf-8')
                    g.response.body = cache_item['body'].encode('utf-8')
                    g.response.code = cache_item['response_code']
                    g.response.time = 0
                    g.response.url = cache_item['url']
                    g.response.parse('utf-8')
                    g.response.cookies = g.extract_cookies()

                grab.process_request_result(custom_prepare_response)

                yield {
                    'ok': True,
                    'grab': grab,
                    'grab_original': grab_original,
                    'task': task,
                    'ecode': None,
                    'emsg': None
                }
                self.inc_count('request')

            while True:
                queued_messages, ok_list, fail_list = m.info_read()

                results = []
                for curl in ok_list:
                    results.append((True, curl, None, None))
                for curl, ecode, emsg in fail_list:
                    results.append((False, curl, ecode, emsg))

                for ok, curl, ecode, emsg in results:
                    res = self.process_multicurl_response(
                        ok, curl, ecode, emsg)
                    m.remove_handle(curl)
                    freelist.append(curl)
                    yield res
                    self.inc_count('request')

                if not queued_messages:
                    break

            m.select(0.5)
Exemplo n.º 13
0
    def task_collect_adv_data(self, grab, task):
        self.stats['taken'] += 1
        one = []
        url_adv = []
        for i in grab.doc.select('//@href'):
            one.append(i.text())
        for item in one:
            if DOMEN[self.city][29:] == 'kharkov':
                if item.startswith('/nedvizhimost/xarkov-'):
                    url_adv.append(item)
                else:
                    continue
            else:
                if item.startswith('/nedvizhimost/%s-' %
                                   DOMEN[self.city][29:]):
                    url_adv.append(item)
                else:
                    continue
        for one_adv in url_adv:
            self.stats['processed'] += 1
            addition = task.get('addition')
            g = Grab()
            g.go(DOMEN[self.city][:15] + one_adv)
            advert = Advert()
            advert.category_id = CATEGORIES[addition['category']]
            advert.city_id = self.city_id
            advert.author_id = self.author_id
            advert.link = DOMEN[self.city][:15] + one_adv

            categories = g.doc.select(
                '//div[@id="content_objectTabWidgetinfo_tab"]').text()

            if g.doc.select('//h1').text():
                title = g.doc.select('//h1').text()
                advert.title = title
            if g.doc.select('//p[@itemprop="average"]').text():
                numlist = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
                price = ""
                for i in g.doc.select('//p[@itemprop="average"]').text():
                    if i in numlist:
                        price += i
                advert.price_uah = int(price)
            if g.doc.select('//div[@class="objava_define"]').text():
                text = g.doc.select('//div[@class="objava_define"]').text()
                advert.main_text = text
            if g.doc.select('//p[@class="tel_user_obj tel"]').text():
                phones = g.doc.select('//p[@class="tel_user_obj tel"]').text()
                advert.raw_phones = phones
            if g.doc.select('//a[@class="ceeboxAuto"]').text():
                contact = g.doc.select('//a[@class="ceeboxAuto"]').text()
                advert.contact_name = contact
            if advert.category_id in [21, 11, 24, 27, 17]:
                extra_object = ExtraFlat()
                if u"Этаж" in categories:
                    separator1 = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Этаж').text().find(":")
                    both = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Этаж').text()[separator1 + 2:]
                    separator2 = both.find("/")
                    floors = both[separator2 + 1:]
                    floor = both[:separator2]
                    extra_object.floors = floors
                    extra_object.floor = floor
                if u"Комнат" in categories:
                    separator = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Комнат').text().find(":")
                    rooms_number = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Комнат').text()[separator + 2:separator + 3]
                    extra_object.rooms_number = rooms_number
                if u"Общая" in categories:
                    separator1 = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Общая').text().find(":")
                    full_area = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Общая').text()[separator1 + 2:]
                    separator2 = full_area.find(" ")
                    area = full_area[:separator2]
                    extra_object.total_area = area
            if advert.category_id in [14]:
                extra_object = ExtraHouse()
                if u"Этажей" in categories:
                    floors = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Этажей').text()[8:]
                    extra_object.floors = floors
                if u"Общая" in categories:
                    separator1 = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Общая').text().find(":")
                    full_area = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Общая').text()[separator1 + 2:]
                    separator2 = full_area.find(" ")
                    area = full_area[:separator2]
                    extra_object.total_area = area
            if advert.category_id in [16, 26]:
                extra_object = ExtraLot()
                if u"Площадь" in categories:
                    separator1 = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Площадь').text().find(":")
                    area = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Площадь').text()[separator1 + 2:]
                    extra_object.total_area = area
                if u"Под ком. заст." or u"Под жил. заст." in categories:
                    granted = g.doc.select(
                        "//div[@class='kratkost']/p[contains(.,'%s')]" %
                        u'Под').text()
                    extra_object.intended_purpose = granted
            if u"Метро" in categories:
                separator = g.doc.select(
                    "//div[@class='kratkost']/p[contains(.,'%s')]" %
                    u'Метро').text().find("-")
                metro = g.doc.select(
                    "//div[@class='kratkost']/p[contains(.,'%s')]" %
                    u'Метро').text()[7:separator - 2]
                if metro == u'Дворец Спорта':
                    advert.metro_id = 76
                elif metro == u'Дружбы Народов':
                    advert.metro_id = 79
                elif metro == u'Красный Хутор':
                    advert.metro_id = 87
                elif metro == u'Демеевская':
                    advert.metro_id = 66
                elif metro == u'Советской армии':
                    advert.metro_id = 21
                elif metro == u'Маршала Жукова':
                    advert.metro_id = 12
                elif metro == u'Метростроителей им. Ващенко':
                    advert.metro_id = 13
                elif metro == u'им. А.С. Масельского':
                    advert.metro_id = 9
                else:
                    advert.metro_id = METRO_CIUA[metro]
            if g.doc.select("//div[@class='kratkost']").text():
                if DOMEN[self.city][29:] == 'kharkov':
                    separator1 = g.doc.select(
                        "//div[@class='kratkost']").text().find(",")
                    fulladress = g.doc.select(
                        "//div[@class='kratkost']").text()[separator1 + 11:]
                    separator2 = fulladress.find(",")
                    subloc = fulladress[:separator2]
                else:
                    separator1 = g.doc.select(
                        "//div[@class='kratkost']").text().find(",")
                    fulladress = g.doc.select(
                        "//div[@class='kratkost']").text()[separator1 + 2:]
                    separator2 = fulladress.find(",")
                    subloc = fulladress[:separator2]
                advert.sublocality_id = SUB_CIUA[subloc]
            same_adv = Advert.objects.filter(
                category_id=CATEGORIES[addition['category']],
                author_id=self.author_id,
                city_id=self.city_id,
                link=advert.link,
            ).first()
            if same_adv:
                self.stats['omited'] += 1
                if same_adv.date_of_update < (timezone.now() -
                                              datetime.timedelta(hours=20)):
                    same_adv.date_of_update = timezone.now()
                    same_adv.save()
                    self.stats['date_of_update'] += 1
                continue
            advert.save()
            photo_grab = g.clone()
            photo_grab.setup(proxy_auto_change=False, reuse_referer=False)
            img = []
            for i in g.doc.select('//@href'):
                if i.text().startswith("/pic/objects/"):
                    img.append(i.text())
                else:
                    continue
            for photo in img:
                photo_name_except = photo[22:54]
                photo_link = '%s%s' % (DOMEN[self.city][:15], photo)
                photos = []
                photo_links2 = []
                sleep(0.2)
                try:
                    photo_grab.go(photo_link)
                    if photo_grab.response.code == 200 and \
                            re.match('image/', photo_grab.response.headers['Content-Type']):
                        photos.append({
                            'body':
                            photo_grab.response.body,
                            'extention':
                            RE_EXTENTION.search(
                                photo_grab.config['url']).group()
                        })
                except GrabNetworkError as error:
                    photo_links2.append(photo_link)
                photo = Photo(advert_id=advert.id)
                try:
                    file_name = '%s.%s' % (hashlib.md5(
                        photo_grab.config['url']).hexdigest(),
                                           photos[0]['extention'])
                except UnicodeEncodeError:
                    file_name = file_name = '%s.%s' % (hashlib.md5(
                        photo_name_except).hexdigest(), photos[0]['extention'])
                photo.photo.save(file_name, ContentFile(photos[0]['body']))
            if extra_object:
                extra_object.advert = advert
                extra_object.save()
            self.stats['saved'] += 1
Exemplo n.º 14
0
    def fetch(self):
        """
        Download urls via multicurl.
        
        Get new tasks from queue.
        """ 
        m = pycurl.CurlMulti()
        m.handles = []

        # Create curl instances
        for x in xrange(self.thread_number):
            curl = pycurl.Curl()
            m.handles.append(curl)

        freelist = m.handles[:]

        # This is infinite cycle
        # You can break it only from outside code which
        # iterates over result of this method
        while True:

            cached_request = None

            while len(freelist):

                # Increase request counter
                if (self.request_limit is not None and
                    self.counters['request'] >= self.request_limit):
                    logging.debug('Request limit is reached: %s' %\
                                  self.request_limit)
                    if len(freelist) == self.thread_number:
                        yield None
                    else:
                        break
                else:
                    try:
                        priority, task = self.taskq.get(True, 0.1)
                    except Empty:
                        # If All handlers are free and no tasks in queue
                        # yield None signal
                        if len(freelist) == self.thread_number:
                            yield None
                        else:
                            break
                    else:
                        if not self._preprocess_task(task):
                            continue

                        task.network_try_count += 1
                        if task.task_try_count == 0:
                            task.task_try_count = 1

                        if task.task_try_count > self.task_try_limit:
                            logging.debug('Task tries ended: %s / %s' % (
                                          task.name, task.url))
                            self.add_item('too-many-task-tries', task.url)
                            continue
                        
                        if task.network_try_count > self.network_try_limit:
                            logging.debug('Network tries ended: %s / %s' % (
                                          task.name, task.url))
                            self.add_item('too-many-network-tries', task.url)
                            continue

                        #import pdb; pdb.set_trace()
                        if task.grab:
                            grab = task.grab
                        else:
                            # Set up curl instance via Grab interface
                            grab = Grab(**self.grab_config)
                            grab.setup(url=task.url)

                        if self.use_cache and not task.get('disable_cache'):
                            if grab.detect_request_method() == 'GET':
                                url = grab.config['url']
                                utf_url = url.encode('utf-8') if isinstance(url, unicode) else url
                                if self.cache_key_hash:
                                    url_hash = sha1(utf_url).hexdigest()
                                else:
                                    url_hash = url
                                cache_item = self.cache.find_one({'_id': url_hash})
                                if cache_item:
                                #if url in self.cache:
                                    #cache_item = pickle.loads(self.cache[url])
                                    logging.debug('From cache: %s' % url)
                                    cached_request = (grab, grab.clone(),
                                                      task, cache_item)
                                    grab.prepare_request()
                                    self.inc_count('request-cache')

                                    # break from prepre-request cycle
                                    # and go to process-response code
                                    break

                        self.inc_count('request-network')
                        if self.proxylist_config:
                            args, kwargs = self.proxylist_config
                            grab.setup_proxylist(*args, **kwargs)

                        curl = freelist.pop()
                        curl.grab = grab
                        curl.grab.curl = curl
                        curl.grab_original = grab.clone()
                        curl.grab.prepare_request()
                        curl.task = task
                        # Add configured curl instance to multi-curl processor
                        m.add_handle(curl)


            # If there were done network requests
            if len(freelist) != self.thread_number:
                while True:
                    status, active_objects = m.perform()
                    if status != pycurl.E_CALL_MULTI_PERFORM:
                        break

            if cached_request:
                grab, grab_original, task, cache_item = cached_request
                url = task.url# or grab.config['url']
                grab.fake_response(cache_item['body'])

                if self.use_cache_compression:
                    body = zlib.decompress(cache_item['body']) 
                else:
                    body = cache_item['body'].encode('utf-8')
                def custom_prepare_response(g):
                    g.response.head = cache_item['head'].encode('utf-8')
                    g.response.body = body
                    g.response.code = cache_item['response_code']
                    g.response.time = 0
                    g.response.url = cache_item['url']
                    g.response.parse('utf-8')
                    g.response.cookies = g.extract_cookies()

                grab.process_request_result(custom_prepare_response)

                yield {'ok': True, 'grab': grab, 'grab_original': grab_original,
                       'task': task, 'ecode': None, 'emsg': None}
                self.inc_count('request')

            while True:
                queued_messages, ok_list, fail_list = m.info_read()

                results = []
                for curl in ok_list:
                    results.append((True, curl, None, None))
                for curl, ecode, emsg in fail_list:
                    results.append((False, curl, ecode, emsg))

                for ok, curl, ecode, emsg in results:
                    res = self.process_multicurl_response(ok, curl,
                                                          ecode, emsg)
                    m.remove_handle(curl)
                    freelist.append(curl)
                    yield res
                    self.inc_count('request')

                if not queued_messages:
                    break

            m.select(0.5)
Exemplo n.º 15
0
    def task_collect_adv_data(self, grab, task):
        self.stats['taken'] += 1
        one = []
        for i in grab.doc.select('//@href'):
            one.append(i.text())
        url_adv = re.findall(r'adv-\d+\.\w+', ','.join(one))[0::3]
        for one_adv in url_adv:
            self.stats['processed'] += 1
            addition = task.get('addition')
            g = Grab()
            g.go(DOMEN + one_adv)
            advert = Advert()
            advert.category_id = CATEGORIES[addition['category']]
            advert.city_id = self.city_id
            advert.author_id = self.author_id
            advert.link = DOMEN + one_adv
            categories = g.doc.select(
                '//table[@class="adv_info_table"]').text()

            if g.doc.select('//h2[@class="pagetitle"]').text():
                title = g.doc.select('//h2[@class="pagetitle"]').text()
                advert.title = title
            price = g.doc.select('//td[@class="adv-price"]').text()
            if price:
                price_search = re.findall('\d+', price)
                price_one = ""
                for i in price_search:
                    price_one += i
                advert.price_uah = int(price_one)
            else:
                advert.price_uah = 1
            if u"Описание:" in categories:
                text = g.doc.select(
                    '//td[@style="border-bottom:none;"][@colspan="2"]').text()
                if text:
                    advert.main_text = text
            if u"Телефон:" in categories:
                phones = re.sub(
                    u'Телефон:', "",
                    g.doc.select(
                        "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                        % u'Телефон').text())
                if phones:
                    phon = re.sub(r'\-', "", phones)
                    advert.raw_phones = phon
            if u"Имя, фамилия:" in categories:
                contact = re.sub(
                    u'Имя, фамилия:', "",
                    g.doc.select(
                        "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                        % u'Имя').text())
                if contact:
                    advert.contact_name = contact
            if advert.category_id in [21, 11, 12]:
                extra_object = ExtraFlat()
                if u"Этажность" in categories:
                    floors = re.sub(
                        u'Этажность ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Этажность').text())
                    if floors:
                        extra_object.floors = floors
                if u"Этаж" in categories:
                    floor = re.sub(
                        u'Этаж ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Этаж').text())
                    if floor:
                        extra_object.floor = floor
                if u"Количество комнат" in categories:
                    rooms_number = re.sub(
                        u'Количество комнат ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Количество комнат').text())
                    if rooms_number:
                        extra_object.rooms_number = rooms_number
                if u"Общая площадь" in categories:
                    area = re.sub(
                        u'Общая площадь ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Общая площадь').text())
                    area_search = re.search('\d+', area)
                    if area_search:
                        extra_object.total_area = area_search.group()
            if advert.category_id in [14]:
                extra_object = ExtraHouse()
                if u"Этажность" in categories:
                    floors = re.sub(
                        u'Этажность ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Этажность').text())
                    if floors:
                        extra_object.floors = floors
                if u"Общая площадь" in categories:
                    area = re.sub(
                        u'Общая площадь ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Общая площадь').text())
                    area_search = re.search('\d+', area)
                    if area_search:
                        extra_object.total_area = area_search.group()
            if advert.category_id in [16]:
                extra_object = ExtraLot()
                if u"Общая площадь" in categories:
                    area = re.sub(
                        u'Общая площадь ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Общая площадь').text())
                    area_search = re.search('\d+', area)
                    if area_search:
                        extra_object.total_area = area_search.group()
            if self.metro_marker:
                metroc = advert.detect_metro_id(self.metro_marker)
                if metroc:
                    advert.metro_id = metroc
                if u"Метро" in categories:
                    metro = re.sub(
                        u'Метро ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Метро').text())
                    if metro:
                        advert.metro_id = METRO_PREM[metro]
            if self.sublocality_marker:
                subloc = advert.detect_sublocality_id(self.sublocality_marker)
                if subloc:
                    advert.sublocality_id = subloc
                if u"Район" in categories:
                    subloc = re.sub(
                        u'Район ', "",
                        g.doc.select(
                            "//table[@class='adv_info_table']/tr[contains(.,'%s')]"
                            % u'Район').text())
                    if subloc:
                        advert.sublocality_id = SUB_PREM[subloc]
            same_adv = Advert.objects.filter(
                category_id=CATEGORIES[addition['category']],
                author_id=self.author_id,
                city_id=self.city_id,
                link=advert.link,
            ).first()
            if same_adv:
                self.stats['omited'] += 1
                if same_adv.date_of_update < (timezone.now() -
                                              datetime.timedelta(hours=20)):
                    same_adv.date_of_update = timezone.now()
                    same_adv.save()
                    self.stats['date_of_update'] += 1
                continue
            advert.save()
            photo_grab = g.clone()
            photo_grab.setup(proxy_auto_change=False, reuse_referer=False)
            img = []
            for i in g.doc.select(
                    '//a[@data-lightbox="advertisement-images"]/@href'):
                img.append(i.text())
            for photo in img:
                photo_name_except = re.search(r'\d{8}', photo).group()
                photo_link = '%s%s' % (DOMEN2, photo)
                photos = []
                sleep(0.2)
                try:
                    photo_grab.go(photo_link)
                    if photo_grab.response.code == 200 and \
                            re.match('image/', photo_grab.response.headers['Content-Type']):
                        photos.append({
                            'body':
                            photo_grab.response.body,
                            'extention':
                            RE_EXTENTION.search(
                                photo_grab.config['url']).group()
                        })
                except GrabNetworkError as error:
                    photo_links2.append(photo_link)
                photo = Photo(advert_id=advert.id)
                try:
                    file_name = '%s.%s' % (hashlib.md5(
                        photo_grab.config['url']).hexdigest(),
                                           photos[0]['extention'])
                except UnicodeEncodeError:
                    file_name = file_name = '%s.%s' % (hashlib.md5(
                        photo_name_except).hexdigest(), photos[0]['extention'])
                photo.photo.save(file_name, ContentFile(photos[0]['body']))
            if extra_object:
                extra_object.advert = advert
                extra_object.save()
            self.stats['saved'] += 1
Exemplo n.º 16
0
class ParserWithProxy(Spider):
    u"""Базовый класс парсера для работы с прокси"""

    USE_PROXY = True

    def __init__(self, country_code, *args, **kwargs):
        super(ParserWithProxy, self).__init__(*args, **kwargs)

        self.country = countries.get(alpha2=country_code)
        self.proxies = []
        self.used_proxies = set()

        self.grab = None
        self.grab_use_count = None

        self.reinit_grab()

        self.setup_queue(getattr(config, 'QUEUE_BACKEND', 'memory'))
        if getattr(config, 'CACHE_ENABLED', False):
            self.setup_cache('mongo', getattr(config, 'CACHE_DATABASE', 'cache'))

    def check_grab(self, grab):
        return True

    def reinit_grab(self):
        if not self.grab:
            self.grab = Grab()

        self.grab_use_count = 0

        while True:
            self.grab.clear_cookies()
            self.grab.setup(**self.get_next_proxy())
            if self.check_grab(self.grab):
                break
            logger.info(u'Плохая прокси. Смена...')

    def get_grab(self):
        self.grab_use_count += 1

        if self.grab_use_count > config.PROXY_USE_LIMIT:
            self.reinit_grab()

        return self.grab.clone()

    def get_next_proxy(self):
        u"""Получение следующей неиспользованной прокси"""

        if not self.USE_PROXY:
            return {}

        while not self.proxies:
            # получение проксей и фильтрация неспользованных
            self.proxies = get_proxy_list(self.country.alpha2, 100)
            self.proxies = filter(
                lambda proxy: tuple(proxy.values()) not in self.used_proxies,
                self.proxies
            )
            if not self.proxies:
                logger.info(u'Кончились прокси, ожидание новых')
                sleep(10)
            else:
                break
        # возврат первой прокси
        proxy = self.proxies[0]
        self.used_proxies.add(tuple(proxy.values()))
        del self.proxies[0]
        return proxy
Exemplo n.º 17
0
 def test_empty_clone(self):
     g = Grab()
     g.clone()
Exemplo n.º 18
0
    def task_collect_adv_data(self, grab, task):
        self.stats['taken'] += 1
        one = []
        url_adv = []
        for i in grab.doc.select('//@href'):
            one.append(i.text())
        for item in one:
            if item.startswith('view.php?ad_id=') and item not in url_adv:
                url_adv.append(item)
            else:
                continue
        for one_adv in url_adv:
            self.stats['processed'] += 1
            addition = task.get('addition')
            advert = Advert()
            if self.city == 'kharkov':
                advert.category_id = CATEGORIES_khar[addition['category']]
            else:
                advert.category_id = CATEGORIES_kiev[addition['category']]
            advert.city_id = self.city_id
            advert.author_id = self.author_id
            advert.link = DOMEN[:13] + one_adv
            g = Grab()
            g.go(DOMEN[:13] + "print_" + one_adv)
            if g.doc.select("//td/p[contains(.,'%s')]" % u'Тел:').text():
                phones = g.doc.select("//td/p[contains(.,'%s')]" %
                                      u'Тел:').text()[5:]
                advert.raw_phones = phones
            g.go(DOMEN[:13] + one_adv)

            categories = g.doc.select(
                '//div[@style="font-size: 11px;"]').text()

            if g.doc.select('//h1').text():
                title = g.doc.select('//h1').text()
                advert.title = title
            if g.doc.select('//p[@class="ad-price"]').text():
                numlist = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
                price = ""
                for i in g.doc.select('//p[@class="ad-price"]').text():
                    if i in numlist:
                        price += i
                    else:
                        continue
                if u'грн' in g.doc.select('//p[@class="ad-price"]').text():
                    if price:
                        advert.price_uah = int(price)
                else:
                    if price:
                        advert.price_usd = int(price)
            if g.doc.select('//p[@class="ad-desc"]').text():
                text = g.doc.select('//p[@class="ad-desc"]').text()
                advert.main_text = text
            # if g.doc.select("//div[@style='font-size: 11px;']/p[contains(.,'%s')]" %u'Контакт').text():
            #     contact = g.doc.select("//div[@style='font-size: 11px;']/p[contains(.,'%s')]" %u'Контакт').text()[9:]
            #     advert.contact_name = contact
            if advert.category_id in [21, 11, 27, 17]:
                extra_object = ExtraFlat()
                if u"Этаж" in categories:
                    separator1 = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Этаж').text().find(":")
                    both = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Этаж').text()[separator1 + 2:]
                    separator2 = both.find("/")
                    floors = both[separator2 + 2:]
                    floor = both[:separator2 - 1]
                    extra_object.floors = floors
                    extra_object.floor = floor
                if u"Комнат" in categories:
                    rooms_number = g.doc.select(
                        '//p[@class="ad-contacts"]').text()[-1]
                    extra_object.rooms_number = rooms_number
                if u"Общая" in categories:
                    separator = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Общая площадь').text().find(":")
                    area = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Общая площадь').text()[separator + 2:-6]
                    extra_object.total_area = area
            if advert.category_id in [14, 24]:
                extra_object = ExtraHouse()
                if u"Этажность" in categories:
                    floors = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Этаж').text()[-1]
                    extra_object.floors = floors
                if u"Площадь дома" in categories:
                    separator = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Площадь дома').text().find(":")
                    area = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Площадь дома').text()[separator + 2:-6]
                    extra_object.total_area = area
                if u"Площадь участка" in categories:
                    separator = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Площадь участка').text().find(":")
                    area = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Площадь участка').text()[separator + 2:]
                    extra_object.lot_area = area
            if advert.category_id in [16]:
                extra_object = ExtraLot()
                if u"Площадь участка" in categories:
                    separator1 = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Площадь участка').text().find(":")
                    area = g.doc.select(
                        "//div[@style='font-size: 11px;']/p[contains(.,'%s')]"
                        % u'Площадь участка').text()[separator1 + 2:]
                    extra_object.total_area = area
                if u"Под строительство" in categories:
                    granted = g.doc.select(
                        "//ul[@style='list-style-type: none']/li[contains(.,'%s')]"
                        % u'Под').text()
                    extra_object.intended_purpose = granted
            if g.doc.select("//h3").text():
                if "," in g.doc.select("//h3").text():
                    subloc = g.doc.select("//h3").text()
                else:
                    separator = g.doc.select("//h3").text().find(" ")
                    subloc = g.doc.select("//h3").text()[:separator]
                advert.sublocality_id = SUB_FN[subloc]
            advert.metro_id = advert.detect_metro_id(self.metro_marker)
            same_adv = Advert.objects.filter(
                category_id=advert.category_id,
                author_id=self.author_id,
                city_id=self.city_id,
                link=advert.link,
            ).first()
            if same_adv:
                self.stats['omited'] += 1
                if same_adv.date_of_update < (timezone.now() -
                                              datetime.timedelta(hours=20)):
                    same_adv.date_of_update = timezone.now()
                    same_adv.save()
                    self.stats['date_of_update'] += 1
                continue
            advert.save()
            photo_grab = g.clone()
            photo_grab.setup(proxy_auto_change=False, reuse_referer=False)
            img = []
            for i in g.doc.select('//@href'):
                if i.text().startswith("./upload/pics/"):
                    img.append(i.text())
                else:
                    continue
            for photo in img:
                photo_name_except = photo[14:-4]
                photo_link = '%s%s' % (DOMEN[:13], photo)
                photos = []
                photo_links2 = []
                sleep(0.2)
                try:
                    photo_grab.go(photo_link)
                    if photo_grab.response.code == 200 and \
                            re.match('image/', photo_grab.response.headers['Content-Type']):
                        photos.append({
                            'body':
                            photo_grab.response.body,
                            'extention':
                            RE_EXTENTION.search(
                                photo_grab.config['url']).group()
                        })
                except GrabNetworkError as error:
                    photo_links2.append(photo_link)
                photo = Photo(advert_id=advert.id)
                try:
                    file_name = '%s.%s' % (hashlib.md5(
                        photo_grab.config['url']).hexdigest(),
                                           photos[0]['extention'])
                except UnicodeEncodeError:
                    file_name = file_name = '%s.%s' % (hashlib.md5(
                        photo_name_except).hexdigest(), photos[0]['extention'])
                photo.photo.save(file_name, ContentFile(photos[0]['body']))
            if extra_object:
                extra_object.advert = advert
                extra_object.save()
            self.stats['saved'] += 1
Exemplo n.º 19
0
 def create_advert(self, raw):
     grab = Grab()
     self.stats['taken'] += 1
     self.stats['processed'] += 1
     # general fields
     extra_object = None
     adv = Advert()
     adv.city_id = self.city_id
     adv.author_id = self.author_id
     if 'priceArr' in raw:
         if raw['priceArr']['3']:
             price = re.sub(r'\s', '', raw['priceArr']['3'])
             adv.price_uah = int(price)
         else:
             adv.price_uah = 1
     if 'description' in raw:
         adv.main_text = raw['description']
     if 'user' in raw:
         adv.contact_name = raw['user']['name']
     if 'user_id' in raw:
         us_id = raw['user_id']
         p = Grab()
         p.go(URLS_ID % us_id)
         main = p.doc.select("//li[@class='fieldWrap']").text()
         adv.raw_phones = re.sub(r'\s|\(|\)|\-', '', main)
     if 'street_name' in raw:
         adv.street = raw['street_name']
     if 'beautiful_url' in raw:
         url_domria = raw['beautiful_url']
         adv.link = (LINK_DOM % url_domria)
     if raw['advert_type_id'] in [1]:
         if raw['realty_type_name'] in [u'квартира', u'Квартира']:
             extra_object = ExtraFlat()
             adv.category_id = CATEGORIES['prodazha-kvartir']
             titles = u'Продажа квартиры %s'
             if 'floors_count' in raw:
                 extra_object.floors = raw['floors_count']
             if 'floor' in raw:
                 extra_object.floor = raw['floor']
             if 'rooms_count' in raw:
                 extra_object.rooms_number = raw['rooms_count']
             if 'total_square_meters' in raw:
                 extra_object.total_area = raw['total_square_meters']
         if raw['realty_type_name'] in [u'дом', u'Дом']:
             extra_object = ExtraHouse()
             adv.category_id = CATEGORIES['prodazha-domov']
             titles = u'Продажа дома %s'
             if 'total_square_meters' in raw:
                 extra_object.total_area = raw['total_square_meters']
     if raw['advert_type_id'] in [3, 4]:
         if raw['realty_type_name'] in [u'квартира', u'Квартира']:
             extra_object = ExtraFlat()
             adv.category_id = CATEGORIES['arenda-kvartir']
             titles = u'Аренда квартиры %s'
             if 'floors_count' in raw:
                 extra_object.floors = raw['floors_count']
             if 'floor' in raw:
                 extra_object.floor = raw['floor']
             if 'rooms_count' in raw:
                 extra_object.rooms_number = raw['rooms_count']
             if 'total_square_meters' in raw:
                 extra_object.total_area = raw['total_square_meters']
         if raw['realty_type_name'] in [u'дом', u'Дом']:
             extra_object = ExtraHouse()
             adv.category_id = CATEGORIES['arenda-domov']
             titles = u'Аренда дома %s'
             if 'total_square_meters' in raw:
                 extra_object.total_area = raw['total_square_meters']
     if 'district_name' in raw:
         sudlo_name = int(SUB_CIUA[raw['district_name']])
         adv.sublocality_id = sudlo_name
         adv.title = (titles % adv.sublocality.name)
     if 'district_name' not in raw:
         if 'street_name' in raw:
             adv.title = (titles % raw['street_name'])
         else:
             adv.title = (titles % self.city)
     if self.metro_marker:
         if 'metro_station_name' in raw:
             metro_station = METRO_CIUA[raw['metro_station_name']]
             adv.metro_id = metro_station
     adv.save()
     photo_grab = grab.clone()
     photo_grab.setup(proxy_auto_change=False, reuse_referer=False)
     for key in raw['photos']:
         key_photo = raw['photos'][key]['file']
         photo_link = (PHOTO_URL % (key_photo.replace('.', 'f.')))
         photos = []
         sleep(0.2)
         try:
             photo_grab.go(photo_link)
             if photo_grab.response.code == 200 and \
                     re.match('image/', photo_grab.response.headers['Content-Type']):
                 photos.append({
                     'body':
                     photo_grab.response.body,
                     'extention':
                     RE_EXTENTION.search(photo_grab.config['url']).group()
                 })
         except GrabNetworkError as error:
             photo_links2.append(photo_link)
         photo = Photo(advert_id=adv.id)
         try:
             file_name = '%s.%s' % (hashlib.md5(
                 photo_grab.config['url']).hexdigest(),
                                    photos[0]['extention'])
             photo.photo.save(file_name, ContentFile(photos[0]['body']))
         except IndexError:
             pass
     if extra_object:
         extra_object.advert = adv
         extra_object.save()
     self.stats['saved'] += 1
Exemplo n.º 20
0
class Avito:
    PAGETRY = 3
    SLEEP = 1

    def __init__(self, proxy=None):
        self.PAGETRY, rc = startup.CFG.value("pagetry", Avito.PAGETRY).toInt()
        self.SLEEP, rc = startup.CFG.value("sleep", Avito.SLEEP).toInt()

        self.g = Grab()
        self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10)))
        if __name__ == "__main__":
            self.g.setup(log_dir="dump")
        if proxy:
            self.g.load_proxylist(proxy, "text_file", "http", auto_init=True, auto_change=False)
        self.tess = Tesseract()
        self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)

    def _go3(self, url, tag):
        c = self.PAGETRY
        while c:
            try:
                self.g.change_proxy()
                self.g.go(url)
                self.g.assert_substring(u'content="51a6d66a02fb23c7"')
                break
            except:
                log.exception("%s left %i", tag, c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception("%s error" % tag)

    def get_links(self, url):
        self._go3(url, "start page")
        c = 999
        while True:
            links = self.g.doc.select('//h3[@class="title"]/a/@href')
            if not links:
                raise Exception("no links")
            for link in links:
                c -= 1
                if not c:
                    return
                yield urljoin(url, link.text())
            next = self.g.doc.select('//a[@class="next"]/@href')
            if not next:
                log.debug("last page?")
                break
            QtCore.QThread.sleep(self.SLEEP)
            nurl = urljoin(url, next.text())
            log.debug("open next page %s", nurl)
            self._go3(nurl, "next page")

    def get_photos(self, photos):
        g = self.g.clone()
        datas = []
        for url in photos:
            if not url.startswith("http:"):
                url = "http:" + url
            c = self.PAGETRY
            while c:
                try:
                    rc = g.go(url)
                    g.assert_substring("JFIF", byte=True)
                    datas.append(rc.body)
                    break
                except:
                    log.exception("get_item left %i", c)
                    c -= 1
                    g.change_proxy()
                    QtCore.QThread.sleep(self.SLEEP)
            else:
                raise Exception("get photo error")
        return datas

    def get_item(self, url):
        g = self.g.clone()
        c = self.PAGETRY
        while c:
            try:
                g.change_proxy()
                g.go(url)
                g.assert_substring(u'content="499bdc75d3636c55"')  # avitos ya id
                break
            except:
                log.exception("get_item left %i", c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception("get item error")
        doc = g.doc
        title = doc.select('//h1[@itemprop="name"]').text()
        gallery = doc.select('//div[@class="gallery-item"]')
        photos = [s.text() for s in gallery.select('.//a[@class="gallery-link"]/@href')]
        if not photos:
            egg = doc.select('//img[contains(@class,"j-zoom-gallery-init")]/@src')
            if egg:
                photos.append(egg.text())
        item = doc.select('//div[@itemprop="offers"]')
        # price = item.select('.//strong[@itemprop="price"]').text()
        price = item.select('.//span[@itemprop="price"]').text()
        name = item.select('.//strong[@itemprop="name"]').text()

        try:
            town = item.select('.//span[@id="toggle_map"]/span[@itemprop="name"]').text()
        except:
            log.warning("xpath town not found, try another way")
            town = item.select('.//div[@id="map"]/span[@itemprop="name"]').text()
        # desc = item.select('.//div[@id="desc_text"]').text()
        desc = doc.select('//div[contains(@class,"description-text")]').text()
        # <span id="item_id">348967351</span>
        item_id = doc.select('//span[@id="item_id"]').text()
        _url = g.rex_text("avito.item.url = '([^>]+?)'")
        _phone = g.rex_text("avito.item.phone = '([^>]+?)'")

        loc = {"item_phone": _phone, "item": {"id": item_id, "url": _url}}
        log.debug("jslock enter <--")
        with PyV8.JSContext(loc) as ctx:
            ctx.enter()
            ctx.eval(
                """function phoneDemixer(key){var
    pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}}
    return r;}"""
            )

            # http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4
            egg = ctx.eval("'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)")
            # egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)")
            log.debug("js rc %s", egg)
            ctx.leave()
        log.debug("jslock leave -->")
        phone = ""
        c = self.PAGETRY
        while c:
            log.debug("read phone image")
            try:
                rc = g.go(urljoin(url, egg))
                img = Image.open(StringIO(rc.body))
                phone = self.tess.from_image(img, basewidth=300, whitelist="0123456789-")
                break
            except:
                g.change_proxy()
                log.exception("get_phone left %i", c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            log.debug("get phone error")

        return dict(item=item_id, title=title, photos=photos, price=price, name=name, town=town, desc=desc, phone=phone)
Exemplo n.º 21
0
def volume_info(url_volume, url_ranobe):
    """Функция возвращает словарь, содержащий информацию о томе ранобе."""

    g = Grab()
    g.setup(hammer_mode=True)

    # Переходим на страницу тома
    g.go(url_volume)

    if g.response.code != 200:
        print("Страница: {}, код возврата: {}".format(url_volume, g.response.code))
        return

    # Ссылка к картинке обложки тома
    url_cover_volume = get_url2full_image_cover(g.clone(), url_ranobe)

    # Получаем список строк с двумя столбцами, каждая строка содержит
    # некоторую информацию о томе: названия на нескольких языка, серия,
    # автор, иллюстратор и т.п.
    list_info = g.doc.select('//table[@id="release-info"]/tr/td[2]')
    # volume_ja_name = None  # Название тома на японском
    # volume_en_name = None  # Название тома на английском
    volume_name = None
    series = None
    author = None
    illustrator = None
    volume_isbn = None
    # status = None  # Статус (наверное, статус перевода)
    tr_team = None
    translators = None

    try:
        # volume_ja_name = list_info[0].text()  # Название тома на японском
        # volume_en_name = list_info[1].text()  # Название тома на английском
        volume_name = list_info[2].text()
        series = list_info[3].text()
        author = list_info[4].text()
        illustrator = list_info[5].text()
        volume_isbn = list_info[6].text()
        # status = list_info[7].text()  # Статус (наверное, статус перевода)
        tr_team = list_info[8].text()
        translators = list_info[9].text().split(', ')
    except IndexError:
        print("Не хватает полей с информацией о томе: {}".format(url_volume))

    # Получение списка глав тома из оглавления
    volume_pages = get_volume_pages(g)
    # Если нет содержания -- пропускаем том
    if not volume_pages:
        print("Нет содержания: {}".format(url_volume))
        return

    # Список глав тома
    chapters = list()

    # Остальные страниц, которые не относятся к главам тома
    other_pages = dict()

    # Словарь содержит информацию о томе
    info = {
        "name": volume_name,
        "series": series,
        "author": author,
        "illustrator": illustrator,
        "ISBN": volume_isbn,
        "url_cover": url_cover_volume,
        "pages": {
            "chapters": chapters,
            "other": other_pages,
        },
        "translation": {
            "team": tr_team,  # команда перевода
            "translators": translators,  # переводчики
        },
    }

    # Переберем все страницы тома
    for page in volume_pages:
        # Адрес к главе тома
        name_ch, url_ch = page

        if not isinstance(url_ch, list):
            check = check_volume_page(url_ch)
            if check is False:
                return
            elif check is None:
                continue
        else:
            # Проверяем подглавы:
            for sub_ch in url_ch:
                sub_name, sub_url = sub_ch
                check = check_volume_page(sub_url)
                if check is False:
                    return
                elif check is None:
                    continue

        # Разбиение списка глав соответственно с типами страниц:
        # главы -- отдельно, а все остальное тоже отдельно.

        # Если адресом является список подглав
        if isinstance(url_ch, list):
            # Добавление списка подглав
            chapters.append(page)
        else:
            # Тип страницы тома может быть "Начальные иллюстрации", "Пролог", сами главы, и т.п.
            # Типы страниц описаны выше данной функции.
            volume_base_page = get_volume_base_page(url_ch)

            # Если типом страницы является глава:
            if type_pages_is_chapter(volume_base_page):
                # Добавление адреса главы к списку
                chapters.append(page)
            else:
                # Добавляем в словарь данную страницу, которая не относится к главам
                other_pages[volume_base_page] = page

    return info
Exemplo n.º 22
0
class Avito():
    PAGETRY = 3
    SLEEP = 1

    def __init__(self, proxy=None):
        self.PAGETRY, rc = startup.CFG.value('pagetry', Avito.PAGETRY).toInt()
        self.SLEEP, rc = startup.CFG.value('sleep', Avito.SLEEP).toInt()

        self.g = Grab()
        self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10)))
        if __name__ == '__main__':
            self.g.setup(log_dir='dump')
        if proxy:
            self.g.load_proxylist(proxy,
                                  'text_file',
                                  'http',
                                  auto_init=True,
                                  auto_change=False)
        self.tess = Tesseract()
        self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)

    def _go3(self, url, tag):
        c = self.PAGETRY
        while c:
            try:
                self.g.change_proxy()
                self.g.go(url)
                self.g.assert_substring(u'content="51a6d66a02fb23c7"')
                break
            except:
                log.exception('%s left %i', tag, c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception('%s error' % tag)

    def get_links(self, url):
        self._go3(url, 'start page')
        c = 999
        while True:
            links = self.g.doc.select('//h3[@class="title"]/a/@href')
            if not links:
                raise Exception('no links')
            for link in links:
                c -= 1
                if not c:
                    return
                yield urljoin(url, link.text())
            next = self.g.doc.select('//a[@class="next"]/@href')
            if not next:
                log.debug('last page?')
                break
            QtCore.QThread.sleep(self.SLEEP)
            nurl = urljoin(url, next.text())
            log.debug('open next page %s', nurl)
            self._go3(nurl, 'next page')

    def get_photos(self, photos):
        g = self.g.clone()
        datas = []
        for url in photos:
            if not url.startswith('http:'):
                url = 'http:' + url
            c = self.PAGETRY
            while c:
                try:
                    rc = g.go(url)
                    g.assert_substring('JFIF', byte=True)
                    datas.append(rc.body)
                    break
                except:
                    log.exception('get_item left %i', c)
                    c -= 1
                    g.change_proxy()
                    QtCore.QThread.sleep(self.SLEEP)
            else:
                raise Exception('get photo error')
        return datas

    def get_item(self, url):
        g = self.g.clone()
        c = self.PAGETRY
        while c:
            try:
                g.change_proxy()
                g.go(url)
                g.assert_substring(
                    u'content="499bdc75d3636c55"')  # avitos ya id
                break
            except:
                log.exception('get_item left %i', c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception('get item error')
        doc = g.doc
        title = doc.select('//h1[@itemprop="name"]').text()
        gallery = doc.select('//div[@class="gallery-item"]')
        photos = [
            s.text()
            for s in gallery.select('.//a[@class="gallery-link"]/@href')
        ]
        if not photos:
            egg = doc.select(
                '//img[contains(@class,"j-zoom-gallery-init")]/@src')
            if egg:
                photos.append(egg.text())
        item = doc.select('//div[@itemprop="offers"]')
        #price = item.select('.//strong[@itemprop="price"]').text()
        price = item.select('.//span[@itemprop="price"]').text()
        name = item.select('.//strong[@itemprop="name"]').text()

        try:
            town = item.select(
                './/span[@id="toggle_map"]/span[@itemprop="name"]').text()
        except:
            log.warning('xpath town not found, try another way')
            town = item.select(
                './/div[@id="map"]/span[@itemprop="name"]').text()
        #desc = item.select('.//div[@id="desc_text"]').text()
        desc = doc.select(
            "//div[contains(@class,\"description-text\")]").text()
        #<span id="item_id">348967351</span>
        item_id = doc.select('//span[@id="item_id"]').text()
        _url = g.rex_text("avito.item.url = '([^>]+?)'")
        _phone = g.rex_text("avito.item.phone = '([^>]+?)'")

        loc = {'item_phone': _phone, 'item': {'id': item_id, 'url': _url}}
        log.debug('jslock enter <--')
        with PyV8.JSContext(loc) as ctx:
            ctx.enter()
            ctx.eval('''function phoneDemixer(key){var
    pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}}
    return r;}''')

            #http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4
            egg = ctx.eval(
                "'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)")
            #egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)")
            log.debug('js rc %s', egg)
            ctx.leave()
        log.debug('jslock leave -->')
        phone = ''
        c = self.PAGETRY
        while c:
            log.debug('read phone image')
            try:
                rc = g.go(urljoin(url, egg))
                img = Image.open(StringIO(rc.body))
                phone = self.tess.from_image(img,
                                             basewidth=300,
                                             whitelist='0123456789-')
                break
            except:
                g.change_proxy()
                log.exception('get_phone left %i', c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            log.debug('get phone error')

        return dict(item=item_id,
                    title=title,
                    photos=photos,
                    price=price,
                    name=name,
                    town=town,
                    desc=desc,
                    phone=phone)
Exemplo n.º 23
0
logger_network = logging.getLogger('grab.network')

# Настройки объекта

g = Grab()
capUrl = 'http://is.fssprus.ru/ajax_search'
url = 'http://fssprus.ru/iss/ip/'
g.setup(
    log_file = 'fs_grab_log.txt',
    log_dir = '~/workspace/',
    debug_post = True,
    debug = True,
    verbose_logging = True,
    )


g.go(url, method = "POST")

if g.go(url, post = {
    'variant': '1',
    'last_name': 'Антон',
    'first_name': 'Петров'
    }):
    g1 = g.clone()
    print(g1.request_method)
else:
    print("Чет пошло не так")


if input(int()) == 1:
    g2 = g1.clone()
Exemplo n.º 24
0
class FreeLanceRu(Spider):
    PROJECT_BY_PID = 'http://www.free-lance.ru/projects/?pid=%d'
    INDEX_BY_PAGE = 'http://www.free-lance.ru/?page=%d'

    def __init__(self, pages_count=5, *args, **kwargs):
        self.pages_count = pages_count
        super(FreeLanceRu, self).__init__(*args, **kwargs)

    def prepare(self):
        self.grab = Grab()
        self.grab.setup(headers=additional_headers)

    def get_grab(self, url=None):
        grab = self.grab.clone()
        if url:
            grab.setup(url=url)
        return grab

    def get_task(self, **kwargs):
        url = None
        if 'url' in kwargs:
            url = kwargs['url']
            del kwargs['url']
        grab = self.get_grab(url=url)
        return Task(
                grab=grab,
                **kwargs
            )

    def task_generator(self):
        for index in range(self.pages_count):
            yield self.get_task(
                    name='page',
                    url=FreeLanceRu.INDEX_BY_PAGE % (index + 1)
                )

    def task_page(self, grab, task):
        pids = grab.xpath_list('//a[starts-with(@id, "prj_name_")]/@id')
        pids = map(lambda item: int(item.split('_')[-1]), pids)
        for pid in pids:
            url = FreeLanceRu.PROJECT_BY_PID % (pid)
            if model.Project.query.filter_by(url=url).first():
                continue
            yield self.get_task(
                    name='project',
                    pid=pid,
                    url=url
                )

    def task_project(self, grab, task):
        project = None
        if grab.xpath_exists('//*[@class="contest-view"]'):
            project = self.parse_contest_view(grab, task)
        elif grab.xpath_exists('//*[@class="pay-prjct"]'):
            project = self.parse_pay_project(grab, task)
        else:
            project = self.parse_project(grab, task)

        if project:
            self.check_project(project)

    def parse_project(self, grab, task):
        project = {}
        #
        project['url'] = FreeLanceRu.PROJECT_BY_PID % (task.pid)
        #
        name = grab.xpath('//h1[@class="prj_name"]/text()')
        name = name.strip().encode('utf-8')
        project['name'] = name
        #
        date = grab.xpath('//*[@class="user-about-r"]/p/text()')
        date = date.split('[', 1)[0]
        date = date.strip().encode('utf-8')
        date = datetime.datetime.strptime(
                date,
                "%d.%m.%Y | %H:%M"
            )
        project['date'] = date
        #
        category = grab.rex(
                u'<p class="crumbs">Разделы: &#160;&#160; (.*?)(, |</p>)'
            )
        category = category.group(1)
        items = fromstring(category).xpath('./a/text()')
        if not items:
            items = category.split(' / ')
        category = items
        category = map(lambda a: a.strip().encode('utf-8'), category)
        project['category'] = category
        #
        description = grab.xpath('//*[@class="prj_text"]/text()')
        description = description.encode('utf-8')
        project['description'] = description
        #
        project['type'] = 'simple'
        #
        return project

    def parse_contest_view(self, grab, task):
        project = {}
        project['url'] = FreeLanceRu.PROJECT_BY_PID % (task.pid)
        project['type'] = 'contest'
        return project

    def parse_pay_project(self, grab, task):
        project = {}
        project['url'] = FreeLanceRu.PROJECT_BY_PID % (task.pid)
        project['type'] = 'pay'
        return project

    def check_project(self, project):
        if model.Project.query.filter_by(url=project['url']).first():
            return
        category = None
        if 'category' in project:
            category = self.get_category(project['category'])
        model.Project(
                name=project.get('name', None),
                url=project['url'],
                description=project.get('description', None),
                project_type=project['type'],
                category=category,
                date=project.get('date', None),
                site=model.free_lance_ru
            )
        session.commit()

    def get_category(self, path):
        path.reverse()
        category = None
        while path:
            category = model.Category.query.filter_by(
                    name=path.pop(),
                    parent=category,
                    site=model.free_lance_ru
                )
            category = category.first()
        return category