예제 #1
0
    def work(self, wnum):
        self.log.debug(f'{wnum} worker started')
        rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
        db_connection = DbPg(self.log)
        driver, prox = self.init_browser()

        for raw_msg in rab_connection.get_generator(self.exit_event):
            if not raw_msg:
                if self.exit_event.wait(2):
                    break
                continue

            msg = raw_msg.json()

            if 'url' not in msg:
                self.log.warning(f'{wnum}: bad task: {msg}')
                raw_msg.ack()
                continue

            if msg['num'] == 0:
                msg['url'] = msg['url'].split('?')[0]

            try:
                driver.get(msg['url'])

                self.log.debug(driver.current_url)
                time.sleep(3)

                # parse with selenium
                rows = driver.find_elements_by_css_selector("tr")
                if not rows:
                    self.log.debug(f'{wnum}: not rows in table')
                    raw_msg.nack(requeue=True)
                    break

                for row in rows:
                    cells = row.find_elements_by_css_selector("td")
                    if not cells:
                        continue

                    data = {
                        'img_url':
                        cells[0].find_element_by_css_selector(
                            'img').get_attribute('src'),
                        'country':
                        cells[1].find_element_by_css_selector(
                            'span').get_attribute('title'),
                        'vessel_name':
                        cells[1].text.split('\n')[0],
                        'vessel_type':
                        cells[1].text.split('\n')[1],
                        'year':
                        cells[2].text,
                        'gt':
                        cells[3].text,
                        'dwt':
                        cells[4].text,
                        'sz':
                        cells[5].text
                    }
                    vlength, vwidth = [
                        int(v.strip()) for v in data['sz'].split('/')
                    ]
                    self.log.debug(data)
                    ship = Ship(
                        sid=None,
                        name=data['vessel_name'],
                        country_name=data['country'],
                        description=f'{data["vessel_type"]}, {data["img_url"]}',
                        built_year=data['year'],
                        length=vlength,
                        width=vwidth,
                        gt=data['gt'],
                        dwt=data['dwt'])
                    db_connection.insert_ship(ship)
                db_connection.exec_query(f'''
                    INSERT INTO pages (page_num)
                    VALUES({msg['num']})
                ''')
                raw_msg.ack()
            except Exception as e0:
                self.log.error(f'{wnum}: get page error: {e0}')
                raw_msg.nack(requeue=True)
                if USE_PROXY:
                    self.proxy_gen.back_proxy(prox, str(e0))
                driver.close()
                driver, prox = self.init_browser()
            time.sleep(random.randrange(1, 5))

        rab_connection.close()
        db_connection.close()
        self.log.info(f'{wnum}: worker exit')
예제 #2
0
    def work(self, wnum):

        rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
        db_connection = DbPg(logger=None)
        driver, prox = self.init_browser()

        for raw_msg in rab_connection.get_generator(self.exit_event):
            if not raw_msg:
                if self.exit_event.wait(2):
                    break
                continue

            msg = raw_msg.json()

            if 'url' not in msg:

                raw_msg.ack()
                continue

            if msg['num'] == 0:
                msg['url'] = msg['url'].split('?')[0]

            try:
                driver.get(msg['url'])


                time.sleep(3)

                html = driver.page_source
                dom = lxml_html.fromstring(html)

                # parse with selenium
                rows = dom.cssselect("tr")
                if not rows:

                    raw_msg.nack(requeue=True)
                    break

                for row in rows:
                    cells = row.cssselect("td")
                    if not cells:
                        continue

                    data = {
                        'img_url': cells[0].cssselect('img')[0].get('src'),
                        'country': cells[1].cssselect('span')[0].get('title'),
                        'vessel_name': cells[1].cssselect('a')[0].text_content().strip(),
                        'vessel_type': cells[1].cssselect('small')[0].text_content().strip(),
                        'year': cells[2].text_content(),
                        'gt': cells[3].text_content(),
                        'dwt': cells[4].text_content(),
                        'sz': cells[5].text_content()
                    }
                    vlength, vwidth = [int(v.strip()) for v in data['sz'].split('/')]

                    ship = Ship(
                        sid=None,
                        name=data['vessel_name'],
                        country_name=data['country'],
                        description=f'{data["vessel_type"]}, {data["img_url"]}',
                        built_year=data['year'],
                        length=vlength,
                        width=vwidth,
                        gt=data['gt'],
                        dwt=data['dwt']
                    )
                    db_connection.insert_ship(ship)
                    print(12121)
                db_connection.exec_query(f'''
                    INSERT INTO pages (page_num)
                    VALUES({msg['num']})
                ''')
                raw_msg.ack()
            except Exception as e0:

                raw_msg.nack(requeue=True)
                if USE_PROXY:
                    self.proxy_gen.back_proxy(prox, str(e0))
                try:
                    driver.close()
                except:
                    pass
                if not self.exit_event.is_set():
                    driver, prox = self.init_browser()
            time.sleep(random.randrange(1, 5))

        try:
            rab_connection.close()
            db_connection.close()
            driver.close()
        except:
            pass
예제 #3
0
    def work(self, wnum):
        self.log.debug(f'{wnum} worker started')
        rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
        db_connection = DbPg(logger=None)
        for raw_msg in rab_connection.get_generator(self.exit_event):
            if not raw_msg:
                if self.exit_event.wait(2):
                    break
                continue

            msg = raw_msg.json()
            print(msg)

            if 'url' not in msg:
                self.log.warning(f'{wnum}: bad task: {msg}')
                raw_msg.ack()
                continue
            print()
            if msg['num'] == 0:
                msg['url'] = PAGE_URL0
                print("0", msg)

            try:
                request = requests.get(msg['url'], headers=HEADERS).content
                soup = BeautifulSoup(request, 'html.parser')

                self.log.debug(msg['url'])
                time.sleep(1)

                names_list = []
                container_names = soup.select('div.information-container h2 a')
                for name in container_names:
                    str_name = name.text
                    print(str_name)
                    names_list.append(str_name)

                links = []
                container_links = soup.select('div.information-container h2 a')
                for i in container_links:
                    ii = i['href'].split("&")[0]
                    full_link = ("https://www.autotrader.co.uk" + ii)
                    link = full_link.split('?')[0]
                    links.append(link)

                photos = []
                container_photo = soup.select(
                    'figure.listing-main-image a img')
                for link_photo in container_photo:
                    photos.append(link_photo['src'])

                list_price = []
                container_text = soup.find_all(
                    "a",
                    attrs={
                        "class":
                        "js-click-handler listing-fpa-link listings-price-link tracking-standard-link"
                    })
                for i in container_text:
                    pr = i.find_all("div", attrs={"class": "vehicle-price"})
                    str_price = "".join((re.findall(r'[0-9]{,3},[0-9]{,3}',
                                                    str(pr))))
                    price = 27 * int(str_price.replace(',', ''))
                    list_price.append(price)

                for n, l, f, p in zip(names_list, links, photos, list_price):

                    db_session.add(Cars(n, l, f, p))
                    db_session.commit()

                    data = '{}{}{}{}{}'.format(n, '\t', l, '\t', f, '\t',
                                               str(p))
                    self.log.debug(data)

                db_session.add(Pages(msg['num']))
                db_session.commit()
                raw_msg.ack()
            except Exception as e0:
                self.log.exception()(
                    f'{wnum}: get page error: {e0}')  ##self.log.error
                raw_msg.nack(requeue=True)
                prox = None
                if USE_PROXY:
                    self.proxy_gen.back_proxy(prox, str(e0))

            time.sleep(random.randrange(1, 5))

        rab_connection.close()
        self.log.info(f'{wnum}: worker exit')
예제 #4
0
    def work(self, wnum):
        self.log.debug(f'{wnum} worker started')
        rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
        db_connection = DbPg(logger=None)
        # driver, prox = self.init_browser()
        for raw_msg in rab_connection.get_generator(self.exit_event):
            if not raw_msg:
                if self.exit_event.wait(2):
                    break
                continue

            msg = raw_msg.json()
            print(msg)

            if 'url' not in msg:
                self.log.warning(f'{wnum}: bad task: {msg}')
                raw_msg.ack()
                continue
            print()
            if msg['num'] == 0:
                msg['url'] = PAGE_URL0
                # msg['url'] = msg['url'].split('?')[0]
                print("0",msg)

            try:
                # driver.get(msg['url'])
                request = requests.get(msg['url'], headers=HEADERS).content
                soup = BeautifulSoup(request, 'html.parser')
                # container = soup.select("li.search-page__result")

                self.log.debug(msg['url'])
                # self.log.debug(driver.current_url)
                time.sleep(1)

                names_list = []
                container_names = soup.select('div.information-container h2 a')
                for name in container_names:
                    str_name = name.text
                    #name = str_name.strip()
                    print(str_name)
                    names_list.append(str_name)

                links = []
                container_links = soup.select('div.information-container h2 a')
                for i in container_links:
                    ii = i['href'].split("&")[0]
                    # ii = i['href']
                    full_link = ("https://www.autotrader.co.uk" + ii)
                    link = full_link.split('?')[0]
                    links.append(link)
                    #print(link)

                photos = []
                container_photo = soup.select('figure.listing-main-image a img')
                for link_photo in container_photo:
                    photos.append(link_photo['src'])
                    #print(link_photo['src'])

                list_price = []
                container_text = soup.find_all("a", attrs={ "class" : "js-click-handler listing-fpa-link listings-price-link tracking-standard-link"})
                for i in container_text:
                    pr = i.find_all("div", attrs={ "class" : "vehicle-price"})
                    str_price = "".join((re.findall(r'[0-9]{,3},[0-9]{,3}', str(pr))))
                    price =27*int(str_price.replace(',', ''))
                    list_price.append(price)

                for n, l, f, p in zip(names_list, links, photos, list_price):

                    db_session.add(Cars(n, l, f, p))
                    db_session.commit()

                    data = '{}{}{}{}{}'.format(n, '\t', l, '\t', f, '\t',str(p))
                                                                    # parse with selenium
                                                                    # rows = driver.find_elements_by_css_selector("tr")
                                                                    # if not rows:
                                                                    #     self.log.debug(f'{wnum}: not rows in table')
                                                                    #     raw_msg.nack(requeue=True)
                                                                    #     break
                                                                    #
                                                                    # for row in rows:
                                                                    #     cells = row.find_elements_by_css_selector("td")
                                                                    #     if not cells:
                                                                    #         continue
                                                                    #
                                                                    #     data = {
                                                                    #         'img_url': cells[0].find_element_by_css_selector(
                                                                    #             'img').get_attribute('src'),
                                                                    #         'country': cells[1].find_element_by_css_selector(
                                                                    #             'span').get_attribute('title'),
                                                                    #         'vessel_name': cells[1].text.split('\n')[0],
                                                                    #         'vessel_type': cells[1].text.split('\n')[1],
                                                                    #         'year': cells[2].text,
                                                                    #         'gt': cells[3].text,
                                                                    #         'dwt': cells[4].text,
                                                                    #         'sz': cells[5].text
                                                                    #     }
                                                                    #     vlength, vwidth = [int(v.strip()) for v in data['sz'].split('/')]
                    self.log.debug(data)


                                                    #     db_connection.insert_ship(car)
                                                    # db_connection.exec_query(f'''
                                                    #     INSERT INTO pages (page_num)
                                                    #     VALUES({msg['num']})
                                                    # ''')
                db_session.add(Pages(msg['num']))
                db_session.commit()
                raw_msg.ack()
            except Exception as e0:
                self.log.exception()(f'{wnum}: get page error: {e0}')##self.log.error
                raw_msg.nack(requeue=True)
                prox = None
                if USE_PROXY:
                    self.proxy_gen.back_proxy(prox, str(e0))
                # driver.close()
                # driver, prox = self.init_browser()
            time.sleep(random.randrange(1, 5))

        rab_connection.close()
        # db_connection.close()
        self.log.info(f'{wnum}: worker exit')