def __init__(self, exit_event):
     self.exit_event = exit_event
     self.was_pages = {}
     self.db = DbPg(logger=None)
     self.rqueue = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
     self.wait_queue()
     self.init_progress_table()
     self.get_ready_tasks()
 def __init__(self, exit_event):
     self.exit_event = exit_event
     self.log = init_logger('cars_url_generator')
     self.was_pages = {}
     self.db = DbPg(self.log)
     self.rqueue = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
     self.wait_queue()
     self.init_progress_table()
     self.get_ready_tasks()
示例#3
0
class ShipsGenerator:
    def __init__(self, exit_event):
        self.exit_event = exit_event
        self.log = init_logger('ships_url_generator')
        self.was_pages = {}

        self.db = DbPg(self.log)
        self.rqueue = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)

        self.wait_queue()
        self.init_progress_table()
        self.get_ready_tasks()

    def wait_queue(self):
        while self.rqueue.count() > 0:
            self.log.info('Generator waiting ...')
            if self.exit_event.wait(10):
                break

    def get_ready_tasks(self):
        query = '''SELECT * FROM pages'''
        for row in self.db.get_query(query):
            self.was_pages[row[0]] = True
        self.log.debug(f'total ready tasks: {len(self.was_pages)}')

    def run(self):
        for i in range(MAX_PAGES):
            if self.exit_event.is_set():
                break

            if self.was_pages.get(i):
                continue

            msg = {'url': PAGE_URL.format(num=i),
                   'num': i}

            self.log.debug(f'[{i}]: queue size is: {self.rqueue.count()}')
            while self.rqueue.count() > MAX_QUEUE_SIZE:
                self.log.info('Queue too big, wait')
                if self.exit_event.wait(5):
                    return

            self.rqueue.publish(msg)
        self.log.info('all tasks are generated')

    def init_progress_table(self):
        query = '''CREATE TABLE IF NOT EXISTS pages (page_num integer UNIQUE )'''
        self.db.exec_query(query)
示例#4
0
class ShipsGenerator:
    def __init__(self, exit_event):
        self.exit_event = exit_event

        self.was_pages = {}

        self.db = DbPg(logger=None)
        self.rqueue = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)

        self.wait_queue()
        self.init_progress_table()
        self.get_ready_tasks()

    def wait_queue(self):
        while self.rqueue.count() > 0:

            if self.exit_event.wait(10):
                break

    def get_ready_tasks(self):
        query = '''SELECT * FROM pages'''
        for row in self.db.get_query(query):
            self.was_pages[row[0]] = True


    def run(self):
        for i in range(MAX_PAGES):
            if self.exit_event.is_set():
                break

            if self.was_pages.get(i):
                continue

            msg = {'url': PAGE_URL.format(num=i),
                   'num': i}


            while self.rqueue.count() > MAX_QUEUE_SIZE:
                if self.exit_event.wait(5):
                    return
            self.rqueue.publish(msg)


    def init_progress_table(self):
        query = '''CREATE TABLE IF NOT EXISTS pages (page_num integer UNIQUE )'''
        self.db.exec_query(query)
class CarGenerator:
    def __init__(self, exit_event):
        self.exit_event = exit_event
        self.log = init_logger('cars_url_generator')
        self.was_pages = {}
        self.db = DbPg(self.log)
        self.rqueue = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
        self.wait_queue()
        self.init_progress_table()
        self.get_ready_tasks()

    def wait_queue(self):
        while self.rqueue.count() > 0:
            self.log.info('Generator waiting ...')
            if self.exit_event.wait(10):
                break

    def get_ready_tasks(self):
        for row in Pages.get_pages():
            self.was_pages[row[0]] = True
        self.log.debug(f'total ready tasks: {len(self.was_pages)}')

    def run(self):
        for i in range(MAX_PAGES):
            if self.exit_event.is_set():
                break

            if self.was_pages.get(i):
                continue

            msg = {'url': PAGE_URL.format(num=i), 'num': i}
            print('run', msg)

            self.log.debug(f'[{i}]: queue size is: {self.rqueue.count()}')
            while self.rqueue.count() > MAX_QUEUE_SIZE:
                self.log.info('Queue too big, wait')
                if self.exit_event.wait(5):
                    return

            self.rqueue.publish(msg)
        self.log.info('all tasks are generated')

    def init_progress_table(self):
        delete_Pages()
        init_db()
示例#6
0
    def work(self, wnum):
        self.log.debug(f'{wnum} worker started')
        rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
        db_connection = DbPg(self.log)
        driver, prox = self.init_browser()

        for raw_msg in rab_connection.get_generator(self.exit_event):
            if not raw_msg:
                if self.exit_event.wait(2):
                    break
                continue

            msg = raw_msg.json()

            if 'url' not in msg:
                self.log.warning(f'{wnum}: bad task: {msg}')
                raw_msg.ack()
                continue

            if msg['num'] == 0:
                msg['url'] = msg['url'].split('?')[0]

            try:
                driver.get(msg['url'])

                self.log.debug(driver.current_url)
                time.sleep(3)

                # parse with selenium
                rows = driver.find_elements_by_css_selector("tr")
                if not rows:
                    self.log.debug(f'{wnum}: not rows in table')
                    raw_msg.nack(requeue=True)
                    break

                for row in rows:
                    cells = row.find_elements_by_css_selector("td")
                    if not cells:
                        continue

                    data = {
                        'img_url':
                        cells[0].find_element_by_css_selector(
                            'img').get_attribute('src'),
                        'country':
                        cells[1].find_element_by_css_selector(
                            'span').get_attribute('title'),
                        'vessel_name':
                        cells[1].text.split('\n')[0],
                        'vessel_type':
                        cells[1].text.split('\n')[1],
                        'year':
                        cells[2].text,
                        'gt':
                        cells[3].text,
                        'dwt':
                        cells[4].text,
                        'sz':
                        cells[5].text
                    }
                    vlength, vwidth = [
                        int(v.strip()) for v in data['sz'].split('/')
                    ]
                    self.log.debug(data)
                    ship = Ship(
                        sid=None,
                        name=data['vessel_name'],
                        country_name=data['country'],
                        description=f'{data["vessel_type"]}, {data["img_url"]}',
                        built_year=data['year'],
                        length=vlength,
                        width=vwidth,
                        gt=data['gt'],
                        dwt=data['dwt'])
                    db_connection.insert_ship(ship)
                db_connection.exec_query(f'''
                    INSERT INTO pages (page_num)
                    VALUES({msg['num']})
                ''')
                raw_msg.ack()
            except Exception as e0:
                self.log.error(f'{wnum}: get page error: {e0}')
                raw_msg.nack(requeue=True)
                if USE_PROXY:
                    self.proxy_gen.back_proxy(prox, str(e0))
                driver.close()
                driver, prox = self.init_browser()
            time.sleep(random.randrange(1, 5))

        rab_connection.close()
        db_connection.close()
        self.log.info(f'{wnum}: worker exit')
示例#7
0
    def work(self, wnum):
        self.log.debug(f'{wnum} worker started')
        rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
        db_connection = DbPg(logger=None)
        for raw_msg in rab_connection.get_generator(self.exit_event):
            if not raw_msg:
                if self.exit_event.wait(2):
                    break
                continue

            msg = raw_msg.json()
            print(msg)

            if 'url' not in msg:
                self.log.warning(f'{wnum}: bad task: {msg}')
                raw_msg.ack()
                continue
            print()
            if msg['num'] == 0:
                msg['url'] = PAGE_URL0
                print("0", msg)

            try:
                request = requests.get(msg['url'], headers=HEADERS).content
                soup = BeautifulSoup(request, 'html.parser')

                self.log.debug(msg['url'])
                time.sleep(1)

                names_list = []
                container_names = soup.select('div.information-container h2 a')
                for name in container_names:
                    str_name = name.text
                    print(str_name)
                    names_list.append(str_name)

                links = []
                container_links = soup.select('div.information-container h2 a')
                for i in container_links:
                    ii = i['href'].split("&")[0]
                    full_link = ("https://www.autotrader.co.uk" + ii)
                    link = full_link.split('?')[0]
                    links.append(link)

                photos = []
                container_photo = soup.select(
                    'figure.listing-main-image a img')
                for link_photo in container_photo:
                    photos.append(link_photo['src'])

                list_price = []
                container_text = soup.find_all(
                    "a",
                    attrs={
                        "class":
                        "js-click-handler listing-fpa-link listings-price-link tracking-standard-link"
                    })
                for i in container_text:
                    pr = i.find_all("div", attrs={"class": "vehicle-price"})
                    str_price = "".join((re.findall(r'[0-9]{,3},[0-9]{,3}',
                                                    str(pr))))
                    price = 27 * int(str_price.replace(',', ''))
                    list_price.append(price)

                for n, l, f, p in zip(names_list, links, photos, list_price):

                    db_session.add(Cars(n, l, f, p))
                    db_session.commit()

                    data = '{}{}{}{}{}'.format(n, '\t', l, '\t', f, '\t',
                                               str(p))
                    self.log.debug(data)

                db_session.add(Pages(msg['num']))
                db_session.commit()
                raw_msg.ack()
            except Exception as e0:
                self.log.exception()(
                    f'{wnum}: get page error: {e0}')  ##self.log.error
                raw_msg.nack(requeue=True)
                prox = None
                if USE_PROXY:
                    self.proxy_gen.back_proxy(prox, str(e0))

            time.sleep(random.randrange(1, 5))

        rab_connection.close()
        self.log.info(f'{wnum}: worker exit')
示例#8
0
    def work(self, wnum):

        rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
        db_connection = DbPg(logger=None)
        driver, prox = self.init_browser()

        for raw_msg in rab_connection.get_generator(self.exit_event):
            if not raw_msg:
                if self.exit_event.wait(2):
                    break
                continue

            msg = raw_msg.json()

            if 'url' not in msg:

                raw_msg.ack()
                continue

            if msg['num'] == 0:
                msg['url'] = msg['url'].split('?')[0]

            try:
                driver.get(msg['url'])


                time.sleep(3)

                html = driver.page_source
                dom = lxml_html.fromstring(html)

                # parse with selenium
                rows = dom.cssselect("tr")
                if not rows:

                    raw_msg.nack(requeue=True)
                    break

                for row in rows:
                    cells = row.cssselect("td")
                    if not cells:
                        continue

                    data = {
                        'img_url': cells[0].cssselect('img')[0].get('src'),
                        'country': cells[1].cssselect('span')[0].get('title'),
                        'vessel_name': cells[1].cssselect('a')[0].text_content().strip(),
                        'vessel_type': cells[1].cssselect('small')[0].text_content().strip(),
                        'year': cells[2].text_content(),
                        'gt': cells[3].text_content(),
                        'dwt': cells[4].text_content(),
                        'sz': cells[5].text_content()
                    }
                    vlength, vwidth = [int(v.strip()) for v in data['sz'].split('/')]

                    ship = Ship(
                        sid=None,
                        name=data['vessel_name'],
                        country_name=data['country'],
                        description=f'{data["vessel_type"]}, {data["img_url"]}',
                        built_year=data['year'],
                        length=vlength,
                        width=vwidth,
                        gt=data['gt'],
                        dwt=data['dwt']
                    )
                    db_connection.insert_ship(ship)
                    print(12121)
                db_connection.exec_query(f'''
                    INSERT INTO pages (page_num)
                    VALUES({msg['num']})
                ''')
                raw_msg.ack()
            except Exception as e0:

                raw_msg.nack(requeue=True)
                if USE_PROXY:
                    self.proxy_gen.back_proxy(prox, str(e0))
                try:
                    driver.close()
                except:
                    pass
                if not self.exit_event.is_set():
                    driver, prox = self.init_browser()
            time.sleep(random.randrange(1, 5))

        try:
            rab_connection.close()
            db_connection.close()
            driver.close()
        except:
            pass
示例#9
0
    def work(self, wnum):
        self.log.debug(f'{wnum} worker started')
        rab_connection = RabbitQueue(CRAWLER_EXCHANGE_NAME, CRAWLER_QUEUE_NAME)
        db_connection = DbPg(logger=None)
        # driver, prox = self.init_browser()
        for raw_msg in rab_connection.get_generator(self.exit_event):
            if not raw_msg:
                if self.exit_event.wait(2):
                    break
                continue

            msg = raw_msg.json()
            print(msg)

            if 'url' not in msg:
                self.log.warning(f'{wnum}: bad task: {msg}')
                raw_msg.ack()
                continue
            print()
            if msg['num'] == 0:
                msg['url'] = PAGE_URL0
                # msg['url'] = msg['url'].split('?')[0]
                print("0",msg)

            try:
                # driver.get(msg['url'])
                request = requests.get(msg['url'], headers=HEADERS).content
                soup = BeautifulSoup(request, 'html.parser')
                # container = soup.select("li.search-page__result")

                self.log.debug(msg['url'])
                # self.log.debug(driver.current_url)
                time.sleep(1)

                names_list = []
                container_names = soup.select('div.information-container h2 a')
                for name in container_names:
                    str_name = name.text
                    #name = str_name.strip()
                    print(str_name)
                    names_list.append(str_name)

                links = []
                container_links = soup.select('div.information-container h2 a')
                for i in container_links:
                    ii = i['href'].split("&")[0]
                    # ii = i['href']
                    full_link = ("https://www.autotrader.co.uk" + ii)
                    link = full_link.split('?')[0]
                    links.append(link)
                    #print(link)

                photos = []
                container_photo = soup.select('figure.listing-main-image a img')
                for link_photo in container_photo:
                    photos.append(link_photo['src'])
                    #print(link_photo['src'])

                list_price = []
                container_text = soup.find_all("a", attrs={ "class" : "js-click-handler listing-fpa-link listings-price-link tracking-standard-link"})
                for i in container_text:
                    pr = i.find_all("div", attrs={ "class" : "vehicle-price"})
                    str_price = "".join((re.findall(r'[0-9]{,3},[0-9]{,3}', str(pr))))
                    price =27*int(str_price.replace(',', ''))
                    list_price.append(price)

                for n, l, f, p in zip(names_list, links, photos, list_price):

                    db_session.add(Cars(n, l, f, p))
                    db_session.commit()

                    data = '{}{}{}{}{}'.format(n, '\t', l, '\t', f, '\t',str(p))
                                                                    # parse with selenium
                                                                    # rows = driver.find_elements_by_css_selector("tr")
                                                                    # if not rows:
                                                                    #     self.log.debug(f'{wnum}: not rows in table')
                                                                    #     raw_msg.nack(requeue=True)
                                                                    #     break
                                                                    #
                                                                    # for row in rows:
                                                                    #     cells = row.find_elements_by_css_selector("td")
                                                                    #     if not cells:
                                                                    #         continue
                                                                    #
                                                                    #     data = {
                                                                    #         'img_url': cells[0].find_element_by_css_selector(
                                                                    #             'img').get_attribute('src'),
                                                                    #         'country': cells[1].find_element_by_css_selector(
                                                                    #             'span').get_attribute('title'),
                                                                    #         'vessel_name': cells[1].text.split('\n')[0],
                                                                    #         'vessel_type': cells[1].text.split('\n')[1],
                                                                    #         'year': cells[2].text,
                                                                    #         'gt': cells[3].text,
                                                                    #         'dwt': cells[4].text,
                                                                    #         'sz': cells[5].text
                                                                    #     }
                                                                    #     vlength, vwidth = [int(v.strip()) for v in data['sz'].split('/')]
                    self.log.debug(data)


                                                    #     db_connection.insert_ship(car)
                                                    # db_connection.exec_query(f'''
                                                    #     INSERT INTO pages (page_num)
                                                    #     VALUES({msg['num']})
                                                    # ''')
                db_session.add(Pages(msg['num']))
                db_session.commit()
                raw_msg.ack()
            except Exception as e0:
                self.log.exception()(f'{wnum}: get page error: {e0}')##self.log.error
                raw_msg.nack(requeue=True)
                prox = None
                if USE_PROXY:
                    self.proxy_gen.back_proxy(prox, str(e0))
                # driver.close()
                # driver, prox = self.init_browser()
            time.sleep(random.randrange(1, 5))

        rab_connection.close()
        # db_connection.close()
        self.log.info(f'{wnum}: worker exit')