Exemplo n.º 1
0
def crawl_holders(data):
    logger.info('----crawling {} holders----'.format(data['companyName']))
    for i in data['holders']:
        if re.search(r'firm_(\w+).html',
                     i['url']) and i['name'] not in crawled:
            try:
                q = crawl_from_qichacha(i['name'], i['url'], {})
            except NeedValidationError as e:
                raise e
            except Exception as e:
                logger.info('error: {}, {}'.format(i['name'], i['url']))
                has_error.append((i['name'], i['url']))
                crawled.append(i['name'])
            else:
                if q['overview']['stock_code']:
                    crawl_stock(q['overview']['stock_code'])
                logger.info('crawl: {}'.format(i['name']))
                crawled.append(i['name'])
    logger.info('----crawling {} holders end----'.format(data['companyName']))
Exemplo n.º 2
0
    def run(self):
        global single_time_crawled, need_validate

        while 1:
            try:
                # 若待爬队列15s无数据
                unique, name, level = self.wait_crawl_q.get(timeout=15)
                logger1.info('+++++{} crawling ({})'.format(
                    self.thread_name, name))

                url = 'https://www.qichacha.com/firm_' + unique + '.html'

                # 暂时不使用代理
                proxy = None

                # 加入延时
                time.sleep(random.uniform(1, 2))

                try:
                    qichacha, html = crawl_from_qichacha(name, url, proxy)

                # 出现验证错误
                except NeedValidationError as e:
                    # 等待两秒后重试
                    time.sleep(2)

                    try:
                        qichacha = crawl_from_qichacha(name, url, proxy)
                    except NeedValidationError as e:
                        # 若需要验证,则该公司不需要加入待写队列

                        self.wait_crawl_q.task_done()

                        need_validate = True

                        # 清空待爬队列
                        logger1.error(
                            '===!!{} get Need Validation Error, clearing wait_crawl_q'
                            .format(self.thread_name))
                        while not self.wait_crawl_q.empty():
                            try:
                                self.wait_crawl_q.get_nowait()
                                self.wait_crawl_q.task_done()
                            except queue.Empty:
                                logger1.error(
                                    '!!!!!{} get Empty Error when clear wait_crawl_q'
                                    .format(self.thread_name))
                        logger1.error(
                            '===!!{} clear wait_crawl_q finished'.format(
                                self.thread_name))

                        continue

                # 爬虫出现错误
                except Exception as e:
                    logger1.error('!!!!!{} crawl ({}, {}) error!!!!!'.format(
                        self.thread_name, name, url))
                    logger1.exception(e)

                    # 加入待写队列的item分为两类
                    # 一类 flag = 0,表示需要更新该记录的 crawled_date, has_error
                    # 一类 flag = 1,表示需要插入该记录的 unique, name, level

                    # 更新该公司的 crawled_date 和 has_error
                    flag = 0
                    crawled_date = datetime.date.today()
                    has_error = 1
                    wait_write_q_item = (flag, unique, name, level,
                                         crawled_date, has_error)
                    self.wait_write_q.put(wait_write_q_item)

                    self.wait_crawl_q.task_done()

                    single_time_crawled += 1

                    continue

                # 没有错误
                else:
                    logger1.info('+++++{} crawled ({})'.format(
                        self.thread_name, name))

                    flag = 0
                    crawled_date = datetime.date.today()
                    has_error = 0

                    eastmoney, cninfo = '', ''
                    # 若有股票代码则爬取股票信息
                    if qichacha['overview']['stock_code']:
                        try:
                            eastmoney, cninfo = crawl_stock(
                                qichacha['overview']['stock_code'])
                        except Exception as e:
                            # 表示该公司股票信息爬取失败
                            has_error = 2

                            logger1.exception(e)
                            logger1.error('crawl stock {} error'.format(
                                qichacha['overview']['stock_code']))

                    # 更新该公司的 crawled_date 和 has_error
                    wait_write_q_item = (flag, unique, name, level,
                                         crawled_date, has_error)
                    self.wait_write_q.put(wait_write_q_item)

                    # 将爬取结果入库
                    document = {
                        'unique': unique,
                        'company': qichacha['companyName'],
                        'html': html,
                        'qichacha': qichacha,
                        'eastmoney': eastmoney,
                        'cninfo': cninfo,
                        'crawl_time': str(datetime.date.today()),
                        'store_time': ''
                    }
                    self.mongo_collection.insert_one(document)

                    # 根据该公司 level 将 holders 或 investments 加入待写队列

                    # level = 0,根公司,将 holders 和 investments 加入待写队列
                    if level == 0:
                        for holder in qichacha['holders']:
                            new_unique = re.search(r'firm_(\w+).html',
                                                   holder['url'])
                            if not new_unique:
                                continue

                            # 插入股东信息
                            flag = 1
                            new_unique = new_unique.group(1)
                            new_name = holder['name']
                            new_level = level - 1
                            new_crawled_date = '2000-01-01'
                            new_has_error = ''
                            wait_write_q_item = (flag, new_unique, new_name,
                                                 new_level, new_crawled_date,
                                                 new_has_error)
                            self.wait_write_q.put(wait_write_q_item)

                        for investment in qichacha['investments']:
                            new_unique = re.search(r'firm_(\w+).html',
                                                   investment['url'])
                            if not new_unique:
                                continue

                            # 插入投资信息
                            flag = 1
                            new_unique = new_unique.group(1)
                            new_name = investment['name']
                            new_level = level + 1
                            new_crawled_date = '2000-01-01'
                            new_has_error = ''
                            wait_write_q_item = (flag, new_unique, new_name,
                                                 new_level, new_crawled_date,
                                                 new_has_error)
                            self.wait_write_q.put(wait_write_q_item)

                    # -2 < level < 0,将 holders 加入待写队列
                    elif -2 < level < 0:
                        for holder in qichacha['holders']:
                            new_unique = re.search(r'firm_(\w+).html',
                                                   holder['url'])
                            if not new_unique:
                                continue

                            # 插入股东信息
                            flag = 1
                            new_unique = new_unique.group(1)
                            new_name = holder['name']
                            new_level = level - 1
                            new_crawled_date = '2000-01-01'
                            new_has_error = ''
                            wait_write_q_item = (flag, new_unique, new_name,
                                                 new_level, new_crawled_date,
                                                 new_has_error)
                            self.wait_write_q.put(wait_write_q_item)

                    # 0 < level < 6,将 investments 加入待写队列
                    elif 0 < level < 6:
                        for investment in qichacha['investments']:
                            new_unique = re.search(r'firm_(\w+).html',
                                                   investment['url'])
                            if not new_unique:
                                continue

                            # 插入投资信息
                            flag = 1
                            new_unique = new_unique.group(1)
                            new_name = investment['name']
                            new_level = level + 1
                            new_crawled_date = '2000-01-01'
                            new_has_error = ''
                            wait_write_q_item = (flag, new_unique, new_name,
                                                 new_level, new_crawled_date,
                                                 new_has_error)
                            self.wait_write_q.put(wait_write_q_item)

                    # level = -2 或 level = 6,只将自身加入待写队列
                    elif level == -2 or level == 6:
                        # 不需要处理 holders 或 investments
                        pass

                self.wait_crawl_q.task_done()

                single_time_crawled += 1

            except queue.Empty:
                logger1.info('+++++{}: No data in wait_crawl_q'.format(
                    self.thread_name))
                logger1.info('+++++{} end+++++'.format(self.thread_name))

                return
Exemplo n.º 3
0
    def run(self):
        global single_time_crawled, need_validate

        while 1:
            try:
                # 等待15s
                # 若待爬队列15s无数据
                name, unique = self.wait_crawl_q.get(timeout=15)
                # logger.info('+++++{} get ({}, {}) from wait_crawl_q'.format(self.name, name, unique))
                if unique:
                    url = 'https://www.qichacha.com/firm_' + unique + '.html'
                else:
                    url = ''

                # 暂时不使用代理
                proxy = None

                # 加入延时
                time.sleep(random.uniform(2, 3))

                try:
                    qichacha = crawl_from_qichacha(name, url, proxy)
                except NeedValidationError as e:
                    wait_write_q_item = (name, unique, 1, 0)
                    self.wait_write_q.put(wait_write_q_item)
                    logger1.info(
                        '+++++{} put ({}, {}, {}, {}) into wait_write_q, remain: {}'
                        .format(self.name, *wait_write_q_item,
                                self.wait_crawl_q.qsize()))

                    self.wait_crawl_q.task_done()

                    need_validate = True

                    logger1.error(
                        '===!!{} get Need Validation Error, clearing q, qsize: {}, unfinished: {}'
                        .format(self.name, self.wait_crawl_q.qsize(),
                                self.wait_crawl_q.unfinished_tasks))
                    while not self.wait_crawl_q.empty():
                        try:
                            self.wait_crawl_q.get_nowait()
                            self.wait_crawl_q.task_done()
                        except queue.Empty:
                            logger1.error(
                                '!!!!!{} get Empty Error when clear q'.format(
                                    self.name))
                    logger1.error(
                        '===!!{} clear q finished, qsize: {}, unfinished: {}'.
                        format(self.name, self.wait_crawl_q.qsize(),
                               self.wait_crawl_q.unfinished_tasks))

                    continue

                except Exception as e:
                    logger1.exception(e)
                    logger1.error('!!!!!{} crawl ({}, {}) error!!!!!'.format(
                        self.name, name, url))

                    wait_write_q_item = (name, unique, 0, 1)
                    self.wait_write_q.put(wait_write_q_item)
                    logger1.info(
                        '+++++{} put ({}, {}, {}, {}) into wait_write_q, remain: {}'
                        .format(self.name, *wait_write_q_item,
                                self.wait_crawl_q.qsize()))

                    self.wait_crawl_q.task_done()

                    single_time_crawled += 1

                    continue

                else:
                    if not unique:
                        url = qichacha['url']
                        unique = re.search(r'firm_(\w+).html', url).group(1)

                    logger1.info('+++++{} crawled ({})'.format(
                        self.name, name))

                    wait_write_q_item = (name, unique, 0, 0)
                    self.wait_write_q.put(wait_write_q_item)
                    logger1.info(
                        '+++++{} put ({}, {}, {}, {}) into wait_write_q, remain: {}'
                        .format(self.name, *wait_write_q_item,
                                self.wait_crawl_q.qsize()))

                    if qichacha['overview']['stock_code']:
                        crawl_stock_thread = threading.Thread(
                            target=crawl_stock,
                            args=(qichacha['overview']['stock_code'], ),
                            name='crawl-stock-thread')
                        crawl_stock_thread.start()

                    for holder in qichacha['holders']:
                        unique = re.search(r'firm_(\w+).html', holder['url'])
                        if not unique:
                            continue
                        unique = unique.group(1)
                        name = holder['name']
                        wait_write_q_item = (name, unique, 1, 0)
                        self.wait_write_q.put(wait_write_q_item)

                    for investment in qichacha['investments']:
                        unique = re.search(r'firm_(\w+).html',
                                           investment['url'])
                        if not unique:
                            continue
                        unique = unique.group(1)
                        name = investment['company_name']
                        wait_write_q_item = (name, unique, 1, 0)
                        self.wait_write_q.put(wait_write_q_item)

                    self.wait_crawl_q.task_done()

                    single_time_crawled += 1

            except queue.Empty:
                logger1.info('+++++{}: No data in wait_crawl_q'.format(
                    self.name))
                logger1.info('+++++{} end+++++'.format(self.name))

                return
Exemplo n.º 4
0
        '贵州汇生林业开发有限公司',
        'url':
        'https://www.qichacha.com/firm_bc79a2ed616358340df33a6155d399c1.html'
    },
]

with open('crawled.json', 'r', encoding='utf-8') as f:
    crawled = json.load(f)

with open('error.json', 'r', encoding='utf-8') as f:
    has_error = json.load(f)

for g in group_list:
    if not os.path.isfile('json/qichacha/' + g['name'] + '.json'):
        # 公司本身
        q = crawl_from_qichacha(g['name'], '', {})
        if q['overview']['stock_code']:
            crawl_stock(q['overview']['stock_code'])
        crawled.append(g['name'])

    unique = re.search(r'_(\w+).html', g['url']).group(1)
    with open('json/qichacha/' + unique + '.json', 'r', encoding='utf-8') as f:
        root_data = json.load(f)

    try:
        # 一级母公司
        crawl_holders(root_data)

        # 一级子公司
        crawl_investments(root_data)
Exemplo n.º 5
0
        if stop:
            finish()

        unique, name, level = item
        logger1.info('Crawling (%s)' % name)

        url = 'https://www.qichacha.com/firm_' + unique + '.html'

        # 暂时不使用代理
        proxy = None

        # 加入延时
        time.sleep(random.uniform(crawl_delay, crawl_delay + 3))

        try:
            qichacha, html = crawl_from_qichacha(name, url, proxy)

        # 出现未登录错误
        except NotLoginError as e:
            logger1.error('Not Login, Please reset cookie')
            finish()

        # 出现验证错误
        except NeedValidationError as e:
            # 等待两秒后重试
            time.sleep(2)

            try:
                qichacha = crawl_from_qichacha(name, url, proxy)

            except NeedValidationError as e: