예제 #1
0
    def _run(self):

        for i in range(968, len(self._txt)):
            for j in range(i, len(self._txt)):
                # if i % 2 == 0:
                # j = i + 5
                search_key = self._txt[i] + self._txt[j]
                # search_key = u'在线途游(北京)科技有限公司'
                # search_key = u'北京'
                if RedisClient.get_search_key_key(search_key):
                    continue
                logging.info(
                    "++++++crawl 1000:->i: %d, j: %d, len: %d, search_key: %s"
                    % (i, j, len(self._txt), search_key))
                # url = "http://www.qichacha.com/search?key=" + urllib.quote(search_key.encode('utf-8')) + "&index=0"
                # url = "http://qiye.qianzhan.com/orgcompany/searchlistview/qy/" + urllib.quote(
                #     search_key.encode('utf-8')) + "?o=0&area=0&areaN=%E5%85%A8%E5%9B%BD&p=1"
                # url = "http://qiye.qianzhan.com/orgcompany/searchlistview/qy/" + urllib.quote(
                #     search_key.encode('utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC&p=" + str(page)
                url = "http://qiye.qianzhan.com/search/all/" + urllib.quote(
                    search_key.encode(
                        'utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC"

                try:
                    self._get_search(url)
                    RedisClient.set_search_key_key(search_key)
                except Error302, err:
                    raise Error302(i, j)
                except Error403, err:
                    raise Error403(i, j)
                except Exception, e:
                    logging.exception(
                        "_get_search:->i: %d, j: %d, len: %d, search_key: %s, %s"
                        % (i, j, len(self._txt), search_key, e.message))
                    pass
예제 #2
0
    def _get_search(self, url):
        # if RedisClient.get_search_url_key(url):
        #     return

        response = self._qianzhan_client.get_search(url)


        soup = BeautifulSoup(response.text, 'lxml')

        link_li_list = soup.select("body ul[class='list-search'] li p[class='tit'] a")
        for tag in link_li_list:
            href = tag['href']
            company_name = tag.text
            company_url = urljoin("http://qiye.qianzhan.com/", href)
            if RedisClient.get_company_name_detail_key(company_name):
                continue
            # if QianzhanDB.is_detail_had(company_name):
            #     continue
            # if RedisClient.get_company_url_detail_key(url):
            #     continue
            logging.info("company_name:->%s" % company_name)
            try:
                company = self._get_company(company_url)
                if company:
                    company.update({"from.chinabidding": 1})
                    QianzhanDB.upsert_company_detail(company)  # upsert company
                    RedisClient.set_company_name_detail_key(company_name)
                    # RedisClient.set_company_url_detail_key(url)
            except Error302, err:
                logging.exception("get_company Error302, company_name:->%s, e:->%s" % (company_name, err))
                raise err
            except Error403, err:
                logging.exception("get_company Error403, company_name:->%s, e:->%s" % (company_name, err))
                raise err
예제 #3
0
    def _run(self):

        cur = ZhaopinDB.get_companys()
        for item in cur:
            search_key = item['company_name']
            if RedisClient.get_search_key_detail_key(search_key):
                continue
            logging.info("++++++crawl zhaopin:->search_key: %s" % search_key)
            url = "http://qiye.qianzhan.com/search/all/" + urllib.quote(
                search_key.encode(
                    'utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC"

            try:
                self._get_search(url)
                RedisClient.set_search_key_detail_key(search_key)
            except Error302, err:
                raise Error302()
            except Error403, err:
                raise Error403()
예제 #4
0
 def run(self):
     cur = QianzhanDB.get_all()
     for item in cur:
         try:
             search_key = item['company_name']
             logging.info("search key.........%s..........." % search_key)
             # search_key = "百度"
             if RedisClient.get_search_key(search_key):
                 continue
             company = self.get_company(search_key)
             if company:
                 BaidubaikeDB.upsert_compnay(company)
             # break
             RedisClient.set_search_key(search_key)
             time.sleep(1)
         except Exception, e:
             logging.info("********************")
             logging.exception(e)
             logging.info("********************")
             continue
예제 #5
0
    def next_page(self, pg, p_area, province):
        next_url = "http://center.qianlima.com/db_qy.jsp?pg=" + str(
            pg) + "&p_area=" + str(p_area) + "&gsmc=null&lxrxm=null"
        if RedisClient.get_url_key(next_url):
            return
        logging.info("next_page+++++%s" % next_url)

        # self._list_window_handle = self._webdriver.current_window_handle
        self._webdriver.get(next_url)
        # time.sleep(60)
        WebDriverWait(self._webdriver, 60 * 1).until(
            EC.presence_of_element_located(
                (By.XPATH, '//table[@class="gz-news pool"]/tbody/tr')))
        page_source = self._webdriver.page_source

        soup = BeautifulSoup(page_source, 'lxml')

        tr_list = soup.select('table[class="gz-news pool"] tbody tr')
        # logging.info(a_list)
        for tr in tr_list:
            td_list = tr.select('td')

            company = {
                "company_name": td_list[0].get_text(),
                "contact": td_list[1].get_text(),
                "phone": td_list[2].get_text(),
                "mobile": td_list[3].get_text(),
                "fax": td_list[4].get_text(),
                "address": td_list[5].get_text(),
                "describe": td_list[6].get_text(),
                "area": td_list[7].get_text(),
                "p_area": p_area,
                "province": province
            }
            QianlimaDB.upsert_company(company)

        # href = soup.select('form a')[-2].href
        # next_url = "http://center.qianlima.com/db_qy.jsp" + href
        # self.next_page(next_url, area_num)
        RedisClient.set_url_key(next_url)
예제 #6
0
    def detail(self, detail_url):
        if RedisClient.get_company_url_detail_key(detail_url):
            return
        logging.info("detail+++++%s" % detail_url)
        self._webdriver.get(detail_url)
        # time.sleep(20)
        WebDriverWait(self._webdriver, 60 * 1).until(
            EC.visibility_of_element_located((By.ID, "wen")))
        page_source = self._webdriver.page_source
        # logging.debug(page_source)
        #     self.parse_detail_page_source(page_source)
        #
        # def parse_detail_page_source(self, page_source):
        #     logging.info("parse_detail_page_source........")

        soup = BeautifulSoup(page_source, 'lxml')

        title = soup.select_one('div[class="wenshang"] h2').getText()
        update_time = soup.select(
            'div[class="wenzhong"] div[class="biaoge"] table tr'
        )[1].select_one('td span').getText()
        detail = soup.select_one('div[id="wen"]').renderContents()

        self._count += 1
        logging.info("++++++count++++++++++%s" % self._count)
        QianlimaDB.upsert_company({
            "title": title,
            "update_time": update_time,
            "detail": detail,
            "url": detail_url
        })

        # txt_line_list = soup.text.split('\n')
        # new_list = []
        # for line in txt_line_list:
        #     l = line.split(' ')
        #     for ll in l:
        #         new_list.append(ll)
        #
        # company_name_list = []
        #
        # for line in new_list:
        #     line = line.strip()
        #     text = line.replace(u':', u':')
        #
        #     if text.find(u'采购单位名称:') > -1:
        #         company_name = text[text.find(u'采购单位名称:') + len(u'采购单位名称:'):]
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #     if text.find(u'采购代理机构全称:') > -1:
        #         company_name = text[text.find(u'采购代理机构全称:') + len(u'采购代理机构全称:'):]
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #     if text.find(u'成交供应商名称:') > -1:
        #         company_name = text[text.find(u'成交供应商名称:') + len(u'成交供应商名称:'):]
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #     if text.find(u'中标供应商:') > -1:
        #         company_name = text[text.find(u'中标供应商:') + len(u'中标供应商:'):]
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #     if text.find(u'中标人:') > -1:
        #         company_name = text[text.find(u'中标人:') + len(u'中标人:'):]
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #     if text.find(u'推荐中标商:') > -1:
        #         company_name = text[text.find(u'推荐中标商:') + len(u'推荐中标商:'):]
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #     if text.find(u'招标机构:') > -1:
        #         company_name = text[text.find(u'招标机构:') + len(u'招标机构:'):]
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #     if text.find(u'招标代理机构:') > -1:
        #         company_name = text[text.find(u'招标代理机构:') + len(u'招标代理机构:'):]
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #     if text.find(u'招标人:') > -1:
        #         company_name = text[text.find(u'招标人:') + len(u'招标人:'):]
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #     if text.find(u'招标人名称:') > -1:
        #         company_name = text[text.find(u'招标人名称:') + len(u'招标人名称:'):]
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #     if text.endswith(u'公司') and text.find(u':') < 0:
        #         company_name = text
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #     if text.endswith(u'公司') and text.startswith(u'北京') < 0:
        #         company_name = text
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #     if text.find(u'公司') > -1:
        #         company_name = text[text.find(u'公司') - 8: text.find(u'公司')]
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #     if text.find(u'公司') > -1:
        #         company_name = text[text.find(u'公司') - 10: text.find(u'公司')]
        #         if company_name not in company_name_list:
        #             company_name_list.append(company_name)
        #
        # for company_name in company_name_list:
        #     # insert to mongodb
        #     QianlimaDB.upsert_company(company_name)
        #
        #     pass
        RedisClient.set_company_url_detail_key(detail_url)
예제 #7
0
            logging.error(e.message)
            return False
        except ErrorStatusCode, e:
            logging.error(e.message)
            exit(1)
        except Exception, e:
            logging.exception(e)
            exit(1)
        if not response:
            logging.info("response is None..exit....")
            # exit(1)
            time.sleep(10)
            return False
        logging.debug("++++url: %s+++++++" % response.url)

        if RedisClient.get_url(response.url):
            return False

        if response.url.find("/search/none") > 0:
            logging.info(".........not found company.........")
            RedisClient.set_url(response.url)
            return False
        elif response.url.find("baidu.com/view") > 0:
            logging.info(".........found view.........")
            pass
        elif response.url.find("baidu.com/item") > 0:
            logging.info(".........found item........")
            pass
        elif response.url.find("baidu.com/subview") > 0:
            logging.info(".........found subview........")
            pass
예제 #8
0
                pass
        try:
            next_page_href = soup.select_one('body a[class="next"]')['href']
        except Exception, e:
            next_page_href = None
            pass
        if next_page_href:
            if next_page_href.find("http") < 0:
                next_page_url = urljoin("http://qiye.qianzhan.com/",
                                        next_page_href)
            else:
                next_page_url = next_page_href
            logging.debug("next_page_url:->%s" % next_page_url)
            self._get_search(next_page_url)

        RedisClient.set_search_url_key(url)

    def _run(self):

        for i in range(968, len(self._txt)):
            for j in range(i, len(self._txt)):
                # if i % 2 == 0:
                # j = i + 5
                search_key = self._txt[i] + self._txt[j]
                # search_key = u'在线途游(北京)科技有限公司'
                # search_key = u'北京'
                if RedisClient.get_search_key_key(search_key):
                    continue
                logging.info(
                    "++++++crawl 1000:->i: %d, j: %d, len: %d, search_key: %s"
                    % (i, j, len(self._txt), search_key))
예제 #9
0
                continue
            logging.info(
                "++++++crawl zhaopin:->search_key: %s" % search_key)
            url = "http://qiye.qianzhan.com/search/qy/" + urllib.quote(
                search_key.encode('utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC"

            try:
                self._get_search(url)
                RedisClient.set_search_key_detail_key(search_key)
            except Error302, err:
                raise Error302()
            except Error403, err:
                raise Error403()
            except Error400, err:
                logging.exception("_get_search Error400, search_key:->%s, e:->%s" % (search_key, err))
                RedisClient.set_search_key_detail_key(search_key)
                # raise err
            except Error404, err:
                logging.exception("_get_search Error404, search_key:->%s, e:->%s" % (search_key, err))
                RedisClient.set_search_key_detail_key(search_key)
                # raise err
            except Exception, e:
                logging.exception("_get_search:->search_key: %s, %s" % (search_key, e.message))
                pass

    def run(self):
        logging.info("+++++++++++++run++++++++++++++++")
        try:
            is_success = self._qianzhan_client.login()
            if is_success:
                self._run()
예제 #10
0
# -*- coding:utf-8 -*-
__author__ = 'zhaojm'

from mongo import ChinabiddingDB
from mredis import RedisClient

if __name__ == "__main__":
    cur = ChinabiddingDB.get_all()
    for item in cur:
        RedisClient.set_result_url_key(item['detail_url'])