def _run(self): for i in range(968, len(self._txt)): for j in range(i, len(self._txt)): # if i % 2 == 0: # j = i + 5 search_key = self._txt[i] + self._txt[j] # search_key = u'在线途游(北京)科技有限公司' # search_key = u'北京' if RedisClient.get_search_key_key(search_key): continue logging.info( "++++++crawl 1000:->i: %d, j: %d, len: %d, search_key: %s" % (i, j, len(self._txt), search_key)) # url = "http://www.qichacha.com/search?key=" + urllib.quote(search_key.encode('utf-8')) + "&index=0" # url = "http://qiye.qianzhan.com/orgcompany/searchlistview/qy/" + urllib.quote( # search_key.encode('utf-8')) + "?o=0&area=0&areaN=%E5%85%A8%E5%9B%BD&p=1" # url = "http://qiye.qianzhan.com/orgcompany/searchlistview/qy/" + urllib.quote( # search_key.encode('utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC&p=" + str(page) url = "http://qiye.qianzhan.com/search/all/" + urllib.quote( search_key.encode( 'utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC" try: self._get_search(url) RedisClient.set_search_key_key(search_key) except Error302, err: raise Error302(i, j) except Error403, err: raise Error403(i, j) except Exception, e: logging.exception( "_get_search:->i: %d, j: %d, len: %d, search_key: %s, %s" % (i, j, len(self._txt), search_key, e.message)) pass
def _get_search(self, url): # if RedisClient.get_search_url_key(url): # return response = self._qianzhan_client.get_search(url) soup = BeautifulSoup(response.text, 'lxml') link_li_list = soup.select("body ul[class='list-search'] li p[class='tit'] a") for tag in link_li_list: href = tag['href'] company_name = tag.text company_url = urljoin("http://qiye.qianzhan.com/", href) if RedisClient.get_company_name_detail_key(company_name): continue # if QianzhanDB.is_detail_had(company_name): # continue # if RedisClient.get_company_url_detail_key(url): # continue logging.info("company_name:->%s" % company_name) try: company = self._get_company(company_url) if company: company.update({"from.chinabidding": 1}) QianzhanDB.upsert_company_detail(company) # upsert company RedisClient.set_company_name_detail_key(company_name) # RedisClient.set_company_url_detail_key(url) except Error302, err: logging.exception("get_company Error302, company_name:->%s, e:->%s" % (company_name, err)) raise err except Error403, err: logging.exception("get_company Error403, company_name:->%s, e:->%s" % (company_name, err)) raise err
def _run(self): cur = ZhaopinDB.get_companys() for item in cur: search_key = item['company_name'] if RedisClient.get_search_key_detail_key(search_key): continue logging.info("++++++crawl zhaopin:->search_key: %s" % search_key) url = "http://qiye.qianzhan.com/search/all/" + urllib.quote( search_key.encode( 'utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC" try: self._get_search(url) RedisClient.set_search_key_detail_key(search_key) except Error302, err: raise Error302() except Error403, err: raise Error403()
def run(self): cur = QianzhanDB.get_all() for item in cur: try: search_key = item['company_name'] logging.info("search key.........%s..........." % search_key) # search_key = "百度" if RedisClient.get_search_key(search_key): continue company = self.get_company(search_key) if company: BaidubaikeDB.upsert_compnay(company) # break RedisClient.set_search_key(search_key) time.sleep(1) except Exception, e: logging.info("********************") logging.exception(e) logging.info("********************") continue
def next_page(self, pg, p_area, province): next_url = "http://center.qianlima.com/db_qy.jsp?pg=" + str( pg) + "&p_area=" + str(p_area) + "&gsmc=null&lxrxm=null" if RedisClient.get_url_key(next_url): return logging.info("next_page+++++%s" % next_url) # self._list_window_handle = self._webdriver.current_window_handle self._webdriver.get(next_url) # time.sleep(60) WebDriverWait(self._webdriver, 60 * 1).until( EC.presence_of_element_located( (By.XPATH, '//table[@class="gz-news pool"]/tbody/tr'))) page_source = self._webdriver.page_source soup = BeautifulSoup(page_source, 'lxml') tr_list = soup.select('table[class="gz-news pool"] tbody tr') # logging.info(a_list) for tr in tr_list: td_list = tr.select('td') company = { "company_name": td_list[0].get_text(), "contact": td_list[1].get_text(), "phone": td_list[2].get_text(), "mobile": td_list[3].get_text(), "fax": td_list[4].get_text(), "address": td_list[5].get_text(), "describe": td_list[6].get_text(), "area": td_list[7].get_text(), "p_area": p_area, "province": province } QianlimaDB.upsert_company(company) # href = soup.select('form a')[-2].href # next_url = "http://center.qianlima.com/db_qy.jsp" + href # self.next_page(next_url, area_num) RedisClient.set_url_key(next_url)
def detail(self, detail_url): if RedisClient.get_company_url_detail_key(detail_url): return logging.info("detail+++++%s" % detail_url) self._webdriver.get(detail_url) # time.sleep(20) WebDriverWait(self._webdriver, 60 * 1).until( EC.visibility_of_element_located((By.ID, "wen"))) page_source = self._webdriver.page_source # logging.debug(page_source) # self.parse_detail_page_source(page_source) # # def parse_detail_page_source(self, page_source): # logging.info("parse_detail_page_source........") soup = BeautifulSoup(page_source, 'lxml') title = soup.select_one('div[class="wenshang"] h2').getText() update_time = soup.select( 'div[class="wenzhong"] div[class="biaoge"] table tr' )[1].select_one('td span').getText() detail = soup.select_one('div[id="wen"]').renderContents() self._count += 1 logging.info("++++++count++++++++++%s" % self._count) QianlimaDB.upsert_company({ "title": title, "update_time": update_time, "detail": detail, "url": detail_url }) # txt_line_list = soup.text.split('\n') # new_list = [] # for line in txt_line_list: # l = line.split(' ') # for ll in l: # new_list.append(ll) # # company_name_list = [] # # for line in new_list: # line = line.strip() # text = line.replace(u':', u':') # # if text.find(u'采购单位名称:') > -1: # company_name = text[text.find(u'采购单位名称:') + len(u'采购单位名称:'):] # if company_name not in company_name_list: # company_name_list.append(company_name) # if text.find(u'采购代理机构全称:') > -1: # company_name = text[text.find(u'采购代理机构全称:') + len(u'采购代理机构全称:'):] # if company_name not in company_name_list: # company_name_list.append(company_name) # if text.find(u'成交供应商名称:') > -1: # company_name = text[text.find(u'成交供应商名称:') + len(u'成交供应商名称:'):] # if company_name not in company_name_list: # company_name_list.append(company_name) # if text.find(u'中标供应商:') > -1: # company_name = text[text.find(u'中标供应商:') + len(u'中标供应商:'):] # if company_name not in company_name_list: # company_name_list.append(company_name) # if text.find(u'中标人:') > -1: # company_name = text[text.find(u'中标人:') + len(u'中标人:'):] # if company_name not in company_name_list: # company_name_list.append(company_name) # if text.find(u'推荐中标商:') > -1: # company_name = text[text.find(u'推荐中标商:') + len(u'推荐中标商:'):] # if company_name not in company_name_list: # company_name_list.append(company_name) # if text.find(u'招标机构:') > -1: # company_name = text[text.find(u'招标机构:') + len(u'招标机构:'):] # if company_name not in company_name_list: # company_name_list.append(company_name) # if text.find(u'招标代理机构:') > -1: # company_name = text[text.find(u'招标代理机构:') + len(u'招标代理机构:'):] # if company_name not in company_name_list: # company_name_list.append(company_name) # if text.find(u'招标人:') > -1: # company_name = text[text.find(u'招标人:') + len(u'招标人:'):] # if company_name not in company_name_list: # company_name_list.append(company_name) # if text.find(u'招标人名称:') > -1: # company_name = text[text.find(u'招标人名称:') + len(u'招标人名称:'):] # if company_name not in company_name_list: # company_name_list.append(company_name) # if text.endswith(u'公司') and text.find(u':') < 0: # company_name = text # if company_name not in company_name_list: # company_name_list.append(company_name) # if text.endswith(u'公司') and text.startswith(u'北京') < 0: # company_name = text # if company_name not in company_name_list: # company_name_list.append(company_name) # if text.find(u'公司') > -1: # company_name = text[text.find(u'公司') - 8: text.find(u'公司')] # if company_name not in company_name_list: # company_name_list.append(company_name) # if text.find(u'公司') > -1: # company_name = text[text.find(u'公司') - 10: text.find(u'公司')] # if company_name not in company_name_list: # company_name_list.append(company_name) # # for company_name in company_name_list: # # insert to mongodb # QianlimaDB.upsert_company(company_name) # # pass RedisClient.set_company_url_detail_key(detail_url)
logging.error(e.message) return False except ErrorStatusCode, e: logging.error(e.message) exit(1) except Exception, e: logging.exception(e) exit(1) if not response: logging.info("response is None..exit....") # exit(1) time.sleep(10) return False logging.debug("++++url: %s+++++++" % response.url) if RedisClient.get_url(response.url): return False if response.url.find("/search/none") > 0: logging.info(".........not found company.........") RedisClient.set_url(response.url) return False elif response.url.find("baidu.com/view") > 0: logging.info(".........found view.........") pass elif response.url.find("baidu.com/item") > 0: logging.info(".........found item........") pass elif response.url.find("baidu.com/subview") > 0: logging.info(".........found subview........") pass
pass try: next_page_href = soup.select_one('body a[class="next"]')['href'] except Exception, e: next_page_href = None pass if next_page_href: if next_page_href.find("http") < 0: next_page_url = urljoin("http://qiye.qianzhan.com/", next_page_href) else: next_page_url = next_page_href logging.debug("next_page_url:->%s" % next_page_url) self._get_search(next_page_url) RedisClient.set_search_url_key(url) def _run(self): for i in range(968, len(self._txt)): for j in range(i, len(self._txt)): # if i % 2 == 0: # j = i + 5 search_key = self._txt[i] + self._txt[j] # search_key = u'在线途游(北京)科技有限公司' # search_key = u'北京' if RedisClient.get_search_key_key(search_key): continue logging.info( "++++++crawl 1000:->i: %d, j: %d, len: %d, search_key: %s" % (i, j, len(self._txt), search_key))
continue logging.info( "++++++crawl zhaopin:->search_key: %s" % search_key) url = "http://qiye.qianzhan.com/search/qy/" + urllib.quote( search_key.encode('utf-8')) + "?o=0&area=11&areaN=%E5%8C%97%E4%BA%AC" try: self._get_search(url) RedisClient.set_search_key_detail_key(search_key) except Error302, err: raise Error302() except Error403, err: raise Error403() except Error400, err: logging.exception("_get_search Error400, search_key:->%s, e:->%s" % (search_key, err)) RedisClient.set_search_key_detail_key(search_key) # raise err except Error404, err: logging.exception("_get_search Error404, search_key:->%s, e:->%s" % (search_key, err)) RedisClient.set_search_key_detail_key(search_key) # raise err except Exception, e: logging.exception("_get_search:->search_key: %s, %s" % (search_key, e.message)) pass def run(self): logging.info("+++++++++++++run++++++++++++++++") try: is_success = self._qianzhan_client.login() if is_success: self._run()
# -*- coding:utf-8 -*- __author__ = 'zhaojm' from mongo import ChinabiddingDB from mredis import RedisClient if __name__ == "__main__": cur = ChinabiddingDB.get_all() for item in cur: RedisClient.set_result_url_key(item['detail_url'])