def _refresh_proxy(self): self._proxy_ip, self._proxy_port, proxy_type = self._getProxy.get_proxy() http_proxy = "http://%s:%s" % (self._proxy_ip, self._proxy_port) proxies = {"http": http_proxy} logging.info("++++++++proxies: %s++++++++++++" % proxies) self._client = SiteClient(proxies) pass
class Spider(object): def __init__(self): self._client = SiteClient(username, password) pass def _get_company(self, company_url): # logging.debug("_get_company.......") response = self._client.get_company(company_url) soup = BeautifulSoup(response.text, 'lxml') company = {} logging.debug("company:->%s" % company) return company def _get_search(self): url = "http://search.qianlima.com/qlm_adv_se.jsp?kw1=&kw2=&kw3=&field=0&p_tflt=365&q_time_start=2015-08-16&q_time_end=2016-08-15&iarea=0&area=2&prg=0%3D3&x=64&y=14" # if RedisClient.get_search_url_key(url): # return response = self._client.get_search(url) soup = BeautifulSoup(response.text, 'lxml') try: try: company_url = soup.select_one('td[class="gsmc"] a')['href'] logging.info("company_url: %s" % company_url) except Exception, e: return if company_url.startswith('http://company.zhaopin.com'): # if RedisClient.get_company_url_key(company_url): # return company = self._get_company(company_url) if company: ZhaopinDB.upsert_company(company) # upsert company # RedisClient.set_company_url_key(company_url) except Error302, err: logging.exception( "get_company Error302, company_name:->%s, e:->%s" % (search_key, err)) raise err
def _create_site_client(self): return SiteClient(self._config)
def __init__(self): self._site_client = SiteClient(proxies={}) self.__VIEWSTATE = None pass
def __init__(self): self._client = SiteClient() self._getProxy = GetProxy(3, 0) self._getSearchKey = GetSearchKey() pass
def __init__(self, num): self.num = num self._client = SiteClient() self._getProxy = GetProxy(2, num) self._getSearchKey = GetSearchKey() pass
def __init__(self): self._client = SiteClient(proxies={}) self._city_list = [] self._industry_list = [] pass
def __init__(self): self._client = SiteClient() pass
class Spider(object): def __init__(self): self._client = SiteClient() pass def _get_company(self, detail_url): response = self._client._verify_get(detail_url) soup = BeautifulSoup(response.content, 'lxml') company = {} company.update({"title": soup.select_one('h1').text}) detail = soup.select_one('section') company.update({"detail": detail.prettify()}) return company def _get_search(self, current_page): logging.info("++++++++%s" % current_page) url = "http://www.chinabidding.com/search/proj.htm" form_data = { "fullText": "", "pubDate": "", "infoClassCodes": "0108", "normIndustry": "", "zoneCode": "", "fundSourceCodes": "", "poClass": "", "rangeType": "", "currentPage": "%s" % current_page # 1,... } response = self._client._verify_post(url, form_data) soup = BeautifulSoup(response.content, 'lxml') a_list = soup.select('a[class="as-pager-item"]') for a in a_list: try: detail_url = a['href'] if ChinabiddingDB.get_one(detail_url): continue logging.info("detail_url: %s" % detail_url) result = {} result.update({"detail_url": detail_url}) result.update( {"time": a.select_one('h5 span[class="time"]').text}) result.update({ "industry": a.select('div[class="as-p-ft"] dl dd span strong')[0].text, "province": a.select('div[class="as-p-ft"] dl dd span strong')[1].text }) company = self._get_company(detail_url) result.update(company) ChinabiddingDB.upsert_result(result) except Exception, e: logging.exception("get_company exception, e:->%s" % e) pass
def __init__(self): self._site_client = SiteClient(proxies={}) pass
class Spider(object): def __init__(self): self._site_client = SiteClient() pass def run(self): response = self._site_client.list(1, 0, 50000) self.parse_search_list(response, 1) response = self._site_client.list(3, 0, 50000) self.parse_search_list(response, 3) response = self._site_client.list(3, 50000, 50000) self.parse_search_list(response, 3) response = self._site_client.list(3, 100000, 50000) self.parse_search_list(response, 3) response = self._site_client.list(3, 150000, 50000) self.parse_search_list(response, 3) response = self._site_client.list(3, 200000, 50000) self.parse_search_list(response, 3) response = self._site_client.list(7, 0, 50000) self.parse_search_list(response, 7) response = self._site_client.list(9, 0, 50000) self.parse_search_list(response, 9) pass def run_crontab(self): response = self._site_client.list(1, 0, 1000) self.parse_search_list(response, 1) response = self._site_client.list(3, 0, 1000) self.parse_search_list(response, 3) response = self._site_client.list(7, 0, 1000) self.parse_search_list(response, 7) response = self._site_client.list(9, 0, 1000) self.parse_search_list(response, 9) pass def parse_search_list(self, response, useState): logging.info("parse_search_list..........") # logging.debug(response.content) # j = json.loads(response.text) # root_list = j['root'] root = u"[" + response.text.split(u'[')[1].split(u']')[0] + u"]" # logging.debug(root) root_list = json.loads(root, 'utf-8') for item in root_list: flowNo = item['flowNo'] if CbrcDB.find_flowNo(flowNo, useState): continue try: company = self.get_company(useState, flowNo) if useState == 1: CbrcDB.upsert_company_1(company) # 近期机构设立情况 elif useState == 3: CbrcDB.upsert_company_3(company) # 机构持有列表 elif useState == 7: CbrcDB.upsert_company_7(company) # 机构推出列表 elif useState == 9: CbrcDB.upsert_company_9(company) # 失控情况 else: raise Exception("unknonw useState") except Exception, e: logging.exception(e) continue
# -*- coding:utf-8 -*- __author__ = 'zhaojm' from site_client import SiteClient site_client = SiteClient() site_client.index_1() response = site_client.get_verify_img()
def __init__(self): self._client = SiteClient(proxies={}) self._index = 0 pass
def __init__(self): self._client = SiteClient(username, password) pass