def _refresh_proxy(self):

        self._proxy_ip, self._proxy_port, proxy_type = self._getProxy.get_proxy()
        http_proxy = "http://%s:%s" % (self._proxy_ip, self._proxy_port)
        proxies = {"http": http_proxy}
        logging.info("++++++++proxies: %s++++++++++++" % proxies)
        self._client = SiteClient(proxies)
        pass
Пример #2
0
class Spider(object):
    def __init__(self):
        self._client = SiteClient(username, password)
        pass

    def _get_company(self, company_url):
        # logging.debug("_get_company.......")
        response = self._client.get_company(company_url)

        soup = BeautifulSoup(response.text, 'lxml')

        company = {}

        logging.debug("company:->%s" % company)

        return company

    def _get_search(self):
        url = "http://search.qianlima.com/qlm_adv_se.jsp?kw1=&kw2=&kw3=&field=0&p_tflt=365&q_time_start=2015-08-16&q_time_end=2016-08-15&iarea=0&area=2&prg=0%3D3&x=64&y=14"

        # if RedisClient.get_search_url_key(url):
        #     return

        response = self._client.get_search(url)

        soup = BeautifulSoup(response.text, 'lxml')
        try:
            try:
                company_url = soup.select_one('td[class="gsmc"] a')['href']
                logging.info("company_url: %s" % company_url)
            except Exception, e:
                return
            if company_url.startswith('http://company.zhaopin.com'):
                # if RedisClient.get_company_url_key(company_url):
                #     return
                company = self._get_company(company_url)
                if company:
                    ZhaopinDB.upsert_company(company)  # upsert company
                    # RedisClient.set_company_url_key(company_url)
        except Error302, err:
            logging.exception(
                "get_company Error302, company_name:->%s, e:->%s" %
                (search_key, err))
            raise err
Пример #3
0
 def _create_site_client(self):
     return SiteClient(self._config)
Пример #4
0
 def __init__(self):
     self._site_client = SiteClient(proxies={})
     self.__VIEWSTATE = None
     pass
 def __init__(self):
     self._client = SiteClient()
     self._getProxy = GetProxy(3, 0)
     self._getSearchKey = GetSearchKey()
     pass
Пример #6
0
 def __init__(self, num):
     self.num = num
     self._client = SiteClient()
     self._getProxy = GetProxy(2, num)
     self._getSearchKey = GetSearchKey()
     pass
Пример #7
0
 def __init__(self):
     self._client = SiteClient(proxies={})
     self._city_list = []
     self._industry_list = []
     pass
Пример #8
0
 def __init__(self):
     self._client = SiteClient()
     pass
Пример #9
0
class Spider(object):
    def __init__(self):
        self._client = SiteClient()
        pass

    def _get_company(self, detail_url):

        response = self._client._verify_get(detail_url)

        soup = BeautifulSoup(response.content, 'lxml')

        company = {}

        company.update({"title": soup.select_one('h1').text})

        detail = soup.select_one('section')

        company.update({"detail": detail.prettify()})

        return company

    def _get_search(self, current_page):
        logging.info("++++++++%s" % current_page)
        url = "http://www.chinabidding.com/search/proj.htm"

        form_data = {
            "fullText": "",
            "pubDate": "",
            "infoClassCodes": "0108",
            "normIndustry": "",
            "zoneCode": "",
            "fundSourceCodes": "",
            "poClass": "",
            "rangeType": "",
            "currentPage": "%s" % current_page  # 1,...
        }
        response = self._client._verify_post(url, form_data)

        soup = BeautifulSoup(response.content, 'lxml')

        a_list = soup.select('a[class="as-pager-item"]')
        for a in a_list:
            try:

                detail_url = a['href']

                if ChinabiddingDB.get_one(detail_url):
                    continue

                logging.info("detail_url: %s" % detail_url)

                result = {}
                result.update({"detail_url": detail_url})
                result.update(
                    {"time": a.select_one('h5 span[class="time"]').text})
                result.update({
                    "industry":
                    a.select('div[class="as-p-ft"] dl dd span strong')[0].text,
                    "province":
                    a.select('div[class="as-p-ft"] dl dd span strong')[1].text
                })

                company = self._get_company(detail_url)

                result.update(company)

                ChinabiddingDB.upsert_result(result)

            except Exception, e:
                logging.exception("get_company exception, e:->%s" % e)
                pass
Пример #10
0
 def __init__(self):
     self._site_client = SiteClient(proxies={})
     pass
Пример #11
0
class Spider(object):
    def __init__(self):
        self._site_client = SiteClient()
        pass

    def run(self):
        response = self._site_client.list(1, 0, 50000)
        self.parse_search_list(response, 1)

        response = self._site_client.list(3, 0, 50000)
        self.parse_search_list(response, 3)
        response = self._site_client.list(3, 50000, 50000)
        self.parse_search_list(response, 3)
        response = self._site_client.list(3, 100000, 50000)
        self.parse_search_list(response, 3)
        response = self._site_client.list(3, 150000, 50000)
        self.parse_search_list(response, 3)
        response = self._site_client.list(3, 200000, 50000)
        self.parse_search_list(response, 3)

        response = self._site_client.list(7, 0, 50000)
        self.parse_search_list(response, 7)

        response = self._site_client.list(9, 0, 50000)
        self.parse_search_list(response, 9)
        pass

    def run_crontab(self):
        response = self._site_client.list(1, 0, 1000)
        self.parse_search_list(response, 1)
        response = self._site_client.list(3, 0, 1000)
        self.parse_search_list(response, 3)
        response = self._site_client.list(7, 0, 1000)
        self.parse_search_list(response, 7)
        response = self._site_client.list(9, 0, 1000)
        self.parse_search_list(response, 9)
        pass

    def parse_search_list(self, response, useState):
        logging.info("parse_search_list..........")
        # logging.debug(response.content)

        # j = json.loads(response.text)
        # root_list = j['root']
        root = u"[" + response.text.split(u'[')[1].split(u']')[0] + u"]"
        # logging.debug(root)
        root_list = json.loads(root, 'utf-8')

        for item in root_list:
            flowNo = item['flowNo']

            if CbrcDB.find_flowNo(flowNo, useState):
                continue
            try:
                company = self.get_company(useState, flowNo)

                if useState == 1:
                    CbrcDB.upsert_company_1(company)  # 近期机构设立情况
                elif useState == 3:
                    CbrcDB.upsert_company_3(company)  # 机构持有列表
                elif useState == 7:
                    CbrcDB.upsert_company_7(company)  # 机构推出列表
                elif useState == 9:
                    CbrcDB.upsert_company_9(company)  # 失控情况
                else:
                    raise Exception("unknonw useState")
            except Exception, e:
                logging.exception(e)
                continue
Пример #12
0
# -*- coding:utf-8 -*-
__author__ = 'zhaojm'

from site_client import SiteClient

site_client = SiteClient()
site_client.index_1()

response = site_client.get_verify_img()
Пример #13
0
 def __init__(self):
     self._client = SiteClient(proxies={})
     self._index = 0
     pass
Пример #14
0
 def __init__(self):
     self._client = SiteClient(username, password)
     pass