Exemplo n.º 1
0
class LinkedInSpider(Spider):
    def __init__(self, thread_cnt, latest_type=None):
        super(LinkedInSpider, self).__init__(thread_cnt)
        self.page_store = PageStoreLinkedIn()
        self.speed_control_requests = SpeedControlRequests()
        self.latest_type = latest_type
        self.page_store.testmode = False

    def dispatch(self):

        if self.latest_type:
            LinkedInConfig.URL_TMPLATE += "&f_TP={}".format(self.latest_type)
        for locId in range(8876, 9046):
            for indId in range(0, 149):
                if indId == 2:
                    continue

                url = LinkedInConfig.URL_TMPLATE.format(locId, indId)
                self.add_main_job({"type":"search", "url": url})

        self.wait_q()
        self.add_main_job(None)

    def run_job(self, jobid):
        if not isinstance(jobid, dict):
            return

        if jobid.get("type", None) == "search":
            self.parse_page(jobid.get("url"))

        elif jobid.get("type", None) == "jdurl":
            url = jobid.get("url")
            jobid = jobid.get("jobid", None)
            if not jobid:
                return
            res = self.speed_control_requests.with_sleep_requests(url, 0.5)
            self.page_store.save(int(time.time()), jobid, url, res.text)

    def parse_page(self, url):

        for page_num in range(1, 41):
            real_url = url + "&start={}&count=25".format(25 * (page_num-1))
            page = self.speed_control_requests.with_sleep_requests(real_url, 0.5)
            jobids = re.findall(r'linkedin.com/jobs2/view/(\d+)', page.text, re.S)
            jobids = set(jobids)

            if not jobids:
                return

            for jobid in jobids:
                url_page = LinkedInConfig.PAGE_TMPLATE.format(jobid)
                self.add_job({"type":"jdurl", "url":url_page, "jobid": jobid}, False)

    def event_handler(self, evt, msg, **kwargs):

        if "DONE" == evt:
            spider.util.sendmail(["<*****@*****.**>"], "linkedin jd爬取", msg + '\nsaved: %d' % self.page_store.saved_count)
            return
Exemplo n.º 2
0
 def __init__(self, threadcnt):
     super(BaiduSchoolSpider, self).__init__(threadcnt)
     self.speed_control_requests = SpeedControlRequests()
     self.wb = openpyxl.Workbook()
     self.ws = self.wb.active
     self.sheet_list = [
         u'高校', u'院校分类', u'办学性质', '211', '985', u'研究生院', u'院校隶属', u'办学类型',
         u'学历层次', u'标签'
     ]
     self.ws.append(self.sheet_list)
Exemplo n.º 3
0
class JobWealinkSpider(Spider):
    def __init__(self, thread_cnt):
        super(JobWealinkSpider, self).__init__(thread_cnt)
        self.page_store = PageStoreWL()
        self.speed_control_request = SpeedControlRequests()
        self.page_store.testmode = False

    def dispatch(self):

        for i in xrange(28261133, 31000000):
            self.add_main_job(str(i))
        self.wait_q()
        self.add_main_job(None)

    def run_job(self, jobid):

        if not self.page_store.check_should_fetch(jobid):
            return
        url = "http://www.wealink.com/zhiwei/view/{}/".format(jobid)
        res = self.speed_control_request.with_sleep_requests(url, sleep=0.1)
        if res.code == 404:
            print "jobid: {} match nothing".format(jobid)
            return
        if res is not None:
            self.page_store.save(int(time.time()), jobid, url, res.text)
        else:
            self.re_add_job(jobid)
            Log.error("failed get url", url)
Exemplo n.º 4
0
class JobLagouSpider(Spider):
    def __init__(self, thread_cnt):
        super(JobLagouSpider, self).__init__(thread_cnt)
        self.page_store = PageStoreLG()
        self.speed_control_request = SpeedControlRequests()
        self.page_store.testmode = False

    def dispatch(self):
        self.bs = BinSaver('joblagou.bin')
        for i in xrange(0, 1500000):
            self.add_main_job(str(i))
        self.wait_q()
        self.add_main_job(None)

    def run_job(self, jobid):

        if not self.page_store.check_should_fetch(jobid):
            return
        url = "http://www.lagou.com/jobs/{}.html".format(jobid)
        res = self.speed_control_request.with_sleep_requests(url, sleep=0.1)
        if htmlfind.findTag(res.text, 'div', 'position_del'):
            print "jobid: {} match nothing".format(jobid)
            return
        if res is not None:
            self.page_store.save(int(time.time()), jobid, url, res.text)
        else:
            self.re_add_job(jobid)
            Log.error("failed get url", url)
Exemplo n.º 5
0
 def __init__(self, thread_cnt, company):
     super(LagouBycompany, self).__init__(thread_cnt)
     self.page_store = PageStoreLG()
     self.speed_control_requests = SpeedControlRequests()
     self.page_store.testmode = False
     self.list = []
     with open(company) as file_:
         for line in file_:
             self.list.append(line.strip())
Exemplo n.º 6
0
class Jd58Spider(Spider):
    def __init__(self, thread_cnt):
        super(Jd58Spider, self).__init__(thread_cnt)
        self.page_store = Jd58PageStore()
        self.speed_control_request = SpeedControlRequests()
        self.page_store.testmode = False
        self.get_latest = None

    def real_dispatch(self):
        for url in urls:
            for ind in inds:
                i = 1
                while 1:
                    realUrl = url + ind + '/pn{}/'.format(i)
                    if self.get_latest:
                        l_time = spider.util.TimeHandler.getTimeOfNDayBefore(
                            self.get_latest) / 1000
                        l_time_local = time.localtime(l_time)
                        l_time_str = '%04d%02d%02d' % (
                            l_time_local[0], l_time_local[1], l_time_local[2])

                        h_time_local = time.localtime(time.time())
                        h_time_str = '%04d%02d%02d' % (
                            h_time_local[0], h_time_local[1], h_time_local[2])

                        realUrl += "?postdate={}_{}".format(
                            l_time_str, h_time_str)

                    # self.add_main_job({"urlpart": realUrl,  "type":"loadPage"})
                    has_next = self.parse_html(realUrl)
                    if not has_next:
                        break
                    i += 1

    def parse_html(self, url):
        res = self.speed_control_request.with_sleep_requests(url, sleep=0.05)
        if not res:
            return True
        els = re.findall(r'entityId=(\d+)', res.text)

        if not els:
            return False

        part = url.split("pn")[0]
        for el in els:
            self.add_main_job({"urlpart": part, "jobid": el, "type": "jdPage"})

        if re.search(ur'新信息较少', res.text):
            return False

        return True
Exemplo n.º 7
0
class LiepinSpider(Spider):
    def __init__(self, thread_cnt):
        super(LiepinSpider, self).__init__(thread_cnt)
        self.page_store = PageStoreLP()
        self.speed_control_requests = SpeedControlRequests()
        self.page_store.testmode = False

    def dispatch(self):

        for i in range(3362419 + 1, 9999999):
            self.add_main_job(i)
        self.wait_q()
        self.add_main_job(None)

    def run_job(self, jobid):

        if isinstance(jobid, int):
            jobid = str(jobid)

        url = "http://job.liepin.com/{}_{}/".format(jobid[:3], jobid)
        res = self.speed_control_requests.with_sleep_requests(url, 0.1)
        if res is None:
            print "%d failed, sleeping 10 secs." % jobid
            time.sleep(2)
            self.add_job(jobid)
            return

        if re.search(u'您访问的页面不存在或已删除', res.text):
            print jobid, "match nothing"
        elif re.search(u'该职位已结束', res.text):
            print jobid, "match ending"
        elif re.search(u'您查看的职位已过期', res.text):
            print jobid, "match timeout"
        else:
            print "saving %s ..." % jobid
            self.page_store.save(int(time.time()), jobid, url, res.text)
Exemplo n.º 8
0
 def __init__(self, thread_cnt):
     super(LiepinSpider, self).__init__(thread_cnt)
     self.page_store = PageStoreLP()
     self.speed_control_requests = SpeedControlRequests()
     self.page_store.testmode = False
Exemplo n.º 9
0
 def __init__(self, thread_cnt):
     super(JobWealinkSpider, self).__init__(thread_cnt)
     self.page_store = PageStoreWL()
     self.speed_control_request = SpeedControlRequests()
     self.page_store.testmode = False
Exemplo n.º 10
0
 def __init__(self, thread_cnt):
     super(Jd58Spider, self).__init__(thread_cnt)
     self.page_store = Jd58PageStore()
     self.speed_control_request = SpeedControlRequests()
     self.page_store.testmode = False
     self.get_latest = None
Exemplo n.º 11
0
 def __init__(self, thread_cnt, latest_type=None):
     super(LinkedInSpider, self).__init__(thread_cnt)
     self.page_store = PageStoreLinkedIn()
     self.speed_control_requests = SpeedControlRequests()
     self.latest_type = latest_type
     self.page_store.testmode = False
Exemplo n.º 12
0
class BaiduSchoolSpider(Spider):
    def __init__(self, threadcnt):
        super(BaiduSchoolSpider, self).__init__(threadcnt)
        self.speed_control_requests = SpeedControlRequests()
        self.wb = openpyxl.Workbook()
        self.ws = self.wb.active
        self.sheet_list = [
            u'高校', u'院校分类', u'办学性质', '211', '985', u'研究生院', u'院校隶属', u'办学类型',
            u'学历层次', u'标签'
        ]
        self.ws.append(self.sheet_list)

    def get_ids(self):
        url = 'http://baike.baidu.com/wikitag/api/getlemmas'
        form_data = {
            'limit': 30,
            'timeout': 3000,
            'filterTags': [0, 0, 0, 0, 0, 0, 0],
            'tagId': 60829,
            'fromLemma': 'false',
            'contentLength': 40,
            'page': 0
        }
        total_page = 81
        while form_data['page'] <= total_page:
            res = self.request_url(url, data=form_data)
            form_data['page'] += 1
            json_resp = json.loads(res.text)
            for item in json_resp['lemmaList']:
                lis = item['lemmaUrl'].split('/')
                # 6 means url format is http://baike.baidu.com/subview/d1/d2.htm
                if len(lis) == 6:
                    id_lis = [str(lis[4]), str(lis[5].split('.')[0])]
                    yield {'id': id_lis}
                else:
                    yield {'id': str(lis[4].split('.')[0])}

    def dispatch(self):
        for jobid in self.get_ids():
            self.add_main_job(jobid)
        self.wait_q()
        self.add_main_job(None)

    def get_info(self, content):
        raw_info = {
            u'高校': u'不确定',
            u'院校分类': u'不确定',
            u'办学性质': u'不确定',
            '211': u'否',
            '985': u'否',
            u'研究生院': u'否',
            u'院校隶属': u'不确定',
            u'办学类型': u'不确定',
            u'学历层次': u'不确定',
            u'标签': u'不确定'
        }
        doc = html.fromstring(content)
        tag_list = doc.xpath(
            '//*[@id="open-tag-item"]/span/a/text()|//*[@id="open-tag-item"]/span/text()'
        )
        tag_list = [i.strip() for i in tag_list]
        tag = ' ' + ' '.join(tag_list) + ' '
        raw_info[u'标签'] = tag
        if u'211高校' in tag:
            raw_info['211'] = u'是'
        if u'985高校' in tag:
            raw_info['985'] = u'是'
        if u'研究生院高校' in tag:
            raw_info[u'研究生院'] = u'是'

        gaoxiao = doc.xpath('//h1/text()')
        if gaoxiao:
            raw_info[u'高校'] = gaoxiao[0]

        fenlei = re.findall(ur'\s([\u4e00-\u9fa5]*?类)高校\s', tag)
        if fenlei:
            raw_info[u'院校分类'] = fenlei[0]
        xingzhi = re.findall(ur'\s([\u4e00-\u9fa5]*?办)高校\s', tag)
        if xingzhi:
            raw_info[u'办学性质'] = xingzhi[0]

        lishu = re.findall(ur'\s([\u4e00-\u9fa5]*?)隶属高校\s', tag)
        if lishu:
            raw_info[u'院校隶属'] = lishu[0]
        elif u'地方所属高校' in tag:
            raw_info[u'院校隶属'] = u'地方所属'

        if u'本科' in tag:
            raw_info[u'学历层次'] = u'本科'
        else:
            raw_info[u'学历层次'] = u'专科'

        if u' 大学 ' in tag:
            raw_info[u'办学类型'] = u'大学'
        elif u' 学院 ' in tag:
            raw_info[u'办学类型'] = u'学院'
        elif u'高等专科院校' in tag:
            raw_info[u'办学类型'] = u'高等专科院校'
        elif u'高等职业技术院校' in tag:
            raw_info[u'办学类型'] = u'高等职业技术院校'
        elif u'独立学院' in tag:
            raw_info[u'办学类型'] = u'独立学院'
        elif u'成人高等院校' in tag:
            raw_info[u'办学类型'] = u'成人高等院校'
        elif u'短期职业大学' in tag:
            raw_info[u'办学类型'] = u'短期职业大学'
        elif u'管理干部学院' in tag:
            raw_info[u'办学类型'] = u'管理干部学院'
        elif u'教育学院' in tag:
            raw_info[u'办学类型'] = u'教育学院'
        elif u'高等学校分校' in tag:
            raw_info[u'办学类型'] = u'高等学校分校'
        else:
            raw_info[u'办学类型'] = u'其他'

        new_list = [raw_info[i] for i in self.sheet_list]
        self.ws.append(new_list)

    def run_job(self, jobid):
        if isinstance(jobid['id'], list):
            url = 'http://baike.baidu.com/subview/{}/{}.htm'.format(
                jobid['id'][0], jobid['id'][1])
            res = self.speed_control_requests.with_sleep_requests(url, 0.1)
            jobid_str = '&'.join(jobid['id'])
            if res is not None:
                print "saving %s ..." % jobid_str
                self.get_info(res.text)
                # self.page_store.save(int(time.time), jobid_str, url, res.text)
            else:
                print "%d failed, sleeping 10 secs." % jobid_str
                time.sleep(2)
                self.add_job(jobid)
        elif isinstance(jobid['id'], str):
            url = 'http://baike.baidu.com/view/{}.htm'.format(jobid['id'])
            res = self.speed_control_requests.with_sleep_requests(url, 0.1)
            if res is not None:
                print "saving %s ..." % jobid['id']
                self.get_info(res.text)
                # self.page_store.save(int(time.time), jobid, url, res.text)
            else:
                print "%d failed, sleeping 10 secs." % jobid['id']
                time.sleep(2)
                self.add_job(jobid)
Exemplo n.º 13
0
class CV58Spider(Spider):
    def __init__(self, thread_cnt):
        super(CV58Spider, self).__init__(thread_cnt)
        self.page_store = CV58PageStore()
        self.speed_control_request = SpeedControlRequests()
        self.page_store.testmode = False
        self.get_latest = 3

    def real_dispatch(self):
        for url in urls:
            for ind in inds:
                i = 1
                while 1:
                    realUrl = "{}qz{}/pn{}".format(url, ind, i)
                    if self.get_latest:
                        l_time = TimeHandler.getTimeOfNDayBefore(
                            self.get_latest) / 1000
                        l_time_local = time.localtime(l_time)
                        l_time_str = '%04d%02d%02d' % (
                            l_time_local[0], l_time_local[1], l_time_local[2])

                        h_time_local = time.localtime(time.time())
                        h_time_str = '%04d%02d%02d' % (
                            h_time_local[0], h_time_local[1], h_time_local[2])

                        realUrl += "?postdate={}000000_{}000000".format(
                            l_time_str, h_time_str)

                    # self.add_main_job({"urlpart": realUrl,  "type":"loadPage"})
                    has_next = self.parse_html(realUrl)
                    if not has_next:
                        break
                    i += 1

    def parse_html(self, url):
        res = self.speed_control_request.with_sleep_requests(url, sleep=0.05)
        els = re.findall(r'resume/(\d+)', res.text)

        els = set(els)

        if not els:
            return False
        for el in els:
            self.add_main_job({"jobid": el})

        return True

    def dispatch(self):
        self.real_dispatch()
        self.wait_q()
        self.add_main_job(None)

    def run_job(self, jobid):

        url = "http://jianli.m.58.com/resume/{}/".format(jobid['jobid'])
        if not self.page_store.check_should_fetch(jobid['jobid']):
            return

        res = self.speed_control_request.with_sleep_requests(url, sleep=0.2)
        if res is not None:
            self.page_store.save(int(time.time()), jobid['jobid'], url,
                                 res.text)
        else:
            self.re_add_job(jobid)
            spider.util.Log.error(("failed get url", url))

    def event_handler(self, evt, msg, **kwargs):
        if "START" == evt:
            spider.util.sendmail(["<*****@*****.**>"], "58 jd爬取", msg)
            return

        if "DONE" == evt:
            spider.util.sendmail(["<*****@*****.**>"], "58 jd爬取", msg)
            return
Exemplo n.º 14
0
class LatestLagouSpider(Spider):
    def __init__(self, thread_cnt):
        super(LatestLagouSpider, self).__init__(thread_cnt)
        self.page_store = PageStoreLG()
        self.speed_control_requests = SpeedControlRequests()

        self.page_store.testmode = False

    def getIds(self, q):
        url = "http://www.lagou.com/jobs/positionAjax.json"
        hasNext = True
        pageIndex = 0
        total_num = 100
        while hasNext and pageIndex <= total_num:
            pageIndex += 1
            q["pn"] = pageIndex
            res = self.request_url(url, data=q)
            json_resp = json.loads(res.text)
            if "content" in json_resp and "positionResult" in json_resp["content"] \
                and "result" in json_resp["content"]["positionResult"]:

                # if pageIndex == 1:
                #     total_num = json_resp["content"]["totalPageCount"]

                if not json_resp["content"]["positionResult"]["result"]:
                    hasNext = False
                elif json_resp["content"]["positionResult"]["result"]:
                    hasNext = True
                    for item in json_resp["content"]["positionResult"][
                            "result"]:
                        create_time = item['createTimeSort']
                        # 昨天的不管
                        if TimeHandler.isBeforeNDay(create_time, 2):
                            yield item["positionId"]
                            break
                        yield item["positionId"]

    def dispatch(self):
        self.bs = BinSaver('joblagou.bin')
        for query in q:
            try:
                for jobid in self.getIds(query):
                    if isinstance(jobid, int):
                        jobid = str(jobid)
                    self.add_main_job(jobid)
            except Exception as e:
                continue
        self.wait_q()
        self.add_main_job(None)

    def run_job(self, jobid):
        if not self.page_store.check_should_fetch(jobid):
            return
        url = "http://www.lagou.com/jobs/{}.html".format(jobid)
        res = self.speed_control_requests.with_sleep_requests(url, sleep=0.1)
        if htmlfind.findTag(res.text, 'div', 'position_del'):
            print "jobid: {} match nothing".format(jobid)
        if res is not None:
            self.page_store.save(int(time.time()), jobid, url, res.text)
        else:
            self.re_add_job(jobid)
            Log.error("failed get url", url)

    def event_handler(self, evt, msg, **kwargs):

        if "DONE" == evt:
            spider.util.sendmail(["<*****@*****.**>"], "lagou jd爬取", msg +
                                 '\nsaved: %d' % self.page_store.saved_count)
            return