예제 #1
0
파일: cv_etl.py 프로젝트: haogods/etl_task
def run(q, channel, _type):

    if _type in ['raw', 'check']:
        handler = ETLRunnerFromRaw(channel)
    else:
        handler = ETLRunner(channel)
    sucess_cnt = 0
    fail_cnt = 0

    fail_save_file_name = 'result/%s_fail_ids_%d.txt' % (channel, os.getpid())
    fail_save_file = FileSave(fail_save_file_name)

    result_file_name = 'result/%s_statistics_%d.txt' % (channel, os.getpid())
    result_file = FileSave(result_file_name)

    while 1:
        job = get_job(q, _type)
        if job is None:
            break
        try:
            handler.run(job)
            sucess_cnt += 1
            print >> result_file.fd, "%s" % (job['indexUrl'])
            if sucess_cnt % 1000 == 0:
                print "process %d, time: %s, success copied: %d, " \
                      "fail copied: %d, fail_save_file: %s, result_file: %s" % (os.getpid(),
                                                                                time.ctime(),
                                                                                sucess_cnt,
                                                                                fail_cnt,
                                                                                fail_save_file_name,
                                                                                result_file_name)
        except Exception as e:
            traceback.print_exc()
            fail_cnt += 1
            fail_save_file.append_end_with(job['indexUrl'])
예제 #2
0
class GetCVJobList(FixLoc):

    def __init__(self, thread_cnt):
        FixLoc.__init__(self,thread_cnt)
        self._save_file = FileSave('cv_jobLists.json')

    def _load_data(self):
        pass

    def run_job(self, job):
        if not job:
            return
        cvId = job['measure']['cvId']
        try:
            measure_doc = job['measure']
            raw_doc = job['raw']

            # if not measure_doc['jobList']:
            #     return

            for index, jobItem in enumerate(measure_doc['jobList']):
                # if jobItem['incLocationId'] == '440300000000':
                jobItem.update({'cvId': cvId})
                jobItem.update({"incName": raw_doc['jobList'][index].get('incName')})
                del jobItem['incDesc']
                self._save_file.append_end_with(jobItem)
                self._inc_update_num()

            print "SUCESS COPY: %s" % cvId
        except Exception as e:
            traceback.print_exc()
            print "Fail copy:  %s" % cvId
예제 #3
0
    def __init__(self, thread_cnt):
        FixLoc.__init__(self, thread_cnt)

        self.empty_address_cnt = 0
        self.fail_measure_cnt = 0
        self._save_file = FileSave('not_measure_addresses.json')
        self.empty_lock = threading.RLock()
        self.fail_measure_lock = threading.RLock()
예제 #4
0
    def __init__(self, owner, queue_size, thread_cnt):
        BaseTask.__init__(self, owner, queue_size, thread_cnt)
        self.owner = owner
        self.process_item = {}
        self.expired_items = []
        self._raw_store = self._get_raw_store(owner)
        self._measure_store = self._get_measure_store(owner)
        self._page_store = PageStore(owner)
        self.page_store_db = 'admin'
        self.page_store_coll = "page_store_{}".format(owner)

        self.thrift_client = TClient('../../conf/thrift_conf.ini')
        self.jd_measure_client = self.thrift_client.jd_measure_server_client

        self.test_mode = False

        self.rs_file = FileSave(BaseTask.PathConfig.result_file)
        self.failfile = FileSave(BaseTask.PathConfig.etl_failids_file)
예제 #5
0
class GetNotMeasureAddress(FixLoc):
    def __init__(self, thread_cnt):
        FixLoc.__init__(self, thread_cnt)

        self.empty_address_cnt = 0
        self.fail_measure_cnt = 0
        self._save_file = FileSave('not_measure_addresses.json')
        self.empty_lock = threading.RLock()
        self.fail_measure_lock = threading.RLock()

    def _inc_empty_address_cnt(self):
        with self.empty_lock:
            self.empty_address_cnt += 1

    def _inc_fail_measure_cnt(self):
        self.fail_measure_cnt += 1


    def run_job(self, job):
        if not job:
            return
        cvId = job['measure']['cvId']

        measure_doc = job['measure']
        raw_doc = job['raw']
        address = raw_doc['baseInfo']['nowAddress']

        if not address:
            self._inc_empty_address_cnt()

        try:
            if 'nowAddress' in measure_doc['baseInfo'] and address:
                measure_doc['baseInfo']['nowAddress'] = self.measure_loc(address)
                if measure_doc['baseInfo']['nowAddress']:
                    self._inc_update_num()
                else:
                    tmp = {'cvId': cvId, 'nowAddress_raw':address, 'nowAddress_measure': measure_doc['baseInfo']['nowAddress']}
                    self._save_file.append_end_with(tmp)

                    if address and not measure_doc['baseInfo']['nowAddress']:
                        self._inc_fail_measure_cnt()

            print "SUCESS COPY: %s" % cvId
        except Exception as e:
            traceback.print_exc()
            print "Fail copy:  %s" % cvId

    def end_operation(self, *args, **kwargs):
        self._save_file.append_end_with('empty address cnt: %s' % self.empty_address_cnt)
        self._save_file.append_end_with('fail measure cnt: %s' % self.fail_measure_cnt)
        self._save_file.append_end_with('measure success cnt: %s' % self.update_cnt)
예제 #6
0
    def __init__(self, thread_cnt):
        BaseTask.__init__(self, '2c_import', thread_cnt=thread_cnt)
        self.process_items = []
        self.fail_fn = BaseTask.PathConfig.toc_failids_file
        self.rs_fn = BaseTask.PathConfig.result_file
        self.lgstore = LgEtlStore()
        self.job51store = Job51EtlStore()
        self.zlstore = ZLEtlStore()
        self.tocstore = ToCMeasureStore()

        self.thrift_client = TClient('../../conf/thrift_conf.ini')
        self.inc_stats2_client = pymongo.MongoClient(self.lgstore.cmgClient.inc_stats2_mongo_url)
        self.zhineng_salary5_charts_client = pymongo.MongoClient(self.lgstore.cmgClient.zhineng_salary5_charts_mongo_url)

        self.edu_info_client = self.thrift_client.edu_info_client
        self.inc_info_clients = self.thrift_client.inc_idinfo_client

        self.toc_failids_fd = FileSave(BaseTask.PathConfig.toc_failids_file)
예제 #7
0
class ETLTask(BaseTask):
    def __init__(self, owner, queue_size, thread_cnt):
        BaseTask.__init__(self, owner, queue_size, thread_cnt)
        self.owner = owner
        self.process_item = {}
        self.expired_items = []
        self._raw_store = self._get_raw_store(owner)
        self._measure_store = self._get_measure_store(owner)
        self._page_store = PageStore(owner)
        self.page_store_db = 'admin'
        self.page_store_coll = "page_store_{}".format(owner)

        self.thrift_client = TClient('../../conf/thrift_conf.ini')
        self.jd_measure_client = self.thrift_client.jd_measure_server_client

        self.test_mode = False

        self.rs_file = FileSave(BaseTask.PathConfig.result_file)
        self.failfile = FileSave(BaseTask.PathConfig.etl_failids_file)

    def _get_raw_store(self, owner):
        if "jd_lagou" == owner:
            return LgEtlStore('raw')
        if "jd_51job" == owner:
            return Job51EtlStore('raw')
        if "jd_zhilian" == owner:
            return ZLEtlStore('raw')

        raise Exception(" unknown owner ")

    def _get_measure_store(self, owner):
        if "jd_lagou" == owner:
            return LgEtlStore('measure')
        if "jd_51job" == owner:
            return Job51EtlStore('measure')
        if "jd_zhilian" == owner:
            return ZLEtlStore('measure')

        raise Exception(" unknown owner ")

    def fill_data_with_flag(self, indexUrl, realUrl, contentSign, updateTime, filePath, flag):

        if indexUrl in self.process_item:
            if self.process_item[indexUrl]['updateTime'] < updateTime:
                self.process_item[indexUrl]['updateTime'] = updateTime
                self.process_item[indexUrl]['contentSign'] = contentSign
                self.process_item[indexUrl]['realUrl'] = realUrl
                self.process_item[indexUrl]['flag'] = flag
                self.process_item[indexUrl]['filePath'] = filePath

        else:
            self.process_item[indexUrl] = {
                'updateTime': updateTime,
                'contentSign': contentSign,
                'realUrl': realUrl,
                'filePath': filePath,
                'flag': flag,
            }

    def check(self, item):

        status = item.get('status', 0)
        isUpdated = item.get('isUpdated', 0)
        updateTime = item.get('updateTime')
        indexUrl = item.get('indexUrl')
        contentSign = item.get('contentSign')
        file_path = item.get('pageContentPath')
        jdUrl = item.get('realUrl')
        expired = item.get('isExpired', 0)

        if status == 0:
            self.fill_data_with_flag(indexUrl, jdUrl, contentSign, updateTime, file_path, 0)
        if status == 1 and isUpdated == 1:
            self.fill_data_with_flag(indexUrl, jdUrl, contentSign, updateTime, file_path, 1)
        if expired == 1:
            self.fill_data_with_flag(indexUrl, jdUrl, contentSign, updateTime,  file_path, 2)

    def _load_data(self):
        page_client = self._raw_store.cmgClient.page_store_mongo_client
        for item in page_client[self.page_store_db][self.page_store_coll].find():
            self.check(item)
            if len(self.process_item) and len(self.process_item) % 10000 == 0:
                print "load {} items".format(len(self.process_item))

        print "finish load {} items".format(len(self.process_item))
        print "finish load {} expired items".format(len(self.expired_items))
        # print "======================start dump ids to files================="
        # fd = FileSave('../data/results.txt')
        # for indexUrl in self.process_item:
        #     fd.append_end_with(indexUrl)
        #
        # print "=======================dump finish============================="

    def dispatcher(self, q):

        for item in self.process_item:
            q.put(item)

        for item in self.expired_items:
            q.put(item)

        q.put(None)
        self.wait_q()

    def getPageContent(self, filename):
        parts = filename.split("::")
        if len(parts) == 3:
            binReader = BinReader(parts[1])
            _, content = binReader.readone_at(int(parts[2]))
            if len(content) == 0:
                raise Exception("file name:{} , content error".format(filename))
            return content

        if len(parts) == 1:
            with open(filename) as f:
                content = f.read()
                if len(content) == 0:
                    raise Exception("file name:{} , content error".format(filename))
                return content

    def update_jd(self, item):
        jd_store_key = {'jdId': item}
        page_store_key = {'indexUrl': item, 'contentSign': self.process_item[item].get('contentSign')}
        updateTime = self.process_item[item].get("updateTime")
        strTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(updateTime/1000))

        jkey = json.dumps(jd_store_key)
        jdoc = json.dumps({"$set": {"pubDate": strTime}})
        jdoc_m = json.dumps({"$set": {"pubDateStamp": updateTime}})

        if self.test_mode:
            print " Update ===> jdkey: {} \n jdoc: {} \n".format(jkey, jdoc)
            return

        if not self._raw_store.save_one(jkey, jdoc, False):
            raise Exception("update jd_raw pubTime Exception")
        if not self._measure_store.save_one(jkey, jdoc_m, False):
            raise Exception("update jd_measure pubTime Exception")
        if not self._page_store.save_one(json.dumps(page_store_key), json.dumps({"$set": {"isUpdated": 0}}), False):
            raise Exception("set page store isUpdated status Exception")

    def check_is_student_job(self, jd_raw):
        if jd_raw.jobWorkAge in [u"在读学生", u"应届毕业生", u"无经验", u"无要求"]\
                or jd_raw.jobCate in [u"应届毕业生", u"储备干部", u"培训生", u"兼职", u"临时", u"实习生"]\
                or jd_raw.jobType in [u"兼职", u"实习"]:
            print "student job ===> ", jd_raw.jdId

            expired = 0
            if self.process_item[jd_raw.jdId]['flag'] == 2:
                expired = 1

            self.rs_file.append_end_with('\t'.join((jd_raw.jdId, str(expired))))
            return True
        return False

    def parse_measure_jd(self, item):
        try:
            jd_store_key = json.dumps({'jdId': item})
            page_store_key = json.dumps({'indexUrl': item, 'contentSign': self.process_item[item].get('contentSign')})
            fileName = self.process_item[item].get("filePath")
            pageContent = self.getPageContent(fileName)

            jd_raw = self.parse_by_owner(item, pageContent, self.owner)
            jd_measure = self.measure(jd_raw)

            if self.test_mode:
                print "Raw ===> {}".format(jd_raw.to_json())
                print "Measure ===> {}".format(jd_measure.to_json())
                return

            if not self._raw_store.save_one(jd_store_key, jd_raw.to_json(), True):
                raise Exception("set raw exception")

            if not self._measure_store.save_one(jd_store_key, jd_measure.to_json(), True):
                raise Exception("set measure exception")

            if not self._page_store.save_one(page_store_key, json.dumps({"$set": {"status": 1}}), False):
                raise Exception("set page store status exception")
            self.check_is_student_job(jd_raw)
        except Exception as e:
            self.failfile.append_end_with(item)
            raise e

    def parse_by_owner(self, jdId, pageContent, owner):
        if "jd_lagou" == owner:
            jdRaw = JdLagouHtmlFind(pageContent).find_fields()
        elif "jd_51job" == owner:
            jdRaw = Jd51JobHtmlFind(pageContent).find_fields()
        elif "jd_zhilian" == owner:
            jdRaw = JdZhilianHtmlFind(pageContent).find_fields()
        else:
            raise Exception("unknown owner")

        jdRaw.jdId = jdId
        jdRaw.jdFrom = self.owner
        jdRaw.jdUrl = self.process_item[jdId].get('realUrl')

        if not jdRaw.pubDate:
            stamp = self.process_item[jdId].get("updateTime")
            str_uptime = time.strftime("%Y-%m-%d %H:%m:%S", time.localtime(stamp/1000))
            jdRaw.pubDate = str_uptime

        return jdRaw

    def measure(self, jd_raw):
        raw_for_measure = construct_jd_raw(jd_raw)
        jd_measure = self.jd_measure_client.measureJd(raw_for_measure)
        jdMeasureObj = convertToJdMeasure(self.owner, jd_measure)

        self.set_md5_SimHash(jdMeasureObj, jd_raw)

        return jdMeasureObj

    def set_md5_SimHash(self, jd_measure, jd_raw):
        jd_measure.jdMd5 = self.get_jd_md5(jd_measure, jd_raw)
        jd_measure.jdSimHash = gen_sim_hash(jd_raw.jobDescription)

    def get_jd_md5(self, jd_measure, jd_raw):

        temp = dict({})
        temp["jdPosition"] = jd_raw.jobPosition
        temp["incName"] = jd_measure.incSegmentId
        temp["jdWorkLoc"] = jd_measure.jobWorkLocId

        return get_jd_measure_hash(temp)

    def event_handler(self, evt, msg, **kwargs):
        if "START" == evt:
            util.send_email(["<*****@*****.**>"], "{}_etl 任务".format(self.owner), msg)
            return

        if "DONE" == evt:
            util.send_email(["<*****@*****.**>"], "{}_etl 任务".format(self.owner), msg)
            return
예제 #8
0
class TCImportTask(BaseTask):
    def __init__(self, thread_cnt):
        BaseTask.__init__(self, '2c_import', thread_cnt=thread_cnt)
        self.process_items = []
        self.fail_fn = BaseTask.PathConfig.toc_failids_file
        self.rs_fn = BaseTask.PathConfig.result_file
        self.lgstore = LgEtlStore()
        self.job51store = Job51EtlStore()
        self.zlstore = ZLEtlStore()
        self.tocstore = ToCMeasureStore()

        self.thrift_client = TClient('../../conf/thrift_conf.ini')
        self.inc_stats2_client = pymongo.MongoClient(self.lgstore.cmgClient.inc_stats2_mongo_url)
        self.zhineng_salary5_charts_client = pymongo.MongoClient(self.lgstore.cmgClient.zhineng_salary5_charts_mongo_url)

        self.edu_info_client = self.thrift_client.edu_info_client
        self.inc_info_clients = self.thrift_client.inc_idinfo_client

        self.toc_failids_fd = FileSave(BaseTask.PathConfig.toc_failids_file)

    def _load_data(self):
        with open(self.rs_fn) as f:
            for line in f:
                if not line:
                    continue
                index, expired = line.split('\t')
                self.process_items.append({"index":index, "expired": int(expired)})

    def dispatcher(self):
        for item in self.process_items:
            self._queue.put(item)

        self._queue.put(None)
        self.wait_q()

    def get_jd_raw(self, index):
        channel = index.split("://")[0]
        if channel == 'jd_lagou':
            return self.lgstore.get_raw(index)
        elif channel == 'jd_51job':
            return self.job51store.get_raw(index)
        elif channel == 'jd_zhilian':
            return self.zlstore.get_raw(index)

    def get_jd_measure(self, index):
        channel = index.split("://")[0]
        if channel == 'jd_lagou':
            return self.lgstore.get_measure(index)
        elif channel == 'jd_51job':
            return self.job51store.get_measure(index)
        elif channel == 'jd_zhilian':
            return self.zlstore.get_measure(index)

    def constructMobileJdUrl(self, jdId, jdFrom):
        positionId = jdId.split("://")[1]
        if "jd_lagou" == jdFrom:
            return "http://www.lagou.com/center/jobs_{}.html?m=1".format(positionId)
        if "jd_51job" == jdFrom:
            return "http://m.51job.com/search/jobdetail.php?jobid={}".format(positionId)
        if "jd_zhilian" == jdFrom:
            part1 = positionId[:9]
            part2 = positionId[9:]
            realPositionId = "cc{}j90{}000".format(part1, part2)
            return "http://m.zhaopin.com/jobs/{}".format(realPositionId)

    def get_major_ids(self, major):
        # major = u"计算机科学与技术"
        flag = 1
        try:
            bk_major = self.thrift_client.edu_info_client.findBkMajorId("", major)
        except Exception as e:
            bk_major = ""
        try:
            zk_major = self.thrift_client.edu_info_client.findZkMajorId("", major)
        except Exception as e:
            zk_major = ""

        ids = [bk_major, zk_major]

        if len(bk_major) <= 0 and len(zk_major) <= 0:
            flag = 2
        else:
            print "======================= ids: ", ids

        return ids, flag

    def get_salary_ratio(self, jobCate):
        r = -1
        if len(jobCate) <= 0:
            return r
        q = self.zhineng_salary5_charts_client["zhineng_stats_v2"]["zhineng_salary5_charts"].\
            find_one({"zhineng_id": jobCate})
        if q and "greater" in q:
            r = q.get("greater")
        return r

    def get_total_ind_rank(self, incSegmentId):
        tagList=[]
        total_ind_rank_postion = -1
        incId = self.get_inc_id(incSegmentId)

        if not incId:
            return "", tagList, total_ind_rank_postion

        q = self.inc_stats2_client["inc_stats_v6"]["inc"].find_one({"_id":ObjectId(incId)})
        if q:
            if "tag_list" in q:
                tagList = q.get("tag_list")

            if "rank_info" in q and "total_ind_rank_postion" in q["rank_info"]:
                total_ind_rank_postion = q["rank_info"]["total_ind_rank_postion"]

        return incId, tagList, total_ind_rank_postion

    def get_inc_id(self, incSegmentId):
        try:
            idinfo = self.inc_info_clients.queryIncId(ServiceAccessToken(), "", incSegmentId)
        except Exception as e:
            print e
            raise e
        return idinfo

    def rebuild(self, raw, measure):
        tocmeasure = toCMeasurePageModel()

        tocmeasure.jdId = raw["jdId"]
        tocmeasure.jdUrl = raw["jdUrl"]
        tocmeasure.mobileJdUrl = self.constructMobileJdUrl(raw["jdId"], raw["jdFrom"])
        tocmeasure.channel = raw["jdFrom"]
        tocmeasure.jd_content_hash = measure["jdSimHash"]
        tocmeasure.jd_measure_hash = measure["jdMd5"]
        tocmeasure.publishTime = measure["pubDateStamp"]
        tocmeasure.isExpired = 0

        tocmeasure.jobDiploma = raw["jobDiploma"]
        tocmeasure.jobDiplomaId = measure["jobDiplomaId"]
        tocmeasure.jobPosition = raw["jobPosition"]
        tocmeasure.jobWorkLoc = raw["jobWorkLoc"]
        tocmeasure.jobSalaryMin = measure["jobSalaryMin"]
        tocmeasure.jobSalaryMax = measure["jobSalaryMax"]
        tocmeasure.jobWorkLocId = str(measure["jobWorkLocId"])
        tocmeasure.jobWorkLoc = raw["jobWorkLoc"]
        tocmeasure.jobWorkAgeMin = measure["jobWorkAgeMin"]
        tocmeasure.jobWorkAgeMax = measure["jobWorkAgeMax"]
        tocmeasure.jobCate = measure["jobCate"]
        tocmeasure.jobType = raw["jobType"]
        tocmeasure.major = measure["jobMajor"]
        tocmeasure.jobDescription = raw["jobDescription"]

        # majorIds, majorIdsFlag
        if measure["jobMajor"] == u"专业不限":
            tocmeasure.majorIds = []
            tocmeasure.majorIdsFlag = 0 #专业不限
        elif measure["jobMajor"] == "":
            tocmeasure.majorIds = []
            tocmeasure.majorIdsFlag = -1 #专业没有
        else:
            tocmeasure.majorIds, tocmeasure.majorIdsFlag = self.get_major_ids(measure["jobMajor"])

        tocmeasure.incName = raw["incName"]
        tocmeasure.incIntro = raw["incIntro"]
        tocmeasure.incIndustry = raw["incIndustry"]
        tocmeasure.incIndustryId = measure["incIndustryId"]
        tocmeasure.incType = measure["incType"]
        tocmeasure.incScaleMin = measure["incScaleMin"]
        tocmeasure.incScaleMax = measure["incScaleMax"]

        tocmeasure.salaryRatio = self.get_salary_ratio(measure["jobCate"])
        tocmeasure.incId, tocmeasure.tagList, tocmeasure.total_ind_rank_postion = self.get_total_ind_rank(measure["incSegmentId"])

        return tocmeasure

    def run_job(self, item):

        index = item.get('index')
        expired = item.get('expired')
        try:

            if expired:
                self.tocstore.set_expired({"jdId": index})
                print "complete set expired,  indexUrl: {}".format(index)
                return

            raw = self.get_jd_raw(index)

            measure = self.get_jd_measure(index)

            result = self.rebuild(raw, measure)
            key = json.dumps({"jdId": raw["jdId"]})

            doc = util.remove_empty_key(json.loads(result.to_json()), ['isExpired'])
            self.tocstore.save_one(key, json.dumps(doc), True)

            print "complete copy indexUrl: {}".format(index)
        except Exception as e:
            self.toc_failids_fd.append_end_with(index)
            print "failed copy indexUrl: {}".format(index)
            traceback.print_exc()
            raise e

    def event_handler(self, evt, msg, **kwargs):
        if "START" == evt:
            util.send_email(["<*****@*****.**>"], "2cimport 任务", msg)
            return

        if "DONE" == evt:
            util.send_email(["<*****@*****.**>"], "2cimport 任务", msg)
            return
예제 #9
0
 def __init__(self, thread_cnt):
     FixLoc.__init__(self,thread_cnt)
     self._save_file = FileSave('cv_jobLists.json')