def run(q, channel, _type): if _type in ['raw', 'check']: handler = ETLRunnerFromRaw(channel) else: handler = ETLRunner(channel) sucess_cnt = 0 fail_cnt = 0 fail_save_file_name = 'result/%s_fail_ids_%d.txt' % (channel, os.getpid()) fail_save_file = FileSave(fail_save_file_name) result_file_name = 'result/%s_statistics_%d.txt' % (channel, os.getpid()) result_file = FileSave(result_file_name) while 1: job = get_job(q, _type) if job is None: break try: handler.run(job) sucess_cnt += 1 print >> result_file.fd, "%s" % (job['indexUrl']) if sucess_cnt % 1000 == 0: print "process %d, time: %s, success copied: %d, " \ "fail copied: %d, fail_save_file: %s, result_file: %s" % (os.getpid(), time.ctime(), sucess_cnt, fail_cnt, fail_save_file_name, result_file_name) except Exception as e: traceback.print_exc() fail_cnt += 1 fail_save_file.append_end_with(job['indexUrl'])
class GetCVJobList(FixLoc): def __init__(self, thread_cnt): FixLoc.__init__(self,thread_cnt) self._save_file = FileSave('cv_jobLists.json') def _load_data(self): pass def run_job(self, job): if not job: return cvId = job['measure']['cvId'] try: measure_doc = job['measure'] raw_doc = job['raw'] # if not measure_doc['jobList']: # return for index, jobItem in enumerate(measure_doc['jobList']): # if jobItem['incLocationId'] == '440300000000': jobItem.update({'cvId': cvId}) jobItem.update({"incName": raw_doc['jobList'][index].get('incName')}) del jobItem['incDesc'] self._save_file.append_end_with(jobItem) self._inc_update_num() print "SUCESS COPY: %s" % cvId except Exception as e: traceback.print_exc() print "Fail copy: %s" % cvId
def __init__(self, thread_cnt): FixLoc.__init__(self, thread_cnt) self.empty_address_cnt = 0 self.fail_measure_cnt = 0 self._save_file = FileSave('not_measure_addresses.json') self.empty_lock = threading.RLock() self.fail_measure_lock = threading.RLock()
def __init__(self, owner, queue_size, thread_cnt): BaseTask.__init__(self, owner, queue_size, thread_cnt) self.owner = owner self.process_item = {} self.expired_items = [] self._raw_store = self._get_raw_store(owner) self._measure_store = self._get_measure_store(owner) self._page_store = PageStore(owner) self.page_store_db = 'admin' self.page_store_coll = "page_store_{}".format(owner) self.thrift_client = TClient('../../conf/thrift_conf.ini') self.jd_measure_client = self.thrift_client.jd_measure_server_client self.test_mode = False self.rs_file = FileSave(BaseTask.PathConfig.result_file) self.failfile = FileSave(BaseTask.PathConfig.etl_failids_file)
class GetNotMeasureAddress(FixLoc): def __init__(self, thread_cnt): FixLoc.__init__(self, thread_cnt) self.empty_address_cnt = 0 self.fail_measure_cnt = 0 self._save_file = FileSave('not_measure_addresses.json') self.empty_lock = threading.RLock() self.fail_measure_lock = threading.RLock() def _inc_empty_address_cnt(self): with self.empty_lock: self.empty_address_cnt += 1 def _inc_fail_measure_cnt(self): self.fail_measure_cnt += 1 def run_job(self, job): if not job: return cvId = job['measure']['cvId'] measure_doc = job['measure'] raw_doc = job['raw'] address = raw_doc['baseInfo']['nowAddress'] if not address: self._inc_empty_address_cnt() try: if 'nowAddress' in measure_doc['baseInfo'] and address: measure_doc['baseInfo']['nowAddress'] = self.measure_loc(address) if measure_doc['baseInfo']['nowAddress']: self._inc_update_num() else: tmp = {'cvId': cvId, 'nowAddress_raw':address, 'nowAddress_measure': measure_doc['baseInfo']['nowAddress']} self._save_file.append_end_with(tmp) if address and not measure_doc['baseInfo']['nowAddress']: self._inc_fail_measure_cnt() print "SUCESS COPY: %s" % cvId except Exception as e: traceback.print_exc() print "Fail copy: %s" % cvId def end_operation(self, *args, **kwargs): self._save_file.append_end_with('empty address cnt: %s' % self.empty_address_cnt) self._save_file.append_end_with('fail measure cnt: %s' % self.fail_measure_cnt) self._save_file.append_end_with('measure success cnt: %s' % self.update_cnt)
def __init__(self, thread_cnt): BaseTask.__init__(self, '2c_import', thread_cnt=thread_cnt) self.process_items = [] self.fail_fn = BaseTask.PathConfig.toc_failids_file self.rs_fn = BaseTask.PathConfig.result_file self.lgstore = LgEtlStore() self.job51store = Job51EtlStore() self.zlstore = ZLEtlStore() self.tocstore = ToCMeasureStore() self.thrift_client = TClient('../../conf/thrift_conf.ini') self.inc_stats2_client = pymongo.MongoClient(self.lgstore.cmgClient.inc_stats2_mongo_url) self.zhineng_salary5_charts_client = pymongo.MongoClient(self.lgstore.cmgClient.zhineng_salary5_charts_mongo_url) self.edu_info_client = self.thrift_client.edu_info_client self.inc_info_clients = self.thrift_client.inc_idinfo_client self.toc_failids_fd = FileSave(BaseTask.PathConfig.toc_failids_file)
class ETLTask(BaseTask): def __init__(self, owner, queue_size, thread_cnt): BaseTask.__init__(self, owner, queue_size, thread_cnt) self.owner = owner self.process_item = {} self.expired_items = [] self._raw_store = self._get_raw_store(owner) self._measure_store = self._get_measure_store(owner) self._page_store = PageStore(owner) self.page_store_db = 'admin' self.page_store_coll = "page_store_{}".format(owner) self.thrift_client = TClient('../../conf/thrift_conf.ini') self.jd_measure_client = self.thrift_client.jd_measure_server_client self.test_mode = False self.rs_file = FileSave(BaseTask.PathConfig.result_file) self.failfile = FileSave(BaseTask.PathConfig.etl_failids_file) def _get_raw_store(self, owner): if "jd_lagou" == owner: return LgEtlStore('raw') if "jd_51job" == owner: return Job51EtlStore('raw') if "jd_zhilian" == owner: return ZLEtlStore('raw') raise Exception(" unknown owner ") def _get_measure_store(self, owner): if "jd_lagou" == owner: return LgEtlStore('measure') if "jd_51job" == owner: return Job51EtlStore('measure') if "jd_zhilian" == owner: return ZLEtlStore('measure') raise Exception(" unknown owner ") def fill_data_with_flag(self, indexUrl, realUrl, contentSign, updateTime, filePath, flag): if indexUrl in self.process_item: if self.process_item[indexUrl]['updateTime'] < updateTime: self.process_item[indexUrl]['updateTime'] = updateTime self.process_item[indexUrl]['contentSign'] = contentSign self.process_item[indexUrl]['realUrl'] = realUrl self.process_item[indexUrl]['flag'] = flag self.process_item[indexUrl]['filePath'] = filePath else: self.process_item[indexUrl] = { 'updateTime': updateTime, 'contentSign': contentSign, 'realUrl': realUrl, 'filePath': filePath, 'flag': flag, } def check(self, item): status = item.get('status', 0) isUpdated = item.get('isUpdated', 0) updateTime = item.get('updateTime') indexUrl = item.get('indexUrl') contentSign = item.get('contentSign') file_path = item.get('pageContentPath') jdUrl = item.get('realUrl') expired = item.get('isExpired', 0) if status == 0: self.fill_data_with_flag(indexUrl, jdUrl, contentSign, updateTime, file_path, 0) if status == 1 and isUpdated == 1: self.fill_data_with_flag(indexUrl, jdUrl, contentSign, updateTime, file_path, 1) if expired == 1: self.fill_data_with_flag(indexUrl, jdUrl, contentSign, updateTime, file_path, 2) def _load_data(self): page_client = self._raw_store.cmgClient.page_store_mongo_client for item in page_client[self.page_store_db][self.page_store_coll].find(): self.check(item) if len(self.process_item) and len(self.process_item) % 10000 == 0: print "load {} items".format(len(self.process_item)) print "finish load {} items".format(len(self.process_item)) print "finish load {} expired items".format(len(self.expired_items)) # print "======================start dump ids to files=================" # fd = FileSave('../data/results.txt') # for indexUrl in self.process_item: # fd.append_end_with(indexUrl) # # print "=======================dump finish=============================" def dispatcher(self, q): for item in self.process_item: q.put(item) for item in self.expired_items: q.put(item) q.put(None) self.wait_q() def getPageContent(self, filename): parts = filename.split("::") if len(parts) == 3: binReader = BinReader(parts[1]) _, content = binReader.readone_at(int(parts[2])) if len(content) == 0: raise Exception("file name:{} , content error".format(filename)) return content if len(parts) == 1: with open(filename) as f: content = f.read() if len(content) == 0: raise Exception("file name:{} , content error".format(filename)) return content def update_jd(self, item): jd_store_key = {'jdId': item} page_store_key = {'indexUrl': item, 'contentSign': self.process_item[item].get('contentSign')} updateTime = self.process_item[item].get("updateTime") strTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(updateTime/1000)) jkey = json.dumps(jd_store_key) jdoc = json.dumps({"$set": {"pubDate": strTime}}) jdoc_m = json.dumps({"$set": {"pubDateStamp": updateTime}}) if self.test_mode: print " Update ===> jdkey: {} \n jdoc: {} \n".format(jkey, jdoc) return if not self._raw_store.save_one(jkey, jdoc, False): raise Exception("update jd_raw pubTime Exception") if not self._measure_store.save_one(jkey, jdoc_m, False): raise Exception("update jd_measure pubTime Exception") if not self._page_store.save_one(json.dumps(page_store_key), json.dumps({"$set": {"isUpdated": 0}}), False): raise Exception("set page store isUpdated status Exception") def check_is_student_job(self, jd_raw): if jd_raw.jobWorkAge in [u"在读学生", u"应届毕业生", u"无经验", u"无要求"]\ or jd_raw.jobCate in [u"应届毕业生", u"储备干部", u"培训生", u"兼职", u"临时", u"实习生"]\ or jd_raw.jobType in [u"兼职", u"实习"]: print "student job ===> ", jd_raw.jdId expired = 0 if self.process_item[jd_raw.jdId]['flag'] == 2: expired = 1 self.rs_file.append_end_with('\t'.join((jd_raw.jdId, str(expired)))) return True return False def parse_measure_jd(self, item): try: jd_store_key = json.dumps({'jdId': item}) page_store_key = json.dumps({'indexUrl': item, 'contentSign': self.process_item[item].get('contentSign')}) fileName = self.process_item[item].get("filePath") pageContent = self.getPageContent(fileName) jd_raw = self.parse_by_owner(item, pageContent, self.owner) jd_measure = self.measure(jd_raw) if self.test_mode: print "Raw ===> {}".format(jd_raw.to_json()) print "Measure ===> {}".format(jd_measure.to_json()) return if not self._raw_store.save_one(jd_store_key, jd_raw.to_json(), True): raise Exception("set raw exception") if not self._measure_store.save_one(jd_store_key, jd_measure.to_json(), True): raise Exception("set measure exception") if not self._page_store.save_one(page_store_key, json.dumps({"$set": {"status": 1}}), False): raise Exception("set page store status exception") self.check_is_student_job(jd_raw) except Exception as e: self.failfile.append_end_with(item) raise e def parse_by_owner(self, jdId, pageContent, owner): if "jd_lagou" == owner: jdRaw = JdLagouHtmlFind(pageContent).find_fields() elif "jd_51job" == owner: jdRaw = Jd51JobHtmlFind(pageContent).find_fields() elif "jd_zhilian" == owner: jdRaw = JdZhilianHtmlFind(pageContent).find_fields() else: raise Exception("unknown owner") jdRaw.jdId = jdId jdRaw.jdFrom = self.owner jdRaw.jdUrl = self.process_item[jdId].get('realUrl') if not jdRaw.pubDate: stamp = self.process_item[jdId].get("updateTime") str_uptime = time.strftime("%Y-%m-%d %H:%m:%S", time.localtime(stamp/1000)) jdRaw.pubDate = str_uptime return jdRaw def measure(self, jd_raw): raw_for_measure = construct_jd_raw(jd_raw) jd_measure = self.jd_measure_client.measureJd(raw_for_measure) jdMeasureObj = convertToJdMeasure(self.owner, jd_measure) self.set_md5_SimHash(jdMeasureObj, jd_raw) return jdMeasureObj def set_md5_SimHash(self, jd_measure, jd_raw): jd_measure.jdMd5 = self.get_jd_md5(jd_measure, jd_raw) jd_measure.jdSimHash = gen_sim_hash(jd_raw.jobDescription) def get_jd_md5(self, jd_measure, jd_raw): temp = dict({}) temp["jdPosition"] = jd_raw.jobPosition temp["incName"] = jd_measure.incSegmentId temp["jdWorkLoc"] = jd_measure.jobWorkLocId return get_jd_measure_hash(temp) def event_handler(self, evt, msg, **kwargs): if "START" == evt: util.send_email(["<*****@*****.**>"], "{}_etl 任务".format(self.owner), msg) return if "DONE" == evt: util.send_email(["<*****@*****.**>"], "{}_etl 任务".format(self.owner), msg) return
class TCImportTask(BaseTask): def __init__(self, thread_cnt): BaseTask.__init__(self, '2c_import', thread_cnt=thread_cnt) self.process_items = [] self.fail_fn = BaseTask.PathConfig.toc_failids_file self.rs_fn = BaseTask.PathConfig.result_file self.lgstore = LgEtlStore() self.job51store = Job51EtlStore() self.zlstore = ZLEtlStore() self.tocstore = ToCMeasureStore() self.thrift_client = TClient('../../conf/thrift_conf.ini') self.inc_stats2_client = pymongo.MongoClient(self.lgstore.cmgClient.inc_stats2_mongo_url) self.zhineng_salary5_charts_client = pymongo.MongoClient(self.lgstore.cmgClient.zhineng_salary5_charts_mongo_url) self.edu_info_client = self.thrift_client.edu_info_client self.inc_info_clients = self.thrift_client.inc_idinfo_client self.toc_failids_fd = FileSave(BaseTask.PathConfig.toc_failids_file) def _load_data(self): with open(self.rs_fn) as f: for line in f: if not line: continue index, expired = line.split('\t') self.process_items.append({"index":index, "expired": int(expired)}) def dispatcher(self): for item in self.process_items: self._queue.put(item) self._queue.put(None) self.wait_q() def get_jd_raw(self, index): channel = index.split("://")[0] if channel == 'jd_lagou': return self.lgstore.get_raw(index) elif channel == 'jd_51job': return self.job51store.get_raw(index) elif channel == 'jd_zhilian': return self.zlstore.get_raw(index) def get_jd_measure(self, index): channel = index.split("://")[0] if channel == 'jd_lagou': return self.lgstore.get_measure(index) elif channel == 'jd_51job': return self.job51store.get_measure(index) elif channel == 'jd_zhilian': return self.zlstore.get_measure(index) def constructMobileJdUrl(self, jdId, jdFrom): positionId = jdId.split("://")[1] if "jd_lagou" == jdFrom: return "http://www.lagou.com/center/jobs_{}.html?m=1".format(positionId) if "jd_51job" == jdFrom: return "http://m.51job.com/search/jobdetail.php?jobid={}".format(positionId) if "jd_zhilian" == jdFrom: part1 = positionId[:9] part2 = positionId[9:] realPositionId = "cc{}j90{}000".format(part1, part2) return "http://m.zhaopin.com/jobs/{}".format(realPositionId) def get_major_ids(self, major): # major = u"计算机科学与技术" flag = 1 try: bk_major = self.thrift_client.edu_info_client.findBkMajorId("", major) except Exception as e: bk_major = "" try: zk_major = self.thrift_client.edu_info_client.findZkMajorId("", major) except Exception as e: zk_major = "" ids = [bk_major, zk_major] if len(bk_major) <= 0 and len(zk_major) <= 0: flag = 2 else: print "======================= ids: ", ids return ids, flag def get_salary_ratio(self, jobCate): r = -1 if len(jobCate) <= 0: return r q = self.zhineng_salary5_charts_client["zhineng_stats_v2"]["zhineng_salary5_charts"].\ find_one({"zhineng_id": jobCate}) if q and "greater" in q: r = q.get("greater") return r def get_total_ind_rank(self, incSegmentId): tagList=[] total_ind_rank_postion = -1 incId = self.get_inc_id(incSegmentId) if not incId: return "", tagList, total_ind_rank_postion q = self.inc_stats2_client["inc_stats_v6"]["inc"].find_one({"_id":ObjectId(incId)}) if q: if "tag_list" in q: tagList = q.get("tag_list") if "rank_info" in q and "total_ind_rank_postion" in q["rank_info"]: total_ind_rank_postion = q["rank_info"]["total_ind_rank_postion"] return incId, tagList, total_ind_rank_postion def get_inc_id(self, incSegmentId): try: idinfo = self.inc_info_clients.queryIncId(ServiceAccessToken(), "", incSegmentId) except Exception as e: print e raise e return idinfo def rebuild(self, raw, measure): tocmeasure = toCMeasurePageModel() tocmeasure.jdId = raw["jdId"] tocmeasure.jdUrl = raw["jdUrl"] tocmeasure.mobileJdUrl = self.constructMobileJdUrl(raw["jdId"], raw["jdFrom"]) tocmeasure.channel = raw["jdFrom"] tocmeasure.jd_content_hash = measure["jdSimHash"] tocmeasure.jd_measure_hash = measure["jdMd5"] tocmeasure.publishTime = measure["pubDateStamp"] tocmeasure.isExpired = 0 tocmeasure.jobDiploma = raw["jobDiploma"] tocmeasure.jobDiplomaId = measure["jobDiplomaId"] tocmeasure.jobPosition = raw["jobPosition"] tocmeasure.jobWorkLoc = raw["jobWorkLoc"] tocmeasure.jobSalaryMin = measure["jobSalaryMin"] tocmeasure.jobSalaryMax = measure["jobSalaryMax"] tocmeasure.jobWorkLocId = str(measure["jobWorkLocId"]) tocmeasure.jobWorkLoc = raw["jobWorkLoc"] tocmeasure.jobWorkAgeMin = measure["jobWorkAgeMin"] tocmeasure.jobWorkAgeMax = measure["jobWorkAgeMax"] tocmeasure.jobCate = measure["jobCate"] tocmeasure.jobType = raw["jobType"] tocmeasure.major = measure["jobMajor"] tocmeasure.jobDescription = raw["jobDescription"] # majorIds, majorIdsFlag if measure["jobMajor"] == u"专业不限": tocmeasure.majorIds = [] tocmeasure.majorIdsFlag = 0 #专业不限 elif measure["jobMajor"] == "": tocmeasure.majorIds = [] tocmeasure.majorIdsFlag = -1 #专业没有 else: tocmeasure.majorIds, tocmeasure.majorIdsFlag = self.get_major_ids(measure["jobMajor"]) tocmeasure.incName = raw["incName"] tocmeasure.incIntro = raw["incIntro"] tocmeasure.incIndustry = raw["incIndustry"] tocmeasure.incIndustryId = measure["incIndustryId"] tocmeasure.incType = measure["incType"] tocmeasure.incScaleMin = measure["incScaleMin"] tocmeasure.incScaleMax = measure["incScaleMax"] tocmeasure.salaryRatio = self.get_salary_ratio(measure["jobCate"]) tocmeasure.incId, tocmeasure.tagList, tocmeasure.total_ind_rank_postion = self.get_total_ind_rank(measure["incSegmentId"]) return tocmeasure def run_job(self, item): index = item.get('index') expired = item.get('expired') try: if expired: self.tocstore.set_expired({"jdId": index}) print "complete set expired, indexUrl: {}".format(index) return raw = self.get_jd_raw(index) measure = self.get_jd_measure(index) result = self.rebuild(raw, measure) key = json.dumps({"jdId": raw["jdId"]}) doc = util.remove_empty_key(json.loads(result.to_json()), ['isExpired']) self.tocstore.save_one(key, json.dumps(doc), True) print "complete copy indexUrl: {}".format(index) except Exception as e: self.toc_failids_fd.append_end_with(index) print "failed copy indexUrl: {}".format(index) traceback.print_exc() raise e def event_handler(self, evt, msg, **kwargs): if "START" == evt: util.send_email(["<*****@*****.**>"], "2cimport 任务", msg) return if "DONE" == evt: util.send_email(["<*****@*****.**>"], "2cimport 任务", msg) return
def __init__(self, thread_cnt): FixLoc.__init__(self,thread_cnt) self._save_file = FileSave('cv_jobLists.json')