def get_count(jd_or_cv, channel): if jd_or_cv == "jd": store = PageStore(channel) elif jd_or_cv == "cv": store = CVPageStore(channel) elif jd_or_cv == "co": store = CoPageStore(channel) else: raise Exception("unknown jd_or_cv type: %s" % jd_or_cv) return store.count_all()
def __init__(self, channel): self.channel = channel BaseTask.__init__(self, "{}_expire_detect".format(channel)) self.page_store = PageStore(channel) self.process_items = [] self.page_expire_detect = PageExpireDetect() self.test_mode = False
def __init__(self, owner, queue_size, thread_cnt): BaseTask.__init__(self, owner, queue_size, thread_cnt) self.owner = owner self.process_item = {} self.expired_items = [] self._raw_store = self._get_raw_store(owner) self._measure_store = self._get_measure_store(owner) self._page_store = PageStore(owner) self.page_store_db = 'admin' self.page_store_coll = "page_store_{}".format(owner) self.thrift_client = TClient('../../conf/thrift_conf.ini') self.jd_measure_client = self.thrift_client.jd_measure_server_client self.test_mode = False self.rs_file = FileSave(BaseTask.PathConfig.result_file) self.failfile = FileSave(BaseTask.PathConfig.etl_failids_file)
class ETLTask(BaseTask): def __init__(self, owner, queue_size, thread_cnt): BaseTask.__init__(self, owner, queue_size, thread_cnt) self.owner = owner self.process_item = {} self.expired_items = [] self._raw_store = self._get_raw_store(owner) self._measure_store = self._get_measure_store(owner) self._page_store = PageStore(owner) self.page_store_db = 'admin' self.page_store_coll = "page_store_{}".format(owner) self.thrift_client = TClient('../../conf/thrift_conf.ini') self.jd_measure_client = self.thrift_client.jd_measure_server_client self.test_mode = False self.rs_file = FileSave(BaseTask.PathConfig.result_file) self.failfile = FileSave(BaseTask.PathConfig.etl_failids_file) def _get_raw_store(self, owner): if "jd_lagou" == owner: return LgEtlStore('raw') if "jd_51job" == owner: return Job51EtlStore('raw') if "jd_zhilian" == owner: return ZLEtlStore('raw') raise Exception(" unknown owner ") def _get_measure_store(self, owner): if "jd_lagou" == owner: return LgEtlStore('measure') if "jd_51job" == owner: return Job51EtlStore('measure') if "jd_zhilian" == owner: return ZLEtlStore('measure') raise Exception(" unknown owner ") def fill_data_with_flag(self, indexUrl, realUrl, contentSign, updateTime, filePath, flag): if indexUrl in self.process_item: if self.process_item[indexUrl]['updateTime'] < updateTime: self.process_item[indexUrl]['updateTime'] = updateTime self.process_item[indexUrl]['contentSign'] = contentSign self.process_item[indexUrl]['realUrl'] = realUrl self.process_item[indexUrl]['flag'] = flag self.process_item[indexUrl]['filePath'] = filePath else: self.process_item[indexUrl] = { 'updateTime': updateTime, 'contentSign': contentSign, 'realUrl': realUrl, 'filePath': filePath, 'flag': flag, } def check(self, item): status = item.get('status', 0) isUpdated = item.get('isUpdated', 0) updateTime = item.get('updateTime') indexUrl = item.get('indexUrl') contentSign = item.get('contentSign') file_path = item.get('pageContentPath') jdUrl = item.get('realUrl') expired = item.get('isExpired', 0) if status == 0: self.fill_data_with_flag(indexUrl, jdUrl, contentSign, updateTime, file_path, 0) if status == 1 and isUpdated == 1: self.fill_data_with_flag(indexUrl, jdUrl, contentSign, updateTime, file_path, 1) if expired == 1: self.fill_data_with_flag(indexUrl, jdUrl, contentSign, updateTime, file_path, 2) def _load_data(self): page_client = self._raw_store.cmgClient.page_store_mongo_client for item in page_client[self.page_store_db][self.page_store_coll].find(): self.check(item) if len(self.process_item) and len(self.process_item) % 10000 == 0: print "load {} items".format(len(self.process_item)) print "finish load {} items".format(len(self.process_item)) print "finish load {} expired items".format(len(self.expired_items)) # print "======================start dump ids to files=================" # fd = FileSave('../data/results.txt') # for indexUrl in self.process_item: # fd.append_end_with(indexUrl) # # print "=======================dump finish=============================" def dispatcher(self, q): for item in self.process_item: q.put(item) for item in self.expired_items: q.put(item) q.put(None) self.wait_q() def getPageContent(self, filename): parts = filename.split("::") if len(parts) == 3: binReader = BinReader(parts[1]) _, content = binReader.readone_at(int(parts[2])) if len(content) == 0: raise Exception("file name:{} , content error".format(filename)) return content if len(parts) == 1: with open(filename) as f: content = f.read() if len(content) == 0: raise Exception("file name:{} , content error".format(filename)) return content def update_jd(self, item): jd_store_key = {'jdId': item} page_store_key = {'indexUrl': item, 'contentSign': self.process_item[item].get('contentSign')} updateTime = self.process_item[item].get("updateTime") strTime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(updateTime/1000)) jkey = json.dumps(jd_store_key) jdoc = json.dumps({"$set": {"pubDate": strTime}}) jdoc_m = json.dumps({"$set": {"pubDateStamp": updateTime}}) if self.test_mode: print " Update ===> jdkey: {} \n jdoc: {} \n".format(jkey, jdoc) return if not self._raw_store.save_one(jkey, jdoc, False): raise Exception("update jd_raw pubTime Exception") if not self._measure_store.save_one(jkey, jdoc_m, False): raise Exception("update jd_measure pubTime Exception") if not self._page_store.save_one(json.dumps(page_store_key), json.dumps({"$set": {"isUpdated": 0}}), False): raise Exception("set page store isUpdated status Exception") def check_is_student_job(self, jd_raw): if jd_raw.jobWorkAge in [u"在读学生", u"应届毕业生", u"无经验", u"无要求"]\ or jd_raw.jobCate in [u"应届毕业生", u"储备干部", u"培训生", u"兼职", u"临时", u"实习生"]\ or jd_raw.jobType in [u"兼职", u"实习"]: print "student job ===> ", jd_raw.jdId expired = 0 if self.process_item[jd_raw.jdId]['flag'] == 2: expired = 1 self.rs_file.append_end_with('\t'.join((jd_raw.jdId, str(expired)))) return True return False def parse_measure_jd(self, item): try: jd_store_key = json.dumps({'jdId': item}) page_store_key = json.dumps({'indexUrl': item, 'contentSign': self.process_item[item].get('contentSign')}) fileName = self.process_item[item].get("filePath") pageContent = self.getPageContent(fileName) jd_raw = self.parse_by_owner(item, pageContent, self.owner) jd_measure = self.measure(jd_raw) if self.test_mode: print "Raw ===> {}".format(jd_raw.to_json()) print "Measure ===> {}".format(jd_measure.to_json()) return if not self._raw_store.save_one(jd_store_key, jd_raw.to_json(), True): raise Exception("set raw exception") if not self._measure_store.save_one(jd_store_key, jd_measure.to_json(), True): raise Exception("set measure exception") if not self._page_store.save_one(page_store_key, json.dumps({"$set": {"status": 1}}), False): raise Exception("set page store status exception") self.check_is_student_job(jd_raw) except Exception as e: self.failfile.append_end_with(item) raise e def parse_by_owner(self, jdId, pageContent, owner): if "jd_lagou" == owner: jdRaw = JdLagouHtmlFind(pageContent).find_fields() elif "jd_51job" == owner: jdRaw = Jd51JobHtmlFind(pageContent).find_fields() elif "jd_zhilian" == owner: jdRaw = JdZhilianHtmlFind(pageContent).find_fields() else: raise Exception("unknown owner") jdRaw.jdId = jdId jdRaw.jdFrom = self.owner jdRaw.jdUrl = self.process_item[jdId].get('realUrl') if not jdRaw.pubDate: stamp = self.process_item[jdId].get("updateTime") str_uptime = time.strftime("%Y-%m-%d %H:%m:%S", time.localtime(stamp/1000)) jdRaw.pubDate = str_uptime return jdRaw def measure(self, jd_raw): raw_for_measure = construct_jd_raw(jd_raw) jd_measure = self.jd_measure_client.measureJd(raw_for_measure) jdMeasureObj = convertToJdMeasure(self.owner, jd_measure) self.set_md5_SimHash(jdMeasureObj, jd_raw) return jdMeasureObj def set_md5_SimHash(self, jd_measure, jd_raw): jd_measure.jdMd5 = self.get_jd_md5(jd_measure, jd_raw) jd_measure.jdSimHash = gen_sim_hash(jd_raw.jobDescription) def get_jd_md5(self, jd_measure, jd_raw): temp = dict({}) temp["jdPosition"] = jd_raw.jobPosition temp["incName"] = jd_measure.incSegmentId temp["jdWorkLoc"] = jd_measure.jobWorkLocId return get_jd_measure_hash(temp) def event_handler(self, evt, msg, **kwargs): if "START" == evt: util.send_email(["<*****@*****.**>"], "{}_etl 任务".format(self.owner), msg) return if "DONE" == evt: util.send_email(["<*****@*****.**>"], "{}_etl 任务".format(self.owner), msg) return
class JdExpireDetect(BaseTask): def __init__(self, channel): self.channel = channel BaseTask.__init__(self, "{}_expire_detect".format(channel)) self.page_store = PageStore(channel) self.process_items = [] self.page_expire_detect = PageExpireDetect() self.test_mode = False def pre_check(self, item): now = int(time.time() * 1000) # 只检测3天之内,没标记为过期的 if "updateTime" in item and item["updateTime"] < now - 3 * 24 * 3600: if "isExpired" not in item or item['isExpired'] == 0: self.process_items.append({"indexUrl": item["indexUrl"], "realUrl": item["realUrl"]}) def _load_data(self): for item in self.page_store.get_all(): self.pre_check(item) if len(self.process_items) and len(self.process_items) % 10000 == 0: print "load {} items".format(len(self.process_items)) print "totally load {} items".format(len(self.process_items)) def dispatcher(self, q): for item in self.process_items: q.put(item) q.put(None) self.wait_q() def check_expire_by_channel(self, job, channel): if "jd_lagou" == channel: return self.page_expire_detect.lagou_page_detect(job["realUrl"]) if "jd_51job" == channel: return self.page_expire_detect.jd51job_page_detect(job["realUrl"]) if "jd_zhilian" == channel: return self.page_expire_detect.zhilian_page_detect(job["realUrl"]) if "jd_wealink" == channel: return self.page_expire_detect.wealink_page_detect(job["realUrl"]) if 'jd_liepin' == channel: return self.page_expire_detect.liepin_page_detect(job['realUrl']) def run_job(self, job): if not isinstance(job, dict): return try: if not self.check_expire_by_channel(job, self.channel): if not self.test_mode: self.page_store.set_expire({"indexUrl": job["indexUrl"]}) print "set expired , indexUrl: {}".format(job["indexUrl"]) except Exception as e: print e print "failed , indexUrl: {}".format(job["indexUrl"]) def event_handler(self, evt, msg, **kwargs): if "START" == evt: util.send_email(["<*****@*****.**>"], "{} 过期检测".format(self.channel), msg) return if "DONE" == evt: util.send_email(["<*****@*****.**>"], "{} 过期检测".format(self.channel), msg) return