class ParseRunner(object): def __init__(self, channel, http_service_addr=None): self.channel = channel self.http_service_addr = http_service_addr self.cv_parser = CvParser() def run(self, pagecontent): if not self.http_service_addr: return self.cv_parser.parser(pagecontent, cvFrom=self.channel) else: result = requests.post(self.http_service_addr, data={'channel': self.channel, 'pagecontent': pagecontent}) return json.loads(result.text,encoding='utf-8')
class CvPS(BaseTask): def __init__(self, channel, prefix=None): BaseTask.__init__(self, channel, thread_cnt=2) self.channel = channel self.cv_parser = CvParser() self._cvs_data = {} #对比 临时存放文件 self.diff_rs_dir = '%s_rs_diff' % self.channel if not prefix else '%s/%s_job_diff' % (prefix, self.channel) #机器解析 self.mechine_rs_dir = '%s_mechine_parsed' % self.channel if not prefix else '%s/%s_mechine_parsed' % (prefix, self.channel) #人工解析 self.person_rs_dir = '%s_person_parsed' % self.channel if not prefix else '%s/%s_person_parsed' % (prefix, self.channel) #结果 self.result_file = '%s_result.txt' % self.channel if not prefix else '%s/%s_result.txt' % (prefix, self.channel) # util.check_and_clear(os.path.dirname(self.result_file)) self.excel_save = ExcelFileSave(self.result_file) #样本 self.sample_dir = '%s_sample' % self.channel if not prefix else '%s/%s_sample' % (prefix, self.channel) def start_operation(self, *args, **kwargs): # 清理准备工作 print os.getcwd() for f in [self.diff_rs_dir, self.mechine_rs_dir]: if not os.path.isdir(f): os.system('mkdir -p %s' % f) else: os.system('rm -rf %s/*' % f) def dispatcher(self): fs = os.listdir(self.sample_dir) for file in fs: real_file = "%s/%s" % (self.sample_dir, file) if not os.path.isfile(real_file) or \ os.path.splitext(real_file)[1] != '.html': continue fname = os.path.split(real_file)[1] cvId = os.path.splitext(fname)[0] self._queue.put({'fn': real_file, 'cvId': cvId}) def run_job(self, job): if not job: return htmlfile = job.get('fn') cvId = job.get('cvId') with open(htmlfile, 'rb') as f: pagecontent = f.read() try: ps = self.cv_parser.parser(htmlContent=pagecontent, cvFrom=self.channel) cvRaw_obj = constructCvRawObj(ps) cvRaw_obj.cvId = "%s://%s" % (self.channel, cvId) cvRaw_obj.cvFrom = self.channel self._check_fields(cvRaw_obj.to_json(), cvId) # Logger.default_log("cvId: %s Ok" % cvId) except Exception as e: Logger.default_log("cvId: %s Fail" % cvId) traceback.print_exc() def _load_data(self): with open('%s/%s' % (self.person_rs_dir, 'parsed.csv'), 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',') for index, row in enumerate(reader): if index == 0: continue cvId = row[0].split('.')[0] if '简历ID:' in cvId: cvId = cvId.replace('简历ID:', '') parsed_data = row[1].replace('\n','') self._cvs_data.update({cvId: parsed_data}) def _check_fields(self, cvRaw_obj, cvId): cvRaw_obj = json.loads(cvRaw_obj) person_parsed = self._cvs_data.get(cvId, '') if not person_parsed: return person_parsed = json.loads(person_parsed) print person_parsed diff_rs = copy.deepcopy(person_parsed) for key, value in person_parsed.items(): if key == 'cvFrom': diff_rs['cvFrom'] = 0 continue if key == 'baseInfo': for key1, value2 in value.items(): # 忽略cvId if key1 == 'cvId': diff_rs['baseInfo'][key1] = 0 continue if value2 in [None,'None',''] and not cvRaw_obj['baseInfo'].get(key1, ''): diff_rs['baseInfo'][key1] = 0 continue if value2 != cvRaw_obj['baseInfo'].get(key1, ''): diff_rs['baseInfo'][key1] = 1 # 解析不相同 else: diff_rs['baseInfo'][key1] = 0 # 解析一样 elif isinstance(value, (str, unicode)): diff_rs[key] = 0 if person_parsed[key] == cvRaw_obj.get(key, '') else 1 elif key in ['languageList', 'proList', 'skillList', 'trainList', 'jobList', 'eduList', 'certList']: if not person_parsed[key]: if not cvRaw_obj.get(key, ''): diff_rs[key] = 0 else: diff_rs[key] = 1 continue for index, languageItem in enumerate(person_parsed[key]): # 为空 if not cvRaw_obj.get(key, ''): diff_rs[key] = 1 break elif cvRaw_obj.get(key, '') and len(cvRaw_obj.get(key)) < len(person_parsed[key]): diff_rs[key] = 1 break for key1, value1 in languageItem.items(): if cvRaw_obj[key][index].get(key1, '') != value1 and key1 != 'positionList': diff_rs[key][index][key1] = 1 # 解析不同 elif key1 == 'positionList': for positionIndex, positionItem in enumerate(value1): if len(cvRaw_obj[key][index].get(key1, [])) < len(value1): diff_rs[key][index][key1] = 1 continue for key2, value2 in positionItem.items(): diff_rs[key][index][key1][positionIndex][key2] = 0 if cvRaw_obj[key][index][key1][positionIndex].get(key2,'') == value2 else 1 else: diff_rs[key][index][key1] = 0 elif key in ['jobList']: if not person_parsed[key]: if not cvRaw_obj.get(key, ''): diff_rs[key] = 0 else: diff_rs[key] = 1 continue elif key in ['others', 'privateInfo', 'jobExp']: if not person_parsed[key] or person_parsed in ['None', None]: if not cvRaw_obj.get(key, ''): diff_rs[key] = 0 else: diff_rs[key] = 1 for key1, value1 in person_parsed[key].items(): if value1 in ['None', None, '']: if not cvRaw_obj[key].get(key1, ''): diff_rs[key][key1] = 0 else: diff_rs[key][key1] = 1 continue if cvRaw_obj[key].get(key1) != value1: diff_rs[key][key1] = 1 # 解析错误 else: diff_rs[key][key1] = 0 self._save_diff_rs(cvId, diff_rs) self._save_mechine_parsed(cvId, cvRaw_obj) self._save_people_parsed(cvId, person_parsed) # Logger.default_log('complete cvId: %s' % cvId) self._save_excel(cvId, person_parsed, cvRaw_obj, diff_rs) def _save_mechine_parsed(self, cvId, cvRaw_obj): with codecs.open("%s/%s.json" % (self.mechine_rs_dir, cvId), 'wb', encoding='utf-8') as f: f.write(json.dumps(cvRaw_obj, indent=4, ensure_ascii=False)) def _save_people_parsed(self, cvId, person_parsed): with codecs.open("%s/%s.json" % (self.person_rs_dir, cvId), 'wb', encoding='utf-8') as f: f.write(json.dumps(person_parsed, indent=4, ensure_ascii=False)) def _save_diff_rs(self, cvId, diff_rs): with open("%s/%s.json" % (self.diff_rs_dir, cvId), 'wb') as f: f.write(json.dumps(diff_rs, indent=4)) def _save_excel(self, cvid, person_parsed, mechine_obj, diff_rs): self.excel_save.append([cvid]) for key, value in diff_rs.items(): if isinstance(value, (str, unicode, int)): self.excel_save.append(['', key, person_parsed[key], mechine_obj.get(key, ''), diff_rs[key]]) if isinstance(value, dict): for key1, value2 in value.items(): if key1 == 'cvId': continue if isinstance(value2, (str, unicode, int)): self.excel_save.append(['',"%s.%s"%(key, key1), person_parsed[key][key1], mechine_obj[key].get(key1,''), diff_rs[key][key1]]) else: print type(value2) raise Exception("%s.%s"%(key, key1)) if isinstance(value, list): for index1, item1 in enumerate(value): if isinstance(item1, dict): for key2, value2 in item1.items(): if isinstance(value2, list): for index2, item2 in enumerate(value2): if isinstance(item2, dict): for key3, value3 in item2.items(): mechine_value = mechine_obj[key][index1].get(key2, '') if mechine_value and len(mechine_value) <= index2: mechine_value =0 else: mechine_value = mechine_value[index2].get(key3, '') self.excel_save.append(['',"%s.%d.%s.%d.%s" % (key, index1, key2, index2, key3), person_parsed[key][index1][key2][index2][key3], mechine_value, diff_rs[key][index1][key2][index2][key3]]) else: self.excel_save.append(['',"%s.%d.%s" % (key, index1, key2), person_parsed[key][index1][key2], mechine_obj[key][index1].get(key2, ''), diff_rs[key][index1][key2]]) else: raise Exception("%s.%d" % (key, index1)) self.excel_save.append(['']) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': util.send_email(["<*****@*****.**>"], "{} 样本对比".format(self.channel), msg) # 最终结果存放文件 filedest = 'app/share/%s_result.txt' % self.channel if os.path.exists(filedest): now = util.get_date_with_day_duration() history_fn = os.path.join(os.path.dirname(filedest), '%s_%s.txt' % (self.channel, '%d%02d%02d%02d%02d' % (now.year, now.month, now.day, now.hour, now.minute))) # 将结果移动到历史文件 os.system('mv %s %s' % (filedest, history_fn)) # 将最后计算结果放到share 目录下, 提供下载 os.system('mv %s %s' % (self.result_file, filedest))
class ETLBase(object): def __init__(self, channel): self.channel = channel self.process_item = {} self.cv_page_store = CVPageStore(channel) self.cv_parser = CvParser() self.test_mode = False conf_path = os.path.join(os.path.dirname(__file__), '../../conf/thrift_conf.ini') self.thrift_client = ThriftClient(conf_path) # jobList 哈希, 用于判断重复 self.cv_hash_table = CVHashTable() self._duplication_count = 0 self.rlock = threading.RLock() self.cv_raw_store = CVRawStore(self.channel, stage='raw') self.cv_measure_store = CVRawStore(self.channel, stage='measure') self.measure_client = self.thrift_client.cv_measure_server_client self.cv_51job_raw_store = CVRawStore('cv_51job', 'raw') self.cv_51job_measure_store = CVRawStore('cv_51job','measure') self.cv_zhilian_raw_store = CVRawStore('cv_zhilian', 'raw') self.cv_zhilian_measure_store = CVRawStore('cv_zhilian', 'measure') self.cv_liepin_raw_store = CVRawStore('cv_liepin', 'raw') self.cv_liepin_measure_store = CVRawStore('cv_liepin', 'measure') self.fail_save = FileSave('%s_fail_ids.txt' % self.channel) self.parsed_cnt = 0 self.parsed_cnt_lock = threading.RLock() # 二进制文件位置, 设置 # local 表示在本地 # remote 表示在远程 self.bin_file_location = os.environ['BIN_FILE_LOCATION'] if self.bin_file_location == 'remote': self.bin_read_client = self.thrift_client.bin_read_server_client def _inc_parsed_cnt(self): with self.parsed_cnt_lock: self.parsed_cnt += 1 def _inc_duplication_count(self): with self.rlock: self._duplication_count += 1 def fill_data_with_flag(self, indexUrl, realUrl, contentSign, updateTime, filePath, flag): self.process_item[indexUrl] = { 'updateTime': updateTime, 'contentSign': contentSign, 'realUrl': realUrl, 'filePath': filePath, 'flag': flag, } if len(self.process_item)%10000 == 0: print "load items: %d" % len(self.process_item) def store(self, cvId, raw, measure, bulk=False): if self.test_mode: print "====================test mode cvId: %s================" % cvId print "raw: ", raw.to_json() return key = {"cvId": cvId} if not bulk: self.cv_raw_store.save_one(key, raw.to_json(), isUpsert=True) self.cv_measure_store.save_one(key, measure.to_json(), isUpsert=True) else: self.cv_raw_store.bulk_upsert(key, raw.to_mongo()) self.cv_raw_store.bulk_upsert(key, measure.to_mongo()) def measure(self, raw): return self.measure_client.measureCv(raw) def check_and_put(self, item): updateTime = item.get('updateTime') indexUrl = item.get('indexUrl') contentSign = item.get('contentSign') file_path = item.get('pageContentPath') realUrl = item.get('realUrl') self.fill_data_with_flag(indexUrl, realUrl, contentSign, updateTime, file_path, 0) def _load_data(self): for item in self.cv_page_store.get_all(): self.check_and_put(item) print "============= totally load %d items ===============" % len(self.process_item) def dispatcher(self, q, from_which='db'): if from_which == 'db': self.dispatcher_from_db(q) elif from_which == 'file': self.dispatcher_from_file(q) else: raise Exception("unknown from_which") def dispatcher_from_db(self, q): i = 0 total_cnt = len(self.process_item) for item in self.process_item: pagecontent = self.getPageContent(self.process_item[item].get("filePath"), self.bin_file_location) q.put({'indexUrl': item, 'pagecontent': pagecontent, 'updateTime':self.process_item[item]['updateTime'], 'contentSign': self.process_item[item]['contentSign']}) i += 1 if i % 10000 == 0: print "processed %f%%" % (float(i*100/total_cnt)) q.put(None) def dispatcher_from_file(self, q): with open('%s_need_fix_ids.txt', 'rb') as f: for line in f: line = line.strip() if not line: continue doc = self.cv_page_store.get_one(line) pagecontent = self.getPageContent(doc.get('pageContentPath'), self.bin_file_location) q.put({'indexUrl': line, 'pagecontent': pagecontent, 'updateTime':doc['updateTime'], 'contentSign': doc['contentSign']}) q.put(None) def getPageContent(self, filename, from_where='local'): if from_where == 'local': parts = filename.split("::") if len(parts) == 3: binReader = BinReader(parts[1]) _, content = binReader.readone_at(int(parts[2])) if len(content) == 0: raise Exception("file name:{} , content error".format(filename)) return content if len(parts) == 1: with open(filename) as f: content = f.read() if len(content) == 0: raise Exception("file name:{} , content error".format(filename)) return content elif from_where == 'remote': #TODO # 从远程获取bin文件内容 content = self.bin_read_client.getHtml(filename) return content else: raise Exception("unknown from_where") def check_has_duplication(self, indexUrl, ps): for s in ps.get('jobList', []): incName = s.get('incName', '') jobPosition = s.get('jobPosition', '') jobDesc = s.get('jobDesc', '') # 都不为空, 才会判重 if not (incName and jobPosition and jobDesc): return False hash_value = util.md5([incName, jobPosition, jobDesc], remove_space=True) key = {'hash_value': hash_value} hash_doc = self.cv_hash_table.get_one(key) if hash_doc: # 统计重复数 self._inc_duplication_count() # 如果此渠道优先级比较大, 替换掉存在hash表中的 cvId_in_db = hash_doc.get('cvId') # 相同Id, 可能更新 if cvId_in_db == indexUrl: return False cv_channel_in_db = cvId_in_db.split('://')[0] if CHANNEL_PRIORITY.get(self.channel, 0) > CHANNEL_PRIORITY.get(cv_channel_in_db, 0): hash_doc['cvId'] = indexUrl self.cv_hash_table.upsert_one(key, hash_doc) # remove 优先级低的, 保持解析数据没有重复 self.remove_duplication(cvId_in_db, cv_channel_in_db) return False return True else: hash_doc = {'hash_value': hash_value, 'cvId': indexUrl} self.cv_hash_table.upsert_one(key, hash_doc) def remove_duplication(self, cvId, channel): key = {'cvId': cvId} if channel == 'cv_51job': self.cv_51job_raw_store.remove_one(key) self.cv_51job_measure_store.remove_one(key) elif channel == 'cv_zhilian': self.cv_zhilian_raw_store.remove_one(key) self.cv_zhilian_measure_store.remove_one(key) elif channel == 'cv_liepin': self.cv_liepin_raw_store.remove_one(key) self.cv_liepin_measure_store.remove_one(key) else: raise Exception('unknown channel') def real_run(self, job): indexUrl = job.get('indexUrl') pagecontent = job.get('pagecontent') #print pagecontent ps = self.cv_parser.parser(htmlContent=pagecontent, cvFrom=self.channel) # 判断是否重复cv if self.check_has_duplication(indexUrl, ps): return cvRaw_obj = constructCvRawObj(ps) cvRaw_obj.cvId = job.get('indexUrl') cvRaw_obj.cvFrom = self.channel cvRaw_obj.updateTime = job.get('updateTime') cvRaw_obj.contentSign = job.get('contentSign') cvRaw = convertToCvRaw(cvRaw_obj) cvMeasure = self.measure(cvRaw) if not cvMeasure: print cvRaw cvMeasured_obj = constructCvMeasureObj(cvMeasure) self.store(indexUrl, cvRaw_obj, cvMeasured_obj, False) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': util.send_email(["<*****@*****.**>"], "{} 样本对比".format(self.channel), msg + '\n duplicate cvs: %d' % self._duplication_count + '\n parsed count: %d' % self.parsed_cnt)