class CheckCvDupliRunner(object): def __init__(self, channel): self.channel = channel self.r_lock = threading.RLock() self._duplication_count = 0 self.cv_hash_table = CVHashTable() self.cv_51job_raw_store = CVRawStore('cv_51job', 'raw') self.cv_51job_measure_store = CVRawStore('cv_51job','measure') self.cv_zhilian_raw_store = CVRawStore('cv_zhilian', 'raw') self.cv_zhilian_measure_store = CVRawStore('cv_zhilian', 'measure') self.cv_liepin_raw_store = CVRawStore('cv_liepin', 'raw') self.cv_liepin_measure_store = CVRawStore('cv_liepin', 'measure') self.dupli_file_save = open(os.path.join(os.path.dirname(__file__), "../result/%s_duplicate_ids_%d" % (self.channel, os.getpid())), 'wb') def _inc_duplication_count(self, indexUrl): with self.r_lock: self._duplication_count += 1 def remove_duplication(self, cvId_to_remove, channel, indexUrl): key = {'cvId': cvId_to_remove} if channel == 'cv_51job': self.cv_51job_raw_store.delete_one(key) self.cv_51job_measure_store.delete_one(key) elif channel == 'cv_zhilian': self.cv_zhilian_raw_store.delete_one(key) self.cv_zhilian_measure_store.delete_one(key) elif channel == 'cv_liepin': self.cv_liepin_raw_store.delete_one(key) self.cv_liepin_measure_store.delete_one(key) else: raise Exception('unknown channel') print >> self.dupli_file_save, "[D]cvId: %s is removed for duplicating with cvId: %s" % (cvId_to_remove, indexUrl) def run(self, indexUrl, ps): for s in ps.get('jobList', []): incName = s.get('incName', '') jobPosition = s.get('jobPosition', '') jobDesc = s.get('jobDesc', '') # 都不为空, 才会判重 if not (incName and jobPosition and jobDesc): return False hash_value = util.md5([incName, jobPosition, jobDesc], remove_space=True) key = {'hash_value': hash_value} hash_doc = self.cv_hash_table.find_one(key) if hash_doc: # 统计重复数 self._inc_duplication_count(indexUrl) # 如果此渠道优先级比较大, 替换掉存在hash表中的 cvId_in_db = hash_doc.get('cvId') # 相同Id, 可能更新 if cvId_in_db == indexUrl: return False cv_channel_in_db = cvId_in_db.split('://')[0] if CHANNEL_PRIORITY.get(self.channel, 0) > CHANNEL_PRIORITY.get(cv_channel_in_db, 0): hash_doc['cvId'] = indexUrl self.cv_hash_table.save_one(key, hash_doc, True) # remove 优先级低的, 保持解析数据没有重复 self.remove_duplication(cvId_in_db, cv_channel_in_db, indexUrl) return False else: self.remove_duplication(indexUrl, indexUrl.split('://')[0], cvId_in_db) return True else: hash_doc = {'hash_value': hash_value, 'cvId': indexUrl} self.cv_hash_table.save_one(key, hash_doc, True) return False return False