示例#1
0
class CheckCvDupliRunner(object):
    def __init__(self, channel):
        self.channel = channel
        self.r_lock = threading.RLock()
        self._duplication_count = 0
        self.cv_hash_table = CVHashTable()

        self.cv_51job_raw_store = CVRawStore('cv_51job', 'raw')
        self.cv_51job_measure_store = CVRawStore('cv_51job','measure')

        self.cv_zhilian_raw_store = CVRawStore('cv_zhilian', 'raw')
        self.cv_zhilian_measure_store = CVRawStore('cv_zhilian', 'measure')

        self.cv_liepin_raw_store = CVRawStore('cv_liepin', 'raw')
        self.cv_liepin_measure_store = CVRawStore('cv_liepin', 'measure')

        self.dupli_file_save = open(os.path.join(os.path.dirname(__file__),
                                                 "../result/%s_duplicate_ids_%d" % (self.channel, os.getpid())), 'wb')

    def _inc_duplication_count(self, indexUrl):
        with self.r_lock:
            self._duplication_count += 1

    def remove_duplication(self, cvId_to_remove, channel, indexUrl):
        key = {'cvId': cvId_to_remove}

        if channel == 'cv_51job':
            self.cv_51job_raw_store.delete_one(key)
            self.cv_51job_measure_store.delete_one(key)
        elif channel == 'cv_zhilian':
            self.cv_zhilian_raw_store.delete_one(key)
            self.cv_zhilian_measure_store.delete_one(key)

        elif channel == 'cv_liepin':
            self.cv_liepin_raw_store.delete_one(key)
            self.cv_liepin_measure_store.delete_one(key)

        else:
            raise Exception('unknown channel')

        print >> self.dupli_file_save, "[D]cvId: %s is removed for duplicating with cvId: %s" % (cvId_to_remove, indexUrl)

    def run(self, indexUrl, ps):
        for s in ps.get('jobList', []):
            incName = s.get('incName', '')
            jobPosition = s.get('jobPosition', '')
            jobDesc = s.get('jobDesc', '')

            # 都不为空, 才会判重
            if not (incName and jobPosition and jobDesc):
                return False

            hash_value = util.md5([incName, jobPosition, jobDesc], remove_space=True)
            key = {'hash_value': hash_value}
            hash_doc = self.cv_hash_table.find_one(key)
            if hash_doc:
                # 统计重复数
                self._inc_duplication_count(indexUrl)

                # 如果此渠道优先级比较大, 替换掉存在hash表中的
                cvId_in_db = hash_doc.get('cvId')
                # 相同Id, 可能更新
                if cvId_in_db == indexUrl:
                    return False

                cv_channel_in_db = cvId_in_db.split('://')[0]
                if CHANNEL_PRIORITY.get(self.channel, 0) > CHANNEL_PRIORITY.get(cv_channel_in_db, 0):
                    hash_doc['cvId'] = indexUrl
                    self.cv_hash_table.save_one(key, hash_doc, True)
                    # remove 优先级低的, 保持解析数据没有重复
                    self.remove_duplication(cvId_in_db, cv_channel_in_db, indexUrl)
                    return False
                else:
                    self.remove_duplication(indexUrl, indexUrl.split('://')[0], cvId_in_db)

                    return True
            else:
                hash_doc = {'hash_value': hash_value, 'cvId': indexUrl}
                self.cv_hash_table.save_one(key, hash_doc, True)
                return False

        return False