Python CVRawStore.delete_one示例

编程语言: Python

命名空间/包名称: store.cv_raw_store

类/类型: CVRawStore

方法/功能: delete_one

hotexamples.com的示例: 1

Python CVRawStore.delete_one - 已找到1个示例。这些是从开源项目中提取的最受好评的store.cv_raw_store.CVRawStore.delete_one现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

bulk_upsert(2)

count_all(2)

get_one(2)

save_one(2)

delete_one(1)

find_one(1)

get_all(1)

remove_one(1)

示例#1

显示文件

文件： etl_runner_base.py 项目： haogods/etl_task

class CheckCvDupliRunner(object):
    def __init__(self, channel):
        self.channel = channel
        self.r_lock = threading.RLock()
        self._duplication_count = 0
        self.cv_hash_table = CVHashTable()

        self.cv_51job_raw_store = CVRawStore('cv_51job', 'raw')
        self.cv_51job_measure_store = CVRawStore('cv_51job','measure')

        self.cv_zhilian_raw_store = CVRawStore('cv_zhilian', 'raw')
        self.cv_zhilian_measure_store = CVRawStore('cv_zhilian', 'measure')

        self.cv_liepin_raw_store = CVRawStore('cv_liepin', 'raw')
        self.cv_liepin_measure_store = CVRawStore('cv_liepin', 'measure')

        self.dupli_file_save = open(os.path.join(os.path.dirname(__file__),
                                                 "../result/%s_duplicate_ids_%d" % (self.channel, os.getpid())), 'wb')

    def _inc_duplication_count(self, indexUrl):
        with self.r_lock:
            self._duplication_count += 1

    def remove_duplication(self, cvId_to_remove, channel, indexUrl):
        key = {'cvId': cvId_to_remove}

        if channel == 'cv_51job':
            self.cv_51job_raw_store.delete_one(key)
            self.cv_51job_measure_store.delete_one(key)
        elif channel == 'cv_zhilian':
            self.cv_zhilian_raw_store.delete_one(key)
            self.cv_zhilian_measure_store.delete_one(key)

        elif channel == 'cv_liepin':
            self.cv_liepin_raw_store.delete_one(key)
            self.cv_liepin_measure_store.delete_one(key)

        else:
            raise Exception('unknown channel')

        print >> self.dupli_file_save, "[D]cvId: %s is removed for duplicating with cvId: %s" % (cvId_to_remove, indexUrl)

    def run(self, indexUrl, ps):
        for s in ps.get('jobList', []):
            incName = s.get('incName', '')
            jobPosition = s.get('jobPosition', '')
            jobDesc = s.get('jobDesc', '')

            # 都不为空， 才会判重
            if not (incName and jobPosition and jobDesc):
                return False

            hash_value = util.md5([incName, jobPosition, jobDesc], remove_space=True)
            key = {'hash_value': hash_value}
            hash_doc = self.cv_hash_table.find_one(key)
            if hash_doc:
                # 统计重复数
                self._inc_duplication_count(indexUrl)

                # 如果此渠道优先级比较大， 替换掉存在hash表中的
                cvId_in_db = hash_doc.get('cvId')
                # 相同Id, 可能更新
                if cvId_in_db == indexUrl:
                    return False

                cv_channel_in_db = cvId_in_db.split('://')[0]
                if CHANNEL_PRIORITY.get(self.channel, 0) > CHANNEL_PRIORITY.get(cv_channel_in_db, 0):
                    hash_doc['cvId'] = indexUrl
                    self.cv_hash_table.save_one(key, hash_doc, True)
                    # remove 优先级低的， 保持解析数据没有重复
                    self.remove_duplication(cvId_in_db, cv_channel_in_db, indexUrl)
                    return False
                else:
                    self.remove_duplication(indexUrl, indexUrl.split('://')[0], cvId_in_db)

                    return True
            else:
                hash_doc = {'hash_value': hash_value, 'cvId': indexUrl}
                self.cv_hash_table.save_one(key, hash_doc, True)
                return False

        return False