Exemplo n.º 1
0
    def real_run(self, job):

        indexUrl = job.get('indexUrl')
        pagecontent = job.get('pagecontent')
        #print pagecontent
        ps = self.cv_parser.parser(htmlContent=pagecontent, cvFrom=self.channel)

        # 判断是否重复cv
        if self.check_has_duplication(indexUrl, ps):
            return

        cvRaw_obj = constructCvRawObj(ps)
        cvRaw_obj.cvId = job.get('indexUrl')
        cvRaw_obj.cvFrom = self.channel
        cvRaw_obj.updateTime = job.get('updateTime')
        cvRaw_obj.contentSign = job.get('contentSign')

        cvRaw = convertToCvRaw(cvRaw_obj)
        cvMeasure = self.measure(cvRaw)

        if not cvMeasure:
            print cvRaw

        cvMeasured_obj = constructCvMeasureObj(cvMeasure)

        self.store(indexUrl, cvRaw_obj, cvMeasured_obj, False)
Exemplo n.º 2
0
    def run(self, job):

        job['indexUrl'] = job.get('cvId')  # 只是为了和其它runner一致,使它有indexUrl
        assert ("cvId" in job)
        cvRaw_obj = constructCvRawObj(job)
        cvRaw_obj.cvId = job.get('cvId')
        cvRaw_obj.cvFrom = self.channel
        cvRaw_obj.updateTime = job.get('updateTime')

        cvRaw = convertToCvRaw(cvRaw_obj)
        cvMeasure = self.measure_runner.run(cvRaw)
        cvMeasured_obj = constructCvMeasureObj(cvMeasure)
        self.store_runner.run(cvRaw.cvId, None, cvMeasured_obj)
Exemplo n.º 3
0
    def run(self, job):

        indexUrl = job.get('indexUrl')
        pagecontent = job.get('pagecontent')
        ps = self.parse_runner.run(pagecontent)
        # 排除重复cv
        if self.check_dupli_runner.run(indexUrl, ps):
            return

        cvRaw_obj = constructCvRawObj(ps)
        self.add_other_fields(cvRaw_obj, self.channel, job)
        cvRaw = convertToCvRaw(cvRaw_obj)
        cvMeasure = self.measure_runner.run(cvRaw)

        # 转为mongoengine模型
        cvMeasured_obj = constructCvMeasureObj(cvMeasure)
        self.store_runner.run(indexUrl, cvRaw_obj, cvMeasured_obj)
Exemplo n.º 4
0
    def run_job(self, job):

        if not job:
            return

        htmlfile = job.get('fn')
        cvId = job.get('cvId')

        with open(htmlfile, 'rb') as f:
            pagecontent = f.read()

        try:
            ps = self.cv_parser.parser(htmlContent=pagecontent, cvFrom=self.channel)
            cvRaw_obj = constructCvRawObj(ps)

            cvRaw_obj.cvId = "%s://%s" % (self.channel, cvId)
            cvRaw_obj.cvFrom = self.channel

            self._check_fields(cvRaw_obj.to_json(), cvId)
            # Logger.default_log("cvId: %s Ok" % cvId)
        except Exception as e:
            Logger.default_log("cvId: %s Fail" % cvId)
            traceback.print_exc()