def real_run(self, job): indexUrl = job.get('indexUrl') pagecontent = job.get('pagecontent') #print pagecontent ps = self.cv_parser.parser(htmlContent=pagecontent, cvFrom=self.channel) # 判断是否重复cv if self.check_has_duplication(indexUrl, ps): return cvRaw_obj = constructCvRawObj(ps) cvRaw_obj.cvId = job.get('indexUrl') cvRaw_obj.cvFrom = self.channel cvRaw_obj.updateTime = job.get('updateTime') cvRaw_obj.contentSign = job.get('contentSign') cvRaw = convertToCvRaw(cvRaw_obj) cvMeasure = self.measure(cvRaw) if not cvMeasure: print cvRaw cvMeasured_obj = constructCvMeasureObj(cvMeasure) self.store(indexUrl, cvRaw_obj, cvMeasured_obj, False)
def run(self, job): job['indexUrl'] = job.get('cvId') # 只是为了和其它runner一致,使它有indexUrl assert ("cvId" in job) cvRaw_obj = constructCvRawObj(job) cvRaw_obj.cvId = job.get('cvId') cvRaw_obj.cvFrom = self.channel cvRaw_obj.updateTime = job.get('updateTime') cvRaw = convertToCvRaw(cvRaw_obj) cvMeasure = self.measure_runner.run(cvRaw) cvMeasured_obj = constructCvMeasureObj(cvMeasure) self.store_runner.run(cvRaw.cvId, None, cvMeasured_obj)
def run(self, job): indexUrl = job.get('indexUrl') pagecontent = job.get('pagecontent') ps = self.parse_runner.run(pagecontent) # 排除重复cv if self.check_dupli_runner.run(indexUrl, ps): return cvRaw_obj = constructCvRawObj(ps) self.add_other_fields(cvRaw_obj, self.channel, job) cvRaw = convertToCvRaw(cvRaw_obj) cvMeasure = self.measure_runner.run(cvRaw) # 转为mongoengine模型 cvMeasured_obj = constructCvMeasureObj(cvMeasure) self.store_runner.run(indexUrl, cvRaw_obj, cvMeasured_obj)
def run_job(self, job): if not job: return htmlfile = job.get('fn') cvId = job.get('cvId') with open(htmlfile, 'rb') as f: pagecontent = f.read() try: ps = self.cv_parser.parser(htmlContent=pagecontent, cvFrom=self.channel) cvRaw_obj = constructCvRawObj(ps) cvRaw_obj.cvId = "%s://%s" % (self.channel, cvId) cvRaw_obj.cvFrom = self.channel self._check_fields(cvRaw_obj.to_json(), cvId) # Logger.default_log("cvId: %s Ok" % cvId) except Exception as e: Logger.default_log("cvId: %s Fail" % cvId) traceback.print_exc()