def run(q, channel, _type): if _type in ['raw', 'check']: handler = ETLRunnerFromRaw(channel) else: handler = ETLRunner(channel) sucess_cnt = 0 fail_cnt = 0 fail_save_file_name = 'result/%s_fail_ids_%d.txt' % (channel, os.getpid()) fail_save_file = FileSave(fail_save_file_name) result_file_name = 'result/%s_statistics_%d.txt' % (channel, os.getpid()) result_file = FileSave(result_file_name) while 1: job = get_job(q, _type) if job is None: break try: handler.run(job) sucess_cnt += 1 print >> result_file.fd, "%s" % (job['indexUrl']) if sucess_cnt % 1000 == 0: print "process %d, time: %s, success copied: %d, " \ "fail copied: %d, fail_save_file: %s, result_file: %s" % (os.getpid(), time.ctime(), sucess_cnt, fail_cnt, fail_save_file_name, result_file_name) except Exception as e: traceback.print_exc() fail_cnt += 1 fail_save_file.append_end_with(job['indexUrl'])
class FixCvField(BaseTask): def __init__(self, channel, field, field_pattern): BaseTask.__init__(self, "fix_cv_field") self.channel = channel self.field = field self.field_pattern = field_pattern self.bugfix = BugFixDispatcher(channel, self._queue) self.dispatcher = lambda q: self.bugfix.dispatcher() self.etl_runner = ETLRunnerFromRaw(channel) def run_job(self, job): try: if not isinstance(job, dict): return split_field = self.field.split('.') if len(split_field) == 2: field_1 = getattr(job, split_field[0]) if isinstance(field_1, list): for item in field_1: field_2 = getattr(item, split_field[1]) assert isinstance(field_2, basestring) if re.search(self.field_pattern, field_2): # 匹配模式,重新量化 print "cvId: ", job.get('cvId'), " || ", print self.field_pattern, " : ", field_2, " || ", self.etl_runner.run(job) except Exception as e: traceback.print_exc()