def real_mapper(queryset): this = Message.objects(task=queryset.task, payload=queryset.payload) STATE = True Model = getattr(models, queryset.task) this.update(set__inprocess=True) if queryset.task == 'Movie': for process in queryset.payload: ret = parse.get_movie_info(process) ret['movieid'] = process models.Movie(**ret).save() return Parse = getattr(parse, queryset.task + 'Parse') for process in queryset.payload: try: p = Parse(process) count = 1 while 1: haspage = p() if haspage is None: # 很可能404 break result, hasnext = haspage Model(**result).save() # 别名体系, 这样只需要全局记录一个人物就知道他们的全部别名 for k, v in p._alias.items(): models.AliasName.objects.get_or_create(name=k)[0].update( add_to_set__alias=v) if hasnext: count += 1 url = p.original_url p.set_url(url.replace('.html', '-{}.html'.format(count))) else: #没有下一页就退出循环 break except: raise STATE = False else: models.IdFinished.objects(year=queryset.year).update( add_to_set__ids=[process]) if STATE: this.update(set__state=2) else: this.update(set__state=3) this.update(set__inprocess=False)
def real_mapper(queryset): this = Message.objects(task=queryset.task, payload=queryset.payload) STATE = True Model = getattr(models, queryset.task) this.update(set__inprocess=True) if queryset.task == 'Movie': for process in queryset.payload: ret = parse.get_movie_info(process) ret['movieid'] = process models.Movie(**ret).save() return Parse = getattr(parse, queryset.task + 'Parse') for process in queryset.payload: try: p = Parse(process) count = 1 while 1: haspage = p() if haspage is None: # 很可能404 break result, hasnext = haspage Model(**result).save() # 别名体系, 这样只需要全局记录一个人物就知道他们的全部别名 for k, v in p._alias.items(): models.AliasName.objects.get_or_create( name=k)[0].update(add_to_set__alias=v) if hasnext: count += 1 url = p.original_url p.set_url(url.replace('.html', '-{}.html'.format(count))) else: #没有下一页就退出循环 break except: raise STATE = False else: models.IdFinished.objects( year=queryset.year ).update(add_to_set__ids=[process]) if STATE: this.update(set__state=2) else: this.update(set__state=3) this.update(set__inprocess=False)
def __init__(self, map_func, num_workers=None, **kwargs): self.map_func = map_func self.inputs = Message.objects(state__ne=2, inprocess__ne=True) self.pool = multiprocessing.Pool(num_workers, **kwargs)
def mtime_beat(): '''每次任务只跑一年的''' y_list = [] y = get_year() + 1 # 要抓取的年份 debug('Fetch Year: {} starting...'.format(y)) instance = fetch(y, 1) page = get_movie_pages(instance) if page is None: warn('Movie"page has not fetched') # 执行间隔自适应 if scheduler.get_interval < TASK_BEAT * 7: scheduler.change_interval(incr=True) return ids = get_movie_ids(instance) if ids is None: # 间隔自适应也不能太大 warn('Movie has not fetched') if scheduler.get_interval < TASK_BEAT * 7: scheduler.change_interval(incr=True) return # 当任务继续能执行的时候,回到默认的间隔 if scheduler.get_interval > TASK_BEAT: debug('Interval back to default') scheduler.change_interval(TASK_BEAT) y_list.extend(ids) if not y_list: # 本年没有电影 debug('Year: {} has not movie'.format(y)) YearFinished(year=y).save() sleep2() return mtime_beat() if page > 1: p = 2 while p <= page: instance = fetch(y, p) debug('Fetch Year:{} Page:{}'.format(y, p)) ids = get_movie_ids(instance) if ids is None: # 间隔自适应也不能太大 if scheduler.get_interval < TASK_BEAT * 7: scheduler.change_interval(incr=True) # 出现需要验证码 手动输入或者等待一段时间后重试,直到能正常使用 sleep2(VERIFY_INTERVAL) continue ids = [] y_list.extend(ids) p += 1 sleep2() obj = IdFinished.objects(year=y).first() if obj is not None: has_finished = obj.ids else: has_finished = [] to_process = get_unfinished(has_finished, y_list) # 给相应队列添加任务 for payload in group(to_process, TASK_BEAT_NUM): for task in ['Fullcredits', 'Movie', 'Comment', 'Character', 'MicroComment', 'Scenes', 'Awards', 'Plot', 'Details']: debug('Push payload: {} to {} Queue'.format(payload, task)) try: Message(year=y, task=task, payload=payload).save() # Hack一下 #Message.objects.get_or_create(year=y, task=task, payload=payload) except NotUniqueError: debug('Duplicate insert: [{}], payload: {}'.format(task, payload)) # 当前年份数据已经入MQ YearFinished(year=y).save() debug('Year: {} done'.format(y))
for k, v in p._alias.items(): models.AliasName.objects.get_or_create(name=k)[0].update( add_to_set__alias=v) if hasnext: count += 1 url = p.original_url p.set_url(url.replace('.html', '-{}.html'.format(count))) else: #没有下一页就退出循环 break except: raise STATE = False else: models.IdFinished.objects(year=queryset.year).update( add_to_set__ids=[process]) if STATE: this.update(set__state=2) else: this.update(set__state=3) this.update(set__inprocess=False) all = Message.objects(state__ne=2) for i in all: try: real_mapper(i) except: raise
# 别名体系, 这样只需要全局记录一个人物就知道他们的全部别名 for k, v in p._alias.items(): models.AliasName.objects.get_or_create( name=k)[0].update(add_to_set__alias=v) if hasnext: count += 1 url = p.original_url p.set_url(url.replace('.html', '-{}.html'.format(count))) else: #没有下一页就退出循环 break except: raise STATE = False else: models.IdFinished.objects( year=queryset.year ).update(add_to_set__ids=[process]) if STATE: this.update(set__state=2) else: this.update(set__state=3) this.update(set__inprocess=False) all = Message.objects(state__ne=2) for i in all: try: real_mapper(i) except: raise