def do_update_story_images( ctx: ActorContext, feed_id: T.int, offset: T.int, story_url: T.url, images: T.list(T.dict( url=T.url, status=T.int, )) ): # save image info url_root_status = {} for img in images: url_root = ImageInfo.extract_url_root(img['url']) value = (img['status'], img['url']) if url_root in url_root_status: url_root_status[url_root] = max(value, url_root_status[url_root]) else: url_root_status[url_root] = value with transaction.atomic(): image_info_objects = [] for url_root, (status, url) in url_root_status.items(): image_info_objects.append(ImageInfo( url_root=url_root, sample_url=url, referer=story_url, status_code=status, )) LOG.info(f'bulk create {len(image_info_objects)} ImageInfo objects') ImageInfo.objects.bulk_create(image_info_objects) _replace_story_images(feed_id, offset)
def _detect_story_images(ctx, story): image_processor = StoryImageProcessor(story.link, story.content) image_urls = _image_urls_of_indexs(image_processor.parse()) if not image_urls: return image_statuses = ImageInfo.batch_detect_images(image_urls) num_todo_image_urls = 0 todo_url_roots = defaultdict(list) for url in image_urls: status = image_statuses.get(url) if status is None: num_todo_image_urls += 1 url_root = ImageInfo.extract_url_root(url) todo_url_roots[url_root].append(url) LOG.info( f'story#{story.feed_id},{story.offset} {story.link} has {len(image_urls)} images, ' f'need detect {num_todo_image_urls} images ' f'from {len(todo_url_roots)} url_roots' ) if todo_url_roots: todo_urls = [] for items in todo_url_roots.values(): if len(items) > 3: todo_urls.extend(random.sample(items, 3)) else: todo_urls.extend(items) ctx.hope('worker_rss.detect_story_images', dict( feed_id=story.feed_id, offset=story.offset, story_url=story.link, image_urls=list(set(todo_urls)), )) else: _replace_story_images(feed_id=story.feed_id, offset=story.offset)