def puts(cls, posts): posts = list(posts) tagnames = set(chain.from_iterable([post['tags'] for post in posts])) tags = {name: Tag.get_or_add_bi_name(name) for name in tagnames} def inner(posts): for kargs in posts: entry = Entry.get_or_add_bi_md5(md5=kargs['md5'], ctime=kargs['ctime']) kargs = {k: v for k, v in kargs.items() if k != 'tags'} try: post = cls.query.filter_by(post_url=kargs['post_url']).options(db.joinedload(cls.entry)).one() if post.entry.md5 != kargs['md5']: raise Exception('md5 changed %s -> %s' % (post.md5, kargs['md5'])) post.from_dict(kargs) except NoResultFound: post = cls(entry=entry) post.from_dict(kargs) db.session.add(post) yield post flushed = False seen = set() for kargs, post in zip(posts, list(inner(posts))): if post.id is None: if not flushed: db.session.flush() flushed = True db.session.expire(post, ['id']) else: if post.id in seen: continue seen.add(post.id) for name in set(kargs['tags']): tag = tags[name] db.session.merge(Tagged(post_id=post.id, tag_id=tag.id, entry_id=post.entry.id))
def _update_images(begin=None, limit=65536): start = time() sources = [make(dict) for make in makesources()] from concurrent.futures import ThreadPoolExecutor as Ex with Ex(len(sources)) as ex: posts = list(chain.from_iterable( ex.map(partial(fetch_posts, begin, limit), sources) )) log.info( 'fetch posts done, {} fetched, take {}', len(posts), time() - start ) start = time() current_app.store.Post.puts(posts=posts) log.info('put posts done, take {}', time() - start)