def process_item(self, item, spider): if item["slug"] in BLOCK_LIST: return item # replace image url print(u"抓取完毕: %s" % item["title"]) content = item["content"] for img in item["images"]: path = get_image_name(img) Image.insert( slug=item["slug"], url=img, path=path ).execute() content = content.replace(img, '../images/%s' % path) try: Note.insert( title = item["title"], slug = item["slug"], url = item["url"], content = content, likes_count = int(item["likes_count"]), views_count = int(item["views_count"]) ).execute() except IntegrityError as e: logger.warn('%s SKIP E: (%s)' % (dict(item), str(e))) return item
def write_summary(): content = '* [Jianshu Hot](markdown/README.md)\n' for i in Note.select().order_by(Note.views_count.desc()).execute(): content += ' - [%s](markdown/%s.md)\n' % (i.title, i.slug) with open('output/SUMMARY.md', 'w') as f: f.write(content)
def process_item(self, item, spider): if item["slug"] in BLOCK_LIST: return item # replace image url print(u"抓取完毕: %s" % item["title"]) content = item["content"] for img in item["images"]: path = get_image_name(img) Image.insert(slug=item["slug"], url=img, path=path).execute() content = content.replace(img, '../images/%s' % path) try: Note.insert(title=item["title"], slug=item["slug"], url=item["url"], content=content, likes_count=int(item["likes_count"]), views_count=int(item["views_count"])).execute() except IntegrityError as e: logger.warn('%s SKIP E: (%s)' % (dict(item), str(e))) return item
def gen_markdown(): write_readme() write_summary() for item in Note.select().execute(): export_to_markdown(item)