def process_item(self, item, spider): if item["slug"] in BLOCK_LIST: return item # replace image url print(u"抓取完毕: %s" % item["title"]) content = item["content"] for img in item["images"]: path = get_image_name(img) Image.insert( slug=item["slug"], url=img, path=path ).execute() content = content.replace(img, '../images/%s' % path) try: Note.insert( title = item["title"], slug = item["slug"], url = item["url"], content = content, likes_count = int(item["likes_count"]), views_count = int(item["views_count"]) ).execute() except IntegrityError as e: logger.warn('%s SKIP E: (%s)' % (dict(item), str(e))) return item
def process_item(self, item, spider): if item["slug"] in BLOCK_LIST: return item # replace image url print(u"抓取完毕: %s" % item["title"]) content = item["content"] for img in item["images"]: path = get_image_name(img) Image.insert(slug=item["slug"], url=img, path=path).execute() content = content.replace(img, '../images/%s' % path) try: Note.insert(title=item["title"], slug=item["slug"], url=item["url"], content=content, likes_count=int(item["likes_count"]), views_count=int(item["views_count"])).execute() except IntegrityError as e: logger.warn('%s SKIP E: (%s)' % (dict(item), str(e))) return item
# -*- coding: utf-8 -*- ''' File Name: jianshu/image.py Author: JackeyGao mail: [email protected] Created Time: 五 1/ 8 14:50:27 2016 ''' import requests, shutil, re, sys from jianshu.settings import DEFAULT_REQUEST_HEADERS as headers from jianshu.db import Image reload(sys) sys.setdefaultencoding('utf-8') images = Image.select().execute() def get_image_name(url): group = re.findall('\d+-\w+.\w+', url) if not group: return None image_name = group[0] if 'imageMogr' in image_name: image_name = image_name.replace('?imageMogr2', '.jpg') return image_name def request_image(url): image_name = get_image_name(url) print(u"正在下载 %s" % image_name) try: response = requests.get(url, headers=headers,
# -*- coding: utf-8 -*- ''' File Name: jianshu/image.py Author: JackeyGao mail: [email protected] Created Time: 五 1/ 8 14:50:27 2016 ''' import requests, shutil, re, sys from jianshu.settings import DEFAULT_REQUEST_HEADERS as headers from jianshu.db import Image reload(sys) sys.setdefaultencoding('utf-8') images = Image.select().execute() def get_image_name(url): group = re.findall('\d+-\w+.\w+', url) if not group: return None image_name = group[0] if 'imageMogr' in image_name: image_name = image_name.replace('?imageMogr2', '.jpg') return image_name def request_image(url): image_name = get_image_name(url) print(u"正在下载 %s" % image_name)