__date__ = '2018/5/29' __QQ__ = '376205871' """ from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor from corpus_health.items import CorpusHealthItem from scrapy_redis.spiders import RedisCrawlSpider import urllib.parse from math import floor import re from corpus_health.Util.LogHandler import LogHandler logger = LogHandler(__name__, stream=True) class Ask120Spider(RedisCrawlSpider): handle_httpstatus_list = [404, 403, 500] name = 'ask120' allowed_domains = ['120ask.com'] # start_urls = [ # 'http://www.120ask.com/list/gaoxueya/', # 'http://www.120ask.com/list/gaoxueya/all/2/' # 'http://www.120ask.com/list/tangniaobing/' # 'http://www.120ask.com/list/guanxinbing/' # 'http://www.120ask.com/list/ganmao/' # 'http://www.120ask.com/list/jingzhuibing/' # 'https://www.120ask.com/list/zhifanggan/' # 'http://www.120ask.com/list/tongfeng/'
import pymysql import pymongo import os import urllib from datetime import datetime import re from urllib import parse from scrapy.exceptions import DropItem import pandas as pd from corpus_health import settings from corpus_health.items import CorpusHealthItem from corpus_health.items import MedicineItem from corpus_health.items import NewsItem from corpus_health.Util.LogHandler import LogHandler logger = LogHandler(__name__, stream=False) class CorpusHealthPipeline(object): def __init__(self): self.connect = pymysql.connect(host=settings.MYSQL_HOST, db=settings.MYSQL_DBNAME, user=settings.MYSQL_USER, passwd=settings.MYSQL_PASSWD, charset='utf8', use_unicode=True) self.cursor = self.connect.cursor() def process_item(self, item, spider): cur_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") if item.__class__ == CorpusHealthItem: