def __init__(self): cf = tools.load_config() self.threshold = float(cf.get('Section', 'threshold')) self.days = int(cf.get('Section', 'days')) self.maxPage = int(cf.get('Section', 'maxPage')) self.decoding = cf.get('Section', 'decoding') target_path = cf.get('Section', 'target_path') stopwords_path = cf.get('Section', 'stopwords_path') dict_path = cf.get('Section', 'corpus') self.s = similarity.TextSimilarity(target_path, stopwords_path, dict_path) # 扫描的批次 self.scan_id = str(time.time()) # 首页 self.science_url = 'https://pacaio.match.qq.com/irs/rcd?cid=58&token=c232b098ee7611faeffc46409e836360&ext=tech&page=' # 互联网 self.internet_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=614,603,605,611,612,613,615,620,618&page=1' # IT self.it_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=604,609&page=' # 区块链 self.blockchain_url = 'https://pacaio.match.qq.com/tags/tag2articles?id=276813&num=15&page=' # AI self.ai_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=602,608,622&page=' # 创业创新 self.innovate_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=619,617,610&page=' # 前沿科技 self.leadingSci_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=607,616,623,624&page=' # 添加进start_urls self.start_urls.append(self.science_url) self.start_urls.append(self.internet_url) self.start_urls.append(self.it_url) self.start_urls.append(self.blockchain_url) self.start_urls.append(self.ai_url) self.start_urls.append(self.innovate_url) self.start_urls.append(self.leadingSci_url)
def __init__(self): cf = tools.load_config() self.threshold = float(cf.get('Section', 'threshold')) self.days = int(cf.get('Section', 'days')) self.maxPage = int(cf.get('Section', 'maxPage')) target_path = cf.get('Section', 'target_path') stopwords_path = cf.get('Section', 'stopwords_path') dict_path = cf.get('Section', 'corpus') self.decoding = cf.get('Section', 'decoding') self.s = similarity.TextSimilarity(target_path, stopwords_path, dict_path) # 扫描的批次 self.scan_id = str(time.time())
def __init__(self): cf = tools.load_config() self.threshold = float(cf.get('Section', 'threshold')) self.days = int(cf.get('Section', 'days')) self.maxPage = int(cf.get('Section', 'maxPage')) target_path = cf.get('Section', 'target_path') dict_path = cf.get('Section', 'corpus') stopwords_path = cf.get('Section', 'stopwords_path') self.decoding = cf.get('Section', 'decoding') self.s = similarity.TextSimilarity(target_path, stopwords_path, dict_path) # 扫描的批次 self.scan_id = str(time.time()) self.category_urls = [] self.page = 1 # url self.tech_url = 'http://news.sina.com.cn/roll/#pageid=153&lid=2515&page='
def __init__(self): # 初始化操作 cf = tools.load_config() self.decoding = cf.get('Section', 'decoding') settings = get_project_settings() # 连接数据库 self.connect = pymysql.connect(host=settings.get('MYSQL_HOST'), port=settings.get('MYSQL_PORT'), db=settings.get('MYSQL_DBNAME'), user=settings.get('MYSQL_USER'), passwd=settings.get('MYSQL_PASSWD'), charset='utf8', use_unicode=True) # 通过cursor执行增删查改 self.cursor = self.connect.cursor() self.connect.autocommit(True) # 获取数据库的URL self.cursor.execute(self.source_urlselect) for r in self.cursor: self.url_list.append(r[0])
# -*- coding: utf-8 -*- # Scrapy settings for news_spider project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html import random from news_spider.spiders import tools cf = tools.load_config() BOT_NAME = 'news_spider' CLOSESPIDER_ITEMCOUNT = int(cf.get('Section', 'closespider_itemcount')) # 爬取多少个item后终止爬虫 SPIDER_MODULES = ['news_spider.spiders'] NEWSPIDER_MODULE = 'news_spider.spiders' LOG_LEVEL = 'ERROR' ROBOTSTXT_OBEY = False CONCURRENT_REQUESTS = 3 DOWNLOAD_TIMEOUT = 180 DOWNLOAD_DELAY = random.randint(1, 3) RETRY_ENABLED = False COOKIES_ENABLED = False REDIRECT_ENABLED = False DEFAULT_REQUEST_HEADERS = { "User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",