예제 #1
0
 def __init__(self):
     cf = tools.load_config()
     self.threshold = float(cf.get('Section', 'threshold'))
     self.days = int(cf.get('Section', 'days'))
     self.maxPage = int(cf.get('Section', 'maxPage'))
     self.decoding = cf.get('Section', 'decoding')
     target_path = cf.get('Section', 'target_path')
     stopwords_path = cf.get('Section', 'stopwords_path')
     dict_path = cf.get('Section', 'corpus')
     self.s = similarity.TextSimilarity(target_path, stopwords_path,
                                        dict_path)
     # 扫描的批次
     self.scan_id = str(time.time())
     # 首页
     self.science_url = 'https://pacaio.match.qq.com/irs/rcd?cid=58&token=c232b098ee7611faeffc46409e836360&ext=tech&page='
     # 互联网
     self.internet_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=614,603,605,611,612,613,615,620,618&page=1'
     # IT
     self.it_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=604,609&page='
     # 区块链
     self.blockchain_url = 'https://pacaio.match.qq.com/tags/tag2articles?id=276813&num=15&page='
     # AI
     self.ai_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=602,608,622&page='
     # 创业创新
     self.innovate_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=619,617,610&page='
     # 前沿科技
     self.leadingSci_url = 'https://pacaio.match.qq.com/irs/rcd?cid=52&token=8f6b50e1667f130c10f981309e1d8200&ext=607,616,623,624&page='
     # 添加进start_urls
     self.start_urls.append(self.science_url)
     self.start_urls.append(self.internet_url)
     self.start_urls.append(self.it_url)
     self.start_urls.append(self.blockchain_url)
     self.start_urls.append(self.ai_url)
     self.start_urls.append(self.innovate_url)
     self.start_urls.append(self.leadingSci_url)
예제 #2
0
 def __init__(self):
     cf = tools.load_config()
     self.threshold = float(cf.get('Section', 'threshold'))
     self.days = int(cf.get('Section', 'days'))
     self.maxPage = int(cf.get('Section', 'maxPage'))
     target_path = cf.get('Section', 'target_path')
     stopwords_path = cf.get('Section', 'stopwords_path')
     dict_path = cf.get('Section', 'corpus')
     self.decoding = cf.get('Section', 'decoding')
     self.s = similarity.TextSimilarity(target_path, stopwords_path,
                                        dict_path)
     # 扫描的批次
     self.scan_id = str(time.time())
예제 #3
0
 def __init__(self):
     cf = tools.load_config()
     self.threshold = float(cf.get('Section', 'threshold'))
     self.days = int(cf.get('Section', 'days'))
     self.maxPage = int(cf.get('Section', 'maxPage'))
     target_path = cf.get('Section', 'target_path')
     dict_path = cf.get('Section', 'corpus')
     stopwords_path = cf.get('Section', 'stopwords_path')
     self.decoding = cf.get('Section', 'decoding')
     self.s = similarity.TextSimilarity(target_path, stopwords_path,
                                        dict_path)
     # 扫描的批次
     self.scan_id = str(time.time())
     self.category_urls = []
     self.page = 1
     # url
     self.tech_url = 'http://news.sina.com.cn/roll/#pageid=153&lid=2515&page='
    def __init__(self):
        # 初始化操作
        cf = tools.load_config()
        self.decoding = cf.get('Section', 'decoding')
        settings = get_project_settings()

        # 连接数据库
        self.connect = pymysql.connect(host=settings.get('MYSQL_HOST'),
                                       port=settings.get('MYSQL_PORT'),
                                       db=settings.get('MYSQL_DBNAME'),
                                       user=settings.get('MYSQL_USER'),
                                       passwd=settings.get('MYSQL_PASSWD'),
                                       charset='utf8',
                                       use_unicode=True)

        # 通过cursor执行增删查改
        self.cursor = self.connect.cursor()
        self.connect.autocommit(True)

        # 获取数据库的URL
        self.cursor.execute(self.source_urlselect)
        for r in self.cursor:
            self.url_list.append(r[0])
예제 #5
0
# -*- coding: utf-8 -*-

# Scrapy settings for news_spider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import random
from news_spider.spiders import tools
cf = tools.load_config()

BOT_NAME = 'news_spider'

CLOSESPIDER_ITEMCOUNT = int(cf.get('Section',
                                   'closespider_itemcount'))  # 爬取多少个item后终止爬虫
SPIDER_MODULES = ['news_spider.spiders']
NEWSPIDER_MODULE = 'news_spider.spiders'
LOG_LEVEL = 'ERROR'
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 3
DOWNLOAD_TIMEOUT = 180
DOWNLOAD_DELAY = random.randint(1, 3)
RETRY_ENABLED = False
COOKIES_ENABLED = False
REDIRECT_ENABLED = False
DEFAULT_REQUEST_HEADERS = {
    "User-Agent":
    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;",