def __init__(self): """ Initializes database connection and sessionmaker. Creates deals table. """ engine = db_connect() create_news_table(engine) # 初始化对象属性Session为可调用对象 self.Session = sessionmaker(bind=engine) self.recent_links = None self.nowtime = datetime.datetime.now()
def init_rule(): engine = db_connect() create_news_table(engine) Session = sessionmaker(bind=engine) with session_scope(Session) as session: artile_rule = ArticleRule( name='huxiu', allow_domains='huxiu.com', start_urls='https://www.huxiu.com/', next_page='', allow_url='/article.*/\d+\.html', extract_from='//div[@class="mod-info-flow"]', title_xpath='//div[@class="article-wrap"]/h1/text()', body_xpath='//div[@class="article-content-wrap"]/p//text()', publish_time_xpath='//span[@class="article-time"]/text()', source_site='虎嗅网', enable=1 ) book_rule = BookRule( name='readnovel', allow_domains='readnovel.com', start_urls='https://www.readnovel.com/', next_page='//div[starts-with(@class,"chapter-control")]/a[@id="j_chapterNext"]', allow_book_url='.*/book/\d+', extract_book_from='//div[@class="book-rank-list"]', book_title_xpath='//div[@class="book-info"]/h1/em/text()', book_intro_xpath='//div[@class="book-info"]/p[@class="intro"]/text()', book_author_xpath='//div[@class="book-info"]/p[@class="author"]/a/text()', book_category_xpath='//div[@class="book-info"]/h1/i/text()', allow_chapter_url='.*/chapter/\d+/\d+', extract_chapter_from='//div[@class="book-info"]', chapter_title_xpath='//h3[@class="j_chapterName"]/text()', chapter_body_xpath='//div[starts-with(@class,"read-content")]', publish_time_xpath='//span[@class="j_updateTime"]/text()', source_site='小说阅读网', enable=1 ) session.add(artile_rule) session.add(book_rule)
import logging from spiders.article import ArticleSpider from twisted.internet import reactor from scrapy.crawler import CrawlerRunner from scrapy.utils.project import get_project_settings from scrapy.utils.log import configure_logging from models import db_connect, create_news_table from models import ArticleRule from sqlalchemy.orm import sessionmaker if __name__ == '__main__': settings = get_project_settings() configure_logging(settings) db = db_connect() create_news_table(db) Session = sessionmaker(bind=db) session = Session() rules = session.query(ArticleRule).filter(ArticleRule.enable == 1).all() session.close() runner = CrawlerRunner(settings) for rule in rules: # stop reactor when spider closes # runner.signals.connect(spider_closing, signal=signals.spider_closed) runner.crawl(ArticleSpider, rule=rule) # blocks process so always keep as the last statement logging.info(rules) if len(rules) > 0: d = runner.join() d.addBoth(lambda _: reactor.stop())
def __init__(self): engine = db_connect() create_news_table(engine) self.Session = sessionmaker(bind=engine)
def init_rule(): engine = db_connect() create_news_table(engine) Session = sessionmaker(bind=engine) with session_scope(Session) as session: artile_rule1 = ArticleRule( name='huxiu', allow_domains='huxiu.com', start_urls='http://www.huxiu.com/', next_page='', allow_url='/article/\d+/\d+\.html', extract_from='//div[@class="mod-info-flow"]', title_xpath='//div[@class="article-wrap"]/h1/text()', body_xpath='//div[@id="article_content"]/p//text()', author_xpath='//span[@class="muted"][2]/a/text()', publish_time_xpath='//span[@class="article-time"]/text()', source_site='虎嗅网', enable=0) artile_rule2 = ArticleRule( name='osc', allow_domains='oschina.net', start_urls='http://www.oschina.net/', next_page='', allow_url='/news/\d+/', extract_from='//div[@id="IndustryNews"]', title_xpath='//h1[@class="OSCTitle"]/text()', author_xpath='//span[@class="muted"][2]/a/text()', publish_time_xpath='//div[@class="PubDate"]/text()', body_xpath= '//div[starts-with(@class, "Body")]/p[position()>1]//text()', source_site='开源中国', enable=0) artile_rule3 = ArticleRule( name='along', allow_domains='along.party', start_urls='https://www.along.party/', next_page='//div[@class="pagination"]/ul/li/a', allow_url='/?p=\d+', extract_from='//div[@class="content"]', title_xpath='//h1[@class="article-title"]/a/text()', body_xpath='//article[@class="article-content"]', author_xpath='//span[@class="muted"][2]/a/text()', publish_time_xpath='//span[@class="muted"]/text()', source_site='蜷缩的蜗牛', enable=0) artile_rule4 = ArticleRule( name='along', allow_domains='along.party', start_urls='https://www.xncoding.com/', next_page='//*[@id="content"]/nav/a', allow_url='/\d+/\d+/\d+/.*/.*.html', extract_from='//div[@class="post-block"]', title_xpath='//h1[@class="post-title"]/text()', body_xpath='//div[@class="post-body"]', author_xpath='//span[@class="site-title"]/text()', publish_time_xpath= '//time[@itemprop="dateCreated datePublished"]/text()', source_site='会飞的污熊', enable=1) session.add(artile_rule1) session.add(artile_rule2) session.add(artile_rule3)