Пример #1
0
 def __init__(self):
     """
     Initializes database connection and sessionmaker.
     Creates deals table.
     """
     engine = db_connect()
     create_news_table(engine)
     # 初始化对象属性Session为可调用对象
     self.Session = sessionmaker(bind=engine)
     self.recent_links = None
     self.nowtime = datetime.datetime.now()
Пример #2
0
def init_rule():
    engine = db_connect()
    create_news_table(engine)
    Session = sessionmaker(bind=engine)
    with session_scope(Session) as session:
        artile_rule = ArticleRule(
            name='huxiu',
            allow_domains='huxiu.com',
            start_urls='https://www.huxiu.com/',
            next_page='',
            allow_url='/article.*/\d+\.html',
            extract_from='//div[@class="mod-info-flow"]',
            title_xpath='//div[@class="article-wrap"]/h1/text()',
            body_xpath='//div[@class="article-content-wrap"]/p//text()',
            publish_time_xpath='//span[@class="article-time"]/text()',
            source_site='虎嗅网',
            enable=1
        )
        book_rule = BookRule(
            name='readnovel',
            allow_domains='readnovel.com',
            start_urls='https://www.readnovel.com/',
            next_page='//div[starts-with(@class,"chapter-control")]/a[@id="j_chapterNext"]',
            allow_book_url='.*/book/\d+',
            extract_book_from='//div[@class="book-rank-list"]',
            book_title_xpath='//div[@class="book-info"]/h1/em/text()',
            book_intro_xpath='//div[@class="book-info"]/p[@class="intro"]/text()',
            book_author_xpath='//div[@class="book-info"]/p[@class="author"]/a/text()',
            book_category_xpath='//div[@class="book-info"]/h1/i/text()',

            allow_chapter_url='.*/chapter/\d+/\d+',
            extract_chapter_from='//div[@class="book-info"]',
            chapter_title_xpath='//h3[@class="j_chapterName"]/text()',
            chapter_body_xpath='//div[starts-with(@class,"read-content")]',
            publish_time_xpath='//span[@class="j_updateTime"]/text()',

            source_site='小说阅读网',
            enable=1
        )
        session.add(artile_rule)
        session.add(book_rule)
Пример #3
0
import logging
from spiders.article import ArticleSpider
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from models import db_connect, create_news_table
from models import ArticleRule
from sqlalchemy.orm import sessionmaker

if __name__ == '__main__':
    settings = get_project_settings()
    configure_logging(settings)
    db = db_connect()
    create_news_table(db)
    Session = sessionmaker(bind=db)
    session = Session()
    rules = session.query(ArticleRule).filter(ArticleRule.enable == 1).all()
    session.close()
    runner = CrawlerRunner(settings)

    for rule in rules:
        # stop reactor when spider closes
        # runner.signals.connect(spider_closing, signal=signals.spider_closed)
        runner.crawl(ArticleSpider, rule=rule)

    # blocks process so always keep as the last statement
    logging.info(rules)
    if len(rules) > 0:
        d = runner.join()
        d.addBoth(lambda _: reactor.stop())
Пример #4
0
 def __init__(self):
   engine = db_connect()
   create_news_table(engine)
   self.Session = sessionmaker(bind=engine)
Пример #5
0
def init_rule():
    engine = db_connect()
    create_news_table(engine)
    Session = sessionmaker(bind=engine)
    with session_scope(Session) as session:
        artile_rule1 = ArticleRule(
            name='huxiu',
            allow_domains='huxiu.com',
            start_urls='http://www.huxiu.com/',
            next_page='',
            allow_url='/article/\d+/\d+\.html',
            extract_from='//div[@class="mod-info-flow"]',
            title_xpath='//div[@class="article-wrap"]/h1/text()',
            body_xpath='//div[@id="article_content"]/p//text()',
            author_xpath='//span[@class="muted"][2]/a/text()',
            publish_time_xpath='//span[@class="article-time"]/text()',
            source_site='虎嗅网',
            enable=0)
        artile_rule2 = ArticleRule(
            name='osc',
            allow_domains='oschina.net',
            start_urls='http://www.oschina.net/',
            next_page='',
            allow_url='/news/\d+/',
            extract_from='//div[@id="IndustryNews"]',
            title_xpath='//h1[@class="OSCTitle"]/text()',
            author_xpath='//span[@class="muted"][2]/a/text()',
            publish_time_xpath='//div[@class="PubDate"]/text()',
            body_xpath=
            '//div[starts-with(@class, "Body")]/p[position()>1]//text()',
            source_site='开源中国',
            enable=0)
        artile_rule3 = ArticleRule(
            name='along',
            allow_domains='along.party',
            start_urls='https://www.along.party/',
            next_page='//div[@class="pagination"]/ul/li/a',
            allow_url='/?p=\d+',
            extract_from='//div[@class="content"]',
            title_xpath='//h1[@class="article-title"]/a/text()',
            body_xpath='//article[@class="article-content"]',
            author_xpath='//span[@class="muted"][2]/a/text()',
            publish_time_xpath='//span[@class="muted"]/text()',
            source_site='蜷缩的蜗牛',
            enable=0)
        artile_rule4 = ArticleRule(
            name='along',
            allow_domains='along.party',
            start_urls='https://www.xncoding.com/',
            next_page='//*[@id="content"]/nav/a',
            allow_url='/\d+/\d+/\d+/.*/.*.html',
            extract_from='//div[@class="post-block"]',
            title_xpath='//h1[@class="post-title"]/text()',
            body_xpath='//div[@class="post-body"]',
            author_xpath='//span[@class="site-title"]/text()',
            publish_time_xpath=
            '//time[@itemprop="dateCreated datePublished"]/text()',
            source_site='会飞的污熊',
            enable=1)
        session.add(artile_rule1)
        session.add(artile_rule2)
        session.add(artile_rule3)