예제 #1
0
파일: utils.py 프로젝트: 65kg/core-scrapy
def init_rule():
    engine = db_connect()
    create_news_table(engine)
    Session = sessionmaker(bind=engine)
    with session_scope(Session) as session:
        artile_rule1 = ArticleRule(
            name='huxiu',
            allow_domains='huxiu.com',
            start_urls='http://www.huxiu.com/',
            next_page='',
            allow_url='/article/\d+/\d+\.html',
            extract_from='//div[@class="mod-info-flow"]',
            title_xpath='//div[@class="article-wrap"]/h1/text()',
            body_xpath='//div[@id="article_content"]/p//text()',
            publish_time_xpath='//span[@class="article-time"]/text()',
            source_site='虎嗅网',
            enable=1
        )
        artile_rule2 = ArticleRule(
            name='osc',
            allow_domains='oschina.net',
            start_urls='http://www.oschina.net/',
            next_page='',
            allow_url='/news/\d+/',
            extract_from='//div[@id="IndustryNews"]',
            title_xpath='//h1[@class="OSCTitle"]/text()',
            publish_time_xpath='//div[@class="PubDate"]/text()',
            body_xpath='//div[starts-with(@class, "Body")]/p[position()>1]//text()',
            source_site='开源中国',
            enable=1
        )
        session.add(artile_rule1)
        session.add(artile_rule2)
예제 #2
0
파일: utils.py 프로젝트: skyhee/iScrapy
def init_rule():
    engine = db_connect()
    create_news_table(engine)
    Session = sessionmaker(bind=engine)
    with session_scope(Session) as session:
        artile_rule1 = ArticleRule(
            name='huxiu',
            allow_domains='huxiu.com',
            start_urls='http://www.huxiu.com/',
            next_page='',
            allow_url='/article/\d+/\d+\.html',
            extract_from='//div[@class="mod-info-flow"]',
            title_xpath='//div[@class="article-wrap"]/h1/text()',
            body_xpath='//div[@id="article_content"]/p//text()',
            publish_time_xpath='//span[@class="article-time"]/text()',
            source_site='虎嗅网',
            enable=1)
        artile_rule2 = ArticleRule(
            name='osc',
            allow_domains='oschina.net',
            start_urls='http://www.oschina.net/',
            next_page='',
            allow_url='/news/\d+/',
            extract_from='//div[@id="IndustryNews"]',
            title_xpath='//h1[@class="OSCTitle"]/text()',
            publish_time_xpath='//div[@class="PubDate"]/text()',
            body_xpath=
            '//div[starts-with(@class, "Body")]/p[position()>1]//text()',
            source_site='开源中国',
            enable=1)
        session.add(artile_rule1)
        session.add(artile_rule2)
예제 #3
0
파일: pipelines.py 프로젝트: kensunp/scrapy
 def __init__(self):
     """
     Initializes database connection and sessionmaker.
     Creates deals table.
     """
     engine = db_connect()
     create_news_table(engine)
     # 初始化对象属性Session为可调用对象
     self.Session = sessionmaker(bind=engine)
     self.recent_links = None
     self.nowtime = datetime.datetime.now()
예제 #4
0
 def __init__(self):
     """
     Initializes database connection and sessionmaker.
     Creates deals table.
     """
     engine = db_connect()
     create_news_table(engine)
     # 初始化对象属性Session为可调用对象
     self.Session = sessionmaker(bind=engine)
     self.recent_links = None
     self.nowtime = datetime.datetime.now()
예제 #5
0
파일: pipelines.py 프로젝트: kensunp/scrapy
 def __init__(self):
     engine = db_connect()
     self.Session = sessionmaker(bind=engine)
예제 #6
0
파일: pipelines.py 프로젝트: kensunp/scrapy
 def __init__(self):
     engine = db_connect()
     create_news_table(engine)
     self.Session = sessionmaker(bind=engine)
예제 #7
0
"""

import logging
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from coolscrapy.models import db_connect, create_news_table
from coolscrapy.models import ArticleRule
from sqlalchemy.orm import sessionmaker
from coolscrapy.spiders.article_spider import ArticleSpider

if __name__ == '__main__':
    settings = get_project_settings()
    configure_logging(settings)
    db = db_connect()
    Session = sessionmaker(bind=db)
    session = Session()
    rules = session.query(ArticleRule).filter(ArticleRule.enable == 1).all()
    session.close()
    runner = CrawlerRunner(settings)

    for rule in rules:
        # spider = ArticleSpider(rule)  # instantiate every spider using rule
        # stop reactor when spider closes
        # runner.signals.connect(spider_closing, signal=signals.spider_closed)
        runner.crawl(ArticleSpider, rule=rule)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
예제 #8
0
파일: run.py 프로젝트: 65kg/core-scrapy
"""

import logging
from spiders.article_spider import ArticleSpider
from twisted.internet import reactor
from scrapy.crawler import CrawlerRunner
from scrapy.utils.project import get_project_settings
from scrapy.utils.log import configure_logging
from coolscrapy.models import db_connect
from coolscrapy.models import ArticleRule
from sqlalchemy.orm import sessionmaker

if __name__ == '__main__':
    settings = get_project_settings()
    configure_logging(settings)
    db = db_connect()
    Session = sessionmaker(bind=db)
    session = Session()
    rules = session.query(ArticleRule).filter(ArticleRule.enable == 1).all()
    session.close()
    runner = CrawlerRunner(settings)

    for rule in rules:
        # spider = ArticleSpider(rule)  # instantiate every spider using rule
        # stop reactor when spider closes
        # runner.signals.connect(spider_closing, signal=signals.spider_closed)
        runner.crawl(ArticleSpider, rule=rule)

    d = runner.join()
    d.addBoth(lambda _: reactor.stop())
예제 #9
0
 def __init__(self):
     engine = db_connect()
     create_news_table(engine)
     self.Session = sessionmaker(bind=engine)
예제 #10
0
 def __init__(self):
     logg.info("Init ProxyDatabasePipeline")
     engine = db_connect()
     self.Session = sessionmaker(bind=engine)
     logg.info(self.Session)