Пример #1
0
Файл: run.py Проект: icexia/mgtv
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

settings = Settings()

# crawl settings
settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36")
settings.set("ITEM_PIPELINES" , {
	# 'pipelines.DuplicatesPipeline': 200,
	# 'pipelines.CountDropPipline': 100,
	'pipelines.MysqlPipeline': 300,
})

process = CrawlerProcess(settings)

db = DBSession()
rules = db.query(Rule).filter(Rule.status == 1)
for rule in rules:
	if rule.allowed_domains=='v.baidu.com':
		process.crawl(BaiduSpider,rule)
	elif rule.allowed_domains == 'iqiyi.com':
		process.crawl(IqiyiSpider,rule)
	elif rule.allowed_domains == 'youku.com':
		process.crawl(YoukuSpider,rule)
	elif rule.allowed_domains == 'douban.com':
		process.crawl(DoubanSpider,rule)
	else:
		process.crawl(MgtvSpider,rule)
process.start()

Пример #2
0
 def start_requests(self):
     db = DBSession()
     sections = db.query(ChangzhiServerSection).all()
     for section in sections:
         yield Request(section.url,self.parse,meta={'section_id':section.id})
Пример #3
0
# -*- coding: utf-8 -*-
from spiders.deep_spider import DeepSpider
from model.config import DBSession
from model.rule import Rule
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings

settings = Settings()

# crawl settings
settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36")
settings.set("ITEM_PIPELINES" , {
    'pipelines.DuplicatesPipeline': 200,
    # 'pipelines.CountDropPipline': 100,
    'pipelines.DataBasePipeline': 300,
})

process = CrawlerProcess(settings)

db = DBSession()
rules = db.query(Rule).filter(Rule.enable == 1)
for rule in rules:
    process.crawl(DeepSpider,rule)
process.start()
Пример #4
0
    "USER_AGENT",
    "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36"
)
settings.set(
    "ITEM_PIPELINES",
    {
        # 'pipelines.DuplicatesPipeline': 200,
        # 'pipelines.CountDropPipline': 100,
        'pipelines.SeprojectPipeline': 300
    })
# settings.set("LOG_LEVEL","INFO")

# process = CrawlerProcess(settings)

db = DBSession()
rules = db.query(Rule).filter(Rule.enable == 1)

for rule in rules:
    crawler = Crawler(settings)
    spider = BeijingSpider(rule)  # instantiate every spider using rule
    RUNNING_CRAWLERS.append(spider)

    # stop reactor when spider closes
    crawler.signals.connect(spider_closing, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()

# blocks process so always keep as the last statement
reactor.run()
# print(rule.starturl)
Пример #5
0
time_now = time.strftime("%Y-%m-%d %H:%M:%S")
print '当前时间:',time_now


#定义日志信息
#log.start(loglevel=log.DEBUG)

settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36")
settings.set("ITEM_PIPELINES" , {
    'wuchong.pipelines.DuplicatesPipeline': 200,
    # 'pipelines.CountDropPipline': 100,
    'wuchong.pipelines.DataBasePipeline': 300
})
dbp = DBSession()
nump = dbp.query(Project).filter(Project.status == 1).distinct().count()
if nump==0:
    print '暂时没有可运行的项目'
    dbp.commit()
    dbp.close()
    exit()

dbr = DBSession()
numr = dbr.query(Rules).filter(Rules.enable == 1).distinct().count()
if numr==0:
    print '暂时没有可运行的规则网站'
    dbr.commit()
    dbr.close()
    exit()