def run(): name = sys.argv[1] # 获取命令行参数,这里就是china custom_settings = get_config(name) # 相当于拿到china.json配置文件 # 爬取使用的spider名称 spider = custom_settings.get('spider', 'universal') project_settings = get_project_settings() settings = dict(project_settings.copy()) # 弄个副本然后搞成一个字典什么意思呢? # 合并配置 这一块理解的不好 # 应该是把china.json配置文件里面的settings设置,更新到项目配置里面去 settings.update(custom_settings.get('settings')) process = CrawlerProcess(settings) # 启动爬虫 process.crawl(spider, **{'name': name}) # 这里是勾上了spider--universal--UniversalSpider process.start() name = sys.argv[1] custom_settings = get_config(name) spider = custom_settings.get('spider', 'universal') project_settings = get_project_settings() settings = dict(project_settings.copy()) settings.update(custom_settings.get('settings')) process = CrawlerProcess(settings) process.crawl(spider, **{'name': name}) process.start()
def __init__(self, name, *args, **kwargs): config = get_config(name) self.config = config self.rules = rules.get(config.get('rules')) self.start_urls = config.get('start_urls') self.allowed_domains = config.get('allowed_domains') super(UniversalSpider, self).__init__(*args, **kwargs)
def __init__(self, name, *args, **kwargs): # init方法中,、、 config = get_config(name) self.config = config self.rules = rules.get(config.get('rules')) # rules属性另外读取了rules.py的配置 self.start_urls = config.get('allowed_domains') # start_urls被赋值 self.allowed_domains = config.get('allowed_domains') # allowed_domains被赋值 super(UniversalSpider, self).__init__(*args, **kwargs)
def run(): name = sys.argv[1] print(name) custom_settings = get_config(name) spider = custom_settings.get('spider', 'universal') print(spider) rules = custom_settings.get('rules') print("rules: %s " % rules) project_settings = get_project_settings() print(project_settings) settings = dict(project_settings.copy()) print(settings) settings.update(custom_settings.get('settings')) print(settings) process = CrawlerProcess(settings) print("######################################################################process is: %s " % process) # c = process.crawl(spider, **{'name': name}) c = process.crawl(UniversalSpider, **{'name': name}) print("######################################################################processrrrrr is: %s " % c) process.start() print("######################################################################process iseeeeeeee: %s " % process)
def __init__(self, name, *args, **kwargs): # 获取自定义的json配置文件 config = get_config(name) self.config = config # rules配置 self.rules = SpiderRules( detailUrlXpaths= '//div[@class="p-con"]/div[@class="p-box"]/ul[@class="products"]', detailTags=('a', 'area'), detailAttrs=('href', ), detailCallback='parse_item', isSplash=True).rules.get(config.get('rules')) # self.rules = rules.get(config.get('rules')) # start_urls配置 start_urls = config.get('start_urls') if start_urls: if start_urls.get('type') == 'static': self.start_urls = start_urls.get('value') elif start_urls.get('type') == 'dynamic': self.start_urls = list( eval('urls.' + start_urls.get('method'))( *start_urls.get('args', []))) # allowed_domains配置 self.allowed_domains = config.get('allowed_domains') super(UniversalSpider, self).__init__(*args, **kwargs)
def __init__(self, name, *args, **kwargs): #config为dict或list类型 config = get_config(name) self.config = config #此处的params表示spider对应的rules self.rules = rules.get(config.get('rules')) self.start_urls = config.get('start_urls') self.allowed_domains = config.get('allowed_domains') super(UniversalSpider, self).__init__(*args, **kwargs)
def run(): name = sys.argv[1] custom_settings = get_config(name) spider = custom_settings.get('spider', 'universal') project_settings = get_project_settings() settings = dict(project_settings.copy()) settings.update(custom_settings.get('settings')) process = CrawlerProcess(settings) process.crawl(spider, **{'name': name}) process.start()
def __init__(self, name, *args, **kwargs): config = get_config(name) self.config = config self.rules = rules.get(config.get('rules')) start_urls = config.get('start_urls') if start_urls: if start_urls.get('type') == 'static': self.start_urls = start_urls.get('value') elif start_urls.get('type') == 'dynamic': self.start_urls = list(eval('urls.' + start_urls.get('method'))(*start_urls.get('args', []))) self.allowed_domains = config.get('allowed_domains') super(UniversalSpider, self).__init__(*args, **kwargs)
def run(): name = sys.argv[1] # 首先获取命令行的参数并赋值为name,name就是JSON文件的名称,也是爬取目标网站的名称 custom_settings = get_config(name) # 利用get_config方法,传入该名称读取刚才定义的配置文件 # 爬取使用的Spider名称、 spider = custom_settings.get('spider', 'universal') project_settings = get_project_settings() # 配置文件中的settings配置 settings = dict(project_settings.copy()) # 将获取到的settings配置和项目全局的settings配置做了合并 # 合并配置 settings.update(custom_settings.get('settings')) process = CrawlerProcess(settings) # 新建一个CrawlSpider,传入爬取使用的配置 # 启动爬虫 process.crawl(spider, **{'name': name}) # 调用crawl process.start() # 调用start启动爬取
def run(): name = sys.argv[1] custom_settings = get_config(name) #获取爬取时使用的Spider名称 spider = custom_settings.get('spider') project_settings = get_project_settings() settings = dict(project_settings.copy()) #合并配置,将json中的settings合并到创建项目时自动生成的settings settings.update(custom_settings.get('settings')) process = CrawlerProcess(settings) #启动爬虫 process.crawl(spider, **{'name': name}) process.start()
def run(): # name = sys.argv[1] # json配置文件的名称 name = 'china' custom_settings = get_config(name) spider = custom_settings.get('spider', 'universal') project_settings = get_project_settings() settings = dict(project_settings.copy()) settings.update(custom_settings.get('settings')) process = CrawlerProcess(settings) process.crawl( spider, **{'name': name} ) # 第一个参数spider是要启动的爬虫类名,第二个参数:**{'name': name}是universal类中init初始化方法中name参数 process.start()
def run(): name = sys.argv[1] keyword = sys.argv[2] custom_settings = get_config(name) # 爬取使用的Spider名称 spider = custom_settings.get('spider', 'universal') project_settings = get_project_settings() settings = dict(project_settings.copy()) # 合并配置 settings.update(random.choice(custom_settings.get('settings'))) process = CrawlerProcess(settings) # 启动爬虫 process.crawl(spider, **{'name': name, 'keyword': keyword}) process.start()
def __init__(self, name, keyword, *args, **kwargs): config = get_config(name) self.config = config self.rules = rules.get(config.get('rules')) self.name = name self.keyword = keyword #self.start_urls = ["http://106.38.57.66:8080/oasearch/front/search.do"] #if start_urls: #if start_urls.get("type") == 'static': #self.start_urls = start_urls.get("value") #elif start_urls.get("type") == 'dynamic': #self.start_urls = list(eval('urls.' + start_urls.get('method'))(*start_urls.get('args',[]),keyword)) self.allowed_domains = config.get('allowed_domains') super(UniversalSpider, self).__init__(*args, **kwargs)
def run(): # 获取命令行参数 name = sys.argv[1] custom_settings = get_config(name) # 获取爬虫名字 spider = custom_settings.get('spider', 'universal') # 获取项目默认配置 project_settings = get_project_settings() settings = dict(project_settings.copy()) # 合并配置 settings.update(custom_settings.get('settings')) process = CrawlerProcess(settings) # 启动爬虫 process.crawl(spider, **{'name': name}) process.start()
def run(): ''' 为入口文件,作用是读取命令行参数->启动Spider/ 添加该文件之后,项目的启动命令为python run.py china ''' name = sys.argv[1] # 获取cmd命令行中参数/ 该name对应的即为spider的名称 custom_settings = get_config(name) # 爬虫执行的spider名称 spider = custom_settings.get('spider', 'universal') project_settings = get_project_settings() settings = dict(project_settings.copy()) # 合并所有配置 settings.update(custom_settings.get('settings')) process = CrawlerProcess(settings) # 启动爬虫 process.crawl(spider, **{'name': name}) process.start()
def run(): name = sys.argv[1] # 获取定义的 json 文件数据 custom_settings = get_config(name) # 获取爬取使用的 spider 名称 spider = custom_settings.get('spider', 'universal') # 如果不存在默认使用 universal print("spider_name: ", spider) project_settings = get_project_settings() # 读取settings设置文件 settings = dict(project_settings.copy()) # 合并配置 settings.update( custom_settings.get('settings')) # 将 settings 中的配置和 自定义 的配置合并 process = CrawlerProcess(settings) # 使用 CrawlerProcess 来实例化 爬虫程序 # 启动爬虫 process.crawl(spider, **{'name': name}) # 传入名称 开始爬取 相当于 scrapy crawl spider_name process.start()
def __init__(self, name, *args, **kwargs): # 获取自定义的json配置文件 config = get_config(name) self.config = config self.pageType = 1 # 翻页类型,根据分页类型判断使用翻页方式 self.pageTotal = 4 # 总页数,事件点击的翻页需要配置总页数 self.detailUrlXpaths = '//div[@class="p-con"]/div[@class="p-box"]/ul[@class="products"]' #详情页链接xpath self.pageXpaths = '//div[@id="pageStyle"]//a[contains(., "下一页")]' self.selector = '.laypage_next' self.attribute = 'data-page' self.title = '//div[@class="pro-property"]/div[@class="pro-info"]/h2/text()' self.text = '//div[@class="pro-property"]/div[@class="pro-info"]/p/text()' # self.rules = rules.get(config.get('rules')) # 根据分页类型获取rules if self.pageType == 0: # rules配置 self.rules = SpiderRules(detailUrlXpaths=self.detailUrlXpaths, pageXpaths=self.pageXpaths, detailCallback='parse_item', isSplash=False).rules.get('ruleHref') elif self.pageType == 1: self.rules = SpiderRules( detailUrlXpaths=self.detailUrlXpaths, detailCallback='parse_item').rules.get('ruleClick') # start_urls配置 start_urls = config.get('start_urls') if start_urls: if start_urls.get('type') == 'static': self.start_urls = start_urls.get('value') elif start_urls.get('type') == 'dynamic': self.start_urls = list( eval('urls.' + start_urls.get('method'))( *start_urls.get('args', []))) # allowed_domains配置 self.allowed_domains = config.get('allowed_domains') super(UniversalSpider, self).__init__(*args, **kwargs)
def __init__(self, *name, **kwargs): super().__init__(self.name, **kwargs) config = get_config(self.name) self.allowed_domains = config.get('allowed_domains') self.start_urls = config.get('start_urls') self.config = config