def make_page_extractor(self, obj): if type(obj) != dict: return pages = obj.get('pages') if pages: regex = self.macro.expand(pages.get('regex')) xpath = self.macro.expand(pages.get('xpath')) self.page_extractor = SgmlLinkExtractor( allow=regex, restrict_xpaths=xpath, process_value=utils.first_n_pages(regex, pages))
def make_page_extractor(self, obj): if type(obj)!=dict: return pages = obj.get('pages') if pages: regex = self.macro.expand(pages.get('regex')) xpath = self.macro.expand(pages.get('xpath')) self.page_extractor = SgmlLinkExtractor( allow=regex, restrict_xpaths=xpath, process_value=utils.first_n_pages(regex, pages))
def load_config(self, config_path): txt = utils.load_file(config_path) if not txt: raise CloseSpider() conf = json.loads(txt) log.msg(u'loading config from <{}>:\n{}'.format( unicode(config_path, encoding='utf-8'), json.dumps(conf, indent=4, ensure_ascii=False, sort_keys=True))) #### config self.config = config_path #### debug self.debug = conf.get('debug', False) #### site self.site = conf.get('site', u'未知站点') self.macro = utils.MacroExpander({ 'SITE': self.site, 'CONF': json.dumps(conf) }) #### allowed_domains self.allowed_domains = conf.get('domains', []) #### start_urls urls = conf.get('urls', []) self.start_urls = utils.generate_urls(urls, self.macro) self.start_method = urls.get('method', 'GET') if type(urls) == dict else 'GET' self.make_headers( urls.get('headers', {}) if type(urls) == dict else {}) #### rules self.rules = [] self.page_extractor = None for k, v in conf.get('rules', {}).iteritems(): follow = v.get('follow', True) callback = None if follow else 'parse_page' regex = self.macro.expand(v.get('regex')) xpath = self.macro.expand(v.get('xpath')) pages = v.get('pages') sub = v.get('sub') rule = Rule(SgmlLinkExtractor(allow=regex, restrict_xpaths=xpath, process_value=utils.first_n_pages( regex, pages)), process_links=self.sub_links(sub), callback=callback, follow=follow) self.rules.append(rule) if not self.rules: self.parse = self.parse_page self.make_page_extractor(conf.get('urls', [])) ### mappings(loop/fields) self.build_mappings(conf) ### proxy self.proxy = conf.get('proxy', {}) ### database for db in ['mongo', 'mysql', 'zmq']: if db in conf: setattr(self, db, conf[db]) ### settings # self.logger = settings.DEFAULT_LOGGER self.dedup = settings.DEFAULT_DEDUP for k, v in conf.get('settings', {}).iteritems(): log.msg(utils.G('+SET {} = {}'.format(k, v))) setattr(self, k, v) ### plugin if hasattr(self, 'plugin'): self.plugin = utils.load_plugin(self.plugin) self.plugin.spider = self else: self.plugin = None
def load_config(self, config_path): txt = utils.load_file(config_path) if not txt: raise CloseSpider() conf = json.loads(txt) log.msg(u'loading config from <{}>:\n{}'.format(unicode(config_path, encoding='utf-8'), json.dumps(conf, indent=4, ensure_ascii=False, sort_keys=True))) #### config self.config = config_path #### debug self.debug = conf.get('debug', False) #### site self.site = conf.get('site', u'未知站点') self.macro = utils.MacroExpander({ 'SITE': self.site, 'CONF': json.dumps(conf) }) #### allowed_domains self.allowed_domains = conf.get('domains', []) #### start_urls urls = conf.get('urls', []) self.start_urls = utils.generate_urls(urls, self.macro) self.start_method = urls.get('method', 'GET') if type(urls)==dict else 'GET' self.make_headers(urls.get('headers', {}) if type(urls)==dict else {}) #### rules self.rules = [] self.page_extractor = None for k,v in conf.get('rules', {}).iteritems(): follow = v.get('follow', True) callback = None if follow else 'parse_page' regex = self.macro.expand(v.get('regex')) xpath = self.macro.expand(v.get('xpath')) pages = v.get('pages') sub = v.get('sub') rule = Rule( SgmlLinkExtractor( allow=regex, restrict_xpaths=xpath, process_value=utils.first_n_pages(regex, pages)), process_links=self.sub_links(sub), callback=callback, follow=follow ) self.rules.append(rule) if not self.rules: self.parse = self.parse_page self.make_page_extractor(conf.get('urls', [])) ### mappings(loop/fields) self.build_mappings(conf) ### proxy self.proxy = conf.get('proxy', {}) ### database for db in ['mongo', 'mysql', 'zmq']: if db in conf: setattr(self, db, conf[db]) ### settings # self.logger = settings.DEFAULT_LOGGER self.dedup = settings.DEFAULT_DEDUP for k,v in conf.get('settings', {}).iteritems(): log.msg(utils.G('+SET {} = {}'.format(k, v))) setattr(self, k, v) ### plugin if hasattr(self, 'plugin'): self.plugin = utils.load_plugin(self.plugin) self.plugin.spider = self else: self.plugin = None