def parse_item(self, response): meta = response.meta try: pm = ParserManager(meta['domain']) except ImportError: raise NotSupported('Have no supported Template for domain:%s' % meta['domain']) item = {} match = False for tpl in pm.list(): p = pm.create(tpl, response = response) try: item = p.extract() match = p.isMatch() if match: break except: continue if not match: raise DropItem('This page has not been extracted!') item['father_url_number'] = meta['mission'][1] item['child_url'] = response.url item['sum_mark'] = meta['mission'][3] item['child_mark'] = meta['mission'][4] return item
def parse_item(self, response): meta = response.meta try: pm = ParserManager(meta["domain"]) except ImportError: raise NotSupported("Have no supported Template for domain:%s" % meta["domain"]) item = {} match = False for tpl in pm.list(): p = pm.create(tpl, response=response) try: item = p.extract() match = p.isMatch() if match: break except: continue if not match: raise DropItem("This page has not been extracted!") item["father_url_number"] = "" item["child_url"] = response.url item["sum_mark"] = "jr" item["child_mark"] = "jr" return item
def parse_item(self, response): meta = response.meta try: pm = ParserManager(meta['domain']) except ImportError: raise NotSupported('Have no supported Template for domain:%s' % meta['domain']) item = {} match = False for tpl in pm.list(): p = pm.create(tpl, response = response) try: item = p.extract() match = p.isMatch() if match: break except: continue if not match: raise DropItem('This page has not been extracted!') item['city'] = meta.get('area') item['father_url_number'] = meta['mission'][1] item['child_url'] = response.url item['sum_mark'] = meta['mission'][3] item['child_mark'] = meta['mission'][4] if len(meta['hxs_a'])>0: if 'out.tuan800' in meta['hxs_a'][meta['num']]: content_image = meta['content_image'][meta['num']] item['image_urls'] = [content_image,] image="<br/><br/><img src='"+content_image+"'>" content = meta['content'][meta['num']] content = content+image item['title'] = meta['title'][meta['num']] item['content'] = content item['price'] = meta['price'][meta['num']].strip() item['source'] = meta['source'] if 'tuan.27.cn' in response.url: item['price'] = meta['price'][meta['num']].strip() if 'nuomi.com' in meta['hxs_a'][meta['num']]: deadline = meta['deadline'][meta['num']] deadline = deadline[0:10] deadline = string.atoi(deadline) deadline = time.localtime(deadline) item['deadline'] = time.strftime('%Y-%m-%d',deadline) return item
def parse_item(self, response): meta = response.meta try: pm = ParserManager(meta['domain']) except ImportError: raise NotSupported('Have no supported Template for domain: %s' % meta['domain']) item = {} match = False for tpl in pm.list(): p = pm.create(tpl, response = response) try: item = p.extract() match = p.isMatch() if match: break except Exception, e: continue
def run(): parser = OptionParser() parser.add_option("-t", "--tpl", dest="template_name", help="specified a template") parser.add_option("-u", "--url", dest="request_url", help="specified a request url as 'http://www.baidu.com/'") (options, args) = parser.parse_args() tpl = options.template_name url = options.request_url if not tpl or not url: sys.stderr.write("Type './testunit.py --help' for usage.\n") sys.exit(1) spider = Crawl(url) spider.fetch() pm = ParserManager(urltools.get_domain(url)) p = pm.create(tpl, response = spider.response) item = p.extract() for kv in item.withdict().items(): print u'[%s]: %s' % kv