Exemplo n.º 1
0
    def parse_item(self, response):
        meta = response.meta
        try:
            pm = ParserManager(meta['domain'])
        except ImportError:
            raise NotSupported('Have no supported Template for domain:%s' % meta['domain'])

        item = {}
        match = False
        for tpl in pm.list():
            p = pm.create(tpl, response = response)
            try:
                item = p.extract()
                match = p.isMatch()
                if match:
                    break
            except:
                continue

        if not match:
            raise DropItem('This page has not been extracted!')
        item['father_url_number'] = meta['mission'][1]
        item['child_url'] = response.url
        item['sum_mark'] = meta['mission'][3]
        item['child_mark'] = meta['mission'][4]

        return item
Exemplo n.º 2
0
    def parse_item(self, response):
        meta = response.meta
        try:
            pm = ParserManager(meta["domain"])
        except ImportError:
            raise NotSupported("Have no supported Template for domain:%s" % meta["domain"])

        item = {}
        match = False
        for tpl in pm.list():
            p = pm.create(tpl, response=response)
            try:
                item = p.extract()
                match = p.isMatch()
                if match:
                    break
            except:
                continue

        if not match:
            raise DropItem("This page has not been extracted!")
        item["father_url_number"] = ""
        item["child_url"] = response.url
        item["sum_mark"] = "jr"
        item["child_mark"] = "jr"

        return item
Exemplo n.º 3
0
    def parse_item(self, response):
        meta = response.meta
        
        try:
            pm = ParserManager(meta['domain'])
        except ImportError:
            raise NotSupported('Have no supported Template for domain:%s' % meta['domain'])

        item = {}
        match = False
        for tpl in pm.list():
            p = pm.create(tpl, response = response)
            try:
                item = p.extract()
                match = p.isMatch()
                if match:
                    break
            except:
                continue

        if not match:
            raise DropItem('This page has not been extracted!')

        item['city'] = meta.get('area')
        item['father_url_number'] = meta['mission'][1]
        item['child_url'] = response.url
        item['sum_mark'] = meta['mission'][3]
        item['child_mark'] = meta['mission'][4]
        if len(meta['hxs_a'])>0:
            if 'out.tuan800' in meta['hxs_a'][meta['num']]:
                content_image = meta['content_image'][meta['num']]
                item['image_urls'] = [content_image,]
                image="<br/><br/><img src='"+content_image+"'>"
                content = meta['content'][meta['num']]
                content = content+image
                item['title'] = meta['title'][meta['num']]
                item['content'] =  content
                item['price'] = meta['price'][meta['num']].strip()
                item['source'] = meta['source']
            
            if 'tuan.27.cn' in response.url:
                item['price'] = meta['price'][meta['num']].strip()
                
            if 'nuomi.com' in meta['hxs_a'][meta['num']]:
                deadline = meta['deadline'][meta['num']]
                deadline = deadline[0:10]
                deadline = string.atoi(deadline)
                deadline = time.localtime(deadline)
                item['deadline'] = time.strftime('%Y-%m-%d',deadline)

        return item
Exemplo n.º 4
0
    def parse_item(self, response):
        meta = response.meta
        try:
            pm = ParserManager(meta['domain'])
        except ImportError:
            raise NotSupported('Have no supported Template for domain: %s' % meta['domain'])

        item = {}
        match = False
        for tpl in pm.list():
            p = pm.create(tpl, response = response)
            try:
                item = p.extract()
                match = p.isMatch()
                if match:
                    break
            except Exception, e:
                continue
Exemplo n.º 5
0
def run():
    parser = OptionParser()
    parser.add_option("-t", "--tpl", dest="template_name",
                  help="specified a template")
    parser.add_option("-u", "--url", dest="request_url", 
                  help="specified a request url as 'http://www.baidu.com/'")

    (options, args) = parser.parse_args()
    tpl = options.template_name
    url = options.request_url
    if not tpl or not url:
        sys.stderr.write("Type './testunit.py --help' for usage.\n")
        sys.exit(1)

    spider = Crawl(url)
    spider.fetch()

    pm = ParserManager(urltools.get_domain(url))
    p = pm.create(tpl, response = spider.response)
    item = p.extract()
    
    for kv in item.withdict().items():
        print u'[%s]: %s' % kv