Python Website示例，dirbot.items.Website Python示例

示例#1

0

显示文件

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        sites = response.xpath('//a[@href]')
        print "sites",sites
        items = []

        for site in sites:
            item = Website()
            item['name'] = site.xpath('@alt').extract()
            item['url'] = site.xpath('@href').extract()
            # print item['url']

            # item['url'] = site.xpath(
            #     'a/@href').extract_first().strip()
            item['description'] = site.xpath('text()').extract()
            for i in item['description']:
                print i
            items.append(item)

        return items

示例#2

0

显示文件

文件： quarks.py 项目： GdZ/dirbot

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        sel = Selector(response)
        # sites = sel.xpath('//ul[@class="directory-url"]/li')
        sites = sel.xpath(self.rules.filters['root']['0'])
        items = []

        for site in sites:
            item = Website()
            # item['name'] = site.xpath('a/text()').extract()
            item['quarks_title'] = site.xpath(
                self.rules.filters['title']['0']).extract()
            item['quarks_link'] = site.xpath(
                self.rules.filters['link']['0']).extract()
            item['quarks_description'] = site.xpath(self.rules.filters['description']['0'])\
                .re(self.rules.filters['description']['1'])
            # item['quarks_pubdate'] = site.xpath(self.rules.filters['pubDate']['0'])
            items.append(item)

        return items

示例#3

0

显示文件

文件： dmoz.py 项目： RamiJaloudi/Python-Scripts

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        sel = Selector(response)
        sites = sel.xpath('//ul[@class="directory-url"]/li')
        items = []

        for site in sites:
            item = Website()
            item['name'] = site.xpath('a/text()').extract()
            item['url'] = site.xpath('a/@href').extract()
            item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
            items.append(item)

        # Original script had the the below line "return items".
        return items
        # However, I included the below item to write to a csv.
        saveFile = open('crawl_data.txt', 'w')
        saveFile.write(str(items))
        saveFile.close()

示例#4

0

显示文件

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        #sel = Selector(response)
        #sites = sel.xpath('//ul[@class="directory-url"]/li')

        sites = response.css(
            '#site-list-content > div.site-item > div.title-and-desc')
        items = []

        for site in sites:
            item = Website()
            item['name'] = site.css(
                'a > div.site-title::text').extract_first().strip()
            item['url'] = site.xpath('a/@href').extract_first().strip()
            item['description'] = site.css(
                'div.site-descr::text').extract_first().strip()
            items.append(item)

        return items

示例#5

0

显示文件

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html
        Use the `check` command to run the contract checks.

        @url http://dmoztools.net/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        nodes = response.xpath(
            '//div[contains(@class, "site-item")]/div[contains(@class, "title-and-desc")]'
        )
        items = []

        for node in nodes:
            item = Website()
            item['name'] = node.xpath(
                'a/div[contains(@class, "site-title")]/text()').re_first(
                    '^[\s\r\n]*(.*[^\s])[\s\r\n]*')
            item['url'] = node.xpath('a/@href').extract_first()
            item['description'] = node.xpath(
                'div[contains(@class, "site-descr")]/text()').re_first(
                    '^[\s\r\n]*(.*)[\s\r\n]*')
            items.append(item)

        return items

示例#6

0

显示文件

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     items = []
     global page_request, articl_request
     item = Website()
     item['articl_url'] = self.parse_articl_url(response)
     page_url = self.parse_page_url(response)
     item['page_url'] = []
     if page_url != 0:
         item['page_url'].append(page_url)
     item['content'] = []
     item['articl_name'] = []
     #print "articl url:%s\nlen:%d\n" %(item['articl_url'],len(item['articl_url']))
     for t_count in range(len(item['articl_url'])):
         articl_url = item['articl_url'][t_count]
         print "##################requse url = %s\n" % (articl_url)
         articl_request = Request(articl_url,
                                  callback=self.parse_articl_content)
         articl_request.meta['item'] = item
         yield articl_request
         if t_count == len(item['articl_url']) - 1:
             if item['page_url']:
                 #print '$$$$$$$$$$$$$$$$$$$$$$$$page url %s\n' %(item['page_url'][0])
                 page_request = Request(item['page_url'][0],
                                        callback=self.parse)
                 yield page_request
             else:
                 yield 'go to the last page, done!!!\n'

示例#7

0

显示文件

文件： sqli.py 项目： sunnypra/SqliScanner

    def parse_items(self, response):
        #print response
        sel = Selector(response)
        sites = sel.xpath('//ul/li')


        if response.status in [404,500,303]:
            raise CloseSpider("Met the page which doesn't exist")

        url= response.request.meta['url']
        print "ss"
        print url
        items = []
        for site in sites:
            item = Website()
            item['name'] = site.xpath('a/text()').extract()
            item['url'] = site.xpath('a/@href').extract()
            item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
            if(len(item['url']) != 0):
              if(len(str(item['url'][0])) != 1):
                new_url = str(self.sta[0])+str(item['url'][0]);
                yield Request(new_url, meta={'item':item},callback=self.parse_items)
                self.dic[url]=items;
            items.append(item)
            yield self.collect_item(self.dic)

示例#8

0

显示文件

    def parse_dir_contents(self, response):

        genre_text = response.xpath(
            "//body//div[@id='mw-pages']//h2//span//text()").extract()[0]
        genre = genre_text.split("Genre/")[-1].strip('"')

        artists = response.xpath(
            '//body//div[@id="mw-pages"]//div[@class="mw-content-ltr"]')

        artists = artists.xpath('//tr//ul//li//a')

        for sel in artists:

            item = Website()

            url = sel.xpath('@href').extract()[0]
            url = response.urljoin(url)

            title = sel.xpath('@title').extract()[0]

            item['url'] = url
            item['title'] = title
            item['genre'] = genre

            yield item

示例#9

0

显示文件

文件： dmoz.py 项目： sunnypra/SqliScanner

    def parse1(self, response):

        print response
        sel = Selector(response)
        sites = sel.xpath('//ul/li')

        items = []
        urls = []
        for site in sites:
            item = Website()
            item['name'] = site.xpath('a/text()').extract()
            item['url'] = site.xpath('a/@href').extract()
            item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
            new_url = str(self.start_urls[0]) + str(item['url'][0])
            yield Request(new_url,
                          meta={
                              'item': item,
                              'url': new_url
                          },
                          callback=self.parse_items)
            #print item['name'] ,item['url'] ,item['description']
            items.append(item)
        self.dic[str(self.start_urls[0])] = items
        print self.dic
        print "result"

示例#10

0

显示文件

文件： dmoz.py 项目： felippeduarte/scrapy_sample

 def parse(self, response):
     hxs = HtmlXPathSelector(response)
     sites = hxs.select('//ul/li')
     items = []
     for site in sites:
         item = Website()
         item['name'] = site.select('a/text()').extract()
         item['url'] = site.select('a/@href').extract()
         item['description'] = site.select('text()').extract()
         items.append(item)
     return items

示例#11

0

显示文件

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        sel = Selector(response)
        # sites = sel.xpath('//ul[@class="directory-url"]/li')
        # sites = sel.xpath(self.rules.filters['root']['0'])
        bodies = sel.xpath(self.rules.filters['body']['0'])

        items = []

        # for body in bodies:
        body = bodies
        for body in bodies:
            item = Website()
            # item['name'] = site.xpath('a/text()').extract()
            # jd_root_nav
            item['jddj_root_nav'] = body.xpath(
                self.rules.filters['root_nav']['0']).extract()
            ## this two need not to two variables, because 'jd_root_nav' is a array
            #item['jd_fenlie'] = body.xpath(self.rules.filters['fenlie']['0']).extract()
            #item['jd_fenglie2'] =body.xpath(self.rules.filters['fenglie2']['0']).extract()

            # jd_product_intro
            # item['jd_product_intro'] = body.xpath(self.rules.filters['product_intro']['0'])
            item['jddj_spec_n1'] = body.xpath(
                self.rules.filters['spec_n1']['0']).extract()
            ## this rules'jd_p_ad' is right, but cannot select. I don't know why
            item['jddj_p_ad'] = body.xpath(
                self.rules.filters['p_ad']['0']).extract()

            ## this rules'jd_jd_price' is right, but cannot select. I don't know why
            item['jddj_jd_price'] = body.xpath(
                self.rules.filters['jd_price']['0']).extract()
            #item['jd_product_detail_1'] = body.xpath(self.rules.filters['product_detail_1']['0'])
            #item['jd_parameter2'] = body.xpath(self.rules.filters['parameter2']['0'])
            item['jddj_canshu'] = body.xpath(
                self.rules.filters['canshu']['0']).extract()

            # jd_promises
            #item['jd_promises'] = body.xpath(self.rules.filters['promises']['0'])
            #item['jd_zhengpin'] = body.xpath(self.rules.filters['zhengpin']['0']).extract()

            # jd_comment, this is comment list
            #item['jd_comment'] = body.xpath(self.rules.filters['comment']['0'])

            # add to list items
            items.append(item)

        return items

示例#12

0

显示文件

文件： dmoz.py 项目： hzjiangjian/scrapy_src

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        sel = Selector(response)
        sites = sel.xpath('//ul[@class="directory-url"]/li')
        items = []

        for site in sites:
            item = Website()
            item['name'] = site.xpath('a/text()').extract()
            item['url'] = site.xpath('a/@href').extract()
            item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
            items.append(item)

            item.self_print()

        return items

示例#13

0

显示文件

 def parse_single(self, response):
     w = Website()
     path = response.url.split('/')
     number = path[-1].split('.')[0].lstrip('p0')
     author = path[-2]
     image_base = path[:-1]
     hxs = Selector(response)
     image = hxs.xpath('//img/@src').extract()[0]
     image_base.append(image)
     w['number'] = number
     w['author'] = author
     w['image_urls'] = ['/'.join(image_base)]
     return w

示例#14

0

显示文件

文件： dmoz.py 项目： BinaryBlue/myDirbot

    def parse(self, response):
        sel = Selector(response)
        #sites = sel.xpath('//ul/li')
        sites = response.css('a')
        items = []

        for site in sites:
            item = Website()
            item['name'] = site.xpath('@title').extract()
            item['url'] = site.xpath('@href').extract()
            items.append(item)

        return items

示例#15

0

显示文件

文件： dmoz.py 项目： codergithut/dirbot

    def parse(self, response):
        item=None
        print("===============")
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """

        if 'item' in response.meta:
            item = response.meta['item']

        if item is None:
            item = Website()
            item['url'] = self.start_urls[0]
            item['depth']=0

        hxs = Selector(response)
        hrefs = hxs.xpath("/html/body//@href").extract()
        item['description'] = '测试'
        item['title']= 'title'
        yield item
        for a in hrefs:
            item_detail=Website()
            item_detail['depth'] = item['depth'] + 1
            item_detail['url'] = a
            if item_detail['depth'] > self.depth :
                return
            else:
                if a.startswith('http') and a.find('python') >= 0:
                    yield scrapy.Request(url=a, meta={'item': item_detail}, callback=self.parse, dont_filter=True)
                    pass
                pass
        pass

示例#16

0

显示文件

文件： dmoz.py 项目： naginoasukara/dirbot

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        sites = response.selector.xpath('//h3')
        items = []

        for site in sites:
            item = Website()
            item['name'] = site.selector.xpath('//p').extract_first().strip()

        print(items)

示例#17

0

显示文件

文件： scrapytestsnap.py 项目： DigiClass/prosopography

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html
        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        sel = Selector(response)
        sites = sel.xpath('//ul[@class="directory-url"]/li')
        items = []

        for site in sites:
            item = Website()
            item['name'] = site.xpath('title/text()').extract()
            items.append(item)

        return items

示例#18

0

显示文件

文件： tpb.py 项目： bottleneckCC/redtorrent

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//div[@id="main-content"]//td')
        items = []

        for site in sites:
            item = Website()
            item['url'] = site.select('a/@href').extract()
            items.append(item)
#dict(item)

        return items

示例#19

0

显示文件

文件： dmoz.py 项目： rainmanzj/alatastest

 def parse_item(self, response):
     p = Pinyin()
     items = []
     sel = Selector(response)
     base_url = get_base_url(response)
     sites_even = sel.css('table.tablelist tr.even')
     for site in sites_even:
         item = Website()
         item['name'] = site.css('.l.square a').xpath('text()').extract()[0]
         item['description'] = site.css(
             'tr > td:nth-child(2)::text').extract()[0]
         url = site.css('tr > td:nth-child(4)::text').extract()[0]
         item['url'] = p.get_pinyin(url, u'')
         item['address'] = url
         item['num'] = int(
             site.css('tr > td:nth-child(3)::text').extract()[0])
         item['date'] = site.css('tr > td:nth-child(5)::text').extract()[0]
         item['uid'] = item['date'] + '-' + url + '-' + item['name']
         items.append(item)
     return items

示例#20

0

显示文件

文件： dmoz.py 项目： KeithYue/QA-spider

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//ul[@class="directory-url"]/li')
        items = []

        for site in sites:
            item = Website()
            item['name'] = site.select('a/text()').extract()
            item['url'] = site.select('a/@href').extract()
            item['description'] = site.select('text()').re('-\s([^\n]*?)\\n')
            items.append(item)

        return items

示例#21

0

显示文件

 def parse(self, response):
     sel = Selector(response)
     sites = sel.xpath('//table/tbody')
     for site in sites:
         item = Website()
         item['program_name'] = site.select(
             './tr[1]/td[3]/a/text()').extract()
         item['license_no'] = site.select(
             './tr[1]/td[5]/a/text()').extract()
         item['area'] = site.select('./tr[2]/td[2]/text()').extract()
         item['open_time'] = site.select('./tr[2]/td[4]/text()').extract()
         item['program_type'] = site.select(
             './tr[3]/td[2]/text()').extract()
         item['sale_phone_no'] = site.select(
             './tr[3]/td[4]/text()').extract()
         item['program_addr'] = site.select(
             './tr[4]/td[2]/text()').extract()
         yield item
     sm = send_email()
     sm.send_email()

示例#22

0

显示文件

文件： dmoz.py 项目： sunnypra/SqliScanner

    def parse_items(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        #print response
        sel = Selector(response)
        sites = sel.xpath('//ul/li')
        """print "sunny"
        print sites
        print "prakash"

        """

        if response.status in [404, 500, 303]:
            raise CloseSpider("Met the page which doesn't exist")

        url = response.request.meta['url']
        print "ss"
        print url
        #urls=[]
        items = []
        for site in sites:
            item = Website()
            item['name'] = site.xpath('a/text()').extract()
            item['url'] = site.xpath('a/@href').extract()
            item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
            new_url = str(self.start_urls[0]) + str(item['url'][0])
            #print new_url;
            #urls.append(new_url);
            yield Request(new_url,
                          meta={'item': item},
                          callback=self.parse_items)
            #print item['name'] ,item['url'] ,item['description']
            items.append(item)
        self.dic[url] = items
        print "final"
        print self.dic

示例#23

0

显示文件

    def parse(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        sel = Selector(response)
        sites = sel.xpath('//div[@class="title-and-desc"]')
        items = []

        for site in sites:
            item = Website()
            item['name'] = site.xpath('a/div/text()').extract()
            item['url'] = site.xpath('a/@href').extract()
            item['description'] = site.xpath(
                'div[@class="site-descr "]/text()').extract()
            items.append(item)

        return items

示例#24

0

显示文件

文件： googledir.py 项目： felippeduarte/scrapy_sample

    def parse_category(self, response):
        # The main selector we're using to extract data from the page
        main_selector = HtmlXPathSelector(response)

        # The XPath to website links in the directory page
        xpath = '//td[descendant::a[contains(@href, "#pagerank")]]/following-sibling::td/font'

        # Get a list of (sub) selectors to each website node pointed by the XPath
        sub_selectors = main_selector.select(xpath)

        # Iterate over the sub-selectors to extract data for each website
        for selector in sub_selectors:
            item = Website()

            l = XPathItemLoader(item=item, selector=selector)
            l.add_xpath('name', 'a/text()')
            l.add_xpath('url', 'a/@href')
            l.add_xpath('description', 'font[2]/text()')

            # Here we populate the item and yield it
            yield l.load_item()

示例#25

0

显示文件

    def parse_dir_contents(self, response):
        """
        The lines below is a spider contract. For more info see:
        http://doc.scrapy.org/en/latest/topics/contracts.html

        @url http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/
        @scrapes name
        """
        sites = response.xpath(
            '//table[@id="ctl00_cphMain_gridResult"]/tr/td[3]')
        # sites = response.css('#site-list-content > div.site-item > div.title-and-desc')
        items = []

        for site in sites:
            item = Website()
            item['name'] = site.xpath('a/text()').extract()
            item['url'] = site.xpath('a/@href').extract()
            item['description'] = site.css('a/@target').extract()
            items.append(item)

        return items

示例#26

0

显示文件

文件： sqli_app1.py 项目： sunnypra/SqliScanner

 def parse1(self, response):
     sel = Selector(response)
     sites = sel.xpath('//ul/li')
     sites1=sel.xpath('//a/@href').extract();
     print sites
     print "ssssssss"
     print sites1
     items = []
     urls=[]
     for site in sites:
         item = Website()
         item['name'] = site.xpath('a/text()').extract()
         item['url'] = site.xpath('a/@href').extract()
         item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
         #yield self.collect_item(item)
         if(len(item['url']) != 0):
            if(len(str(item['url'][0])) != 1):
              new_url = str(self.sta[0])+str(item['url'][0])
              yield Request(new_url, meta={'item':item,'url':new_url},callback=self.parse_items)
         items.append(item)
         yield self.collect_item(item)
     self.dic[str(self.start_urls[0])]=items;

示例#27

0

显示文件

文件： sqli.py 项目： sunnypra/SqliScanner

 def parse1(self, response):
     print response
     sel = Selector(response)
     texts=sel.xpath("//input[@type='text']")
     print "ttttttt"
     print texts
     print "sssssss"
     sites = sel.xpath("//ul/li[@onclick]")
     print sites
     items = []
     urls=[]
     for site in sites:
         item = Website()
         item['name'] = site.xpath('a/text()').extract()
         item['url'] = site.xpath('a/@href').extract()
         item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
         yield self.collect_item(item)
         if(len(item['url']) != 0):
            if(len(str(item['url'][0])) != 1):
              new_url = str(self.sta[0])+str(item['url'][0])
              print "new_url :"+new_url
              yield Request(new_url, meta={'item':item,'url':new_url},callback=self.parse_items)
         items.append(item)
     self.dic[str(self.start_urls[0])]=items;

示例#28

0

显示文件

文件： data.py 项目： okfde/odm-datenerfassung

    def parse_page(self, response):
        for ext in self.filetypes:
            if (ext[1:] in response.headers['Content-Type'].upper() or
                ('Content-Disposition' in response.headers
                 and ext in response.headers['Content-Disposition'].upper())):
                print "Detected a downloadable, generated file"
                item = Website()

                item['URL_Datei'] = response.url
                item['Stadt_URL'] = unicode(self.domain, 'utf-8')
                #Not applicable
                item['URL_Text'] = unicode('', 'utf-8')
                if ('Content-Disposition' in response.headers):
                    item['URL_Dateiname'] = unicode(
                        response.headers['Content-Disposition'], 'utf-8')
                else:
                    item['URL_Dateiname'] = unicode(
                        item['URL_Datei']).split('/')[-1]
                item['Format'] = ext[1:]
                #if we just have e.g. "json" and we are dealing with DKAN, then we are probably dealing with an API item description and not a file
                if (item['URL_Dateiname'].upper()
                        == item['Format']) and 'node' in item['URL_Datei']:
                    return []
                if (ext in self.geofiletypes):
                    item['geo'] = 'x'
                else:
                    item['geo'] = u''
                item[
                    'URL_PARENT'] = u'Nicht moeglich kann aber nachtraeglich ermittelt werden'
                item[
                    'Title_PARENT'] = u'Nicht moeglich kann aber nachtraeglich ermittelt werden'
                self.writerdata.writerow(item)
                #Done
                return []

        if ('Content-Type' in response.headers
                and 'text/html' not in response.headers['Content-Type']):
            print "Not HTML or anything else of interest, giving up"
            print response.headers
            return []

        #Otherwise, its html and we process all links on the page
        sel = Selector(response)

        #Title of the page we are on (this will be the 'parent')
        parent_title = sel.xpath('//title/text()').extract()
        if (len(parent_title) > 0): parent_title = parent_title[0]
        #URL of the page we are on (parent)
        parent_url = response.url

        #Get all links
        sites = sel.xpath('//body//a')
        #items = []

        for site in sites:
            item = Website()

            item['URL_Datei'] = unicode('', 'utf-8')
            url_file = site.xpath('@href').extract()
            if (len(url_file) > 0):
                item['URL_Datei'] = url_file[0]

            item['Stadt_URL'] = unicode(self.domain, 'utf-8')

            #Get ALL text of everything inside the link
            #First any sub-elements like <span>
            textbits = site.xpath('child::node()')
            item['URL_Text'] = unicode('', 'utf-8')
            for text in textbits:
                thetext = text.xpath('text()').extract()
                if (len(thetext) > 0): item['URL_Text'] += thetext[0]
            #Then the actual text
            directText = site.xpath('text()').extract()
            #If there's something there and it isn't a repetition, use it
            if (len(directText) > 0) and (directText != thetext):
                item['URL_Text'] += directText[0]
            item['URL_Text'] = item['URL_Text'].replace("\t", " ").replace(
                "\n", "").strip()

            #If that got us nothing, then look at the title and alt elements
            title_text = site.xpath('@title').extract()
            if (len(title_text) > 0) and (item['URL_Text'] == u''):
                item['URL_Datei'] = title_text[0]
            alt_text = site.xpath('@alt').extract()
            if (len(alt_text) > 0) and (item['URL_Text'] == u''):
                item['URL_Datei'] = alt_text[0]

            item['URL_Dateiname'] = unicode(item['URL_Datei']).split('/')[-1]
            item['Format'] = u'Not interesting'
            item['geo'] = u''
            item['URL_PARENT'] = parent_url
            item['Title_PARENT'] = parent_title

            #Is it a file (does it have any of the extensions (including the '.' in the filename,
            #then remove the '.'
            for ext in self.filetypes:
                if ext in item['URL_Dateiname'].encode(
                        'ascii', errors='ignore').upper():
                    item['Format'] = ext[1:len(ext)]
                    #And is it one of our special geo filetypes?
                    if ext in self.geofiletypes:
                        item['geo'] = 'x'
                    self.writerdata.writerow(item)

            self.writer.writerow(item)
            #items.append(item)

        return []

示例#29

0

显示文件

 def parse_address(self, response):
     webpage = Website()
     webpage['url'] = response.url
     webpage['body'] = response.css(
         "#ctl00_PlaceHolderMain_ctl00_resultsPanel").extract()
     return webpage