예제 #1
0
   def parse(self, response):         
       title = response.xpath("//div[@id='text']/div/a/text()").extract()
       thumbnail = response.xpath("//div[@id='text']/div/img/@src").extract()
       tagline = response.xpath("//div[@id='text']/div/p/text()").extract()
       link = response.xpath("//div[@id='text']/div/a/@href").extract()
 
       for x in range(10):
           try:
               item = NewsItem()
               item['title'] = title[x].encode('ascii','ignore')
               item['articletype'] = "News"
               item['tags'] = "news"
               item['thumbnail'] = uploadImg(thumbnail[x])
               item['tagline'] = tagline[x].encode('ascii','ignore') 
               item['website'] = "un.org"
               item['link'] = "http://www.un.org" + str(link[x].encode('ascii','ignore'))
               item['created'] = datetime.date.today()
               item['hits'] = 0
               item['user_id'] = 1
               yield item
           except:
               pass
               
       print "Titles: " + str(len(title))
       print "Thumbs: " + str(len(thumbnail))
       print "Taglines: " + str(len(tagline))
       print "Links: " + str(len(link))
    def parse(self, response):
        thumbs = []
        titles=[]
        taglines=[]
        websites=[]
        links=[]

        for sel in response.xpath('//ul[not(@id)]/li'):
            try:
                title_item = str(sel.xpath("a/strong/text()").extract()[0].encode('ascii', 'ignore'))
                website_item = str(sel.xpath("a/@href").extract()[0].encode('ascii', 'ignore')).split("/")[2][4:]
                website_itemb = str(sel.xpath("a/@href").extract()[0].encode('ascii', 'ignore'))
                tagline_item = str(sel.xpath("p/text()").extract()[0].encode('ascii', 'ignore'))[:320]
            except:
                title_item = "#"
                website_item = "#"
                website_itemb = "#"
                tagline_item = "#"

            titles.append(title_item)
            websites.append(website_item)
            links.append(website_itemb)
            taglines.append(tagline_item)

        for sel in response.xpath('//ul[not(@id)]/img'):
            thumb_item = str(sel.xpath("@src").extract()[0].encode('ascii', 'ignore'))
            thumbs.append(thumb_item)

        for x in range (len(titles)):
            if links[x] == "#":
                pass
            else:
                item = NewsItem()
                item['title'] = titles[x]
                item['articletype'] = "News"
                item['tags'] = "news"
                item['thumbnail'] = uploadImg(thumbs[x])
                item['tagline'] = taglines[x]
                item['website'] = websites[x]
                item['link'] = links[x]
                item['created'] = datetime.date.today()
                item['hits'] = 0
                item['user_id'] = 1
                yield item

        print "Titles: " + str(len(titles))
        print "Thumbs: " + str(len(thumbs))
        print "Taglines: " + str(len(taglines))
        print "Websites: " + str(len(websites))
        print "Links: " + str(len(links))