def parse(self, response): title = response.xpath("//div[@id='text']/div/a/text()").extract() thumbnail = response.xpath("//div[@id='text']/div/img/@src").extract() tagline = response.xpath("//div[@id='text']/div/p/text()").extract() link = response.xpath("//div[@id='text']/div/a/@href").extract() for x in range(10): try: item = NewsItem() item['title'] = title[x].encode('ascii','ignore') item['articletype'] = "News" item['tags'] = "news" item['thumbnail'] = uploadImg(thumbnail[x]) item['tagline'] = tagline[x].encode('ascii','ignore') item['website'] = "un.org" item['link'] = "http://www.un.org" + str(link[x].encode('ascii','ignore')) item['created'] = datetime.date.today() item['hits'] = 0 item['user_id'] = 1 yield item except: pass print "Titles: " + str(len(title)) print "Thumbs: " + str(len(thumbnail)) print "Taglines: " + str(len(tagline)) print "Links: " + str(len(link))
def parse(self, response): thumbs = [] titles=[] taglines=[] websites=[] links=[] for sel in response.xpath('//ul[not(@id)]/li'): try: title_item = str(sel.xpath("a/strong/text()").extract()[0].encode('ascii', 'ignore')) website_item = str(sel.xpath("a/@href").extract()[0].encode('ascii', 'ignore')).split("/")[2][4:] website_itemb = str(sel.xpath("a/@href").extract()[0].encode('ascii', 'ignore')) tagline_item = str(sel.xpath("p/text()").extract()[0].encode('ascii', 'ignore'))[:320] except: title_item = "#" website_item = "#" website_itemb = "#" tagline_item = "#" titles.append(title_item) websites.append(website_item) links.append(website_itemb) taglines.append(tagline_item) for sel in response.xpath('//ul[not(@id)]/img'): thumb_item = str(sel.xpath("@src").extract()[0].encode('ascii', 'ignore')) thumbs.append(thumb_item) for x in range (len(titles)): if links[x] == "#": pass else: item = NewsItem() item['title'] = titles[x] item['articletype'] = "News" item['tags'] = "news" item['thumbnail'] = uploadImg(thumbs[x]) item['tagline'] = taglines[x] item['website'] = websites[x] item['link'] = links[x] item['created'] = datetime.date.today() item['hits'] = 0 item['user_id'] = 1 yield item print "Titles: " + str(len(titles)) print "Thumbs: " + str(len(thumbs)) print "Taglines: " + str(len(taglines)) print "Websites: " + str(len(websites)) print "Links: " + str(len(links))