def load_podcast_rss(self, response): x = XmlXPathSelector(response) x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd") x.register_namespace("media", "http://search.yahoo.com/mrss/") metaData = response.meta['metaData'] itunesTrackId = metaData['itunesTrackId'] metaData['rssUrl'] = response.url ########## # a limit of 50 episodes has been hard coded here, this should be in settings somewhere ######### episodes = x.select('//channel/item[enclosure[contains(@type,"audio") or contains(@type,"video")]][position()<50]') podcastEpisodeCount = str(len(episodes)) items = [] self.totalPodcastEpisodes = self.totalPodcastEpisodes + len(episodes) if len(episodes)==0: self.logProgress('Empty feed', metaData['brandName'][0], '', itunesTrackId, log.WARNING, ('No episodes for %s' % (response.url))) metaData['itemtype']=['noepisodes'] item = self.load_item(x.select('//channel'), metaData) yield item else: podcastEpisodeIndex = str(len(items)) podcastEpisodeCount = str(len(episodes)) self.logProgress('load_podcast_rss', metaData['brandName'][0], '', itunesTrackId, log.INFO, ('%s/%s' % (podcastEpisodeIndex, podcastEpisodeCount))) for episode in episodes: metaData['itemtype']=['ondemand'] item = self.load_item(episode, metaData) yield item
def parse_page_content(self, response): xxs = XmlXPathSelector(response) page_text = xxs.select('/api/query/pages/page/revisions/rev/text()').extract() if page_text: url = xxs.select('/api/query/pages/page/@fullurl').extract() if url: url = url[0] else: url = None page_text = page_text[0] for md_full in RE_INFOBOX_PAINTING.finditer(page_text): infobox = md_full.groups()[0] md = RE_IB_LOCATION.search(infobox) if md: location = clean_wiki_string(md.groups()[0]) artist = '' md_artist = RE_IB_ARTIST.search(infobox) if md_artist: artist = clean_wiki_string(md_artist.groups()[0]) name = '' md_name = RE_IB_NAME.search(infobox) if md_name: name = clean_wiki_string(md_name.groups()[0]) if location and artist and name: yield ArtInfo(name=name, artist=artist, location=location, url=url)
def parse(self, response): xxs = XmlXPathSelector(response) routetitle = xxs.select('//predictions/@routeTitle').extract()[0] stoptag = xxs.select('//predictions/@stopTag').extract()[0] predictions = xxs.select('//prediction') items = [] for prediction in predictions: item = EtaScraperItem() item['seconds'] = prediction.select('@seconds').extract()[0] item['minutes'] = prediction.select('@minutes').extract()[0] item['is_departure'] = prediction.select("@isDeparture").extract()[0] item['dir_tag'] = prediction.select('@dirTag').extract()[0] item['trip_tag'] = prediction.select('@tripTag').extract()[0] item['vehicle_id'] = prediction.select('@vehicle').extract()[0] abl = prediction.select("@affectedByLayover").extract() if len(abl) > 0: item['affected_by_layover'] = abl[0] else: item['affected_by_layover'] = 'false' item['routename'] = routetitle item['stoptag'] = stoptag item['created'] = time() item['thisdate'] = datetime.now().date() direction = item['dir_tag'] if direction.find(DIRECTION_OPTS[0][0]) == -1 and direction.find(DIRECTION_OPTS[1][0]) == -1: direc = DIRECTION_OPTS[2][1] elif direction.find(DIRECTION_OPTS[0][0]) != -1: direc = DIRECTION_OPTS[0][1] else: direc = DIRECTION_OPTS[1][1] item['dir_tag'] = direc items.append(item) return items
def test_unquote(self): xmldoc = '\n'.join(( '<root>', ' lala', ' <node>', ' blabla&more<!--comment-->a<b>test</b>oh', ' <![CDATA[lalalal&ppppp<b>PPPP</b>ppp&la]]>', ' </node>', ' pff', '</root>')) xxs = XmlXPathSelector(text=xmldoc) self.assertEqual(xxs.extract_unquoted(), u'') self.assertEqual(xxs.select('/root').extract_unquoted(), [u'']) self.assertEqual(xxs.select('/root/text()').extract_unquoted(), [ u'\n lala\n ', u'\n pff\n']) self.assertEqual(xxs.select('//*').extract_unquoted(), [u'', u'', u'']) self.assertEqual(xxs.select('//text()').extract_unquoted(), [ u'\n lala\n ', u'\n blabla&more', u'a', u'test', u'oh\n ', u'lalalal&ppppp<b>PPPP</b>ppp&la', u'\n ', u'\n pff\n'])
def parse(self, response): xxs = XmlXPathSelector(response) xxs.register_namespace('soapenv', 'http://schemas.xmlsoap.org/soap/envelope/') xxs.register_namespace('xsd', 'http://www.w3.org/2001/XMLSchema') xxs.register_namespace('xsi', 'http://www.w3.org/2001/XMLSchema-instance') xxs.register_namespace( 'CurrentsAndMetadata', 'http://opendap.co-ops.nos.noaa.gov/axis/webservices/currents/wsdl' ) timelist = xxs.select( '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:timeStamp/text()' ).extract() cspdlist = xxs.select( '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CS/text()' ).extract() cdirlist = xxs.select( '//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CD/text()' ).extract() print len(timelist) for i in range(0, len(cdirlist)): sql_str = self.SQL_INSERT_STUB.format( self.get_current_station().lower(), str(timelist[i])[0:-2], str(cspdlist[i]), str(cdirlist[i]), 'datafactory_currentdata') #d_time = datetime.datetime(str(timelist[i])[0:-2], pytz.UTC) d_time_unware = datetime.datetime.strptime( str(timelist[i])[0:-2], "%Y-%m-%d %H:%M:%S") d_time1 = pytz.utc.localize(d_time_unware) d_time = d_time1.astimezone(pytz.utc) if self.needStore(d_time): self.db.query(sql_str) self.db.commit() if timelist: sql_str = "INSERT INTO {0} (sid, stime, etime) VALUES (\"{1}\", \"{2}\", \"{3}\")".format( DB_SETTINGS['DATABASE_TIME_TABLE'], self.get_current_station(), self.startDate.astimezone( pytz.utc).strftime("%Y-%m-%d %H:%M:%S"), self.endDate.astimezone( pytz.utc).strftime("%Y-%m-%d %H:%M:%S")) self.db.query(sql_str) self.db.commit() self.station_slot = self.station_slot + 1 if (self.station_slot < len(self.start_urls)): yield self.start_urls[self.station_slot]
def parse(self, response): x = XmlXPathSelector(response) x.remove_namespaces() x.register_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") items = [] items = x.select('//record/metadata/RDF') jsons = [] for item in items: creator = item.select('MetaResource/creator/Agent/name/text()').extract() title = item.select('Resource/title/text()').extract() uri = item.select('Resource/screen/Image/@rdf:about').extract() tags = item.select('Resource/subject/Description/value/text()').extract() thumbnail = item.select('Resource/thumbnail/Image/@rdf:about').extract() lat = item.select('Resource/spatial/Description/lat/text()').extract() long = item.select('Resource/spatial/Description/long/text()').extract() locality = item.select('Resource/spatial/Description/locality/text()').extract() tags_string = '"' + '", "'.join(tags) + '"' if not lat: newlat = 'null' else: newlat = lat[0] if not long: newlong = 'null' else: newlong = long[0] if not locality: newloc = '' else: newloc = locality[0] json_entry = '{"title": "' + title[0] + '", "uri": "' + uri[0] + '", "attribution_uri": "' + uri[0] + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail[0] + '", "media_geo_latitude": ' + newlat + ', "media_geo_longitude": ' + newlong + ', "location": "' + newloc + '", "tags": [' + tags_string + '], "archive":"Yahoo! Japan", "media_type": "Image", "layer_type": "Image", "child_items_count":0, "published":1}, ' jsons.append(json_entry) resumptionToken = x.select('//resumptionToken/text()').extract() if resumptionToken == []: nextFileLink = '' open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8")) else: nextFileLink = "http://search.shinrokuden.irides.tohoku.ac.jp/webapi/oaipmh?verb=ListRecords&metadataPrefix=sdn&resumptionToken=" + resumptionToken[0].encode('ascii') open(resumptionToken[0].encode('ascii') + '.txt', 'wb').write(''.join(jsons).encode("UTF-8")) yield Request(nextFileLink, callback = self.parse)
def parse_xml_document(self, response): xxs = XmlXPathSelector(response) votes = xxs.select('//meeting/vote') items = [] for vote in votes: councilvote = VoteItem() votenum = int(vote.select('@number').extract()[0]) councilvote["number"] = int(votenum) councilvote["date"] = vote.select('vote-date/text()').extract()[0] councilvote["time"] = vote.select('vote-time/text()').extract()[0] councilvote["motion_ch"] = vote.select('motion-ch/text()').extract()[0] councilvote["motion_en"] = vote.select('motion-en/text()').extract()[0] councilvote["mover_ch"] = vote.select('mover-ch/text()').extract()[0] councilvote["mover_en"] = vote.select('mover-en/text()').extract()[0] councilvote["mover_type"] = vote.select('mover-type/text()').extract()[0] councilvote["separate_mechanism"] = vote.select('vote-separate-mechanism/text()').extract()[0] if councilvote["separate_mechanism"] == 'Yes': mechanism = ['functional-constituency', 'geographical-constituency'] else: mechanism = ['overall'] for constituency in mechanism: if constituency == 'functional-constituency': short = 'fc_' elif constituency == 'geographical-constituency': short = 'gc_' else: short = '' for count_type in ['present', 'vote', 'yes', 'no', 'abstain']: councilvote[short+count_type] = int(vote.select('vote-summary/'+constituency+'/'+count_type+'-count/text()').extract()[0]) councilvote[short+'result'] = vote.select('vote-summary/'+constituency+'/'+'result/text()').extract()[0] councilvote['result'] = vote.select('vote-summary/overall/result/text()').extract()[0] items.append(councilvote) members = xxs.select('//meeting/vote[%s]/individual-votes/member'%votenum) for member in members: individualvote = IndividualVoteItem() individualvote['number'] = councilvote["number"] individualvote['date'] = councilvote["date"] individualvote['name_ch'] = member.select('@name-ch').extract()[0] individualvote['name_en'] = member.select('@name-en').extract()[0] individualvote['constituency'] = member.select('@constituency').extract()[0] individualvote['vote'] = member.select('vote/text()').extract()[0] items.append(individualvote) return items
def parse(self, response): xxs = XmlXPathSelector(response) eis = xxs.select('/api/query/embeddedin/ei') for ei in eis: pageid = ei.select('@pageid').extract() if pageid: yield Request('http://en.wikipedia.org/w/api.php?action=query&prop=revisions|info&pageids=%s&rvprop=content&inprop=url&format=xml' % pageid[0], callback=self.parse_page_content) cont = xxs.select('/api/query-continue/embeddedin/@eicontinue').extract() if cont: yield Request('http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&' 'eititle=Template:Infobox%%20artwork&eilimit=100&eifilterredir=nonredirects&format=xml&eicontinue=%s' % cont[0], callback=self.parse)
def parse_rss(self, response): item = response.request.meta['item'] if response.status != 500: xxs = XmlXPathSelector(response) xxs.remove_namespaces() item['date'] = xxs.select('.//channel/date/text()').extract() description = xxs.select('.//channel/description/text()').extract() if (len(item.get('description', '')) < 10) and description: item['description'] = ''.join(description).strip() del (item['subpage_urls']) return item
def parse_rss(self, response): item = response.request.meta['item'] if response.status != 500: xxs = XmlXPathSelector(response) xxs.remove_namespaces() item['date'] = xxs.select('.//channel/date/text()').extract() description = xxs.select('.//channel/description/text()').extract() if (len(item.get('description', '')) < 10) and description: item['description'] = ''.join(description).strip() del(item['subpage_urls']) return item
def parse(self, response): x = XmlXPathSelector(response) x.register_namespace("im", "http://itunes.apple.com/rss") x.register_namespace('atom','http://www.w3.org/2005/Atom') feedCount = str(len(self.start_urls)) self.i=self.i+1 self.log('Reading rss url [%s of %s]' % (self.i, feedCount), level=log.INFO) entries = x.select('//atom:entry') if entries: # a itunes rss feed for entry in entries: id = entry.select('./atom:id/@im:id').extract() self.log('Entry %s' % (str(id)), level=log.INFO) yield Request('http://itunes.apple.com/lookup?id='+ id[0], callback=self.getItunesTrackJson) else: # a single feed l = XPathItemLoader(PodcastItem(), x) l.add_value('id', 'rssdisco_'+response.url) l.add_value('audioType', 'disco') l.add_value('brandFeed', response.url) l.add_xpath('brandName', '//./channel/title/text()') self.log('Feed from rss %s' % (response.url), level=log.INFO) item = l.load_item() yield item
def parse(self, response): x = XmlXPathSelector(response) #x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") #programs = x.select('./body/outline[position()=4]/outline[position()<4]') programs = x.select('//body/outline/outline') podcastCount = str(len(programs)) i=0 allitems=[] for program in programs: i=i+1 l = XPathItemLoader(PodcastItem(), selector=program) l.add_xpath('id', 'concat("dpc_", ./@xmlUrl)') l.add_value('audioType', 'disco') l.add_xpath('brandId', './@xmlUrl') l.add_xpath('brandFeed', './@xmlUrl') l.add_xpath('brandName', './@title') l.add_xpath('brandDescription', './@description') l.add_xpath('brandHomepage', './@htmlUrl') self.log('Discovering dpc [%s of %s] feeds' % (i, podcastCount), level=log.INFO) item = l.load_item() yield item
def parseSubGenre(self, response): x = XmlXPathSelector(response) x.register_namespace("kb", "http://www.kerbango.com/xml") metaData = response.meta['metaData'] stations = x.select('//kb:results/kb:station_record') # was limited to less 5 for now!!! for station in stations: metaData['channelPlaylist'] = [station.select('./kb:station_url_record/kb:url/text()').extract()[0].rstrip('/ \r\n')] metaData['channelName'] = station.select('./kb:station/text()').extract() metaData['channelDescription'] = station.select('./kb:description/text()').extract() metaData['streamId'] = station.select('./kb:esid/text()').extract() metaData['streamBandwidth'] = station.select('./kb:station_url_record/kb:bandwidth_kbps/text()').extract() metaData['streamData'] = station.select('./kb:station_url_record/kb:status_code/text()').extract() metaData['channelGenreIds'] = metaData['genreId'] metaData['channelGenres'] = metaData['genreName'] metaData['channelCategory'] = metaData['genreName'] self.log('parseSubGenre %s %s' % (metaData['genreName'], metaData['channelName'] ), level=log.INFO) channelName = metaData['channelName'][0] channelName = re.sub(r'Low$|High$', '', channelName).strip() #cope with bbc names that include bitratethy in name tuneInSearchUrl = 'http://tunein.com/search/suggest/?query='+ channelName #assume all is well and the supplied url is indeed a playlist! request = Request(tuneInSearchUrl, meta = {'metaData': copy.deepcopy(metaData)}, callback=self.parseTuneInSearch, errback=lambda x:self.parsePlaylist(x,copy.deepcopy(metaData)) ) yield request
def parseFeed(self, response): jsonResponse = response.meta['jsonResponse'] brandStats = jsonResponse['stats']['stats_fields']['episodePublishDate'] #maxDate = brandStats['max'] #updateDoc = '<delete><query>brandFeed:"'+brandFeed+'"</query></delete>' x = XmlXPathSelector(response) x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd") x.register_namespace("media", "http://search.yahoo.com/mrss/") ######### newEpisodes = x.select('//channel/item[enclosure[contains(@type,"audio") or contains(@type,"video")]]') metaData = {} metaData['rssUrl'] = response.url episodes = [] #create a single solr update doc that contains all the new episodes and deletes expired ones for xmlEpisode in newEpisodes: jsonBrand = jsonResponse['grouped']['brandFeed']['groups'][0]['doclist']['docs'][0] episode = self.load_item(jsonBrand, xmlEpisode, metaData).__dict__.values()[0] episodes.append(episode) updatejson = JSONEncoder().encode(episodes) yield Request( url=self.solrUpdateUrl, method='POST', body=updatejson, headers={'Content-Type':'application/json'}, callback=self.dummyEnd )
def handle_bug_xml(self, response): logging.info("STARTING XML") hxs = XmlXPathSelector(response) item = hxs.select('//item') try: parsed = bugimporters.items.ParsedBug({ 'title': item.select('title/text()').extract()[0], 'description': item.select('description/text()').extract()[0] , 'status': item.select('status/text()').extract()[0], 'people_involved': 0, #TODO 'date_reported': self.format_date(item.select('created/text()').extract()[0]), 'last_touched': self.format_date(item.select('updated/text()').extract()[0]), 'submitter_username': item.select('reporter/@username').extract()[0], 'submitter_realname': item.select('reporter/text()').extract()[0], 'canonical_bug_link': item.select('link/text()').extract()[0], 'looks_closed': (item.select('status/text()').extract()[0] == 'Closed'), 'last_polled': datetime.now(), # TODO tracker ids #'_project_name': self.tm.tracker_name, #'_tracker_name': self.tm.tracker_name, }) yield parsed except IndexError as e: logging.exception(e) logging.debug("AHHHHHHHHHHHHHHHHHHHHHH!!!!!!!!!!!!!: {0}".format(item.select('title/text()').extract()[0]))
def parse(self, response): if(self.value('link_extractor') != None): xxs = XmlXPathSelector(response) links = xxs.select(self.value("link_extractor")).extract() return [Request(x, callback=self.parse_item) for x in links] else: return super(CommonSpider, self).parse(response)
def parse(self, response): # Create xml selector & get its contents as a string for regex parsing xxs = XmlXPathSelector(response) data = str(xxs.select('/courseinfo').extract()) # Create course item item = CourseItem() # Get course number from url number_regex = re.compile('(..-...)') number_match = number_regex.search(response.url) if (number_match != None): number_match.group() item['number'] = number_match.group(1) # Construct regular expression for prerequisite decoding prereq_regex = re.compile('Prerequisite(?:s)?:(.*)(\.') match = prereq_regex.search(data) if (match == None): print item return else: match.group() print match.group(1) item['prereqs'] = match.group(1) print item
def parse(self, response): xxs = XmlXPathSelector(response) for product in xxs.select('//product'): category = product.select('./Category/text()').extract() loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('identifier', './product-id/text()') loader.add_xpath('sku', './product-id/text()') loader.add_xpath('url', './product-url/text()') loader.add_xpath('name', './product-name/text()') loader.add_xpath('brand', './brand/text()') loader.add_value( 'price', extract_price_eu(' '.join( product.select('./price/text()').extract()))) if category: loader.add_value('category', category[0].split('/')[-1].strip()) loader.add_xpath('image_url', './image-url/text()') loader.add_xpath('stock', './stock/text()') if loader.get_output_value('price') > 499: loader.add_value('shipping_cost', '0') else: loader.add_value('shipping_cost', '25') yield loader.load_item()
def parse(self, response): xxs = XmlXPathSelector(response) links = xxs.select( "//item/*[local-name()='origLink']/text()").extract() return [Request(x, callback=self.parse_item) for x in links]
def parse(self, response): # inspect_response(response, self) # return # hxs = HtmlXPathSelector(response) # file_path = "d:/work/GoogleFeed.xml" # f = open(file_path) # xxs = XmlXPathSelector(text=f.read()) xxs = XmlXPathSelector(response) for sel in xxs.select('//channel/item'): # ## loader = ProductLoader(item=Product(), response=response) tmp = sel.select('link/text()').extract() if tmp: loader.add_value('url', tmp[0]) # ID tmp = sel.select('*[name()="g:id"]/text()').extract() if tmp: loader.add_value('identifier', tmp[0]) # Sku tmp = sel.select('*[name()="g:id"]/text()').extract() if tmp: loader.add_value('sku', tmp[0]) # Name tmp = sel.select('title/text()').extract() if tmp: loader.add_value('name', tmp[0]) # price tmp = sel.select('*[name()="g:sale_price"]/text()').extract() if not tmp: tmp = sel.select('*[name()="g:price"]/text()').extract() if tmp: price = round(extract_price(tmp[0]) / Decimal('1.20'), 2) loader.add_value('price', price) # image_url tmp = sel.select('*[name()="g:image_link"]/text()').extract() if tmp: loader.add_value('image_url', tmp[0]) # Brand tmp = sel.select('*[name()="g:brand"]/text()').extract() if tmp and tmp[0] != 'Alliance': loader.add_value('brand', tmp[0]) # category tmp = sel.select('*[name()="g:product_type"]/text()').extract() if tmp: try: loader.add_value('category', tmp[0].split('>')[1].strip()) except: loader.add_value('category', tmp[0].strip()) # shipping_cost price = loader.load_item()['price'] if price and price < 50.00: loader.add_value('shipping_cost', 5.90) # stock tmp = sel.select('*[name()="g:availability"]/text()').extract() if tmp and tmp[0] == 'in stock': loader.add_value('stock', 1) else: loader.add_value('stock', 0) yield loader.load_item()
def parsePage(self, response): x = XmlXPathSelector(response) items = [] feeds = x.select('//lst[@name="grouped"]/lst[@name="brandFeed"]/arr[@name="groups"]/lst') for feed in feeds: metaData={} metaData['brandAvgDuration'] = feed.select('./result/doc/str[@name="brandAvgDuration"]/text()').extract()[:1] metaData['brandCurrentItem'] = feed.select('./result/doc/str[@name="brandCurrentItem"]/text()').extract()[:1] metaData['brandDescription'] = feed.select('./result/doc/str[@name="brandDescription"]/text()').extract()[:1] metaData['brandFeed'] = feed.select('./result/doc/str[@name="brandFeed"]/text()').extract()[:1] metaData['brandFrequency'] = feed.select('./result/doc/str[@name="brandFrequency"]/text()').extract()[:1] metaData['brandHomepage'] = feed.select('./result/doc/str[@name="brandHomepage"]/text()').extract()[:1] metaData['brandId'] = feed.select('./result/doc/str[@name="brandId"]/text()').extract()[:1] metaData['brandIds'] = feed.select('./result/doc/arr[@name="brandIds"]/text()').extract() metaData['brandImage'] = feed.select('./result/doc/str[@name="brandImage"]/text()').extract()[:1] metaData['brandName'] = feed.select('./result/doc/str[@name="brandName"]/text()').extract()[:1] metaData['brandShortName'] = feed.select('./result/doc/str[@name="brandShortName"]/text()').extract()[:1] metaData['brandTimes'] = feed.select('./result/doc/str[@name="brandTimes"]/text()').extract() metaData['brandRegions'] = feed.select('./result/doc/arr[@name="brandRegions"]/text()').extract() metaData['channelHomepage'] = feed.select('./result/doc/str[@name="channelHomepage"]/text()').extract()[:1] metaData['channelId'] = feed.select('./result/doc/str[@name="channelId"]/text()').extract()[:1] metaData['channelName'] = feed.select('./result/doc/str[@name="channelName"]/text()').extract()[:1] metaData['itunesArtistId'] = feed.select('./result/doc/str[@name="itunesArtistId"]/text()').extract()[:1] metaData['itunesPopular'] = feed.select('./result/doc/int[@name="itunesPopular"]/text()').extract()[:1] metaData['itunesPopularInGenre'] = feed.select('./result/doc/int[@name="itunesPopularInGenre"]/text()').extract()[:1] metaData['itunesSimilar'] = feed.select('./result/doc/str[@name="itunesSimilar"]/text()').extract()[:1] metaData['itunesRelated'] = feed.select('./result/doc/str[@name="itunesRelated"]/text()').extract()[:1] metaData['itunesTrackId'] = feed.select('./result/doc/str[@name="itunesTrackId"]/text()').extract()[:1] metaData['ownerHomepage'] = feed.select('./result/doc/str[@name="ownerHomepage"]/text()').extract()[:1] metaData['ownerId'] = feed.select('./result/doc/str[@name="ownerId"]/text()').extract()[:1] metaData['ownerImage'] = feed.select('./result/doc/str[@name="ownerImage"]/text()').extract()[:1] metaData['ownerKey'] = feed.select('./result/doc/str[@name="ownerKey"]/text()').extract()[:1] metaData['ownerName'] = feed.select('./result/doc/str[@name="ownerName"]/text()').extract()[:1] if metaData['itunesTrackId']: metaData['itunesTrackId'] = metaData['itunesTrackId'][0] #itunes podcast html #from an Id if 'itunesTrackId' in metaData and metaData['itunesTrackId']: self.logProgress('parsePage from Id', metaData['brandName'][0], '', metaData['itunesTrackId'], log.INFO, str(metaData['itunesTrackId']) ) request = Request('http://itunes.apple.com/lookup?id='+ metaData['itunesTrackId'], meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getItunesTrackJson) else: #if not from the title self.logProgress('parsePage from title', metaData['brandName'], '', '---------', log.INFO) try: ownerName = metaData['ownerName'][0] except: ownerName = '' #&attribute=titleTerm removed whilst using the owner name in the string as well request = Request('http://itunes.apple.com/search?term='+ metaData['brandName'][0] +' '+ ownerName +'&entity=podcast', meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getItunesTrackJson) self.indexedPodcasts.append(1) yield request
def parse(self, response): if not hasattr(self, 'parse_node'): raise NotConfigured('You must define parse_node method in order to scrape this XML feed') response = self.adapt_response(response) if self.iterator == 'iternodes': nodes = xmliter(response, self.itertag) elif self.iterator == 'xml': selector = XmlXPathSelector(response) self._register_namespaces(selector) nodes = selector.select('//%s' % self.itertag) elif self.iterator == 'html': selector = HtmlXPathSelector(response) self._register_namespaces(selector) nodes = selector.select('//%s' % self.itertag) else: raise NotSupported('Unsupported node iterator') return self.parse_nodes(response, nodes)
def parseSubGenre(self, response): x = XmlXPathSelector(response) metaData = response.meta['metaData'] stations = x.select('//body/outline[@key="stations"]/outline') relateds = x.select('//body/outline[@key="related"]/outline') for station in stations: metaData['channelUrl'] = station.select('@URL').extract() metaData['channelName'] = station.select('@text').extract() metaData['channelDescription'] = station.select('@subtext').extract() metaData['channelGenreId'] = station.select('@genre_id').extract() metaData['channelFormats'] = station.select('@formats').extract() metaData['channelImage'] = station.select('@image').extract() metaData['channelTuneInItem'] = station.select('@item').extract() metaData['channelTuneInNowPlayingId'] = station.select('@now_playing_id').extract() metaData['channelTuneInPresetId'] =station.select('@preset_id').extract() metaData['channelTuneInType'] =station.select('@type').extract() metaData['channelTuneInBitrate'] =station.select('@bitrate').extract() metaData['channelTuneInReliability'] =station.select('@reliability').extract() metaData['channelTuneInGuideId'] =station.select('@guide_id').extract() metaData['channelTuneInShowId'] =station.select('@show_id').extract() metaData['channelTuneInCurrentTrack'] =station.select('@current_track').extract() if metaData['channelTuneInGuideId']: print '------------', metaData['channelTuneInGuideId'] metaData['channelTuneInUrl'] = 'http://tunein.com/tuner/?StationId='+ metaData['channelTuneInGuideId'][0] self.indexedPodcasts.append(1) self.logProgress ('parseSubGenre', metaData['channelUrl'], '', '', level=log.DEBUG) url = 'http://opml.radiotime.com/Tune.ashx?formats=aac,mp3,wma,wmpro,wmvoice,mp3raw&render=json&id='+ metaData['channelTuneInGuideId'][0] request = Request(url , meta = {'metaData': copy.deepcopy(metaData)}, callback=self.getStreams) yield request else: self.logProgress ('parseSubGenre', metaData['channelName'], 'no guide id', '', level=log.WARNING) yield None for related in relateds: url = related.select('./@URL').extract()[0] self.logProgress ('parseSubGenre related links', url, '', '', level=log.DEBUG) request = Request(url , meta = {'metaData': copy.deepcopy(metaData)}, callback=self.parseSubGenre) yield request
def parse(self, response): xxs = XmlXPathSelector(response) stores = xxs.select('//locationinfo') items = [] for store in stores: item = TutItem() item['address'] = store.select('address') item['address2'] = store.select('address2') items.append(item) return items
def parse(self, response): xxs = XmlXPathSelector(response) entries = xxs.select('//item') for entry in entries: item = ZoinkscraperItem() item['name'] = entry.select('./title/text()')[0].extract_unquoted() item['url'] = entry.select('./link/text()')[0].extract() item['date'] = datetime.strptime(entry.select('./pubDate/text()')[0].extract()[:-6],'%a, %d %b %Y %H:%M:%S') yield item
def parse(self, response): if not hasattr(self, 'parse_node'): raise NotConfigured( 'You must define parse_node method in order to scrape this XML feed' ) response = self.adapt_response(response) if self.iterator == 'iternodes': nodes = self._iternodes(response) elif self.iterator == 'xml': selector = XmlXPathSelector(response) self._register_namespaces(selector) nodes = selector.select('//%s' % self.itertag) elif self.iterator == 'html': selector = HtmlXPathSelector(response) self._register_namespaces(selector) nodes = selector.select('//%s' % self.itertag) else: raise NotSupported('Unsupported node iterator') return self.parse_nodes(response, nodes)
def test_selector_over_text(self): hxs = HtmlXPathSelector(text='<root>lala</root>') self.assertEqual(hxs.extract(), u'<html><body><root>lala</root></body></html>') xxs = XmlXPathSelector(text='<root>lala</root>') self.assertEqual(xxs.extract(), u'<root>lala</root>') xxs = XmlXPathSelector(text='<root>lala</root>') self.assertEqual(xxs.select('.').extract(), [u'<root>lala</root>'])
def parse_travel_asy(self, response): xxs = XmlXPathSelector(response) xxs.remove_namespaces() json_object = json.loads(xxs.select("//string/text()").extract()[0]) request_list = [] for product in json_object['product']: if product['isYuyue'] == 'True': url = 'http://www.zhongmin.cn/Product/ProductDetails.aspx?pid=%s&bid=11' % product['Id'] else: url = 'http://www.zhongmin.cn/Travel/Product/TravelDetailArr%(Id)s-%(age)sd%(day)s.html' % product request_list.append(Request(url = url)) return request_list
def parse(self, response): """ We define a custom parser here because we need to get the link from the feed item and then follow it to get the recipe data. Getting the data from <content:encoded> seems overly complex, as we would have to decode all the encoded characters and then build a DOM from that. """ xxs = XmlXPathSelector(response) links = xxs.select("//item/*[local-name()='origLink']/text()").extract() return [Request(x, callback=self.parse_item) for x in links]
def load_rss(self, response): x = XmlXPathSelector(response) x.register_namespace("xsi", "http://www.w3.org/2001/XMLSchema-instance") x.register_namespace("itunes", "http://www.itunes.com/dtds/podcast-1.0.dtd") x.register_namespace("media", "http://search.yahoo.com/mrss/") title = x.select('//./channel/title/text()').extract()[0] parent = response.meta['parent'] request = Request('http://itunes.apple.com/search?term='+ title +'&entity=podcast&attribute=titleTerm', meta = {'parent': parent, 'rss': x, 'rssUrl': response.url}, callback=self.get_itunes_info) return request
def parse(self, response): xxs = XmlXPathSelector(response) xxs.register_namespace('soapenv', 'http://schemas.xmlsoap.org/soap/envelope/') xxs.register_namespace('xsd', 'http://www.w3.org/2001/XMLSchema') xxs.register_namespace('xsi', 'http://www.w3.org/2001/XMLSchema-instance') xxs.register_namespace('CurrentsAndMetadata', 'http://opendap.co-ops.nos.noaa.gov/axis/webservices/currents/wsdl') timelist = xxs.select('//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:timeStamp/text()').extract() cspdlist = xxs.select('//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CS/text()').extract() cdirlist = xxs.select('//CurrentsAndMetadata:data/CurrentsAndMetadata:item/CurrentsAndMetadata:CD/text()').extract() print len(timelist) for i in range(0, len(cdirlist)): sql_str = self.SQL_INSERT_STUB.format(self.get_current_station().lower(), str(timelist[i])[0:-2], str(cspdlist[i]), str(cdirlist[i]), 'datafactory_currentdata') #d_time = datetime.datetime(str(timelist[i])[0:-2], pytz.UTC) d_time_unware = datetime.datetime.strptime(str(timelist[i])[0:-2], "%Y-%m-%d %H:%M:%S") d_time1 = pytz.utc.localize(d_time_unware) d_time = d_time1.astimezone(pytz.utc) if self.needStore(d_time): self.db.query(sql_str) self.db.commit() if timelist: sql_str = "INSERT INTO {0} (sid, stime, etime) VALUES (\"{1}\", \"{2}\", \"{3}\")".format( DB_SETTINGS['DATABASE_TIME_TABLE'], self.get_current_station(), self.startDate.astimezone(pytz.utc).strftime("%Y-%m-%d %H:%M:%S"), self.endDate.astimezone(pytz.utc).strftime ("%Y-%m-%d %H:%M:%S") ) self.db.query(sql_str) self.db.commit() self.station_slot = self.station_slot + 1 if (self.station_slot < len(self.start_urls)): yield self.start_urls[self.station_slot]
def parseDetails(self, response): x = XmlXPathSelector(response) metaData = response.meta['metaData'] related = x.select('//body/outline[@key="genres"]/outline[@type="link"]') metaData['channelRelatedGenres'] = related.select('@text').extract() metaData['channelRelatedGenreIds'] = related.select('@guide_id').extract() recommended = x.select('//body/outline[@key="recommendations"]/outline[@type="audio"]') metaData['channelRecommended'] = recommended.select('@text').extract() metaData['channelRecommendedDescription'] = recommended.select('@subtext').extract() metaData['channelRecommendedId'] = recommended.select('@guide_id').extract() metaData['channelRecommendedFormats'] = recommended.select('@formats').extract() metaData['channelRecommendedType'] = recommended.select('@item').extract() metaData['channelRecommendedImage'] = recommended.select('@image').extract() listing = x.select('//body/outline[@key="listing"]/outline[@type="object"]/station') metaData['channelCallSign'] = listing.select('call_sign/text()').extract() metaData['channelSlogan'] = listing.select('slogan/text()').extract() metaData['channelUrl'] = listing.select('url/text()').extract() metaData['channelTuneInReportUrl'] = listing.select('report_url/text()').extract() metaData['channelTuneInDetailUrl'] = listing.select('detail_url/text()').extract() metaData['channelTuneInIsPreset'] = listing.select('is_preset/text()').extract() metaData['channelTuneInIsAvailable'] = listing.select('is_available/text()').extract() metaData['channelTuneInIsMusic'] = listing.select('is_music/text()').extract() metaData['channelTuneInHasSong'] = listing.select('has_song/text()').extract() metaData['channelTuneInHasSchedule'] = listing.select('has_schedule/text()').extract() metaData['channelTuneInHasTopics'] = listing.select('has_topics/text()').extract() metaData['channelTuneInTwitterId'] = listing.select('twitter_id/text()').extract() metaData['channelLogo'] = listing.select('logo/text()').extract() metaData['channelLocation'] = listing.select('location/text()').extract() metaData['channelEmail'] = listing.select('email/text()').extract() metaData['channelPhone'] = listing.select('phone/text()').extract() metaData['channelAddress'] = listing.select('mailing_address/text()').extract() metaData['channelLanguage'] = listing.select('language/text()').extract() if metaData['channelTuneInGuideId']: url = 'http://opml.radiotime.com/Describe.ashx?c=composite&detail=options,schedules,listing,affiliates,genres,recommendations&id='+ metaData['channelTuneInGuideId'][0] request = Request(url , meta = {'metaData': copy.deepcopy(metaData)}, callback=self.createItem) yield request
def parse(self, response): x = XmlXPathSelector(response) total = int(x.select('//lst[@name="grouped"]/lst[@name="brandFeed"]/int[@name="ngroups"]/text()').extract()[0]) pageSize = 100 urlBase = response.url start = 0 #try letting scrapy handle it all for i in range(start, total, pageSize): url = urlBase + '&start='+ str(i) self.log('Requesting next %d page %d of %d %s'% (pageSize, i, total, url), log.DEBUG) # add a , dont_filter=True to request to prevent caching solr requests by Scrapy request = Request( url, callback=self.parsePage, dont_filter=True) yield request
def parse(self, response): hxs = XmlXPathSelector(response) name = hxs.select('//name').extract() if self.task_id is not None: self.log('Processing item %s' % self.task_id, log.INFO) self.alert_context = 'task_id=%s' % self.task_id for item in self.process_item(self.bot_task_params(self.task_id)): yield item else: for item in self.process_items(): yield item
def xmliter_lxml(obj, nodename, namespace=None): from lxml import etree reader = _StreamReader(obj) tag = '{%s}%s' % (namespace, nodename) if namespace else nodename iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding) selxpath = '//' + ('x:%s' % nodename if namespace else nodename) for _, node in iterable: nodetext = etree.tostring(node) node.clear() xs = XmlXPathSelector(text=nodetext) if namespace: xs.register_namespace('x', namespace) yield xs.select(selxpath)[0]
def test_selector_namespaces_simple(self): body = """ <test xmlns:somens="http://scrapy.org"> <somens:a id="foo"/> <a id="bar">found</a> </test> """ response = XmlResponse(url="http://example.com", body=body) x = XmlXPathSelector(response) x.register_namespace("somens", "http://scrapy.org") self.assertEqual(x.select("//somens:a").extract(), ['<somens:a id="foo"/>'])
def parse(self, response): """ We define a custom parser here because we need to get the link from the feed item and then follow it to get the recipe data. Getting the data from <content:encoded> seems overly complex, as we would have to decode all the encoded characters and then build a DOM from that. """ xxs = XmlXPathSelector(response) links = xxs.select( "//item/*[local-name()='origLink']/text()").extract() # self.parse_item comes from OnehundredonecookbooksMixin return [Request(x, callback=self.parse_item) for x in links]
def parse(self, response): xxs = XmlXPathSelector(response) hxs = HtmlXPathSelector(response) links = xxs.select('//link/text()').extract() log.msg('Link length: %s' % len(links), level=log.ERROR) if len(links) <= 0: log.msg('no links found, using regular parser', level=log.ERROR) links = hxs.select('//a/@href').extract() msg = 'Links: %s' % links log.msg(msg, level=log.ERROR) return [Request(x, callback=self.parse_item) for x in links]
def detect_feed(self, response): """Just detects the feed in the links and returns an Item""" xxs = XmlXPathSelector(response); '''Need to tweak the feedparser lib to just use the headers from response instead of d/l the feed page again, rather than d/l it again ''' if any(xxs.select("/%s" % feed_type) for feed_type in ['rss', 'feed', 'xml', 'rdf']): try: rssFeed = feedparser.parse(response.url); return self.extract_feed(rssFeed) except: raise Exception('Exception while parsing/extracting the feed') return None
def parsePart(self, response): item = response.meta['item'] xxs = XmlXPathSelector(response) if len(xxs.select("//ERRORSEGMENT")) == 0: part_num = response.meta['part_num'] end_range = response.meta['end_range'] part_prefix = response.meta['part_prefix'] item['parts'].append(self.part_format % (part_prefix, part_num)) if part_num < end_range: yield self.makePartRequest(part_prefix, part_num + 1, item, end_range) else: yield item else: yield item
def parse(self, response): base_url = get_base_url(response) xxs = XmlXPathSelector(response) xxs.register_namespace("g", "http://base.google.com/ns/1.0") products = xxs.select('//channel/item') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('url', 'link/text()') loader.add_xpath('name', 'title/text()') loader.add_xpath('image_url', 'g:image_link/text()') loader.add_xpath('price', 'g:price/text()') loader.add_xpath('brand', 'g:brand/text()') loader.add_xpath('category', 'g:brand/text()') loader.add_xpath('sku', 'g:id/text()') loader.add_xpath('identifier', 'g:id/text()') yield loader.load_item()
def parse(self, response): xxs = XmlXPathSelector(response) xxs.remove_namespaces() products = xxs.select('//item') for product in products: mpn = product.xpath('mpn/text()') if mpn: mpn = mpn[0].extract().upper().strip() else: mpn = None row = self.monitored_products.get(mpn) if mpn else None if row is None or (row and row['Discontinued'].lower().strip() == 'yes'): continue loader = ProductLoader(selector=product, item=Product()) loader.add_xpath('identifier', 'id/text()') loader.add_xpath('sku', 'mpn/text()') loader.add_xpath('brand', 'brand/text()') loader.add_xpath('image_url', 'image_link/text()') loader.add_xpath('url', 'link/text()') loader.add_xpath('name', 'title/text()') price = product.select('sale_price/text()').extract() if not price: price = product.select('price/text()').extract() loader.add_value('price', extract_price(price[0])) categories = product.select( 'product_type/text()').extract()[-1].split('>') categories = map(lambda x: x.strip(), categories) loader.add_value('category', categories) shipping_cost = product.select('shipping/price/text()').extract() shipping_cost = extract_price( shipping_cost[0]) if shipping_cost else '' loader.add_value('shipping_cost', shipping_cost) in_stock = product.select( 'availability[contains(text(), "in stock")]').extract() if not in_stock: loader.add_value('price', 0) item = loader.load_item() item['metadata'] = RHSMeta() item['metadata']['cost_price'] = row['Cost Price'] yield item
def get_products(self, meta, response, colors, colors_ids): hxs = XmlXPathSelector(response) names, ids = self.get_names(meta['base_name'], meta['product_id'], meta['current_data'], colors, colors_ids) for i, name in enumerate(names): p = ProductLoader(item=Product(), response=response) p.add_value('identifier', ids[i]) p.add_value('name', name) p.add_value('brand', meta['brand']) p.add_value('url', meta['url']) p.add_value('image_url', meta['image_url']) price = hxs.select('//cmd[@t="discounted_price"]/text()').extract() if price: price = price[0].replace('.', '').replace(',', '.') price = extract_price(price) if not price or price == Decimal(1): if not price: self.log('Price not found %s' % meta['url']) else: self.log('Price is one %s' % meta['url']) if not self.retries.get( meta['url']) or self.retries.get(meta['url']) < 3: self.log('Retrying %s' % meta['url']) self.retries[meta['url']] = self.retries.get( meta['url'], 0) + 1 p = meta['url'] yield Request(p, meta={ 'category': response.meta.get('category', ''), 'cookiejar': p + str(self.retries.get(meta['url'])) }, callback=self.parse_product, dont_filter=True) else: self.log('Max retries reached %s' % meta['url']) return p.add_value('price', price) p.add_value('shipping_cost', '0') p.add_value('category', response.meta.get('category')) yield p.load_item()
def parse(self, response): xxs = XmlXPathSelector(response) base_url = get_base_url(response) xxs.register_namespace("f", "http://www.w3.org/2005/Atom") products = xxs.select('//f:entry') for product in products: product.register_namespace("g", "http://base.google.com/ns/1.0") product.register_namespace("p", "http://www.w3.org/2005/Atom") product_loader = ProductLoader(item=Product(), selector=product) name = product.select('./p:title/text()').extract()[0] if 'B-STOCK' in name.upper(): continue product_loader.add_value('name', name) url = product.select('./p:link/@href').extract()[0] product_loader.add_value('url', urljoin_rfc(base_url, url)) image_url = product.select('./g:image_link/text()').extract() if image_url: product_loader.add_value('image_url', urljoin_rfc(base_url, image_url[0])) category = product.select('./g:product_type/text()').extract() if category: product_loader.add_value('category', category[0]) brand = product.select('./g:brand/text()').extract() if brand: product_loader.add_value('brand', brand[0]) price = product.select('./g:sale_price/text()').extract() if price: product_loader.add_value('price', extract_price(price[0])) else: price = product.select('./g:price/text()').extract() product_loader.add_value('price', extract_price(price[0])) # sku = product.select('./g:gtin/text()').extract() # if sku: # product_loader.add_value('sku', sku[0]) identifier = product.select('./g:id/text()').extract()[0] product_loader.add_value('identifier', identifier) product_loader.add_value('sku', identifier) shipping_cost = product.select( './g:shipping/g:price/text()').extract() if shipping_cost: product_loader.add_value('shipping_cost', extract_price(shipping_cost[0])) product = product_loader.load_item() yield product
def parse(self, response): xxs = XmlXPathSelector(response) for productxs in xxs.select( '//product[attribute_set/text()!="spares-accessories"]'): loader = ProductLoader(item=Product(), selector=productxs) loader.add_xpath('sku', './product_id/text()') loader.add_xpath('identifier', './product_id/text()') loader.add_xpath('price', './product_price/text()') loader.add_xpath('name', './product_name/text()') loader.add_xpath('url', './product_url/text()') loader.add_xpath('category', './attribute_set/text()') loader.add_xpath('brand', './manufacturer/text()') brand = loader.get_output_value('brand').strip().upper() if brand in self.ignore_brands: log.msg('Ignoring product %s because of brand %s' % (loader.get_output_value('identifier'), brand)) continue loader.add_value('stock', '1') item = loader.load_item() item['identifier'] = item['identifier'].upper() cost_price = productxs.select('./cost/text()').extract() metadata = CSCateringMeta() cost_price = cost_price[0].strip() if cost_price else '0.00' metadata['cost_price'] = cost_price item['metadata'] = metadata category = loader.get_output_value('category').strip().lower() if category in ignore_categories and not self.has_sku( item.get('sku', '')): log.msg('Ignoring product %s because of category %s' % (loader.get_output_value('identifier'), category)) continue yield Request(item['url'], callback=self.parse_img, meta={'item': item})
def parse(self, response): xxs = XmlXPathSelector(response) xxs.register_namespace("g", "http://base.google.com/ns/1.0") products = xxs.select('//channel/item') for product in products: loader = ProductLoader(item=Product(), selector=product) loader.add_xpath('url', 'link/text()') loader.add_xpath('name', 'title/text()') loader.add_xpath('image_url', 'g:image_link/text()') loader.add_xpath('price', 'g:price/text()') loader.add_xpath('brand', 'g:brand/text()') categories = product.select( 'g:product_type/text()').extract()[0].split(' > ') loader.add_value('category', categories) loader.add_xpath('sku', 'g:id/text()') loader.add_xpath('identifier', 'g:id/text()') stock = product.select( 'g:availability/text()').extract()[0].lower() if stock != 'in stock': loader.add_value('stock', 0) yield loader.load_item()
def scrape_rss(response): log.msg("inside scrape rss") xxs = XmlXPathSelector(response) items = [] requests = [] for item_tag in xxs.select('//item'): items.append(ArticleItem()) if len(item_tag.select("title")) > 0: items[-1]["title"] = item_tag.select("title/text()")[0].extract() if len(item_tag.select("pubDate")) > 0: items[-1]["time_published"] = [ item_tag.select("pubDate/text()")[0].extract() ] if len(item_tag.select("link")) > 0: items[-1]["url"] = item_tag.select("link/text()")[0].extract() if len(item_tag.select("description")) > 0: items[-1]["summary"] = item_tag.select( "description/text()")[0].extract() request = Request(items[-1]["url"], callback=extract_author_from_link) request.meta["item"] = items[-1] yield request
def parse(self, response): hxs = XmlXPathSelector(response) shows = hxs.select('//show') date_from = datetime.now() date_to = date_from + timedelta(days=7 * 6) for show in shows: name = show.select('./name/text()').extract()[0] url = show.select('./@href').extract()[0] show_id = url.split('/')[-1] show_data = SHOWS_DATA % (show_id, date_from.strftime('%Y-%m-%d'), date_to.strftime('%Y-%m-%d')) r = Request( 'https://api.entstix.com/api/v1/xlive/booking/book/availability/show', method='POST', body=show_data, callback=self.parse_products, meta={ 'name': name, 'id': show_id }) yield r
def parse(self, response): xxs = XmlXPathSelector(response) xxs.remove_namespaces() urls = xxs.select('//loc/text()').extract() for url in urls: if 'brands-sitemap.xml' in url: continue if 'productbrand' in url: prod_id = re.findall('productbrand_(\d+).html', url) prod_id = prod_id[0] if prod_id else '' if prod_id: if prod_id in self.product_ids: continue else: self.product_ids.append(prod_id) yield Request(url, callback=self.parse_product, meta={"dont_merge_cookies": True}) else: yield Request(url, meta={"dont_merge_cookies": True}) '''
def parse_products(self, response): hxs = XmlXPathSelector(response) show_id = response.meta['id'] name = response.meta['name'] if not hxs.select('/availability/moreResults/text()'): self.log('No results for %s, %s' % (show_id, name)) return if hxs.select( '/availability/moreResults/text()')[0].extract() != 'false': self.log('There are more results!') date_from = datetime.now() date_to = date_from + timedelta(days=7 * 6) show_data = SHOWS_DATA_NEXT % ( show_id, date_from.strftime('%Y-%m-%d'), date_to.strftime('%Y-%m-%d'), hxs.select('/availability/navigate/@key')[0].extract()) r = Request( 'https://api.entstix.com/api/v1/xlive/booking/book/availability/show', method='POST', body=show_data, callback=self.parse_products, meta={ 'name': name, 'id': show_id }) yield r products = hxs.select('.//performances/performance') weekdays = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ] ids_seen = defaultdict(list) for product in products: loader = ProductLoader(item=Product(), selector=product) face_value = product.select('.//faceValue/text()')[0].extract() price = product.select('.//saleprice/text()')[0].extract() date_ = product.select('.//date/text()')[0].extract()[4:] date_ = datetime.strptime(date_, '%d-%b-%Y %H:%M') type_ = product.select('.//type/text()')[0].extract() identifier = ':'.join( [show_id, date_.strftime('%Y-%m-%d'), type_, face_value]) if identifier in ids_seen and price not in ids_seen[identifier]: ids_seen[identifier].append(price) identifier += '-' + product.select('.//block/@id')[0].extract() else: ids_seen[identifier].append(price) loader.add_value('identifier', identifier) loader.add_value('brand', face_value) loader.add_value('price', price) loader.add_value('name', name) loader.add_value('category', weekdays[date_.weekday()]) p = loader.load_item() p['sku'] = date_.strftime('%d-%m-%y') + ' ' + type_.upper() yield p
def parse_vote(self, response): if not hasattr(response, 'body_as_unicode'): self.log('Cannot parse: {u}'.format(u=response.url), level=log.INFO) return x = XmlXPathSelector(response) info = x.select('//Resultado/Informacion') session_id = info.select('//Sesion/text()').extract() if not session_id: # can't identify session, so we skip this file self.log('Missing session ID: {u}'.format(u=response.url), level=log.INFO) return # general session info session_id = session_id[0] session_date = date_parser.parse( info.select('//Fecha/text()').extract()[0], dayfirst=True) session_instance, session_created = Session.objects.get_or_create( session=session_id, defaults={'date': session_date}) if not session_created: session_instance.date = session_date session_instance.save() # specific voting session info voting_number = info.select('//NumeroVotacion/text()').extract() if not voting_number: self.log('Missing voting number: {u}'.format(u=response.url), level=log.INFO) return voting_number = voting_number[0] voting_title = info.select('//Titulo/text()').extract()[0] voting_text = info.select('//TextoExpediente/text()').extract()[0] voting_title_sub = info.select('//TituloSubGrupo/text()').extract() voting_title_sub = voting_title_sub[0] if voting_title_sub else '' voting_text_sub = info.select('//TextoSubGrupo/text()').extract() voting_text_sub = voting_text_sub[0] if voting_text_sub else '' voting_instance, voting_created = Voting.objects.get_or_create( session=session_instance, number=voting_number) voting_instance.title = voting_title voting_instance.record_text = voting_text voting_instance.subgroup_title = voting_title_sub voting_instance.subgroup_text = voting_text_sub # voting session counters counts = x.select('//Resultado/Totales') counts_assent = counts.select('//Asentimiento/text()').extract()[0] if counts_assent.lower() == 'no': counts_assent = False else: counts_assent = True if counts_assent is False: counts_presents = counts.select('//Presentes/text()').extract()[0] counts_for = counts.select('//AFavor/text()').extract()[0] counts_against = counts.select('//EnContra/text()').extract()[0] counts_abstentions = counts.select( '//Abstenciones/text()').extract()[0] counts_dont = counts.select('//NoVotan/text()').extract()[0] voting_instance.attendee = counts_presents voting_instance.for_votes = counts_for voting_instance.against_votes = counts_against voting_instance.abstains = counts_abstentions voting_instance.no_votes = counts_dont voting_instance.assent = counts_assent record = response.meta['record'] initiatives = Initiative.objects.filter(record__exact=record) if initiatives: voting_instance.initiative_set.add(initiatives.latest('id')) voting_instance.save() if counts_assent is False: # time to parse votes! votes = x.select('//Resultado/Votaciones/Votacion') Vote.objects.filter(voting=voting_instance).delete() votes_list = [] for v in votes: member_seat = v.select('Asiento/text()').extract()[0] # @jneight: I don't like search members by name, seats better? full_name = v.select('Diputado/text()').extract()[0] second_name, first_name = full_name.split(',') vote_type = v.select('Voto/text()').extract()[0] member_pk = Member.objects.filter( name__iexact=first_name.strip(), second_name__iexact=second_name.strip()).values_list( 'pk', flat=True) if member_pk: votes_list.append( Vote(voting=voting_instance, member_id=member_pk[0], vote=vote_type)) Vote.objects.bulk_create(votes_list) return voting_instance
from scrapy.selector import XmlXPathSelector xml = ( """ <root> <foos> <foo>the quick <bar>brown </bar>fox</foo> </foos> </root> """ ) xxs = XmlXPathSelector(text=xml) foos = xxs.select('//foos') for one in foos: text = one.select('./foo//text()').extract() text = ''.join(text) print(text) xml = ( """ <content type="text/xml"> <s:dict> <s:key name="group_id">MAC</s:key> <s:key name="label">NOT FOR RESALE</s:key> <s:key name="max_violations">5</s:key> <s:key name="quota">1000000000</s:key> <s:key name="relative_expiration_interval">0</s:key> <s:key name="relative_expiration_start">0</s:key>
def parse_google_geocode(self, response): self.log('Parsing response from google geocoder\n%s' % (response.body), log.DEBUG) xxs = XmlXPathSelector(response=response) reportnum = response.request.meta['reportnum'] source = response.request.meta['source'] state = response.request.meta['state'] geocode_cache_key = response.request.meta.get('cache_key', None) status = xxs.select('//status/text()').extract() if status: status = status[0] if status == u'OK': result_type = xxs.select('//result/type[1]/text()').extract() if result_type: result_type = result_type[0] location = xxs.select('//geometry/location') lat = location.select('lat/text()').extract()[0] lng = location.select('lng/text()').extract()[0] geocode_state = xxs.select( '//address_component[type="administrative_area_level_1"]/short_name/text()' ) if geocode_state: geocode_state = geocode_state.extract()[0] else: geocode_state = xxs.select( '//address_component[type="country"]/short_name/text()') if geocode_state: geocode_state = geocode_state.extract()[0] if not geocode_state in self.us_territories: geocode_state = None if source == 'ADDRESS': if result_type: source = result_type else: source = 'IGNORE' if source == 'ZIP' and result_type != 'postal_code': self.log('Bad zip code %s' % (geocode_cache_key), log.WARNING) source = 'IGNORE' if geocode_state: if (geocode_state.lower() != state.lower()): self.log( 'Geocode state mismatch: expected %s, actual %s' % (state, geocode_state), log.WARNING) source = 'IGNORE' else: self.log('Geocode returned with no state code', log.WARNING) source = 'IGNORE' try: item = self.createGeocode(reportnum, source, lat, lng) except Exception as e: self.log( 'GeocodeError:%s\n\torig source %s, source %s, loc %s, %s' % (e, response.request.meta['source'], source, lat, lng), log.ERROR) raise if item: if geocode_cache_key: self.db.putGeocodeCache(geocode_cache_key, lat, lng) yield item self.item_completed(reportnum) else: self.log( 'Dropping geocoder response with result type: %s' % (result_type), log.INFO) elif status == 'OVER_QUERY_LIMIT': self.log( 'Geocode failed for task id %s \n%s\n%s' % (reportnum, response.request, response.body), log.WARNING) # Do not mark the task as done, we will pick it up again on the next run self.item_processing(task_id) pass else: msg = 'Google Geocode operation failed for task id %s : %s \n%s' % ( reportnum, response.request, response.body) try: self.send_alert(msg, reportnum) except Exception: self.log(msg, log.ERROR) raise
def parse_xml_document(self, response): xxs = XmlXPathSelector(response) votes = xxs.select('//meeting/vote') items = [] for vote in votes: councilvote = VoteItem() votenum = int(vote.select('@number').extract()[0]) councilvote["number"] = int(votenum) councilvote["date"] = vote.select('vote-date/text()').extract()[0] councilvote["time"] = vote.select('vote-time/text()').extract()[0] councilvote["motion_ch"] = vote.select( 'motion-ch/text()').extract()[0] councilvote["motion_en"] = vote.select( 'motion-en/text()').extract()[0] councilvote["mover_ch"] = vote.select( 'mover-ch/text()').extract()[0] councilvote["mover_en"] = vote.select( 'mover-en/text()').extract()[0] councilvote["mover_type"] = vote.select( 'mover-type/text()').extract()[0] councilvote["separate_mechanism"] = vote.select( 'vote-separate-mechanism/text()').extract()[0] if councilvote["separate_mechanism"] == 'Yes': mechanism = [ 'functional-constituency', 'geographical-constituency' ] else: mechanism = ['overall'] for constituency in mechanism: if constituency == 'functional-constituency': short = 'fc_' elif constituency == 'geographical-constituency': short = 'gc_' else: short = '' for count_type in ['present', 'vote', 'yes', 'no', 'abstain']: councilvote[short + count_type] = int( vote.select('vote-summary/' + constituency + '/' + count_type + '-count/text()').extract()[0]) councilvote[short + 'result'] = vote.select( 'vote-summary/' + constituency + '/' + 'result/text()').extract()[0] councilvote['result'] = vote.select( 'vote-summary/overall/result/text()').extract()[0] items.append(councilvote) members = xxs.select('//meeting/vote[%s]/individual-votes/member' % votenum) for member in members: individualvote = IndividualVoteItem() individualvote['number'] = councilvote["number"] individualvote['date'] = councilvote["date"] individualvote['name_ch'] = member.select( '@name-ch').extract()[0] individualvote['name_en'] = member.select( '@name-en').extract()[0] individualvote['constituency'] = member.select( '@constituency').extract()[0] individualvote['vote'] = member.select( 'vote/text()').extract()[0] items.append(individualvote) return items
def parse(self, response): x = XmlXPathSelector(response) x.remove_namespaces() x.register_namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#") items = [] items = x.select('//record/metadata/RDF') jsons = [] for item in items: creator = item.select( 'MetaResource/creator/Agent/name/text()').extract() title = item.select('Resource/title/text()').extract() uri = item.select('Resource/screen/Image/@rdf:about').extract() tags = item.select( 'Resource/subject/Description/value/text()').extract() thumbnail = item.select( 'Resource/thumbnail/Image/@rdf:about').extract() lat = item.select( 'Resource/spatial/Description/lat/text()').extract() long = item.select( 'Resource/spatial/Description/long/text()').extract() locality = item.select( 'Resource/spatial/Description/locality/text()').extract() tags_string = '"' + '", "'.join(tags) + '"' if not lat: newlat = 'null' else: newlat = lat[0] if not long: newlong = 'null' else: newlong = long[0] if not locality: newloc = '' else: newloc = locality[0] json_entry = '{"title": "' + title[0] + '", "uri": "' + uri[ 0] + '", "attribution_uri": "' + uri[ 0] + '", "media_creator_username": "******", "thumbnail_url": "' + thumbnail[ 0] + '", "media_geo_latitude": ' + newlat + ', "media_geo_longitude": ' + newlong + ', "location": "' + newloc + '", "tags": [' + tags_string + '], "archive":"Yahoo! Japan", "media_type": "Image", "layer_type": "Image", "child_items_count":0, "published":1}, ' jsons.append(json_entry) resumptionToken = x.select('//resumptionToken/text()').extract() if resumptionToken == []: nextFileLink = '' open('last.txt', 'wb').write(''.join(jsons).encode("UTF-8")) else: nextFileLink = "http://search.shinrokuden.irides.tohoku.ac.jp/webapi/oaipmh?verb=ListRecords&metadataPrefix=sdn&resumptionToken=" + resumptionToken[ 0].encode('ascii') open(resumptionToken[0].encode('ascii') + '.txt', 'wb').write(''.join(jsons).encode("UTF-8")) yield Request(nextFileLink, callback=self.parse)
def parse(self, response): xxs = XmlXPathSelector(response) links = xxs.select("//link/text()").extract() return [Request(x, callback=self.parse_item) for x in links]
def parse(self, response): xxs = XmlXPathSelector(response) for title in xxs.select("//item/title/text()").extract() log.msg(title)
def _extract_links(self, response): xxs = XmlXPathSelector(response) for url in xxs.select(self.xpath).extract(): yield Link(url.encode(response.encoding))