def parse_searchrequest(self, response): """ This function parses the initial response of the ChemSpider Search API Requires a valid token to function. :param response: the Response object to be parsed :return: A Request for the information page and a Request for the extendedinfo API call """ sel = Selector(response) log.msg('chemspider parse_searchrequest', level=log.DEBUG) sel.register_namespace('cs', 'http://www.chemspider.com/') csids = sel.xpath('.//cs:int/text()').extract() if len(csids) == 0: log.msg('ChemSpider found nothing', level=log.ERROR) return elif len(csids) > 1: log.msg('ChemSpider found multiple substances, taking first ' 'element', level=log.DEBUG) csid = csids[0] structure_url = self.website[:-2].replace("\\", "") + self.structure % csid extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) return [Request(url=structure_url, callback=self.parse), Request(url=extendedinfo_url, callback=self.parse_extendedinfo)]
def parse(self, response): sel = Selector(response) sel.register_namespace("redrover", "https://www.redroverapp.com/redrover") for event in sel.xpath("//redrover:event"): loader = NewVictoryLoader(item=NewVictoryItem(), selector=event) loader.selector.register_namespace("redrover", "https://www.redroverapp.com/redrover") # u"image", for fld in [u"name", u"description", u"start_date", u"start_time", u"end_time", u"phone", u"link"]: fld_xpath = "redrover:%s/text()" % fld loader.add_xpath(fld, fld_xpath) for fld in [u"street_addr", u"state", u"zip", u"city", u"lat", u"long", u"name"]: fld_xpath = "redrover:place/redrover:%s/text()" % fld loader.add_xpath(u"place_"+fld, fld_xpath) start_date = "".join(loader.get_collected_values("start_date")) loader.replace_value(u"end_freq", start_date) place_name = loader.get_collected_values("place_name") if place_name and place_name == u"The New Victory Theater": loader.replace_value(u"foursquare_id", u"4afaede8f964a520a01922e3") loader.add_value(u"rr_identifier", u"RFNN") loader.add_value(u"rr_publisher_market", u"NYC") item = loader.load_item() yield item
def parse_searchrequest(self, response): """ This function parses the initial response of the ChemSpider Search API Requires a valid token to function. :param response: the Response object to be parsed :return: A Request for the information page and a Request for the extendedinfo API call """ sel = Selector(response) log.msg('chemspider parse_searchrequest', level=log.DEBUG) sel.register_namespace('cs', 'http://www.chemspider.com/') csids = sel.xpath('.//cs:int/text()').extract() if len(csids) == 0: log.msg('ChemSpider found nothing', level=log.ERROR) return elif len(csids) > 1: log.msg( 'ChemSpider found multiple substances, taking first ' 'element', level=log.DEBUG) csid = csids[0] structure_url = self.website[:-2].replace("\\", "") + self.structure % csid extendedinfo_url = self.website[:-2].replace( "\\", "") + self.extendedinfo % csid log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG) return [ Request(url=structure_url, callback=self.parse), Request(url=extendedinfo_url, callback=self.parse_extendedinfo) ]
def parse(self, response): sel = Selector(response) sel.register_namespace("redrover", "https://www.redroverapp.com/redrover") #import ipdb;ipdb.set_trace() for event in sel.xpath(u"//item/event"): loader = EventLoader(item=EventItem(), selector=event) loader.selector.register_namespace(u"redrover", u"https://www.redroverapp.com/redrover") for fld in [u"name", u"description", u"start_date", u"start_time", u"end_time", u"phone", u"link"]: fld_xpath = u"%s/text()" % fld loader.add_xpath(fld, fld_xpath) for fld in [u"name", u"street_addr", u"state", u"zip", u"city", u"lat", u"long"]: fld_xpath = u"place/%s/text()" % fld loader.add_xpath(u"place_"+fld, fld_xpath) start_date = loader.get_collected_values(u"start_date") loader.replace_value(u"end_freq", start_date) loader.add_value(u"rr_identifier", response.meta[u"pub_code"]) loader.add_value(u"rr_publisher_market", u"NYC") #TODO add ages and categories yield loader.load_item()
def put(self, request, *args, **kwargs): archive_url = 'http://export.arxiv.org/api/query?id_list=' + \ request.data['old_paper_id'] print(archive_url) with libreq.urlopen(archive_url) as url: r = url.read() content_selector = Selector(text=r) content_selector.register_namespace( 'arxiv', 'http://arxiv.org/schemas/atom') content_selector.register_namespace( 'xmlns', 'http://www.w3.org/2005/Atom') content_selector.remove_namespaces() for line in content_selector.xpath('//feed/entry'): item = TestItem() item['id'] = line.xpath('id/text()').extract() item['title'] = line.xpath('title/text()').extract() item['links'] = line.xpath('link/@href').extract() item['authors'] = line.xpath('author/name/text()').extract() item['comments'] = line.xpath('comment/text()').extract() item['primary_category'] = line.xpath( 'primary_category/@term').extract() item['categories'] = line.xpath('category/@term').extract() item['summary'] = line.xpath('summary/text()').extract() self.saveItem2Db(item) return Response(item, status=status.HTTP_201_CREATED)
def xmliter_lxml(obj, nodename, namespace=None, prefix='x'): from lxml import etree reader = _StreamReader(obj) tag = '{%s}%s' % (namespace, nodename) if namespace else nodename iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding) selxpath = '//' + ('%s:%s' % (prefix, nodename) if namespace else nodename) for _, node in iterable: nodetext = etree.tostring(node, encoding='unicode') node.clear() xs = Selector(text=nodetext, type='xml') if namespace: xs.register_namespace(prefix, namespace) yield xs.xpath(selxpath)[0]
def parseSigner(self, response): """ Parse data for signer (first name, last name etc) """ xxs = Selector(response) xxs.register_namespace("xmlns", "http://diavgeia.gov.gr/schema/v2") signers = xxs.xpath("//xmlns:signer") s = Signer() for signer in signers.xpath("*"): name = signer.xpath("name(.)").extract()[0] if len (signer.xpath("./text()" )) != 0: value = signer.xpath("./text()").extract()[0] s[name] = value yield s
def parse(self, response): content_selector = Selector(text=response.body) content_selector.register_namespace('arxiv', 'http://arxiv.org/schemas/atom') content_selector.register_namespace('xmlns', 'http://www.w3.org/2005/Atom') content_selector.remove_namespaces() for line in content_selector.xpath('//feed/entry'): item = TestItem() item['id'] = line.xpath('id/text()').extract() item['title'] = line.xpath('title/text()').extract() item['links'] = line.xpath('link/@href').extract() item['authors'] = line.xpath('author/name/text()').extract() item['comments'] = line.xpath('comment/text()').extract() item['primary_category'] = line.xpath('primary_category/@term').extract() item['categories'] = line.xpath('category/@term').extract() item['summary'] = line.xpath('summary/text()').extract() yield item
def parse (self, response): xxs = Selector(response) xxs.register_namespace("xmlns", "http://diavgeia.gov.gr/schema/v2") # Parse Decision decisions = xxs.xpath("//xmlns:decisions/xmlns:decision") # Print debug the query field of the XML query = xxs.xpath("//xmlns:info/xmlns:query/text()").extract()[0] self.logger.debug("Query: %s" % query) for decision in decisions: d = DiavgeiaItem() for element in decision.xpath("*"): name = element.xpath("name(.)").extract()[0] if len (element.xpath("*")) != 0: #Handle elements with children here d[name] = [] for child in element.xpath("*"): #TODO: Add support for extraValues field ch = {} childname = child.xpath("name(.)").extract()[0] if len (child.xpath("./text()" )) != 0: chvalue = child.xpath("./text()").extract()[0] ch[childname]=chvalue # Get information about signers, units etc. if childname == "signerId": yield Request(self.base_url + \ "signers/%s" % (chvalue), self.parseSigner) d[name].append(ch) if len (element.xpath("./text()" )) != 0: value = element.xpath("./text()").extract()[0] d[name] = value yield d # Get next page info total = int(xxs.xpath("//xmlns:info/xmlns:total/text()").extract()[0]) page = int(xxs.xpath("//xmlns:info/xmlns:page/text()").extract()[0]) size = int(xxs.xpath("//xmlns:info/xmlns:size/text()").extract()[0]) if (page*size <= total): yield Request(self.url % (page+1), self.parse)
def do_parse(self, chart, response): selector = Selector(response=response) selector.register_namespace('itms', 'http://phobos.apple.com/rss/1.0/modules/itms/') selector.register_namespace('ns', 'http://www.w3.org/2005/Atom') selector.register_namespace('im','http://itunes.apple.com/rss') itms = selector.xpath("//item") im = selector.xpath("/ns:feed/ns:entry") if itms: ns = 'itms' if im: ns = 'im' items = itms or im item_type = self.do_get_type(chart) for rank, item in enumerate(items): entry = TomahawkItemLoader(selector=item) if ns is 'im': entry.add_xpath(item_type, '//im:name/text()') entry.add_xpath('artist', './/im:artist/text()') else: entry.add_xpath('album', './/itms:album/text()') entry.add_xpath('artist', './/itms:artist/text()') entry.add_value('rank', rank) chart.add_value("list", entry.load_item()) return self.do_process_item(chart)
def get_selector(cls, response): sel = Selector(response) for name, uri in cls.xml_namespaces: sel.register_namespace(name, uri) return sel
def get_selector_from_text(cls, text): text = u'<html><body>%s</body></html>' % text sel = Selector(text=text) for name, uri in cls.xml_namespaces: sel.register_namespace(name, uri) return sel
def parse(self, response): sel = Selector(response) sel.register_namespace("redrover", "https://www.redroverapp.com/redrover") for event in sel.xpath("//redrover:event")[int(self.item_from):int(self.item_to)]: loader = MassAudubonLoader(item=EventItem(), selector=event) loader.selector.register_namespace("redrover", "https://www.redroverapp.com/redrover") event_images = { 'Boston Nature Center': ['BostonNatureCenter[1].jpg', ], 'Broadmoor Wildlife Sanctuary': ['Brodmoor[1].jpg', ], 'Drumlin Farm Wildlife Sanctuary': ['DrumlinFarm.jpg', ], 'Joppa Flats Education Center': ['JoppaFlats[1].jpg', ], 'Moose Hill Wildlife Sanctuary': ['MooseHill[1].jpg', ], 'North River Wildlife Sanctuary': ['North River.jpg', ], 'Pleasant Valley Wildlife Sanctuary': ['PleasantValley.jpg', ], 'Stony Brook Wildlife Sanctuary': ['Stony Brook.jpg', ], 'Wellfleet Bay Wildlife Sanctuary': ['WellfleetBay.jpg', ], } for fld in [u"name", u"description", u"start_date", u"start_time", u"end_time", u"phone", ]: fld_xpath = "redrover:%s/text()" % fld loader.add_xpath(fld, fld_xpath) for fld in [u"name", u"street_addr", u"state", u"zip", u"city", u"lat", u"long"]: fld_xpath = "redrover:place/redrover:%s/text()" % fld loader.add_xpath(u"place_"+fld, fld_xpath) place_name = "".join(loader.get_collected_values(u"place_name")) if u"Drumlin Farm Wildlife Sanctuary".lower() in place_name.lower(): loader.replace_value(u"place_name", u"Drumlin Farm Wildlife Sanctuary") loader.replace_value(u"place_street_addr", u"208 South Great Road") loader.replace_value(u"place_state", u"MA") loader.replace_value(u"place_city", u"Lincoln") loader.replace_value(u"place_zip", u"") loader.replace_value(u"place_lat", u"42.40980100000001") loader.replace_value(u"place_long", u"-71.331795") start_date = loader.get_collected_values("start_date") loader.replace_value(u"end_freq", start_date) place_name = "".join(loader.get_collected_values(u"place_name")) if place_name in event_images: image_name = event_images.get(place_name)[0] else: place_name = random.choice(event_images.keys()) image_name = event_images.get(place_name)[0] loader.replace_value(u"image_name", image_name) loader.replace_value( u"image_urls", [u"%s/mass_audubon/%s" % (settings.EVENT_IMAGES_URL, image_name,)] ) loader.add_value(u"ages", u"Kids") loader.add_value(u"rr_identifier", u"MAS") loader.add_value(u"rr_publisher_market", u"BOS") loader.add_xpath(u"link", u"redrover:link/text()") event_url = "".join(loader.get_collected_values(u"link")) if event_url: event_id = event_url.split("=")[-1] if event_id: yield FormRequest( url="http://www.massaudubon.org/MAOPRAPI/api", formdata={ 'action': 'GetEventDetail', 'data[eventId]': event_id, }, callback=self.parse_event_page, meta={"loader": loader}, )