예제 #1
0
 def parse_searchrequest(self, response):
     """
     This function parses the initial response of the ChemSpider Search API
     Requires a valid token to function.
     :param response: the Response object to be parsed
     :return: A Request for the information page and a Request for the 
     extendedinfo API call
     """
     sel = Selector(response)
     log.msg('chemspider parse_searchrequest', level=log.DEBUG)
     sel.register_namespace('cs', 'http://www.chemspider.com/')
     csids = sel.xpath('.//cs:int/text()').extract()
     if len(csids) == 0:
         log.msg('ChemSpider found nothing', level=log.ERROR)
         return
     elif len(csids) > 1:
         log.msg('ChemSpider found multiple substances, taking first '
                 'element', level=log.DEBUG)
     csid = csids[0]
     structure_url = self.website[:-2].replace("\\", "") + self.structure % csid
     extendedinfo_url = self.website[:-2].replace("\\", "") + self.extendedinfo % csid
     log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
     return [Request(url=structure_url,
                     callback=self.parse),
             Request(url=extendedinfo_url,
                     callback=self.parse_extendedinfo)]
예제 #2
0
    def parse(self, response):
        sel = Selector(response)
        sel.register_namespace("redrover", "https://www.redroverapp.com/redrover")

        for event in sel.xpath("//redrover:event"):
            loader = NewVictoryLoader(item=NewVictoryItem(), selector=event)
            loader.selector.register_namespace("redrover", "https://www.redroverapp.com/redrover")
            #  u"image",
            for fld in [u"name", u"description", u"start_date", u"start_time", u"end_time", u"phone", u"link"]:
                fld_xpath = "redrover:%s/text()" % fld
                loader.add_xpath(fld, fld_xpath)

            for fld in [u"street_addr", u"state", u"zip", u"city", u"lat", u"long", u"name"]:
                fld_xpath = "redrover:place/redrover:%s/text()" % fld
                loader.add_xpath(u"place_"+fld, fld_xpath)

            start_date = "".join(loader.get_collected_values("start_date"))
            loader.replace_value(u"end_freq", start_date)

            place_name = loader.get_collected_values("place_name")
            if place_name and place_name == u"The New Victory Theater":
                loader.replace_value(u"foursquare_id", u"4afaede8f964a520a01922e3")

            loader.add_value(u"rr_identifier", u"RFNN")
            loader.add_value(u"rr_publisher_market", u"NYC")

            item = loader.load_item()

            yield item
예제 #3
0
 def parse_searchrequest(self, response):
     """
     This function parses the initial response of the ChemSpider Search API
     Requires a valid token to function.
     :param response: the Response object to be parsed
     :return: A Request for the information page and a Request for the 
     extendedinfo API call
     """
     sel = Selector(response)
     log.msg('chemspider parse_searchrequest', level=log.DEBUG)
     sel.register_namespace('cs', 'http://www.chemspider.com/')
     csids = sel.xpath('.//cs:int/text()').extract()
     if len(csids) == 0:
         log.msg('ChemSpider found nothing', level=log.ERROR)
         return
     elif len(csids) > 1:
         log.msg(
             'ChemSpider found multiple substances, taking first '
             'element',
             level=log.DEBUG)
     csid = csids[0]
     structure_url = self.website[:-2].replace("\\",
                                               "") + self.structure % csid
     extendedinfo_url = self.website[:-2].replace(
         "\\", "") + self.extendedinfo % csid
     log.msg('chemspider URL: %s' % structure_url, level=log.DEBUG)
     return [
         Request(url=structure_url, callback=self.parse),
         Request(url=extendedinfo_url, callback=self.parse_extendedinfo)
     ]
예제 #4
0
파일: nypl.py 프로젝트: ayat-ra/scraping
    def parse(self, response):
        sel = Selector(response)
        sel.register_namespace("redrover", "https://www.redroverapp.com/redrover")

        #import ipdb;ipdb.set_trace()
        for event in sel.xpath(u"//item/event"):
            loader = EventLoader(item=EventItem(), selector=event)
            loader.selector.register_namespace(u"redrover", u"https://www.redroverapp.com/redrover")

            for fld in [u"name", u"description", u"start_date", u"start_time", u"end_time", u"phone", u"link"]:
                fld_xpath = u"%s/text()" % fld
                loader.add_xpath(fld, fld_xpath)

            for fld in [u"name", u"street_addr", u"state", u"zip", u"city", u"lat", u"long"]:
                fld_xpath = u"place/%s/text()" % fld
                loader.add_xpath(u"place_"+fld, fld_xpath)

            start_date = loader.get_collected_values(u"start_date")
            loader.replace_value(u"end_freq", start_date)

            loader.add_value(u"rr_identifier", response.meta[u"pub_code"])
            loader.add_value(u"rr_publisher_market", u"NYC")

            #TODO add ages and categories

            yield loader.load_item()
예제 #5
0
    def put(self, request, *args, **kwargs):
        archive_url = 'http://export.arxiv.org/api/query?id_list=' + \
            request.data['old_paper_id']
        print(archive_url)
        with libreq.urlopen(archive_url) as url:
            r = url.read()
            content_selector = Selector(text=r)
            content_selector.register_namespace(
                'arxiv', 'http://arxiv.org/schemas/atom')
            content_selector.register_namespace(
                'xmlns', 'http://www.w3.org/2005/Atom')
            content_selector.remove_namespaces()

            for line in content_selector.xpath('//feed/entry'):
                item = TestItem()
                item['id'] = line.xpath('id/text()').extract()
                item['title'] = line.xpath('title/text()').extract()
                item['links'] = line.xpath('link/@href').extract()
                item['authors'] = line.xpath('author/name/text()').extract()
                item['comments'] = line.xpath('comment/text()').extract()
                item['primary_category'] = line.xpath(
                    'primary_category/@term').extract()
                item['categories'] = line.xpath('category/@term').extract()
                item['summary'] = line.xpath('summary/text()').extract()
                self.saveItem2Db(item)

        return Response(item, status=status.HTTP_201_CREATED)
예제 #6
0
def xmliter_lxml(obj, nodename, namespace=None, prefix='x'):
    from lxml import etree
    reader = _StreamReader(obj)
    tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
    iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
    selxpath = '//' + ('%s:%s' % (prefix, nodename) if namespace else nodename)
    for _, node in iterable:
        nodetext = etree.tostring(node, encoding='unicode')
        node.clear()
        xs = Selector(text=nodetext, type='xml')
        if namespace:
            xs.register_namespace(prefix, namespace)
        yield xs.xpath(selxpath)[0]
예제 #7
0
파일: iterators.py 프로젝트: Digenis/scrapy
def xmliter_lxml(obj, nodename, namespace=None, prefix='x'):
    from lxml import etree
    reader = _StreamReader(obj)
    tag = '{%s}%s' % (namespace, nodename) if namespace else nodename
    iterable = etree.iterparse(reader, tag=tag, encoding=reader.encoding)
    selxpath = '//' + ('%s:%s' % (prefix, nodename) if namespace else nodename)
    for _, node in iterable:
        nodetext = etree.tostring(node, encoding='unicode')
        node.clear()
        xs = Selector(text=nodetext, type='xml')
        if namespace:
            xs.register_namespace(prefix, namespace)
        yield xs.xpath(selxpath)[0]
예제 #8
0
    def parseSigner(self, response):
        """ Parse data for signer (first name, last name etc) """
        xxs = Selector(response)
        xxs.register_namespace("xmlns", "http://diavgeia.gov.gr/schema/v2")

        signers = xxs.xpath("//xmlns:signer")
        s = Signer()
        for signer in signers.xpath("*"):
            name = signer.xpath("name(.)").extract()[0]
            if len (signer.xpath("./text()" )) != 0:
                value = signer.xpath("./text()").extract()[0]
                s[name] = value
        yield s
    def parse(self, response):
        content_selector = Selector(text=response.body)
        content_selector.register_namespace('arxiv', 'http://arxiv.org/schemas/atom')
        content_selector.register_namespace('xmlns', 'http://www.w3.org/2005/Atom')
        content_selector.remove_namespaces()

        for line in content_selector.xpath('//feed/entry'):
            item = TestItem()
            item['id'] = line.xpath('id/text()').extract()
            item['title'] = line.xpath('title/text()').extract()
            item['links'] = line.xpath('link/@href').extract()
            item['authors'] = line.xpath('author/name/text()').extract()
            item['comments'] = line.xpath('comment/text()').extract()
            item['primary_category'] = line.xpath('primary_category/@term').extract()
            item['categories'] = line.xpath('category/@term').extract()
            item['summary'] = line.xpath('summary/text()').extract()
            yield item
예제 #10
0
    def parse (self, response):
        xxs = Selector(response)
        xxs.register_namespace("xmlns", "http://diavgeia.gov.gr/schema/v2")

        # Parse Decision
        decisions = xxs.xpath("//xmlns:decisions/xmlns:decision")

        # Print debug the query field of the XML
        query = xxs.xpath("//xmlns:info/xmlns:query/text()").extract()[0]
        self.logger.debug("Query: %s" % query)
        for decision in decisions:
            d = DiavgeiaItem()
            for element in decision.xpath("*"):
                name = element.xpath("name(.)").extract()[0]
                if len (element.xpath("*")) != 0:
                    #Handle elements with children here
                    d[name] = []
                    for child in element.xpath("*"):
                        #TODO: Add support for extraValues field
                        ch = {}
                        childname = child.xpath("name(.)").extract()[0]
                        if len (child.xpath("./text()" )) != 0:
                            chvalue = child.xpath("./text()").extract()[0]
                            ch[childname]=chvalue
                            # Get information about signers, units etc.
                            if childname == "signerId":
                                yield Request(self.base_url + \
                                        "signers/%s" % (chvalue),
                                        self.parseSigner)
                        d[name].append(ch)
                if len (element.xpath("./text()" )) != 0:
                    value = element.xpath("./text()").extract()[0]
                    d[name] = value
            yield d

        # Get next page info
        total = int(xxs.xpath("//xmlns:info/xmlns:total/text()").extract()[0])
        page = int(xxs.xpath("//xmlns:info/xmlns:page/text()").extract()[0])
        size = int(xxs.xpath("//xmlns:info/xmlns:size/text()").extract()[0])
        if (page*size <= total):
            yield Request(self.url % (page+1), self.parse)
예제 #11
0
    def do_parse(self, chart, response):
        selector = Selector(response=response)
        selector.register_namespace('itms', 'http://phobos.apple.com/rss/1.0/modules/itms/')
        selector.register_namespace('ns', 'http://www.w3.org/2005/Atom')
        selector.register_namespace('im','http://itunes.apple.com/rss')

        itms = selector.xpath("//item")
        im = selector.xpath("/ns:feed/ns:entry")

        if itms: ns = 'itms'
        if im: ns = 'im'

        items = itms or im
        item_type = self.do_get_type(chart)

        for rank, item in enumerate(items):
            entry = TomahawkItemLoader(selector=item)
            if ns is 'im':
                entry.add_xpath(item_type, '//im:name/text()')
                entry.add_xpath('artist', './/im:artist/text()')
            else:
                entry.add_xpath('album', './/itms:album/text()')
                entry.add_xpath('artist', './/itms:artist/text()')
            entry.add_value('rank', rank)
            chart.add_value("list", entry.load_item())

        return self.do_process_item(chart)
예제 #12
0
 def get_selector(cls, response):
   sel = Selector(response)
   for name, uri in cls.xml_namespaces:
     sel.register_namespace(name, uri)
   return sel
예제 #13
0
 def get_selector_from_text(cls, text):
   text = u'<html><body>%s</body></html>' % text
   sel = Selector(text=text)
   for name, uri in cls.xml_namespaces:
     sel.register_namespace(name, uri)
   return sel
예제 #14
0
    def parse(self, response):
        sel = Selector(response)
        sel.register_namespace("redrover", "https://www.redroverapp.com/redrover")

        for event in sel.xpath("//redrover:event")[int(self.item_from):int(self.item_to)]:
            loader = MassAudubonLoader(item=EventItem(), selector=event)
            loader.selector.register_namespace("redrover", "https://www.redroverapp.com/redrover")
            event_images = {
                'Boston Nature Center': ['BostonNatureCenter[1].jpg', ],
                'Broadmoor Wildlife Sanctuary': ['Brodmoor[1].jpg', ],
                'Drumlin Farm Wildlife Sanctuary': ['DrumlinFarm.jpg', ],
                'Joppa Flats Education Center': ['JoppaFlats[1].jpg', ],
                'Moose Hill Wildlife Sanctuary': ['MooseHill[1].jpg', ],
                'North River Wildlife Sanctuary': ['North River.jpg', ],
                'Pleasant Valley Wildlife Sanctuary': ['PleasantValley.jpg', ],
                'Stony Brook Wildlife Sanctuary': ['Stony Brook.jpg', ],
                'Wellfleet Bay Wildlife Sanctuary': ['WellfleetBay.jpg', ],
            }

            for fld in [u"name", u"description", u"start_date", u"start_time", u"end_time", u"phone", ]:
                fld_xpath = "redrover:%s/text()" % fld
                loader.add_xpath(fld, fld_xpath)

            for fld in [u"name", u"street_addr", u"state", u"zip", u"city", u"lat", u"long"]:
                fld_xpath = "redrover:place/redrover:%s/text()" % fld
                loader.add_xpath(u"place_"+fld, fld_xpath)

            place_name = "".join(loader.get_collected_values(u"place_name"))
            if u"Drumlin Farm Wildlife Sanctuary".lower() in place_name.lower():
                loader.replace_value(u"place_name", u"Drumlin Farm Wildlife Sanctuary")
                loader.replace_value(u"place_street_addr", u"208 South Great Road")
                loader.replace_value(u"place_state", u"MA")
                loader.replace_value(u"place_city", u"Lincoln")
                loader.replace_value(u"place_zip", u"")
                loader.replace_value(u"place_lat", u"42.40980100000001")
                loader.replace_value(u"place_long", u"-71.331795")

            start_date = loader.get_collected_values("start_date")
            loader.replace_value(u"end_freq", start_date)

            place_name = "".join(loader.get_collected_values(u"place_name"))
            if place_name in event_images:
                image_name = event_images.get(place_name)[0]
            else:
                place_name = random.choice(event_images.keys())
                image_name = event_images.get(place_name)[0]

            loader.replace_value(u"image_name", image_name)

            loader.replace_value(
                u"image_urls",
                [u"%s/mass_audubon/%s" % (settings.EVENT_IMAGES_URL, image_name,)]
            )

            loader.add_value(u"ages", u"Kids")

            loader.add_value(u"rr_identifier", u"MAS")
            loader.add_value(u"rr_publisher_market", u"BOS")

            loader.add_xpath(u"link", u"redrover:link/text()")
            event_url = "".join(loader.get_collected_values(u"link"))
            if event_url:
                event_id = event_url.split("=")[-1]
                if event_id:
                    yield FormRequest(
                        url="http://www.massaudubon.org/MAOPRAPI/api",
                        formdata={
                            'action': 'GetEventDetail',
                            'data[eventId]': event_id,
                        },
                        callback=self.parse_event_page,
                        meta={"loader": loader},
                    )