def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.utils import get_abspath >>> from meza.fntools import Objectify >>> >>> feed = 'http://feeds.feedburner.com/TechCrunch/' >>> url = 'http://query.yahooapis.com/v1/public/yql' >>> query = "select * from feed where url='%s'" % feed >>> conf = {'query': query, 'url': url, 'debug': False} >>> objconf = Objectify(conf) >>> url = get_abspath(get_path('yql.xml')) >>> >>> with fetch(url) as f: ... kwargs = {'stream': {}, 'response': f} ... result = parser(None, objconf, **kwargs) >>> >>> next(result)['title'] 'Bring pizza home' """ if skip: stream = kwargs['stream'] else: f = kwargs.get('response') if not f: params = {'q': objconf.query, 'diagnostics': objconf.debug} if objconf.memoize and not objconf.cache_type: objconf.cache_type = 'auto' f = fetch(params=params, **objconf) # TODO: consider paging for large result sets root = xml2etree(f).getroot() results = root.find('results') stream = map(etree2dict, results) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> >>> url = get_path('ouseful.xml') >>> objconf = Objectify({'url': url, 'xpath': '/rss/channel/item'}) >>> result = parser(None, objconf, stream={}) >>> title = 'Running “Native” Data Wrangling Applications' >>> next(result)['title'][:44] == title True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict if objconf.memoize and not objconf.cache_type: objconf.cache_type = 'auto' with fetch(**objconf) as f: root = xml2etree(f, xml=xml, html5=objconf.html5).getroot() elements = xpath(root, objconf.xpath) items = map(etree2dict, elements) stringified = ({kwargs['assign']: str(i)} for i in items) stream = stringified if objconf.stringify else items return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> >>> url = get_path('ouseful.xml') >>> objconf = Objectify({'url': url, 'xpath': '/rss/channel/item'}) >>> result = parser(None, objconf, stream={}) >>> title = 'Running “Native” Data Wrangling Applications' >>> next(result)['title'][:44] == title True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict with fetch(**objconf) as f: root = xml2etree(f, xml=xml, html5=objconf.html5).getroot() elements = xpath(root, objconf.xpath) items = map(etree2dict, elements) stringified = ({kwargs['assign']: str(i)} for i in items) stream = stringified if objconf.stringify else items return stream