def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> >>> url = get_path('ouseful.xml') >>> objconf = Objectify({'url': url, 'xpath': '/rss/channel/item'}) >>> result = parser(None, objconf, stream={}) >>> title = 'Running “Native” Data Wrangling Applications' >>> next(result)['title'][:44] == title True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict if objconf.memoize and not objconf.cache_type: objconf.cache_type = 'auto' with fetch(**objconf) as f: root = xml2etree(f, xml=xml, html5=objconf.html5).getroot() elements = xpath(root, objconf.xpath) items = map(etree2dict, elements) stringified = ({kwargs['assign']: str(i)} for i in items) stream = stringified if objconf.stringify else items return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> >>> url = get_path('ouseful.xml') >>> objconf = Objectify({'url': url, 'xpath': '/rss/channel/item'}) >>> result = parser(None, objconf, stream={}) >>> title = 'Running “Native” Data Wrangling Applications' >>> next(result)['title'][:44] == title True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict with fetch(**objconf) as f: root = xml2etree(f, xml=xml, html5=objconf.html5).getroot() elements = xpath(root, objconf.xpath) items = map(etree2dict, elements) stringified = ({kwargs['assign']: str(i)} for i in items) stream = stringified if objconf.stringify else items return stream
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> @coroutine ... def run(reactor): ... xml_url = get_path('ouseful.xml') ... xml_conf = {'url': xml_url, 'xpath': '/rss/channel/item'} ... xml_objconf = Objectify(xml_conf) ... xml_args = (None, xml_objconf) ... html_url = get_path('sciencedaily.html') ... html_conf = {'url': html_url, 'xpath': '/html/head/title'} ... html_objconf = Objectify(html_conf) ... html_args = (None, html_objconf) ... kwargs = {'stream': {}} ... ... try: ... xml_stream = yield async_parser(*xml_args, **kwargs) ... html_stream = yield async_parser(*html_args, **kwargs) ... print(next(xml_stream)['title'][:44]) ... print(next(html_stream)) ... except Exception as e: ... logger.error(e) ... logger.error(traceback.format_exc()) ... >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Running “Native” Data Wrangling Applications Help Page -- ScienceDaily """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict try: f = yield io.async_url_open(url) tree = yield util.xml2etree(f, xml=xml) except Exception as e: logger.error(e) logger.error(traceback.format_exc()) elements = xpath(tree, objconf.xpath) f.close() items = map(util.etree2dict, elements) stringified = ({kwargs['assign']: encode(i)} for i in items) stream = stringified if objconf.stringify else items return_value(stream)