def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> from meza.compat import decode >>> >>> def run(reactor): ... callback = lambda x: print(decode(next(x)['content'][:32])) ... url = get_path('cnn.html') ... conf = {'url': url, 'start': '<title>', 'end': '</title>'} ... objconf = Objectify(conf) ... kwargs = {'stream': {}, 'assign': 'content'} ... d = async_parser(None, objconf, **kwargs) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... CNN.com International - Breaking """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) content = yield io.async_url_read(url) parsed = get_string(content, objconf.start, objconf.end) detagged = get_text(parsed) if objconf.detag else parsed splits = detagged.split(objconf.token) if objconf.token else [detagged] stream = ({kwargs['assign']: chunk} for chunk in splits) return_value(stream)
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> from meza._compat import decode >>> >>> url = get_path('cnn.html') >>> conf = {'url': url, 'start': '<title>', 'end': '</title>'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': {}, 'assign': 'content'} >>> result = parser(None, objconf, **kwargs) >>> resp = next(result)['content'][:21] >>> decode(resp) == 'CNN.com International' True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) with closing(urlopen(url)) as response: f = response.fp encoding = get_response_encoding(response, 'utf-8') decoded = iterdecode(f, encoding) sliced = betwix(decoded, objconf.start, objconf.end, True) content = '\n'.join(sliced) parsed = get_string(content, objconf.start, objconf.end) detagged = get_text(parsed) if objconf.detag else parsed splits = detagged.split(objconf.token) if objconf.token else [detagged] stream = ({kwargs['assign']: chunk} for chunk in splits) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> from meza.compat import decode >>> >>> url = get_path('cnn.html') >>> conf = {'url': url, 'start': '<title>', 'end': '</title>'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': {}, 'assign': 'content'} >>> result = parser(None, objconf, **kwargs) >>> resp = next(result)['content'][:21] >>> decode(resp) == 'CNN.com International' True """ if skip: stream = kwargs['stream'] else: if objconf.memoize and not objconf.cache_type: objconf.cache_type = 'auto' with fetch(decode=True, **objconf) as f: sliced = betwix(f, objconf.start, objconf.end, True) content = '\n'.join(sliced) parsed = get_string(content, objconf.start, objconf.end) detagged = get_text(parsed) if objconf.detag else parsed splits = detagged.split(objconf.token) if objconf.token else [detagged] stream = ({kwargs['assign']: chunk} for chunk in splits) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> from meza.compat import decode >>> >>> url = get_path('cnn.html') >>> conf = {'url': url, 'start': '<title>', 'end': '</title>'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': {}, 'assign': 'content'} >>> result = parser(None, objconf, **kwargs) >>> resp = next(result)['content'][:21] >>> decode(resp) == 'CNN.com International' True """ if skip: stream = kwargs['stream'] else: with fetch(decode=True, **objconf) as f: sliced = betwix(f, objconf.start, objconf.end, True) content = '\n'.join(sliced) parsed = get_string(content, objconf.start, objconf.end) detagged = get_text(parsed) if objconf.detag else parsed splits = detagged.split(objconf.token) if objconf.token else [detagged] stream = ({kwargs['assign']: chunk} for chunk in splits) return stream