def async_get_rss(url, convert_charrefs=False): try: f = yield async_url_open(url, timeout=TIMEOUT) except ValueError: f = filter(None, url.splitlines()) document = microdom.parse(f, lenient=True) return_value(doc2entries(document))
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x[0])['mileage']) ... url = get_path('spreadsheet.csv') ... conf = {'url': url, 'sanitize': True, 'skip_rows': 0} ... objconf = Objectify(conf) ... d = async_parser(None, objconf, False, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... 7213 """ if skip: stream = kwargs['stream'] else: # TODO: write function to extract encoding from response url = utils.get_abspath(objconf.url) response = yield io.async_url_open(url) first_row, custom_header = objconf.skip_rows, objconf.col_names renamed = {'first_row': first_row, 'custom_header': custom_header} rkwargs = utils.combine_dicts(objconf, renamed) rkwargs['encoding'] = objconf.encoding stream = read_csv(response, **rkwargs) result = (stream, skip) return_value(result)
def asyncGetRSS(url, convert_charrefs=False): # TODO: implement via an async parser # maybe get twisted.web.microdom.parse working for HTML try: parser = LinkParser(convert_charrefs=convert_charrefs) except TypeError: parser = LinkParser() try: f = yield async_url_open(url, timeout=TIMEOUT) except ValueError: f = filter(None, url.splitlines()) return_value(gen_entries(f, parser))
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x)['mileage']) ... url = get_path('spreadsheet.csv') ... conf = { ... 'url': url, 'sanitize': True, 'skip_rows': 0, ... 'encoding': ENCODING} ... objconf = Objectify(conf) ... d = async_parser(None, objconf, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... 7213 """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) r = yield io.async_url_open(url) first_row, custom_header = objconf.skip_rows, objconf.col_names renamed = {'first_row': first_row, 'custom_header': custom_header} rkwargs = merge([objconf, renamed]) stream = auto_close(read_csv(r, **rkwargs), r) return_value(stream)
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(x[0][0]['title']) ... url = get_path('gigs.json') ... objconf = Objectify({'url': url, 'path': 'value.items'}) ... d = async_parser(None, objconf, False, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Business System Analyst """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') f = yield io.async_url_open(url) stream = utils.any2dict(f, ext, objconf.html5, path=objconf.path) f.close() result = (stream, skip) return_value(result)
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x)['content']) ... url = get_path('lorem.txt') ... objconf = Objectify({'url': url, 'encoding': ENCODING}) ... d = async_parser(None, objconf, assign='content') ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... What is Lorem Ipsum? """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) f = yield io.async_url_open(url) assign = kwargs['assign'] encoding = objconf.encoding _stream = ({assign: line.strip().decode(encoding)} for line in f) stream = auto_close(_stream, f) return_value(stream)
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(x[0]['title']) ... url = get_path('gigs.json') ... objconf = Objectify({'url': url, 'path': 'value.items'}) ... d = async_parser(None, objconf, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Business System Analyst """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = p.splitext(url)[1].lstrip('.') f = yield io.async_url_open(url) stream = any2dict(f, ext, objconf.html5, path=objconf.path) f.close() return_value(stream)
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> @coroutine ... def run(reactor): ... xml_url = get_path('ouseful.xml') ... xml_conf = {'url': xml_url, 'xpath': '/rss/channel/item'} ... xml_objconf = Objectify(xml_conf) ... xml_args = (None, xml_objconf) ... html_url = get_path('sciencedaily.html') ... html_conf = {'url': html_url, 'xpath': '/html/head/title'} ... html_objconf = Objectify(html_conf) ... html_args = (None, html_objconf) ... kwargs = {'stream': {}} ... ... try: ... xml_stream = yield async_parser(*xml_args, **kwargs) ... html_stream = yield async_parser(*html_args, **kwargs) ... print(next(xml_stream)['title'][:44]) ... print(next(html_stream)) ... except Exception as e: ... logger.error(e) ... logger.error(traceback.format_exc()) ... >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Running “Native” Data Wrangling Applications Help Page -- ScienceDaily """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict try: f = yield io.async_url_open(url) tree = yield util.xml2etree(f, xml=xml) except Exception as e: logger.error(e) logger.error(traceback.format_exc()) elements = xpath(tree, objconf.xpath) f.close() items = map(util.etree2dict, elements) stringified = ({kwargs['assign']: encode(i)} for i in items) stream = stringified if objconf.stringify else items return_value(stream)