def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> objconf = Objectify({'url': get_path('bbc.html')}) >>> result = parser(None, objconf, stream={}) >>> next(result)['title'] == 'Using NFC tags in the car' True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) rss = autorss.get_rss(url) link = get_abspath(next(rss)['link']) parsed = parse_rss(link) stream = gen_entries(parsed) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item conf (dict): The pipe configuration Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> objconf = Objectify({'url': get_path('feed.xml'), 'sleep': 0}) >>> result = parser(None, objconf, stream={}) >>> next(result)['title'] == 'Donations' True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) parsed = parse_rss(url, objconf.sleep) stream = gen_entries(parsed) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('gigs.json') >>> objconf = Objectify({'url': url, 'path': 'value.items'}) >>> result = parser(None, objconf, stream={}) >>> result[0]['title'] == 'Business System Analyst' True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') with closing(urlopen(url)) as f: stream = any2dict(f, ext, objconf.html5, path=objconf.path) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> objconf = Objectify({'url': get_path('bbc.html')}) >>> result = parser(None, objconf, stream={}) >>> next(result)['link'] == 'file://riko/data/greenhughes.xml' True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) stream = autorss.get_rss(url) return stream
def async_parser(base, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: base (str): The base currency (exchanging from) objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: exchangerate) stream (dict): The original item Returns: Deferred: twisted.internet.defer.Deferred item Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... url = get_path('quote.json') ... conf = { ... 'url': url, 'currency': 'USD', 'sleep': 0, 'precision': 6} ... item = {'content': 'GBP'} ... objconf = Objectify(conf) ... kwargs = {'stream': item, 'assign': 'content'} ... d = async_parser(item['content'], objconf, **kwargs) ... return d.addCallbacks(print, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... 1.545801 """ same_currency = base == objconf.currency if skip: rate = kwargs['stream'] elif same_currency: rate = Decimal(1) elif objconf.url.startswith('http'): r = yield treq.get(objconf.url, params=objconf.params) json = yield treq.json(r) else: url = get_abspath(objconf.url) content = yield io.async_url_read(url, delay=objconf.sleep) json = loads(decode(content)) if not (skip or same_currency): places = Decimal(10)**-objconf.precision rates = parse_response(json) rate = calc_rate(base, objconf.currency, rates, places=places) return_value(rate)
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x)['title']) ... objconf = Objectify({'url': get_path('bbc.html')}) ... d = async_parser(None, objconf, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Using NFC tags in the car """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) rss = yield autorss.async_get_rss(url) link = get_abspath(next(rss)['link']) content = yield io.async_url_read(link) parsed = parse_rss(content) stream = gen_entries(parsed) return_value(stream)
def parser(base, objconf, skip=False, **kwargs): """ Parses the pipe content Args: base (str): The base currency (exchanging from) objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: exchangerate) stream (dict): The original item Returns: dict: The item Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('quote.json') >>> conf = {'url': url, 'currency': 'USD', 'sleep': 0, 'precision': 6} >>> item = {'content': 'GBP'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': item, 'assign': 'content'} >>> parser(item['content'], objconf, **kwargs) Decimal('1.545801') """ same_currency = base == objconf.currency if skip: rate = kwargs['stream'] elif same_currency: rate = Decimal(1) elif objconf.url.startswith('http'): get = partial(requests.get, stream=True) sget = memoize(HALF_DAY)(get) if objconf.memoize else get r = sget(objconf.url, params=objconf.params) r.raw.decode_content = True json = next(items(r.raw, '')) else: context = SleepyDict(delay=objconf.sleep) url = get_abspath(objconf.url) try: with closing(urlopen(url, context=context)) as f: json = next(items(f, '')) except TypeError: with closing(urlopen(url)) as f: json = next(items(f, '')) if not (skip or same_currency): places = Decimal(10)**-objconf.precision rates = parse_response(json) rate = calc_rate(base, objconf.currency, rates, places=places) return rate
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> from meza._compat import decode >>> >>> def run(reactor): ... callback = lambda x: print(decode(next(x)['content'][:32])) ... url = get_path('cnn.html') ... conf = {'url': url, 'start': '<title>', 'end': '</title>'} ... objconf = Objectify(conf) ... kwargs = {'stream': {}, 'assign': 'content'} ... d = async_parser(None, objconf, **kwargs) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... CNN.com International - Breaking """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) content = yield io.async_url_read(url) parsed = get_string(content, objconf.start, objconf.end) detagged = get_text(parsed) if objconf.detag else parsed splits = detagged.split(objconf.token) if objconf.token else [detagged] stream = ({kwargs['assign']: chunk} for chunk in splits) return_value(stream)
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x)['mileage']) ... url = get_path('spreadsheet.csv') ... conf = {'url': url, 'sanitize': True, 'skip_rows': 0} ... objconf = Objectify(conf) ... d = async_parser(None, objconf, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... 7213 """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) response = yield io.async_url_open(url) first_row, custom_header = objconf.skip_rows, objconf.col_names renamed = {'first_row': first_row, 'custom_header': custom_header} rkwargs = merge([objconf, renamed]) _stream = read_csv(response, **rkwargs) stream = auto_close(_stream, response) return_value(stream)
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(x[0]['title']) ... url = get_path('gigs.json') ... objconf = Objectify({'url': url, 'path': 'value.items'}) ... d = async_parser(None, objconf, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Business System Analyst """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') f = yield io.async_url_open(url) stream = any2dict(f, ext, objconf.html5, path=objconf.path) f.close() return_value(stream)
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item conf (dict): The pipe configuration Returns: Deferred: twisted.internet.defer.Deferred Iter[dict] Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x)['title']) ... objconf = Objectify({'url': get_path('feed.xml'), 'sleep': 0}) ... d = async_parser(None, objconf, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Donations """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) content = yield io.async_url_read(url, delay=objconf.sleep) parsed = parse_rss(content) stream = gen_entries(parsed) return_value(stream)
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> from meza._compat import decode >>> >>> url = get_path('cnn.html') >>> conf = {'url': url, 'start': '<title>', 'end': '</title>'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': {}, 'assign': 'content'} >>> result = parser(None, objconf, **kwargs) >>> resp = next(result)['content'][:21] >>> decode(resp) == 'CNN.com International' True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) with closing(urlopen(url)) as response: f = response.fp encoding = get_response_encoding(response, 'utf-8') decoded = iterdecode(f, encoding) sliced = betwix(decoded, objconf.start, objconf.end, True) content = '\n'.join(sliced) parsed = get_string(content, objconf.start, objconf.end) detagged = get_text(parsed) if objconf.detag else parsed splits = detagged.split(objconf.token) if objconf.token else [detagged] stream = ({kwargs['assign']: chunk} for chunk in splits) return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from meza.fntools import Objectify >>> from riko import get_path >>> >>> url = get_path('ouseful.xml') >>> objconf = Objectify({'url': url, 'xpath': '/rss/channel/item'}) >>> result = parser(None, objconf, stream={}) >>> title = 'Running “Native” Data Wrangling Applications' >>> next(result)['title'][:44] == title True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict with closing(urlopen(url)) as f: root = xml2etree(f, xml=xml, html5=objconf.html5).getroot() elements = xpath(root, objconf.xpath) items = map(etree2dict, elements) stringified = ({kwargs['assign']: str(i)} for i in items) stream = stringified if objconf.stringify else items return stream
def parser(_, objconf, skip=False, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from meza.fntools import Objectify >>> >>> url = get_path('spreadsheet.csv') >>> conf = {'url': url, 'sanitize': True, 'skip_rows': 0} >>> objconf = Objectify(conf) >>> result = parser(None, objconf, stream={}) >>> next(result)['mileage'] == '7213' True """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) first_row, custom_header = objconf.skip_rows, objconf.col_names renamed = {'first_row': first_row, 'custom_header': custom_header} response = urlopen(url) encoding = get_response_encoding(response, objconf.encoding) rkwargs = merge([objconf, renamed]) rkwargs['encoding'] = encoding _stream = read_csv(response, **rkwargs) stream = auto_close(_stream, response) return stream
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> @coroutine ... def run(reactor): ... xml_url = get_path('ouseful.xml') ... xml_conf = {'url': xml_url, 'xpath': '/rss/channel/item'} ... xml_objconf = Objectify(xml_conf) ... xml_args = (None, xml_objconf) ... html_url = get_path('sciencedaily.html') ... html_conf = {'url': html_url, 'xpath': '/html/head/title'} ... html_objconf = Objectify(html_conf) ... html_args = (None, html_objconf) ... kwargs = {'stream': {}} ... ... try: ... xml_stream = yield async_parser(*xml_args, **kwargs) ... html_stream = yield async_parser(*html_args, **kwargs) ... print(next(xml_stream)['title'][:44]) ... print(next(html_stream)) ... except Exception as e: ... logger.error(e) ... logger.error(traceback.format_exc()) ... >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Running “Native” Data Wrangling Applications Help Page -- ScienceDaily """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict try: f = yield io.async_url_open(url) tree = yield util.xml2etree(f, xml=xml) except Exception as e: logger.error(e) logger.error(traceback.format_exc()) elements = xpath(tree, objconf.xpath) f.close() items = map(util.etree2dict, elements) stringified = ({kwargs['assign']: encode(i)} for i in items) stream = stringified if objconf.stringify else items return_value(stream)