def parser(_, objconf, skip, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.lib.utils import Objectify >>> >>> objconf = Objectify({'url': get_path('bbc.html')}) >>> result, skip = parser(None, objconf, False, stream={}) >>> next(result)['title'] == 'Using NFC tags in the car' True """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) rss = autorss.get_rss(url) link = utils.get_abspath(next(rss)['link']) parsed = utils.parse_rss(link) stream = utils.gen_entries(parsed) return stream, skip
def parser(_, objconf, skip, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item conf (dict): The pipe configuration Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.lib.utils import Objectify >>> >>> objconf = Objectify({'url': get_path('feed.xml'), 'sleep': 0}) >>> result, skip = parser(None, objconf, False, stream={}) >>> next(result)['title'] == 'Donations' True """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) parsed = utils.parse_rss(url, objconf.sleep) stream = utils.gen_entries(parsed) return stream, skip
def parser(_, objconf, skip, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.lib.utils import Objectify >>> >>> url = get_path('spreadsheet.csv') >>> conf = {'url': url, 'sanitize': True, 'skip_rows': 0} >>> objconf = Objectify(conf) >>> result, skip = parser(None, objconf, False, stream={}) >>> next(result)['mileage'] == '7213' True """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) first_row, custom_header = objconf.skip_rows, objconf.col_names renamed = {'first_row': first_row, 'custom_header': custom_header} response = urlopen(url) encoding = utils.get_response_encoding(response, objconf.encoding) rkwargs = utils.combine_dicts(objconf, renamed) rkwargs['encoding'] = encoding stream = read_csv(response, **rkwargs) return stream, skip
def parser(_, objconf, skip, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.lib.utils import Objectify >>> >>> objconf = Objectify({'url': get_path('bbc.html')}) >>> result, skip = parser(None, objconf, False, stream={}) >>> next(result)['link'] == 'file://riko/data/greenhughes.xml' True """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) stream = autorss.get_rss(url) return stream, skip
def parser(_, objconf, skip, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.lib.utils import Objectify >>> >>> url = get_path('gigs.json') >>> objconf = Objectify({'url': url, 'path': 'value.items'}) >>> result, skip = parser(None, objconf, False, stream={}) >>> result[0]['title'] == 'Business System Analyst' True """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') with closing(urlopen(url)) as f: stream = utils.any2dict(f, ext, objconf.html5, path=objconf.path) return stream, skip
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x[0])['title']) ... objconf = Objectify({'url': get_path('bbc.html')}) ... d = async_parser(None, objconf, False, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Using NFC tags in the car """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) rss = yield autorss.asyncGetRSS(url) link = utils.get_abspath(next(rss)['link']) content = yield io.async_url_read(link) parsed = utils.parse_rss(content) stream = utils.gen_entries(parsed) result = (stream, skip) return_value(result)
def async_parser(base, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: base (str): The base currency (exchanging from) objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: exchangerate) stream (dict): The original item Returns: Deferred: twisted.internet.defer.Deferred Tuple of (item, skip) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(x[0]) ... url = get_path('quote.json') ... conf = { ... 'url': url, 'currency': 'USD', 'sleep': 0, 'precision': 6} ... item = {'content': 'GBP'} ... objconf = Objectify(conf) ... kwargs = {'stream': item, 'assign': 'content'} ... d = async_parser(item['content'], objconf, False, **kwargs) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... 1.545801 """ if skip: rate = kwargs['stream'] elif objconf.url.startswith('http'): r = yield treq.get(objconf.url, params=objconf.params) json = yield treq.json(r) else: url = utils.get_abspath(objconf.url) content = yield io.async_url_read(url, delay=objconf.sleep) json = loads(decode(content)) if not skip: places = Decimal(10) ** -objconf.precision rates = parse_response(json) rate = calc_rate(base, objconf.currency, rates, places=places) result = (rate, skip) return_value(result)
def parser(base, objconf, skip, **kwargs): """ Parses the pipe content Args: base (str): The base currency (exchanging from) objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: exchangerate) stream (dict): The original item Returns: Tuple(dict, bool): Tuple of (item, skip) Examples: >>> from riko import get_path >>> from riko.lib.utils import Objectify >>> >>> url = get_path('quote.json') >>> conf = {'url': url, 'currency': 'USD', 'sleep': 0, 'precision': 6} >>> item = {'content': 'GBP'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': item, 'assign': 'content'} >>> result, skip = parser(item['content'], objconf, False, **kwargs) >>> result Decimal('1.545801') """ if skip: rate = kwargs['stream'] elif objconf.url.startswith('http'): get = partial(requests.get, stream=True) sget = utils.memoize(utils.HALF_DAY)(get) if objconf.memoize else get r = sget(objconf.url, params=objconf.params) json = next(items(r.raw, '')) else: context = utils.SleepyDict(delay=objconf.sleep) url = utils.get_abspath(objconf.url) try: with closing(urlopen(url, context=context)) as f: json = next(items(f, '')) except TypeError: with closing(urlopen(url)) as f: json = next(items(f, '')) if not skip: places = Decimal(10) ** -objconf.precision rates = parse_response(json) rate = calc_rate(base, objconf.currency, rates, places=places) return rate, skip
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> from meza._compat import decode >>> >>> def run(reactor): ... callback = lambda x: print(decode(next(x[0])['content'][:32])) ... url = get_path('cnn.html') ... conf = {'url': url, 'start': '<title>', 'end': '</title>'} ... objconf = Objectify(conf) ... kwargs = {'stream': {}, 'assign': 'content'} ... d = async_parser(None, objconf, False, **kwargs) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... CNN.com International - Breaking """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) content = yield io.async_url_read(url) parsed = get_string(content, objconf.start, objconf.end) detagged = get_text(parsed) if objconf.detag else parsed splits = detagged.split(objconf.token) if objconf.token else [detagged] stream = ({kwargs['assign']: chunk} for chunk in splits) result = (stream, skip) return_value(result)
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x[0])['mileage']) ... url = get_path('spreadsheet.csv') ... conf = {'url': url, 'sanitize': True, 'skip_rows': 0} ... objconf = Objectify(conf) ... d = async_parser(None, objconf, False, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... 7213 """ if skip: stream = kwargs['stream'] else: # TODO: write function to extract encoding from response url = utils.get_abspath(objconf.url) response = yield io.async_url_open(url) first_row, custom_header = objconf.skip_rows, objconf.col_names renamed = {'first_row': first_row, 'custom_header': custom_header} rkwargs = utils.combine_dicts(objconf, renamed) rkwargs['encoding'] = objconf.encoding stream = read_csv(response, **rkwargs) result = (stream, skip) return_value(result)
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(x[0][0]['title']) ... url = get_path('gigs.json') ... objconf = Objectify({'url': url, 'path': 'value.items'}) ... d = async_parser(None, objconf, False, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Business System Analyst """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') f = yield io.async_url_open(url) stream = utils.any2dict(f, ext, objconf.html5, path=objconf.path) f.close() result = (stream, skip) return_value(result)
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item conf (dict): The pipe configuration Returns: Deferred: twisted.internet.defer.Deferred Tuple(Iter[dict], bool) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x[0])['title']) ... objconf = Objectify({'url': get_path('feed.xml'), 'sleep': 0}) ... d = async_parser(None, objconf, False, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Donations """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) content = yield io.async_url_read(url, delay=objconf.sleep) parsed = utils.parse_rss(content) stream = utils.gen_entries(parsed) result = (stream, skip) return_value(result)
def parser(_, objconf, skip, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko.lib.utils import Objectify >>> from riko import get_path >>> from meza._compat import decode >>> >>> url = get_path('cnn.html') >>> conf = {'url': url, 'start': '<title>', 'end': '</title>'} >>> objconf = Objectify(conf) >>> kwargs = {'stream': {}, 'assign': 'content'} >>> result, skip = parser(None, objconf, False, **kwargs) >>> resp = next(result)['content'][:21] >>> decode(resp) == 'CNN.com International' True """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) with closing(urlopen(url)) as response: f = response.fp encoding = utils.get_response_encoding(response, 'utf-8') decoded = iterdecode(f, encoding) sliced = utils.betwix(decoded, objconf.start, objconf.end, True) content = '\n'.join(sliced) parsed = get_string(content, objconf.start, objconf.end) detagged = get_text(parsed) if objconf.detag else parsed splits = detagged.split(objconf.token) if objconf.token else [detagged] stream = ({kwargs['assign']: chunk} for chunk in splits) return stream, skip
def parser(_, objconf, skip, **kwargs): """ Parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko.lib.utils import Objectify >>> from riko import get_path >>> >>> url = get_path('ouseful.xml') >>> objconf = Objectify({'url': url, 'xpath': '/rss/channel/item'}) >>> result, skip = parser(None, objconf, False, stream={}) >>> title = 'Running “Native” Data Wrangling Applications' >>> next(result)['title'][:44] == title True """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict with closing(urlopen(url)) as f: root = utils.xml2etree(f, xml=xml, html5=objconf.html5).getroot() elements = utils.xpath(root, objconf.xpath) items = map(utils.etree2dict, elements) stringified = ({kwargs['assign']: str(i)} for i in items) stream = stringified if objconf.stringify else items return stream, skip
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> @coroutine ... def run(reactor): ... xml_url = get_path('ouseful.xml') ... xml_conf = {'url': xml_url, 'xpath': '/rss/channel/item'} ... xml_objconf = Objectify(xml_conf) ... xml_args = (None, xml_objconf, False) ... html_url = get_path('sciencedaily.html') ... html_conf = {'url': html_url, 'xpath': '/html/head/title'} ... html_objconf = Objectify(html_conf) ... html_args = (None, html_objconf, False) ... kwargs = {'stream': {}} ... ... try: ... xml_stream, _ = yield async_parser(*xml_args, **kwargs) ... html_stream, _ = yield async_parser(*html_args, **kwargs) ... print(next(xml_stream)['title'][:44]) ... print(next(html_stream)) ... except Exception as e: ... logger.error(e) ... logger.error(traceback.format_exc()) ... >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Running “Native” Data Wrangling Applications Help Page -- ScienceDaily """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict try: f = yield io.async_url_open(url) tree = yield util.xml2etree(f, xml=xml) except Exception as e: logger.error(e) logger.error(traceback.format_exc()) elements = utils.xpath(tree, objconf.xpath) f.close() items = map(util.etree2dict, elements) stringified = ({kwargs['assign']: encode(i)} for i in items) stream = stringified if objconf.stringify else items result = (stream, skip) return_value(result)