def async_parser(base, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: base (str): The base currency (exchanging from) objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: exchangerate) stream (dict): The original item Returns: Deferred: twisted.internet.defer.Deferred item Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... url = get_path('quote.json') ... conf = { ... 'url': url, 'currency': 'USD', 'delay': 0, 'precision': 6} ... item = {'content': 'GBP'} ... objconf = Objectify(conf) ... kwargs = {'stream': item, 'assign': 'content'} ... d = async_parser(item['content'], objconf, **kwargs) ... return d.addCallbacks(print, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... 1.275201 """ same_currency = base == objconf.currency if skip: rate = kwargs['stream'] elif same_currency: rate = Decimal(1) elif objconf.url.startswith('http'): r = yield treq.get(objconf.url, params=objconf.params) json = yield treq.json(r) else: url = get_abspath(objconf.url) content = yield io.async_url_read(url, delay=objconf.delay) json = loads(decode(content)) if not (skip or same_currency): places = Decimal(10) ** -objconf.precision rates = parse_response(json) rate = calc_rate(base, objconf.currency, rates, places=places) return_value(rate)
def async_parser(base, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: base (str): The base currency (exchanging from) objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: exchangerate) stream (dict): The original item Returns: Deferred: twisted.internet.defer.Deferred item Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... url = get_path('quote.json') ... conf = { ... 'url': url, 'currency': 'USD', 'sleep': 0, 'precision': 6} ... item = {'content': 'GBP'} ... objconf = Objectify(conf) ... kwargs = {'stream': item, 'assign': 'content'} ... d = async_parser(item['content'], objconf, **kwargs) ... return d.addCallbacks(print, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... 1.545801 """ same_currency = base == objconf.currency if skip: rate = kwargs['stream'] elif same_currency: rate = Decimal(1) elif objconf.url.startswith('http'): r = yield treq.get(objconf.url, params=objconf.params) json = yield treq.json(r) else: url = get_abspath(objconf.url) content = yield io.async_url_read(url, delay=objconf.sleep) json = loads(decode(content)) if not (skip or same_currency): places = Decimal(10)**-objconf.precision rates = parse_response(json) rate = calc_rate(base, objconf.currency, rates, places=places) return_value(rate)
def async_get_rss(url, convert_charrefs=False): try: f = yield async_url_open(url, timeout=TIMEOUT) except ValueError: f = filter(None, url.splitlines()) document = microdom.parse(f, lenient=True) return_value(doc2entries(document))
def wrapper(item=None, **kwargs): module_name = wrapper.__module__.split('.')[-1] defaults = { 'dictize': True, 'ftype': 'pass', 'ptype': 'pass', 'objectify': True } combined = merge([self.defaults, defaults, self.opts, kwargs]) is_source = combined['ftype'] == 'none' def_assign = 'content' if is_source else module_name extracted = 'extract' in combined pdictize = combined.get('listize') if extracted else True combined.setdefault('assign', def_assign) combined.setdefault('emit', is_source) combined.setdefault('pdictize', pdictize) conf = {k: combined[k] for k in self.defaults} conf.update(kwargs.get('conf', {})) combined.update({'conf': conf}) uconf = DotDict(conf) if combined.get('dictize') else conf updates = {'conf': uconf, 'assign': combined.get('assign')} kwargs.update(updates) item = item or {} _input = DotDict(item) if combined.get('dictize') else item bfuncs = get_broadcast_funcs(**combined) skip = get_skip(_input, **combined) types = set([]) if skip else {combined['ftype'], combined['ptype']} if types.difference({'pass', 'none'}): dfuncs = get_dispatch_funcs(**combined) else: dfuncs = None parsed, orig_item = _dispatch(_input, bfuncs, dfuncs=dfuncs) kwargs.update({'skip': skip, 'stream': orig_item}) if self. async: stream = yield pipe(*parsed, **kwargs) else: stream = pipe(*parsed, **kwargs) one, assignment = get_assignment(stream, skip=skip, **combined) if skip or combined.get('emit'): stream = assignment elif not skip: stream = assign(_input, assignment, one=one, **combined) if self. async: return_value(stream) else: for s in stream: yield s
def async_reducer(item, rules): field = rules[0]['field'] word = item.get(field, **kwargs) grouped = group_by(rules, 'flags') group_rules = [g[1] for g in grouped] if multi else rules reducer = multi_substitute if multi else substitute replacement = yield ait.coop_reduce(reducer, group_rules, word) combined = merge([item, {field: replacement}]) return_value(DotDict(combined))
def wrapper(item=None, **kwargs): module_name = wrapper.__module__.split('.')[-1] defaults = { 'dictize': True, 'ftype': 'pass', 'ptype': 'pass', 'objectify': True } combined = cdicts(self.defaults, defaults, self.opts, kwargs) is_source = combined['ftype'] == 'none' def_assign = 'content' if is_source else module_name extracted = 'extract' in combined pdictize = combined.get('listize') if extracted else True combined.setdefault('assign', def_assign) combined.setdefault('emit', is_source) combined.setdefault('pdictize', pdictize) conf = {k: combined[k] for k in self.defaults} conf.update(kwargs.get('conf', {})) combined.update({'conf': conf}) # replace conf with dictized version so we can access its # attributes even if we already extracted a value updates = {'conf': DotDict(conf), 'assign': combined.get('assign')} kwargs.update(updates) item = item or {} _input = DotDict(item) if combined.get('dictize') else item bfuncs = get_broadcast_funcs(**combined) types = {combined['ftype'], combined['ptype']} if types.difference({'pass', 'none'}): dfuncs = get_dispatch_funcs(**combined) else: dfuncs = None parsed, orig_item = dispatch(_input, bfuncs, dfuncs=dfuncs) if self. async: stream, skip = yield pipe(*parsed, stream=orig_item, **kwargs) else: stream, skip = pipe(*parsed, stream=orig_item, **kwargs) one, assignment = get_assignment(stream, skip, **combined) if skip or combined.get('emit'): stream = assignment elif not skip: key = combined.get('assign') stream = assign(_input, assignment, key, one=one) if self. async: return_value(stream) else: for s in stream: yield s
def wrapper(item=None, **kwargs): module_name = wrapper.__module__.split('.')[-1] defaults = { 'dictize': True, 'ftype': 'pass', 'ptype': 'pass', 'objectify': True} combined = merge([self.defaults, defaults, self.opts, kwargs]) is_source = combined['ftype'] == 'none' def_assign = 'content' if is_source else module_name extracted = 'extract' in combined pdictize = combined.get('listize') if extracted else True combined.setdefault('assign', def_assign) combined.setdefault('emit', is_source) combined.setdefault('pdictize', pdictize) conf = {k: combined[k] for k in self.defaults} conf.update(kwargs.get('conf', {})) combined.update({'conf': conf}) uconf = DotDict(conf) if combined.get('dictize') else conf updates = {'conf': uconf, 'assign': combined.get('assign')} kwargs.update(updates) item = item or {} _input = DotDict(item) if combined.get('dictize') else item bfuncs = get_broadcast_funcs(**combined) skip = get_skip(_input, **combined) types = set([]) if skip else {combined['ftype'], combined['ptype']} if types.difference({'pass', 'none'}): dfuncs = get_dispatch_funcs(**combined) else: dfuncs = None parsed, orig_item = _dispatch(_input, bfuncs, dfuncs=dfuncs) kwargs.update({'skip': skip, 'stream': orig_item}) if self.async: stream = yield pipe(*parsed, **kwargs) else: stream = pipe(*parsed, **kwargs) one, assignment = get_assignment(stream, skip=skip, **combined) if skip or combined.get('emit'): stream = assignment elif not skip: stream = assign(_input, assignment, one=one, **combined) if self.async: return_value(stream) else: for s in stream: yield s
def wrapper(item=None, **kwargs): module_name = wrapper.__module__.split('.')[-1] defaults = { 'dictize': True, 'ftype': 'pass', 'ptype': 'pass', 'objectify': True} combined = cdicts(self.defaults, defaults, self.opts, kwargs) is_source = combined['ftype'] == 'none' def_assign = 'content' if is_source else module_name extracted = 'extract' in combined pdictize = combined.get('listize') if extracted else True combined.setdefault('assign', def_assign) combined.setdefault('emit', is_source) combined.setdefault('pdictize', pdictize) conf = {k: combined[k] for k in self.defaults} conf.update(kwargs.get('conf', {})) combined.update({'conf': conf}) # replace conf with dictized version so we can access its # attributes even if we already extracted a value updates = {'conf': DotDict(conf), 'assign': combined.get('assign')} kwargs.update(updates) item = item or {} _input = DotDict(item) if combined.get('dictize') else item bfuncs = get_broadcast_funcs(**combined) types = {combined['ftype'], combined['ptype']} if types.difference({'pass', 'none'}): dfuncs = get_dispatch_funcs(**combined) else: dfuncs = None parsed, orig_item = dispatch(_input, bfuncs, dfuncs=dfuncs) if self.async: stream, skip = yield pipe(*parsed, stream=orig_item, **kwargs) else: stream, skip = pipe(*parsed, stream=orig_item, **kwargs) one, assignment = get_assignment(stream, skip, **combined) if skip or combined.get('emit'): stream = assignment elif not skip: key = combined.get('assign') stream = assign(_input, assignment, key, one=one) if self.async: return_value(stream) else: for s in stream: yield s
def output(self): source = yield self.source async_pipeline = partial(self.async_pipe, **self.kwargs) if self.mapify: args = (async_pipeline, source, self.connections) mapped = yield ait.async_map(*args) output = multiplex(mapped) else: output = yield async_pipeline(source) return_value(output)
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> from meza._compat import decode >>> >>> def run(reactor): ... callback = lambda x: print(decode(next(x[0])['content'][:32])) ... url = get_path('cnn.html') ... conf = {'url': url, 'start': '<title>', 'end': '</title>'} ... objconf = Objectify(conf) ... kwargs = {'stream': {}, 'assign': 'content'} ... d = async_parser(None, objconf, False, **kwargs) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... CNN.com International - Breaking """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) content = yield io.async_url_read(url) parsed = get_string(content, objconf.start, objconf.end) detagged = get_text(parsed) if objconf.detag else parsed splits = detagged.split(objconf.token) if objconf.token else [detagged] stream = ({kwargs['assign']: chunk} for chunk in splits) result = (stream, skip) return_value(result)
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x[0])['mileage']) ... url = get_path('spreadsheet.csv') ... conf = {'url': url, 'sanitize': True, 'skip_rows': 0} ... objconf = Objectify(conf) ... d = async_parser(None, objconf, False, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... 7213 """ if skip: stream = kwargs['stream'] else: # TODO: write function to extract encoding from response url = utils.get_abspath(objconf.url) response = yield io.async_url_open(url) first_row, custom_header = objconf.skip_rows, objconf.col_names renamed = {'first_row': first_row, 'custom_header': custom_header} rkwargs = utils.combine_dicts(objconf, renamed) rkwargs['encoding'] = objconf.encoding stream = read_csv(response, **rkwargs) result = (stream, skip) return_value(result)
def asyncGetRSS(url, convert_charrefs=False): # TODO: implement via an async parser # maybe get twisted.web.microdom.parse working for HTML try: parser = LinkParser(convert_charrefs=convert_charrefs) except TypeError: parser = LinkParser() try: f = yield async_url_open(url, timeout=TIMEOUT) except ValueError: f = filter(None, url.splitlines()) return_value(gen_entries(f, parser))
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x)['mileage']) ... url = get_path('spreadsheet.csv') ... conf = { ... 'url': url, 'sanitize': True, 'skip_rows': 0, ... 'encoding': ENCODING} ... objconf = Objectify(conf) ... d = async_parser(None, objconf, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... 7213 """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) r = yield io.async_url_open(url) first_row, custom_header = objconf.skip_rows, objconf.col_names renamed = {'first_row': first_row, 'custom_header': custom_header} rkwargs = merge([objconf, renamed]) stream = auto_close(read_csv(r, **rkwargs), r) return_value(stream)
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x[0])['title']) ... objconf = Objectify({'url': get_path('bbc.html')}) ... d = async_parser(None, objconf, False, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Using NFC tags in the car """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) rss = yield autorss.asyncGetRSS(url) link = utils.get_abspath(next(rss)['link']) content = yield io.async_url_read(link) parsed = utils.parse_rss(content) stream = utils.gen_entries(parsed) result = (stream, skip) return_value(result)
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x)['content']) ... url = get_path('lorem.txt') ... objconf = Objectify({'url': url, 'encoding': ENCODING}) ... d = async_parser(None, objconf, assign='content') ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... What is Lorem Ipsum? """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) f = yield io.async_url_open(url) assign = kwargs['assign'] encoding = objconf.encoding _stream = ({assign: line.strip().decode(encoding)} for line in f) stream = auto_close(_stream, f) return_value(stream)
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(x[0][0]['title']) ... url = get_path('gigs.json') ... objconf = Objectify({'url': url, 'path': 'value.items'}) ... d = async_parser(None, objconf, False, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Business System Analyst """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') f = yield io.async_url_open(url) stream = utils.any2dict(f, ext, objconf.html5, path=objconf.path) f.close() result = (stream, skip) return_value(result)
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(x[0]['title']) ... url = get_path('gigs.json') ... objconf = Objectify({'url': url, 'path': 'value.items'}) ... d = async_parser(None, objconf, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Business System Analyst """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = p.splitext(url)[1].lstrip('.') f = yield io.async_url_open(url) stream = any2dict(f, ext, objconf.html5, path=objconf.path) f.close() return_value(stream)
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item conf (dict): The pipe configuration Returns: Deferred: twisted.internet.defer.Deferred Tuple(Iter[dict], bool) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x[0])['title']) ... objconf = Objectify({'url': get_path('feed.xml'), 'sleep': 0}) ... d = async_parser(None, objconf, False, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Donations """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) content = yield io.async_url_read(url, delay=objconf.sleep) parsed = utils.parse_rss(content) stream = utils.gen_entries(parsed) result = (stream, skip) return_value(result)
def async_parser(word, rules, skip, **kwargs): """ Asynchronously parses the pipe content Args: word (str): The string to transform rules (List[obj]): the parsed rules (Objectify instances). skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: strtransform) stream (dict): The original item Returns: Deferred: twisted.internet.defer.Deferred Tuple of (item, skip) Examples: >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(x[0]) ... item = {'content': 'hello world'} ... conf = {'rule': {'transform': 'title'}} ... rule = Objectify(conf['rule']) ... kwargs = {'stream': item, 'conf': conf} ... d = async_parser(item['content'], [rule], False, **kwargs) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Hello World """ if skip: value = kwargs['stream'] else: value = yield ait.coop_reduce(reducer, rules, word) result = (value, skip) return_value(result)
def async_parser(word, rules, skip, **kwargs): """ Asynchronously parses the pipe content Args: word (str): The string to transform rules (List[obj]): the parsed rules (Objectify instances). skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: exchangerate) stream (dict): The original item Returns: Deferred: twisted.internet.defer.Deferred Tuple of (item, skip) Examples: >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(x[0]) ... item = {'content': 'hello world'} ... conf = {'rule': {'find': 'hello', 'replace': 'bye'}} ... rule = Objectify(conf['rule']) ... kwargs = {'stream': item, 'conf': conf} ... d = async_parser(item['content'], [rule], False, **kwargs) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... bye world """ if skip: value = kwargs['stream'] else: value = yield ait.coop_reduce(reducer, rules, word) result = (value, skip) return_value(result)
def async_parser(item, rules, skip, **kwargs): """ Asynchronously parses the pipe content Args: item (obj): The entry to process (a DotDict instance) rules (List[obj]): the parsed rules (Objectify instances). skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Deferred: twisted.internet.defer.Deferred Tuple of (item, skip) Examples: >>> from riko.bado import react >>> from riko.lib.dotdict import DotDict >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(x[0] == {'greeting': 'hello world'}) ... item = DotDict({'content': 'hello world'}) ... rule = {'field': 'content', 'newval': 'greeting'} ... kwargs = {'stream': item} ... d = async_parser(item, [Objectify(rule)], False, **kwargs) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... True """ if skip: item = kwargs['stream'] else: item = yield ait.coop_reduce(reducer, rules, item) result = (item, skip) return_value(result)
def async_parser(item, rules, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: item (obj): The entry to process (a DotDict instance) rules (List[obj]): the parsed rules (Objectify instances). skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Deferred: twisted.internet.defer.Deferred item Examples: >>> from riko.bado import react >>> from riko.dotdict import DotDict >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(x == {'greeting': 'hello world'}) ... item = DotDict({'content': 'hello world'}) ... rule = {'field': 'content', 'newval': 'greeting'} ... kwargs = {'stream': item} ... d = async_parser(item, [Objectify(rule)], **kwargs) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... True """ if skip: item = kwargs['stream'] else: item = yield ait.coop_reduce(reducer, rules, item) return_value(item)
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Iter[dict]: Deferred stream Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> def run(reactor): ... callback = lambda x: print(next(x)['link']) ... objconf = Objectify({'url': get_path('bbc.html')}) ... d = async_parser(None, objconf, stream={}) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... file://riko/data/greenhughes.xml """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) stream = yield autorss.async_get_rss(url) return_value(stream)
def async_parser(item, rules, skip=False, **kwargs): """ Asynchronously parsers the pipe content Args: item (obj): The entry to process (a DotDict instance) rules (List[obj]): the parsed rules (Objectify instances). skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: stream (dict): The original item Returns: Deferred: twisted.internet.defer.Deferred dict Examples: >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> item = DotDict({'content': 'hello world', 'title': 'greeting'}) >>> match = r'(\w+)\s(\w+)' >>> replace = '$2wide' >>> >>> def run(reactor): ... callback = lambda x: print(x['content']) ... rule = {'field': 'content', 'match': match, 'replace': replace} ... conf = {'rule': rule, 'multi': False, 'convert': True} ... rules = [Objectify(rule)] ... kwargs = {'stream': item, 'conf': conf} ... d = async_parser(item, rules, **kwargs) ... return d.addCallbacks(callback, logger.error) >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... worldwide """ multi = kwargs['conf']['multi'] recompile = not multi @coroutine def async_reducer(item, rules): field = rules[0]['field'] word = item.get(field, **kwargs) grouped = group_by(rules, 'flags') group_rules = [g[1] for g in grouped] if multi else rules reducer = multi_substitute if multi else substitute replacement = yield ait.coop_reduce(reducer, group_rules, word) combined = merge([item, {field: replacement}]) return_value(DotDict(combined)) if skip: item = kwargs['stream'] else: new_rules = [get_new_rule(r, recompile=recompile) for r in rules] grouped = group_by(new_rules, 'field') field_rules = [g[1] for g in grouped] item = yield ait.async_reduce(async_reducer, field_rules, item) return_value(item)
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Deferred: twisted.internet.defer.Deferred Tuple of (stream, skip) Examples: >>> from six.moves.urllib.request import urlopen >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify, get_abspath >>> >>> feed = 'http://feeds.feedburner.com/TechCrunch/' >>> url = 'http://query.yahooapis.com/v1/public/yql' >>> query = "select * from feed where url='%s'" % feed >>> f = urlopen(get_abspath(get_path('yql.xml'))) >>> >>> def run(reactor): ... callback = lambda x: print(next(x[0])['title']) ... conf = {'query': query, 'url': url, 'debug': False} ... objconf = Objectify(conf) ... kwargs = {'stream': {}, 'response': f} ... d = async_parser(None, objconf, False, **kwargs) ... d.addCallbacks(callback, logger.error) ... d.addCallback(lambda _: f.close()) ... return d >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Bring pizza home """ if skip: stream = kwargs['stream'] else: f = kwargs.get('response') if not f: params = {'q': objconf.query, 'diagnostics': objconf.debug} r = yield treq.get(objconf.url, params=params) f = yield treq.content(r) tree = yield util.xml2etree(f) results = next(tree.getElementsByTagName('results')) stream = map(util.etree2dict, results.childNodes) result = (stream, skip) return_value(result)
def async_list_pipe(args): source, async_pipeline = args output = yield async_pipeline(source) return_value(list(output))
def list(self): result = yield self.async_fetch() return_value(list(result))
def async_fetch(self): """Fetch all source urls""" args = (async_get_pipe, self.zargs, self.connections) mapped = yield ait.async_map(*args) return_value(multiplex(mapped))
def list(self): output = yield self.output return_value(list(output))
def wrapper(items=None, **kwargs): module_name = wrapper.__module__.split('.')[-1] wrapper.__dict__['name'] = module_name defaults = { 'dictize': True, 'ftype': 'pass', 'ptype': 'pass', 'objectify': True, 'emit': True, 'assign': module_name } combined = cdicts(self.defaults, defaults, self.opts, kwargs) extracted = 'extract' in combined pdictize = combined.get('listize') if extracted else True combined.setdefault('pdictize', pdictize) conf = {k: combined[k] for k in self.defaults} conf.update(kwargs.get('conf', {})) combined.update({'conf': conf}) # replace conf with dictized version so we can access its # attributes even if we already extracted a value updates = {'conf': DotDict(conf), 'assign': combined.get('assign')} kwargs.update(updates) items = items or iter([]) _INPUT = map(DotDict, items) if combined.get('dictize') else items bfuncs = get_broadcast_funcs(**combined) types = {combined['ftype'], combined['ptype']} if types.difference({'pass', 'none'}): dfuncs = get_dispatch_funcs(**combined) else: dfuncs = None pairs = (dispatch(item, bfuncs, dfuncs=dfuncs) for item in _INPUT) parsed, _ = dispatch(DotDict(), bfuncs, dfuncs=dfuncs) # - operators can't skip items # - purposely setting both variables to maps of the same iterable # since only one is intended to be used at any given time # - `tuples` is an iterator of tuples of the first two `parsed` # elements tuples = ((p[0][0], p[0][1]) for p in pairs) orig_stream = (p[0][0] for p in pairs) objconf = parsed[1] if self. async: stream = yield pipe(orig_stream, objconf, tuples, **kwargs) else: stream = pipe(orig_stream, objconf, tuples, **kwargs) sub_type = 'aggregator' if hasattr(stream, 'keys') else 'composer' wrapper.__dict__['sub_type'] = sub_type # operators can only assign one value per item and can't skip items _, assignment = get_assignment(stream, False, **combined) if combined.get('emit'): stream = assignment else: singles = (iter([v]) for v in assignment) key = combined.get('assign') assigned = (assign({}, s, key, one=True) for s in singles) stream = utils.multiplex(assigned) if self. async: return_value(stream) else: for s in stream: yield s
def wrapper(items=None, **kwargs): module_name = wrapper.__module__.split('.')[-1] wrapper.__dict__['name'] = module_name defaults = { 'dictize': True, 'ftype': 'pass', 'ptype': 'pass', 'objectify': True, 'emit': True, 'assign': module_name} combined = cdicts(self.defaults, defaults, self.opts, kwargs) extracted = 'extract' in combined pdictize = combined.get('listize') if extracted else True combined.setdefault('pdictize', pdictize) conf = {k: combined[k] for k in self.defaults} conf.update(kwargs.get('conf', {})) combined.update({'conf': conf}) # replace conf with dictized version so we can access its # attributes even if we already extracted a value updates = {'conf': DotDict(conf), 'assign': combined.get('assign')} kwargs.update(updates) items = items or iter([]) _INPUT = map(DotDict, items) if combined.get('dictize') else items bfuncs = get_broadcast_funcs(**combined) types = {combined['ftype'], combined['ptype']} if types.difference({'pass', 'none'}): dfuncs = get_dispatch_funcs(**combined) else: dfuncs = None pairs = (dispatch(item, bfuncs, dfuncs=dfuncs) for item in _INPUT) parsed, _ = dispatch(DotDict(), bfuncs, dfuncs=dfuncs) # - operators can't skip items # - purposely setting both variables to maps of the same iterable # since only one is intended to be used at any given time # - `tuples` is an iterator of tuples of the first two `parsed` # elements tuples = ((p[0][0], p[0][1]) for p in pairs) orig_stream = (p[0][0] for p in pairs) objconf = parsed[1] if self.async: stream = yield pipe(orig_stream, objconf, tuples, **kwargs) else: stream = pipe(orig_stream, objconf, tuples, **kwargs) sub_type = 'aggregator' if hasattr(stream, 'keys') else 'composer' wrapper.__dict__['sub_type'] = sub_type # operators can only assign one value per item and can't skip items _, assignment = get_assignment(stream, False, **combined) if combined.get('emit'): stream = assignment else: singles = (iter([v]) for v in assignment) key = combined.get('assign') assigned = (assign({}, s, key, one=True) for s in singles) stream = utils.multiplex(assigned) if self.async: return_value(stream) else: for s in stream: yield s
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Deferred: twisted.internet.defer.Deferred stream Examples: >>> from six.moves.urllib.request import urlopen >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.utils import get_abspath >>> from meza.fntools import Objectify >>> >>> feed = 'http://feeds.feedburner.com/TechCrunch/' >>> url = 'http://query.yahooapis.com/v1/public/yql' >>> query = "select * from feed where url='%s'" % feed >>> f = urlopen(get_abspath(get_path('yql.xml'))) >>> >>> def run(reactor): ... callback = lambda x: print(next(x)['title']) ... conf = {'query': query, 'url': url, 'debug': False} ... objconf = Objectify(conf) ... kwargs = {'stream': {}, 'response': f} ... d = async_parser(None, objconf, **kwargs) ... d.addCallbacks(callback, logger.error) ... d.addCallback(lambda _: f.close()) ... return d >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... finally: ... f.close() Bring pizza home """ if skip: stream = kwargs['stream'] else: f = kwargs.get('response') if not f: params = {'q': objconf.query, 'diagnostics': objconf.debug} r = yield treq.get(objconf.url, params=params) f = yield treq.content(r) tree = yield util.xml2etree(f) results = next(tree.getElementsByTagName('results')) stream = map(util.etree2dict, results.childNodes) return_value(stream)
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> @coroutine ... def run(reactor): ... xml_url = get_path('ouseful.xml') ... xml_conf = {'url': xml_url, 'xpath': '/rss/channel/item'} ... xml_objconf = Objectify(xml_conf) ... xml_args = (None, xml_objconf) ... html_url = get_path('sciencedaily.html') ... html_conf = {'url': html_url, 'xpath': '/html/head/title'} ... html_objconf = Objectify(html_conf) ... html_args = (None, html_objconf) ... kwargs = {'stream': {}} ... ... try: ... xml_stream = yield async_parser(*xml_args, **kwargs) ... html_stream = yield async_parser(*html_args, **kwargs) ... print(next(xml_stream)['title'][:44]) ... print(next(html_stream)) ... except Exception as e: ... logger.error(e) ... logger.error(traceback.format_exc()) ... >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Running “Native” Data Wrangling Applications Help Page -- ScienceDaily """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict try: f = yield io.async_url_open(url) tree = yield util.xml2etree(f, xml=xml) except Exception as e: logger.error(e) logger.error(traceback.format_exc()) elements = xpath(tree, objconf.xpath) f.close() items = map(util.etree2dict, elements) stringified = ({kwargs['assign']: encode(i)} for i in items) stream = stringified if objconf.stringify else items return_value(stream)