def get_string(content, start, end): # TODO: convert relative links to absolute # TODO: remove the closing tag if using an HTML tag stripped of HTML tags # TODO: clean html with Tidy content = encode(content) start_pos = content.find(encode(start)) if start else 0 right = content[start_pos + (len(start) if start else 0):] end_pos = right[1:].find(encode(end)) + 1 if end else len(right) return right[:end_pos] if end_pos > 0 else right
def async_url_read(url, timeout=0, **kwargs): if url.startswith('http'): content = getPage(encode(url), timeout=timeout) else: content = async_read_file(url, StringTransport(), **kwargs) return content
def writexml(self, stream, *args, **kwargs): if self.raw: val = decode(self.nodeValue) else: v = decode(self.nodeValue) v = ' '.join(v.split()) if kwargs.get('strip') else v val = escape(v) val = encode(val) stream.write(val)
def async_url_open(url, timeout=0, **kwargs): if url.startswith('http'): page = NamedTemporaryFile(delete=False) new_url = page.name yield downloadPage(encode(url), page, timeout=timeout) else: page, new_url = None, url f = yield async_get_file(new_url, StringTransport(), **kwargs) if page: page.close() remove(page.name) return_value(f)
def async_parser(_, objconf, skip=False, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Iter[dict]: The stream of items Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from meza.fntools import Objectify >>> >>> @coroutine ... def run(reactor): ... xml_url = get_path('ouseful.xml') ... xml_conf = {'url': xml_url, 'xpath': '/rss/channel/item'} ... xml_objconf = Objectify(xml_conf) ... xml_args = (None, xml_objconf) ... html_url = get_path('sciencedaily.html') ... html_conf = {'url': html_url, 'xpath': '/html/head/title'} ... html_objconf = Objectify(html_conf) ... html_args = (None, html_objconf) ... kwargs = {'stream': {}} ... ... try: ... xml_stream = yield async_parser(*xml_args, **kwargs) ... html_stream = yield async_parser(*html_args, **kwargs) ... print(next(xml_stream)['title'][:44]) ... print(next(html_stream)) ... except Exception as e: ... logger.error(e) ... logger.error(traceback.format_exc()) ... >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Running “Native” Data Wrangling Applications Help Page -- ScienceDaily """ if skip: stream = kwargs['stream'] else: url = get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict try: f = yield io.async_url_open(url) tree = yield util.xml2etree(f, xml=xml) except Exception as e: logger.error(e) logger.error(traceback.format_exc()) elements = xpath(tree, objconf.xpath) f.close() items = map(util.etree2dict, elements) stringified = ({kwargs['assign']: encode(i)} for i in items) stream = stringified if objconf.stringify else items return_value(stream)
def writexml(self, stream, *args, **kwargs): val = encode(self.data) stream.write("<!--%s-->" % val)
def parseString(content, *args, **kwargs): f = BytesIO(encode(content)) return parse(f, *args, **kwargs)
def async_parser(_, objconf, skip, **kwargs): """ Asynchronously parses the pipe content Args: _ (None): Ignored objconf (obj): The pipe configuration (an Objectify instance) skip (bool): Don't parse the content kwargs (dict): Keyword arguments Kwargs: assign (str): Attribute to assign parsed content (default: content) stream (dict): The original item Returns: Tuple(Iter[dict], bool): Tuple of (stream, skip) Examples: >>> from riko import get_path >>> from riko.bado import react >>> from riko.bado.mock import FakeReactor >>> from riko.lib.utils import Objectify >>> >>> @coroutine ... def run(reactor): ... xml_url = get_path('ouseful.xml') ... xml_conf = {'url': xml_url, 'xpath': '/rss/channel/item'} ... xml_objconf = Objectify(xml_conf) ... xml_args = (None, xml_objconf, False) ... html_url = get_path('sciencedaily.html') ... html_conf = {'url': html_url, 'xpath': '/html/head/title'} ... html_objconf = Objectify(html_conf) ... html_args = (None, html_objconf, False) ... kwargs = {'stream': {}} ... ... try: ... xml_stream, _ = yield async_parser(*xml_args, **kwargs) ... html_stream, _ = yield async_parser(*html_args, **kwargs) ... print(next(xml_stream)['title'][:44]) ... print(next(html_stream)) ... except Exception as e: ... logger.error(e) ... logger.error(traceback.format_exc()) ... >>> >>> try: ... react(run, _reactor=FakeReactor()) ... except SystemExit: ... pass ... Running “Native” Data Wrangling Applications Help Page -- ScienceDaily """ if skip: stream = kwargs['stream'] else: url = utils.get_abspath(objconf.url) ext = splitext(url)[1].lstrip('.') xml = (ext == 'xml') or objconf.strict try: f = yield io.async_url_open(url) tree = yield util.xml2etree(f, xml=xml) except Exception as e: logger.error(e) logger.error(traceback.format_exc()) elements = utils.xpath(tree, objconf.xpath) f.close() items = map(util.etree2dict, elements) stringified = ({kwargs['assign']: encode(i)} for i in items) stream = stringified if objconf.stringify else items result = (stream, skip) return_value(result)