def pipe_sort(context=None, _INPUT=None, conf=None, **kwargs): """An operator that sorts the input source according to the specified key. Not loopable. Not lazy. Parameters ---------- context : pipe2py.Context object _INPUT : pipe2py.modules pipe like object (iterable of items) kwargs -- other inputs, e.g. to feed terminals for rule values conf : { 'KEY': [ { 'field': {'type': 'text', 'value': 'title'}, 'dir': {'type': 'text', 'value': 'DESC'} } ] } Returns ------- _OUTPUT : generator of sorted items """ test = kwargs.pop('pass_if', None) _pass = utils.get_pass(test=test) key_defs = imap(DotDict, utils.listize(conf['KEY'])) get_value = partial(utils.get_value, **kwargs) parse_conf = partial(utils.parse_conf, parse_func=get_value, **kwargs) keys = imap(parse_conf, key_defs) order = ('%s%s' % ('-' if k.dir == 'DESC' else '', k.field) for k in keys) comparers = map(get_comparer, order) cmp_func = partial(multikeysort, comparers=comparers) _OUTPUT = _INPUT if _pass else iter(sorted(_INPUT, cmp=cmp_func)) return _OUTPUT
def pipe_feedautodiscovery(context=None, _INPUT=None, conf=None, **kwargs): """A source that searches for and returns feed links found in a page. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : URL -- url Yields ------ _OUTPUT : items """ conf = DotDict(conf) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) if context and context.verbose: print "pipe_feedautodiscovery loading:", url for entry in autorss.getRSSLink(url.encode('utf-8')): yield {'link': entry} # todo: add rel, type, title if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_fetchsitefeed(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches and parses the first feed found on one or more sites. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : URL -- url Yields ------ _OUTPUT : items """ conf = DotDict(conf) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) if context and context.verbose: print "pipe_fetchsitefeed loading:", url for link in autorss.getRSSLink(url.encode('utf-8')): parsed = speedparser.parse(urlopen(link).read()) for entry in utils.gen_entries(parsed): yield entry if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def _get_broadcast_funcs(pieces, ftype='with', **kwargs): test = kwargs.pop('pass_if', None) listize = kwargs.pop('listize', True) parse = kwargs.pop('parse', True) pdictize = kwargs.pop('pdictize', True) cust_func = kwargs.pop('cust_func', False) get_value = partial(utils.get_value, **kwargs) get_pass = partial(utils.get_pass, test=test) get_with = partial(utils.get_with, **kwargs) if parse: get_func = partial(utils.parse_conf, parse_func=get_value, **kwargs) else: get_func = get_value if listize: listed = utils.listize(pieces) piece_defs = map(DotDict, listed) if pdictize else listed get_pieces = lambda item: imap(get_func, piece_defs, repeat(item)) else: piece_defs = DotDict(pieces) if pdictize else pieces get_pieces = partial(get_func, piece_defs) return (get_pieces, get_with, get_pass, cust_func)
def pipe_filter(context=None, _INPUT=None, conf=None, **kwargs): """An operator that filters for source items matching the given rules. Not loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipe2py.modules pipe like object (iterable of items) conf : { 'MODE': {'value': <'permit' or 'block'>}, 'COMBINE': {'value': <'and' or 'or'>} 'RULE': [ { 'field': {'value': 'search field'}, 'op': {'value': 'one of SWITCH above'}, 'value': {'value': 'search term'} } ] } kwargs : other inputs, e.g., to feed terminals for rule values Returns ------- _OUTPUT : generator of filtered items Examples -------- >>> import os.path as p >>> from pipe2py.modules.pipeforever import pipe_forever >>> from pipe2py.modules.pipefetchdata import pipe_fetchdata >>> parent = p.dirname(p.dirname(__file__)) >>> file_name = p.abspath(p.join(parent, 'data', 'gigs.json')) >>> path = 'value.items' >>> url = 'file://%s' % file_name >>> conf = {'URL': {'value': url}, 'path': {'value': path}} >>> input = pipe_fetchdata(_INPUT=pipe_forever(), conf=conf) >>> mode = {'value': 'permit'} >>> combine = {'value': 'and'} >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \ 'value': {'value': 'web'}}] >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule} >>> pipe_filter(_INPUT=input, conf=conf).next()['title'] u'E-Commerce Website Developer | Elance Job' >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \ 'value': {'value': 'kjhlked'}}] >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule} >>> list(pipe_filter(_INPUT=input, conf=conf)) [] """ conf = DotDict(conf) test = kwargs.pop('pass_if', None) permit = conf.get('MODE', **kwargs) == 'permit' combine = conf.get('COMBINE', **kwargs) if not combine in {'and', 'or'}: raise Exception("Invalid combine: %s. (Expected 'and' or 'or')" % combine) rule_defs = map(DotDict, utils.listize(conf['RULE'])) get_pass = partial(utils.get_pass, test=test) get_value = partial(utils.get_value, **kwargs) parse_conf = partial(utils.parse_conf, parse_func=get_value, **kwargs) get_rules = lambda i: imap(parse_conf, rule_defs, repeat(i)) funcs = [COMBINE_BOOLEAN[combine], utils.passthrough, utils.passthrough] inputs = imap(DotDict, _INPUT) splits = utils.broadcast(inputs, get_rules, utils.passthrough, get_pass) outputs = starmap(partial(parse_rules, **kwargs), splits) parsed = utils.dispatch(outputs, *funcs) gathered = starmap(partial(parse_result, permit=permit), parsed) _OUTPUT = ifilter(None, gathered) return _OUTPUT
def pipe_filter(context=None, _INPUT=None, conf=None, **kwargs): """An operator that filters for source items matching the given rules. Not loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipe2py.modules pipe like object (iterable of items) conf : { 'MODE': {'value': <'permit' or 'block'>}, 'COMBINE': {'value': <'and' or 'or'>} 'RULE': [ { 'field': {'value': 'search field'}, 'op': {'value': 'one of SWITCH above'}, 'value': {'value': 'search term'} } ] } kwargs : other inputs, e.g., to feed terminals for rule values Returns ------- _OUTPUT : generator of filtered items Examples -------- >>> import os.path as p >>> from pipe2py.modules.pipeforever import pipe_forever >>> from pipe2py.modules.pipefetchdata import pipe_fetchdata >>> parent = p.dirname(p.dirname(__file__)) >>> file_name = p.abspath(p.join(parent, 'data', 'gigs.json')) >>> path = 'value.items' >>> url = 'file://%s' % file_name >>> conf = {'URL': {'value': url}, 'path': {'value': path}} >>> input = pipe_fetchdata(_INPUT=pipe_forever(), conf=conf) >>> mode = {'value': 'permit'} >>> combine = {'value': 'and'} >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \ 'value': {'value': 'web'}}] >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule} >>> pipe_filter(_INPUT=input, conf=conf).next()['title'] u'E-Commerce Website Developer | Elance Job' >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \ 'value': {'value': 'kjhlked'}}] >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule} >>> list(pipe_filter(_INPUT=input, conf=conf)) [] """ conf = DotDict(conf) test = kwargs.pop('pass_if', None) permit = conf.get('MODE', **kwargs) == 'permit' combine = conf.get('COMBINE', **kwargs) if not combine in {'and', 'or'}: raise Exception( "Invalid combine: %s. (Expected 'and' or 'or')" % combine) rule_defs = map(DotDict, utils.listize(conf['RULE'])) get_pass = partial(utils.get_pass, test=test) get_value = partial(utils.get_value, **kwargs) parse_conf = partial(utils.parse_conf, parse_func=get_value, **kwargs) get_rules = lambda i: imap(parse_conf, rule_defs, repeat(i)) funcs = [COMBINE_BOOLEAN[combine], utils.passthrough, utils.passthrough] inputs = imap(DotDict, _INPUT) splits = utils.broadcast(inputs, get_rules, utils.passthrough, get_pass) outputs = starmap(partial(parse_rules, **kwargs), splits) parsed = utils.dispatch(outputs, *funcs) gathered = starmap(partial(parse_result, permit=permit), parsed) _OUTPUT = ifilter(None, gathered) return _OUTPUT
def pipe_fetchpage(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches the content of a given web site as a string. Loopable. context : pipe2py.Context object _INPUT : pipeforever asyncPipe or an iterable of items or fields conf : dict URL -- url object contain the URL to download from -- string from where to start the input to -- string to limit the input token -- if present, split the input on this token to generate items Description: http://pipes.yahoo.com/pipes/docs?doc=sources#FetchPage TODOS: - don't retrieve pages larger than 200k - don't retrieve if page is not indexable. - item delimiter removes the closing tag if using a HTML tag (not documented but happens) - items should be cleaned, i.e. stripped of HTML tags Yields ------ _OUTPUT : items """ conf = DotDict(conf) split_token = conf.get('token', **kwargs) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) if not url: continue f = urlopen(url) # TODO: it seems that Yahoo! converts relative links to # absolute. This needs to be done on the content but seems to # be a non-trival task python? content = unicode(f.read(), 'utf-8') if context and context.verbose: print '............Content .................' print content print '...............EOF...................' parsed = _parse_content(content, conf, **kwargs) items = parsed.split(split_token) if split_token else [parsed] if context and context.verbose: print "FetchPage: found count items:", len(items) for i in items: if context and context.verbose: print "--------------item data --------------------" print i print "--------------EOF item data ----------------" yield {"content": i} if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches the content of a given website as DOM nodes or a string. Loopable. context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : dict URL -- url object contain the URL to download xpath -- xpath to extract html5 -- use html5 parser? useAsString -- emit items as string? TODOS: - don't retrieve pages larger than 1.5MB - don't retrieve if page is not indexable. Yields ------ _OUTPUT : items """ conf = DotDict(conf) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) f = urlopen(url) # TODO: it seems that Yahoo! converts relative links to # absolute. This needs to be done on the content but seems to # be a non-trival task python? content = unicode(f.read(), 'utf-8') if context and context.verbose: print '............Content .................' print content print '...............EOF...................' xpath = conf.get('xpath', **kwargs) html5 = conf.get('html5', **kwargs) == 'true' use_as_string = conf.get('useAsString', **kwargs) == 'true' tree = html5parser.parse(f) if html5 else html.parse(f) root = tree.getroot() items = root.xpath(xpath) if context and context.verbose: print 'XPathFetchPage: found count items:', len(items) for etree in items: i = utils.etree_to_dict(etree) if context and context.verbose: print '--------------item data --------------------' print i print '--------------EOF item data ----------------' if use_as_string: yield {'content': unicode(i)} else: yield i if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break