def pipe_feedautodiscovery(context=None, _INPUT=None, conf=None, **kwargs): """A source that searches for and returns feed links found in a page. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : URL -- url Yields ------ _OUTPUT : items """ conf = DotDict(conf) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) if context and context.verbose: print "pipe_feedautodiscovery loading:", url for entry in autorss.getRSSLink(url.encode('utf-8')): yield {'link': entry} # todo: add rel, type, title if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_subelement(context=None, _INPUT=None, conf=None, **kwargs): """An operator extracts select sub-elements from a feed. Not loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipe2py.modules pipe like object (iterable of items) conf : {'path': {'value': <element path>}} Yields ------ _OUTPUT : items """ path = DotDict(conf).get('path', **kwargs) for item in _INPUT: element = DotDict(item).get(path, **kwargs) for i in utils.gen_items(element): yield {'content': i} if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def _gen_rules(rule_defs, **kwargs): for rule in rule_defs: rule = DotDict(rule) # flags = re.DOTALL # DOTALL was the default for pipe2py previously # flag 'm' flags = re.MULTILINE if 'multilinematch' in rule else 0 # flag 'i'; this name is reversed from its meaning flags |= re.IGNORECASE if 'casematch' in rule else 0 # flag 's' flags |= re.DOTALL if 'singlelinematch' in rule else 0 # todo: 'globalmatch' is the default in python # todo: if set, re.sub() below would get count=0 and by default would # get count=1 # todo: use subkey? match = rule.get('match', **kwargs) # compile for speed and we need to pass flags matchc = re.compile(match, flags) # todo: use subkey? replace = rule.get('replace', **kwargs) or '' # Convert regex to Python format # todo: use a common routine for this # map $1 to \1 etc. # todo: also need to escape any existing \1 etc. replace = re.sub('\$(\d+)', r'\\\1', replace) yield (rule.get('filed'), matchc, replace)
def pipe_fetchsitefeed(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches and parses the first feed found on one or more sites. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : URL -- url Yields ------ _OUTPUT : items """ conf = DotDict(conf) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) if context and context.verbose: print "pipe_fetchsitefeed loading:", url for link in autorss.getRSSLink(url.encode('utf-8')): parsed = speedparser.parse(urlopen(link).read()) for entry in utils.gen_entries(parsed): yield entry if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_stringtokenizer(context=None, _INPUT=None, conf=None, **kwargs): """Splits a string into tokens delimited by separators. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: to-str -- separator string Yields (_OUTPUT): tokens of the input string """ conf = DotDict(conf) delim = conf.get('to-str', **kwargs) for item in _INPUT: for chunk in item.split(delim): yield {'content': chunk} try: forever = item.get('forever') except AttributeError: forever = False if forever: # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_dateformat(context=None, _INPUT=None, conf=None, **kwargs): """Formats a datetime value. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipedatebuilder pipe like object (iterable of date timetuples) conf : { 'format': {'value': <'%B %d, %Y'>}, 'timezone': {'value': <'EST'>} } Yields ------ _OUTPUT : formatted dates """ conf = DotDict(conf) loop_with = kwargs.pop('with', None) date_format = conf.get('format', **kwargs) # timezone = conf.get('timezone', **kwargs) for item in _INPUT: _with = item.get(loop_with, **kwargs) if loop_with else item try: # todo: check that all PHP formats are covered by Python date_string = time.strftime(date_format, _with) except TypeError as e: if context and context.verbose: print 'Error formatting date: %s' % item print e continue else: yield date_string
def pipe_dateformat(context=None, _INPUT=None, conf=None, **kwargs): """This source formats a date. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: format -- date format Yields (_OUTPUT): formatted date """ conf = DotDict(conf) date_format = conf.get('format', **kwargs) date = () for item in _INPUT: if not hasattr(item, 'tm_year'): date = util.get_date(item) date = date.timetuple() if date else item if not date: raise Exception('Unexpected date format: %s' % date_format) try: # todo: check that all PHP formats are covered by Python date_string = time.strftime(date_format, date) except TypeError: # silent error handling e.g. if item is not a date continue else: yield date_string
def pipe_createrss(context=None, _INPUT=None, conf=None, **kwargs): conf = DotDict(conf) for item in _INPUT: item = DotDict(item) yield { value: item.get(conf.get(key, **kwargs)) for key, value in RSS_FIELDS.items()}
def _gen_rules(rule_defs, **kwargs): rule_defs = util.listize(rule_defs) # todo: use subkey? for rule_def in rule_defs: rule_def = DotDict(rule_def) op = rule_def.get('op', **kwargs) newfield = {'subkey': rule_def.get('field')} newval = rule_def.get('newval', **kwargs) yield (op, newfield, newval)
def pipe_createrss(context=None, _INPUT=None, conf=None, **kwargs): """An operator that converts a source into an RSS stream. Not loopable. """ conf = DotDict(conf) for item in _INPUT: item = DotDict(item) yield {value: item.get(conf.get(key, **kwargs)) for key, value in RSS_FIELDS.items()}
def pipe_yql(context=None, _INPUT=None, conf=None, **kwargs): """A source that issues YQL queries. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : yqlquery -- YQL query # todo: handle envURL Yields ------ _OUTPUT : query results """ # todo: get from a config/env file url = "http://query.yahooapis.com/v1/public/yql" conf = DotDict(conf) query = conf['yqlquery'] for item in _INPUT: item = DotDict(item) yql = utils.get_value(query, item, **kwargs) # note: we use the default format of xml since json loses some # structure # todo: diagnostics=true e.g. if context.test # todo: consider paging for large result sets r = requests.get(url, params={'q': yql}, stream=True) # Parse the response tree = parse(r.raw) if context and context.verbose: print "pipe_yql loading xml:", yql root = tree.getroot() # note: query also has row count results = root.find('results') # Convert xml into generation of dicts for element in results.getchildren(): yield utils.etree_to_dict(element) if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_tail(context=None, _INPUT=None, conf=None, **kwargs): """This operator truncates the number of items in a feed. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- terminal, if the truncation value is wired in conf: count -- length of the truncated feed, if specified literally Yields (_OUTPUT): tail-truncated list of source items """ conf = DotDict(conf) limit = conf.get('count', func=int, **kwargs) for item in deque(_INPUT, limit): yield item
def pipe_tail(context=None, _INPUT=None, conf=None, **kwargs): """Returns a specified number of items from the bottom of a feed. Parameters ---------- context : pipe2py.Context object _INPUT : pipe2py.modules pipe like object (iterable of items) kwargs -- terminal, if the truncation value is wired in conf : count -- length of the truncated feed, if specified literally Yields ------ _OUTPUT : items """ conf = DotDict(conf) limit = conf.get('count', func=int, **kwargs) for item in deque(_INPUT, limit): yield item
def pipe_datebuilder(context=None, _INPUT=None, conf=None, **kwargs): """A date module that converts a text string into a datetime value. Useful as terminal data. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items conf : {'DATE': {'type': 'datetime', 'value': '12/2/2014'}} Yields ------ _OUTPUT : date timetuples """ conf = DotDict(conf) for item in _INPUT: _input = DotDict(item) date = utils.get_value(conf['DATE'], _input, **kwargs).lower() if date.endswith(' day') or date.endswith(' days'): count = int(date.split(' ')[0]) new_date = dt.today() + timedelta(days=count) elif date.endswith(' year') or date.endswith(' years'): count = int(date.split(' ')[0]) new_date = dt.today().replace(year=dt.today().year + count) else: new_date = SWITCH.get(date) if not new_date: new_date = utils.get_date(date) if not new_date: raise Exception('Unrecognized date string: %s' % date) yield new_date.timetuple()
def pipe_uniq(context=None, _INPUT=None, conf=None, **kwargs): """This operator filters out non unique items according to the specified field. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: field -- field to be unique Yields (_OUTPUT): source items, one per unique field value """ seen = set() conf = DotDict(conf) field = conf.get('field', **kwargs) for item in _INPUT: value = DotDict(item).get(field) if value not in seen: seen.add(value) yield item
def pipe_substr(context=None, _INPUT=None, conf=None, **kwargs): """Returns a substring. Keyword arguments: context -- pipeline context _INPUT -- source generator conf: from -- starting character length -- number of characters to return Yields (_OUTPUT): portion of source string """ conf = DotDict(conf) start = conf.get('from', func=int, **kwargs) length = conf.get('length', func=int, **kwargs) for item in _INPUT: yield item[start:start + length] if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_urlbuilder(context=None, _INPUT=None, conf=None, **kwargs): """This source builds a url and yields it forever. Keyword arguments: context -- pipeline context _INPUT -- not used conf: BASE -- base PATH -- path elements PARAM -- query parameters Yields (_OUTPUT): url """ conf = DotDict(conf) paths = util.listize(conf.get('PATH')) # use .get() incase 'PATH' isnt set param_defs = util.listize(conf['PARAM']) url = None for item in _INPUT: # if _INPUT is pipeforever and not a loop, get values from cache if not url: item = DotDict(item) forever = item.get('forever') url = conf.get('BASE', **kwargs) url += '/' if not url.endswith('/') else url url += "/".join(str(p) for p in paths if p) url = url.rstrip("/") url = util.url_quote(url) # Ensure url is valid params = dict(_gen_params(param_defs, item, **kwargs)) if params and params.keys() != [u'']: url += "?" + urllib.urlencode(params) yield url url = url if forever else None
def pipe_createrss(context=None, _INPUT=None, conf=None, **kwargs): """An operator that converts a source into an RSS stream. Not loopable. """ conf = DotDict(conf) for item in _INPUT: item = DotDict(item) yield { value: item.get(conf.get(key, **kwargs)) for key, value in RSS_FIELDS.items() }
def pipe_rssitembuilder(context=None, _INPUT=None, conf=None, **kwargs): """A source that builds an rss item. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever asyncPipe or an iterable of items or fields conf : { 'mediaContentType': {'type': 'text', 'value': ''}, 'mediaContentHeight': {'type': 'text', 'value': ''}, 'mediaContentWidth': {'type': 'text', 'value': ''}, 'mediaContentURL': {'type': 'text', 'value': 'url'}, 'mediaThumbHeight': {'type': 'text', 'value': ''}, 'mediaThumbWidth': {'type': 'text', 'value': ''}, 'mediaThumbURL': {'type': 'text', 'value': 'url'}, 'description': {'type': 'text', 'value': 'description'}, 'pubdate': {'type': 'text', 'value': 'pubdate'}, 'author': {'type': 'text', 'value': 'author'}, 'title': {'type': 'text', 'value': 'title'}, 'link': {'type': 'text', 'value': 'url'}, 'guid': {'type': 'text', 'value': 'guid'}, } Yields ------ _OUTPUT : items """ get_value = partial(utils.get_value, **kwargs) pkwargs = utils.combine_dicts({'parse_func': get_value}, kwargs) parse_conf = partial(utils.parse_conf, DotDict(conf), **pkwargs) get_RSS = lambda key, value: (RSS.get(key, key), value) get_YAHOO = lambda key, value: (YAHOO.get(key), value) make_dict = lambda func, conf: dict(starmap(func, conf.iteritems())) clean_dict = lambda d: dict(i for i in d.items() if all(i)) funcs = [partial(make_dict, get_RSS), partial(make_dict, get_YAHOO)] finite = utils.finitize(_INPUT) inputs = imap(DotDict, finite) confs = imap(parse_conf, inputs) splits = utils.broadcast(confs, *funcs) combined = starmap(utils.combine_dicts, splits) result = imap(clean_dict, combined) _OUTPUT = imap(DotDict, result) return _OUTPUT
def _get_broadcast_funcs(pieces, ftype='with', **kwargs): test = kwargs.pop('pass_if', None) listize = kwargs.pop('listize', True) parse = kwargs.pop('parse', True) pdictize = kwargs.pop('pdictize', True) cust_func = kwargs.pop('cust_func', False) get_value = partial(utils.get_value, **kwargs) get_pass = partial(utils.get_pass, test=test) get_with = partial(utils.get_with, **kwargs) if parse: get_func = partial(utils.parse_conf, parse_func=get_value, **kwargs) else: get_func = get_value if listize: listed = utils.listize(pieces) piece_defs = map(DotDict, listed) if pdictize else listed get_pieces = lambda item: imap(get_func, piece_defs, repeat(item)) else: piece_defs = DotDict(pieces) if pdictize else pieces get_pieces = partial(get_func, piece_defs) return (get_pieces, get_with, get_pass, cust_func)
def pipe_csv(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches and parses a csv file to yield items. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : URL -- url skip -- number of header rows to skip col_mode -- column name source: row=header row(s), custom=defined in col_name col_name -- list of custom column names col_row_start -- first column header row col_row_end -- last column header row separator -- column separator Yields ------ _OUTPUT : items Note: Current restrictions: separator must be 1 character assumes every row has exactly the expected number of fields, as defined in the header """ conf = DotDict(conf) conf_sep = conf['separator'] conf_mode = conf['col_mode'] col_name = conf['col_name'] for item in _INPUT: item = DotDict(item) url = utils.get_value(conf['URL'], item, **kwargs) url = utils.get_abspath(url) separator = utils.get_value(conf_sep, item, encode=True, **kwargs) skip = int(utils.get_value(conf['skip'], item, **kwargs)) col_mode = utils.get_value(conf_mode, item, **kwargs) f = urlopen(url) if context and context.verbose: print "pipe_csv loading:", url for i in xrange(skip): f.next() reader = csv.UnicodeReader(f, delimiter=separator) fieldnames = [] if col_mode == 'custom': fieldnames = [DotDict(x).get() for x in col_name] else: fieldnames = _gen_fieldnames(conf, reader, item, **kwargs) for rows in reader: yield dict(zip(fieldnames, rows)) f.close() if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_filter(context=None, _INPUT=None, conf=None, **kwargs): """An operator that filters for source items matching the given rules. Not loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipe2py.modules pipe like object (iterable of items) conf : { 'MODE': {'value': <'permit' or 'block'>}, 'COMBINE': {'value': <'and' or 'or'>} 'RULE': [ { 'field': {'value': 'search field'}, 'op': {'value': 'one of SWITCH above'}, 'value': {'value': 'search term'} } ] } kwargs : other inputs, e.g., to feed terminals for rule values Returns ------- _OUTPUT : generator of filtered items Examples -------- >>> import os.path as p >>> from pipe2py.modules.pipeforever import pipe_forever >>> from pipe2py.modules.pipefetchdata import pipe_fetchdata >>> parent = p.dirname(p.dirname(__file__)) >>> file_name = p.abspath(p.join(parent, 'data', 'gigs.json')) >>> path = 'value.items' >>> url = 'file://%s' % file_name >>> conf = {'URL': {'value': url}, 'path': {'value': path}} >>> input = pipe_fetchdata(_INPUT=pipe_forever(), conf=conf) >>> mode = {'value': 'permit'} >>> combine = {'value': 'and'} >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \ 'value': {'value': 'web'}}] >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule} >>> pipe_filter(_INPUT=input, conf=conf).next()['title'] u'E-Commerce Website Developer | Elance Job' >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \ 'value': {'value': 'kjhlked'}}] >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule} >>> list(pipe_filter(_INPUT=input, conf=conf)) [] """ conf = DotDict(conf) test = kwargs.pop('pass_if', None) permit = conf.get('MODE', **kwargs) == 'permit' combine = conf.get('COMBINE', **kwargs) if not combine in {'and', 'or'}: raise Exception("Invalid combine: %s. (Expected 'and' or 'or')" % combine) rule_defs = map(DotDict, utils.listize(conf['RULE'])) get_pass = partial(utils.get_pass, test=test) get_value = partial(utils.get_value, **kwargs) parse_conf = partial(utils.parse_conf, parse_func=get_value, **kwargs) get_rules = lambda i: imap(parse_conf, rule_defs, repeat(i)) funcs = [COMBINE_BOOLEAN[combine], utils.passthrough, utils.passthrough] inputs = imap(DotDict, _INPUT) splits = utils.broadcast(inputs, get_rules, utils.passthrough, get_pass) outputs = starmap(partial(parse_rules, **kwargs), splits) parsed = utils.dispatch(outputs, *funcs) gathered = starmap(partial(parse_result, permit=permit), parsed) _OUTPUT = ifilter(None, gathered) return _OUTPUT
def pipe_fetchpage(context=None, _INPUT=None, conf=None, **kwargs): """Fetch Page module _INPUT -- not used since this does not have inputs. conf: URL -- url object contain the URL to download from -- string from where to start the input to -- string to limit the input token -- if present, split the input on this token to generate items Description: http://pipes.yahoo.com/pipes/docs?doc=sources#FetchPage TODOS: - don't retrieve pages larger than 200k - don't retrieve if page is not indexable. - item delimiter removes the closing tag if using a HTML tag (not documented but happens) - items should be cleaned, i.e. stripped of HTML tags """ conf = DotDict(conf) split_token = conf.get('token', **kwargs) urls = util.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = util.get_value(DotDict(item_url), DotDict(item), **kwargs) url = util.get_abspath(url) if not url: continue f = urlopen(url) # TODO: it seems that Yahoo! converts relative links to # absolute. This needs to be done on the content but seems to # be a non-trival task python? content = unicode(f.read(), 'utf-8') if context and context.verbose: print '............Content .................' print content print '...............EOF...................' parsed = _parse_content(content, conf, **kwargs) items = parsed.split(split_token) if split_token else [parsed] if context and context.verbose: print "FetchPage: found count items:", len(items) for i in items: if context and context.verbose: print "--------------item data --------------------" print i print "--------------EOF item data ----------------" yield {"content": i} if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_csv(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches and parses a csv file to yield items. Loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : URL -- url skip -- number of header rows to skip col_mode -- column name source: row=header row(s), custom=defined in col_name col_name -- list of custom column names col_row_start -- first column header row col_row_end -- last column header row separator -- column separator Yields ------ _OUTPUT : items Note: Current restrictions: separator must be 1 character assumes every row has exactly the expected number of fields, as defined in the header """ conf = DotDict(conf) conf_sep = conf["separator"] conf_mode = conf["col_mode"] col_name = conf["col_name"] for item in _INPUT: item = DotDict(item) url = utils.get_value(conf["URL"], item, **kwargs) url = utils.get_abspath(url) separator = utils.get_value(conf_sep, item, encode=True, **kwargs) skip = int(utils.get_value(conf["skip"], item, **kwargs)) col_mode = utils.get_value(conf_mode, item, **kwargs) f = urlopen(url) if context and context.verbose: print "pipe_csv loading:", url for i in xrange(skip): f.next() reader = csv.UnicodeReader(f, delimiter=separator) fieldnames = [] if col_mode == "custom": fieldnames = [DotDict(x).get() for x in col_name] else: fieldnames = _gen_fieldnames(conf, reader, item, **kwargs) for rows in reader: yield dict(zip(fieldnames, rows)) f.close() if item.get("forever"): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_csv(context=None, _INPUT=None, conf=None, **kwargs): """This source fetches and parses a csv file to yield items. Keyword arguments: context -- pipeline context _INPUT -- not used conf: URL -- url skip -- number of header rows to skip col_mode -- column name source: row=header row(s), custom=defined in col_name col_name -- list of custom column names col_row_start -- first column header row col_row_end -- last column header row separator -- column separator Yields (_OUTPUT): file entries Note: Current restrictions: separator must be 1 character assumes every row has exactly the expected number of fields, as defined in the header """ conf = DotDict(conf) conf_sep = conf['separator'] conf_mode = conf['col_mode'] col_name = conf['col_name'] for item in _INPUT: item = DotDict(item) url = util.get_value(conf['URL'], item, **kwargs) url = util.get_abspath(url) separator = util.get_value(conf_sep, item, encode=True, **kwargs) skip = util.get_value(conf['skip'], item, func=int, **kwargs) col_mode = util.get_value(conf_mode, item, **kwargs) f = urlopen(url) if context and context.verbose: print "pipe_csv loading:", url for i in xrange(skip): f.next() reader = csv.UnicodeReader(f, delimiter=separator) fieldnames = [] if col_mode == 'custom': fieldnames = [DotDict(x).get() for x in col_name] else: fieldnames = _gen_fieldnames(conf, reader, item, **kwargs) for rows in reader: yield dict(zip(fieldnames, rows)) f.close() if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_regex(context=None, _INPUT=None, conf=None, **kwargs): """Applies regex rules to _INPUT items. Parameters ---------- context : pipe2py.Context object _INPUT : source generator of dicts conf: dict { 'RULE': [ { 'field': {'value': 'search field'}, 'match': {'value': 'regex'}, 'replace': {'value': 'replacement'} } ] } Yields ------ _OUTPUT : source pipe items post regexes application """ rule_defs = util.listize(conf['RULE']) # use list bc iterator gets used up if there are no matching feeds rules = list(_gen_rules(rule_defs, **kwargs)) for item in _INPUT: item = DotDict(item) def sub_fields(matchobj): return item.get(matchobj.group(1), **kwargs) for rule in rules: # todo: do we ever need get_value here instead of item[]? # when the subject being examined is an HTML node, not a # string then the unicode() converts the dict representing the node # to a dict literal, and then attempts to apply the pattern # to the literal; as an HTML element node, it may have attributes # which then appear in the literal. It should be only matching on # (and replacing the value of) the `.content` subelement # I'm not confident that what is below will work across the board # nor if this is the right way to detect that we're looking at # an HTML node and not a plain string if rule[0] in item and item[rule[0]]: sub_string = '\$\{(.+?)\}' if ( hasattr(item[rule[0]], 'keys') and 'content' in item[rule[0]] ): # this looks like an HTML node, so only do substitution on # the content of the node possible gotcha: the content # might be a subtree, in which case we revert to modifying # the literal of the subtree dict args1 = _get_args(item, rule, rule[1], rule[2], 'content') args2 = _get_args(item, rule, sub_string, sub_fields) else: args1 = _get_args(item, rule, rule[1], rule[2]) args2 = _get_args(item, rule, sub_string, sub_fields) item.set(*args1) item.set(*args2) yield item
def pipe_fetchpage(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches the content of a given web site as a string. Loopable. context : pipe2py.Context object _INPUT : pipeforever asyncPipe or an iterable of items or fields conf : dict URL -- url object contain the URL to download from -- string from where to start the input to -- string to limit the input token -- if present, split the input on this token to generate items Description: http://pipes.yahoo.com/pipes/docs?doc=sources#FetchPage TODOS: - don't retrieve pages larger than 200k - don't retrieve if page is not indexable. - item delimiter removes the closing tag if using a HTML tag (not documented but happens) - items should be cleaned, i.e. stripped of HTML tags Yields ------ _OUTPUT : items """ conf = DotDict(conf) split_token = conf.get('token', **kwargs) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) if not url: continue f = urlopen(url) # TODO: it seems that Yahoo! converts relative links to # absolute. This needs to be done on the content but seems to # be a non-trival task python? content = unicode(f.read(), 'utf-8') if context and context.verbose: print '............Content .................' print content print '...............EOF...................' parsed = _parse_content(content, conf, **kwargs) items = parsed.split(split_token) if split_token else [parsed] if context and context.verbose: print "FetchPage: found count items:", len(items) for i in items: if context and context.verbose: print "--------------item data --------------------" print i print "--------------EOF item data ----------------" yield {"content": i} if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_filter(context=None, _INPUT=None, conf=None, **kwargs): """An operator that filters for source items matching the given rules. Not loopable. Parameters ---------- context : pipe2py.Context object _INPUT : pipe2py.modules pipe like object (iterable of items) conf : { 'MODE': {'value': <'permit' or 'block'>}, 'COMBINE': {'value': <'and' or 'or'>} 'RULE': [ { 'field': {'value': 'search field'}, 'op': {'value': 'one of SWITCH above'}, 'value': {'value': 'search term'} } ] } kwargs : other inputs, e.g., to feed terminals for rule values Returns ------- _OUTPUT : generator of filtered items Examples -------- >>> import os.path as p >>> from pipe2py.modules.pipeforever import pipe_forever >>> from pipe2py.modules.pipefetchdata import pipe_fetchdata >>> parent = p.dirname(p.dirname(__file__)) >>> file_name = p.abspath(p.join(parent, 'data', 'gigs.json')) >>> path = 'value.items' >>> url = 'file://%s' % file_name >>> conf = {'URL': {'value': url}, 'path': {'value': path}} >>> input = pipe_fetchdata(_INPUT=pipe_forever(), conf=conf) >>> mode = {'value': 'permit'} >>> combine = {'value': 'and'} >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \ 'value': {'value': 'web'}}] >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule} >>> pipe_filter(_INPUT=input, conf=conf).next()['title'] u'E-Commerce Website Developer | Elance Job' >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \ 'value': {'value': 'kjhlked'}}] >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule} >>> list(pipe_filter(_INPUT=input, conf=conf)) [] """ conf = DotDict(conf) test = kwargs.pop('pass_if', None) permit = conf.get('MODE', **kwargs) == 'permit' combine = conf.get('COMBINE', **kwargs) if not combine in {'and', 'or'}: raise Exception( "Invalid combine: %s. (Expected 'and' or 'or')" % combine) rule_defs = map(DotDict, utils.listize(conf['RULE'])) get_pass = partial(utils.get_pass, test=test) get_value = partial(utils.get_value, **kwargs) parse_conf = partial(utils.parse_conf, parse_func=get_value, **kwargs) get_rules = lambda i: imap(parse_conf, rule_defs, repeat(i)) funcs = [COMBINE_BOOLEAN[combine], utils.passthrough, utils.passthrough] inputs = imap(DotDict, _INPUT) splits = utils.broadcast(inputs, get_rules, utils.passthrough, get_pass) outputs = starmap(partial(parse_rules, **kwargs), splits) parsed = utils.dispatch(outputs, *funcs) gathered = starmap(partial(parse_result, permit=permit), parsed) _OUTPUT = ifilter(None, gathered) return _OUTPUT
def pipe_fetchdata(context=None, _INPUT=None, conf=None, **kwargs): """Fetches and parses an XML or JSON file. Parameters ---------- context : pipe2py.Context object _INPUT : source generator of dicts conf : dict { 'URL': {'value': url}, 'path': {'value': dot separated path to data list} } Yields ------ _OUTPUT : pipe items fetched from source Examples -------- >>> from os import path as p >>> from pipe2py.modules.pipeforever import pipe_forever >>> parent = p.dirname(p.dirname(__file__)) >>> abspath = p.abspath(p.join(parent, 'data', 'gigs.json')) >>> path = 'value.items' >>> url = "file://%s" % abspath >>> conf = {'URL': {'value': url}, 'path': {'value': path}} >>> pipe_fetchdata(_INPUT=pipe_forever(), conf=conf).next().keys()[:5] [u'y:repeatcount', u'description', u'pubDate', u'title', u'y:published'] >>> abspath = p.abspath(p.join(parent, 'data', 'places.xml')) >>> path = 'appointment' >>> url = "file://%s" % abspath >>> conf = {'URL': {'value': url}, 'path': {'value': path}} >>> sorted(pipe_fetchdata(_INPUT=pipe_forever(), conf=conf).next().keys()) ['alarmTime', 'begin', 'duration', 'places', 'subject', 'uid'] >>> conf = {'URL': {'value': url}, 'path': {'value': ''}} >>> sorted(pipe_fetchdata(_INPUT=pipe_forever(), conf=conf).next().keys()) ['appointment', 'reminder'] """ # todo: iCal and KML conf = DotDict(conf) urls = util.listize(conf['URL']) for item in _INPUT: for item_url in urls: item = DotDict(item) url = util.get_value(DotDict(item_url), item, **kwargs) url = util.get_abspath(url) f = urlopen(url) path = util.get_value(conf['path'], item, **kwargs) split_path = path.split(".") if path else [] res = {} try: tree = objectify.parse(f) root = tree.getroot() except XMLSyntaxError: if context and context.verbose: print "pipe_fetchdata loading json:", url f = urlopen(url) element = loads(f.read()) else: if context and context.verbose: print "pipe_fetchdata loading xml:", url # print etree.tostring(element, pretty_print=True) element = util.etree_to_dict(root) finally: res = _parse_dict(split_path, element) if element else None for i in util.gen_items(res, True): yield i if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def _gen_order(keys, **kwargs): for key in keys: key = DotDict(key) field = key.get('field', **kwargs) sort_dir = key.get('dir', **kwargs) yield '%s%s' % (sort_dir == 'DESC' and '-' or '', field)
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs): """This operator loops over the input performing the embedded submodule. Keyword arguments: context -- pipeline context _INPUT -- source generator kwargs -- other inputs, e.g. to feed terminals for rule values conf: mode -- how to affect output - either assign or EMIT assign_to -- if mode is assign, which field to assign to (new or existing) loop_with -- pass a particular field into the submodule rather than the whole item embed -- embedded submodule Yields (_OUTPUT): source items after passing through the submodule and adding/replacing values """ conf = DotDict(conf) mode = conf.get('mode') assign_to = conf.get('assign_to') assign_part = conf.get('assign_part') # TODO: what is this for?? # emit_part = conf.get('emit_part') loop_with = conf.get('with') embed_conf = conf.get('embed')['conf'] # Prepare the submodule to take parameters from the loop instead of from # the user embed_context = copy(context) embed_context.submodule = True for item in _INPUT: item = DotDict(item) inp = item.get(loop_with, **kwargs) if loop_with else item # prepare the submodule embed_context.inputs = dict(_gen_inputs(item, embed_conf)) submodule = embed(embed_context, [inp], embed_conf) first = assign_part == 'first' results = _gen_results(submodule, mode, first) if not results: continue elif mode == 'EMIT': for i in results: yield i elif mode == 'assign': results = list(results) # this is a hack to make sure fetchpage works in an out of a # loop while not disturbing strconcat in a loop etc. # note: i suspect this needs to be more discerning and only happen # if the source can only ever deliver 1 result, e.g. strconcat vs. # fetchpage if len(results) == 1 and not hasattr(results[0], 'keys'): results = results[0] item.set(assign_to, results) yield item else: raise Exception( "Invalid mode: %s. (Expected 'assign' or 'EMIT')" % mode)
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs): """XPath Fetch Page module _INPUT -- not used since this does not have inputs. conf: URL -- url object contain the URL to download xpath -- xpath to extract html5 -- use html5 parser? useAsString -- emit items as string? Description: http://pipes.yahoo.com/pipes/docs?doc=sources#XPathFetchPage TODOS: - don't retrieve pages larger than 1.5MB - don't retrieve if page is not indexable. """ conf = DotDict(conf) urls = util.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = util.get_value(DotDict(item_url), DotDict(item), **kwargs) url = util.get_abspath(url) f = urlopen(url) # TODO: it seems that Yahoo! converts relative links to # absolute. This needs to be done on the content but seems to # be a non-trival task python? content = unicode(f.read(), 'utf-8') if context and context.verbose: print '............Content .................' print content print '...............EOF...................' xpath = conf.get('xpath', **kwargs) html5 = conf.get('html5', **kwargs) == 'true' use_as_string = conf.get('useAsString', **kwargs) == 'true' tree = html5parser.parse(f) if html5 else html.parse(f) root = tree.getroot() items = root.xpath(xpath) if context and context.verbose: print 'XPathFetchPage: found count items:', len(items) for etree in items: i = util.etree_to_dict(etree) if context and context.verbose: print '--------------item data --------------------' print i print '--------------EOF item data ----------------' if use_as_string: yield {'content': unicode(i)} else: yield i if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs): """A source that fetches the content of a given website as DOM nodes or a string. Loopable. context : pipe2py.Context object _INPUT : pipeforever pipe or an iterable of items or fields conf : dict URL -- url object contain the URL to download xpath -- xpath to extract html5 -- use html5 parser? useAsString -- emit items as string? TODOS: - don't retrieve pages larger than 1.5MB - don't retrieve if page is not indexable. Yields ------ _OUTPUT : items """ conf = DotDict(conf) urls = utils.listize(conf['URL']) for item in _INPUT: for item_url in urls: url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs) url = utils.get_abspath(url) f = urlopen(url) # TODO: it seems that Yahoo! converts relative links to # absolute. This needs to be done on the content but seems to # be a non-trival task python? content = unicode(f.read(), 'utf-8') if context and context.verbose: print '............Content .................' print content print '...............EOF...................' xpath = conf.get('xpath', **kwargs) html5 = conf.get('html5', **kwargs) == 'true' use_as_string = conf.get('useAsString', **kwargs) == 'true' tree = html5parser.parse(f) if html5 else html.parse(f) root = tree.getroot() items = root.xpath(xpath) if context and context.verbose: print 'XPathFetchPage: found count items:', len(items) for etree in items: i = utils.etree_to_dict(etree) if context and context.verbose: print '--------------item data --------------------' print i print '--------------EOF item data ----------------' if use_as_string: yield {'content': unicode(i)} else: yield i if item.get('forever'): # _INPUT is pipeforever and not a loop, # so we just yield our item once break