Пример #1
def pipe_itembuilder(context, _INPUT, conf, **kwargs):
    """This source builds an item.
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
        attrs -- key, value pairs
    Yields (_OUTPUT):
    attrs = conf['attrs']
    for item in _INPUT:
        d = {}
        for attr in attrs:
                key = util.get_value(attr['key'], item, **kwargs)
                value = util.get_value(attr['value'], item, **kwargs)
            except KeyError:
                continue  #ignore if the item is referenced but doesn't have our source or target field (todo: issue a warning if debugging?)
            util.set_value(d, key, value)
        yield d
        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
Пример #2
def pipe_sort(context, _INPUT, conf, **kwargs):
    """This operator sorts the input source according to the specified key. 

    Keyword arguments:
    context -- pipeline context        
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
        KEY -- list of fields to sort by
    Yields (_OUTPUT):
    source items sorted by key
    order = []

    keys = conf['KEY']
    if not isinstance(keys, list):
        keys = [keys]
    for key in keys:
        field = util.get_value(key['field'], None, **kwargs)
        sort_dir = util.get_value(key['dir'], None, **kwargs)
        order.append('%s%s' % (sort_dir == 'DESC' and '-' or '', field))

    #read all and sort
    sorted_input = []
    for item in _INPUT:
    sorted_input = util.multikeysort(sorted_input, order)

    for item in sorted_input:
        yield item
Пример #3
def pipe_strreplace(context, _INPUT, conf, **kwargs):
    """Replaces text with replacement text.
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
        RULE -- rules - each rule comprising (find, param, replace):
            find -- text to find
            param -- type of match: 1=first, 2=last, 3=every
            replace -- text to replace with
    Yields (_OUTPUT):
    source string with replacements
    rules = []
    for rule in conf['RULE']:
        find = util.get_value(rule['find'], None, **kwargs)
        param = util.get_value(rule['param'], None, **kwargs)
        replace = util.get_value(rule['replace'], None, **kwargs)
        rules.append((find, param, replace))

    for item in _INPUT:
        t = item
        for rule in rules:
            if rule[1] == '1':
                t = t.replace(rule[0], rule[2], 1)
            elif rule[1] == '2':
                t = util.rreplace(t, rule[0], rule[2], 1)
            elif rule[1] == '3':
                t = t.replace(rule[0], rule[2])
            #todo else assertion
        yield t
Пример #4
Пример #5
Пример #6
def pipe_strregex(context, _INPUT, conf, **kwargs):
    """This operator replaces values using regexes. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
        RULE -- rules - each rule comprising (match, replace)
    Yields (_OUTPUT):
    source item after replacing values matching regexes
    rules = []
    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]
    for rule in rule_defs:
        #TODO compile regex here: c = re.compile(match)
        match = util.get_value(rule['match'], None, **kwargs) #todo use subkey?
        replace = util.get_value(rule['replace'], None, **kwargs) #todo use subkey?
        #convert regex to Python format: todo use a common routine for this
        replace = re.sub('\$(\d+)', r'\\\1', replace)   #map $1 to \1 etc.   #todo: also need to escape any existing \1 etc.
        rules.append((match, replace))
    for item in _INPUT:
        for rule in rules:
            item = re.sub(match, replace, item)
        yield item
Пример #7
def pipe_regex(context, _INPUT, conf, **kwargs):
    """This operator replaces values using regexes. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
        RULE -- rules - each rule comprising (field, match, replace)
    Yields (_OUTPUT):
    source items after replacing values matching regexes
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]

    for rule in rule_defs:
        #todo use the undocumented g,s,m,i flags here: rule['singlelinematch']['value'] == 2 indicates re.DOTALL
        # so use that to pass to re.compile: see here for more http://livedocs.adobe.com/flex/3/html/help.html?content=12_Using_Regular_Expressions_10.html
        match = util.get_value(rule['match'], None,
                               **kwargs)  #todo use subkey?
        matchc = re.compile(
            match, re.DOTALL)  #compile for speed and we need to pass flags
        replace = util.get_value(rule['replace'], None,
                                 **kwargs)  #todo use subkey?
        if replace is None:
            replace = ''

        #convert regex to Python format: todo use a common routine for this
        replace = re.sub(
            '\$(\d+)', r'\\\1', replace
        )  #map $1 to \1 etc.   #todo: also need to escape any existing \1 etc.

        rules.append((rule['field']['value'], matchc, replace))

    for item in _INPUT:

        def sub_fields(matchobj):
            return util.get_value({'subkey': matchobj.group(1)}, item)

        for rule in rules:
            #todo: do we ever need get_value here instead of item[]?
            if rule[0] in item and item[rule[0]]:
                    item, rule[0],
                    re.sub(rule[1], rule[2], unicode(item[rule[0]])))

                    item, rule[0],
                    re.sub('\$\{(.+)\}', sub_fields, unicode(item[rule[0]])))

        yield item
Пример #8
def pipe_fetchsitefeed(context, _INPUT, conf, **kwargs):
    """This source fetches and parses the first feed found on one or more sites 
       to yield the feed entries.
    Keyword arguments:
    context -- pipeline context       
    _INPUT -- not used
        URL -- url
    Yields (_OUTPUT):
    feed entries
    forever = pipe_forever(context, None, conf=None)
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]
    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)
            if not '://' in url:
                url = 'http://' + url
            if context.verbose:
                print "pipe_fetchsitefeed loading:", url
            for feed in pipe_feedautodiscovery(context, forever, {u'URL': {u'type': u'url', u'value': url}}):
                for feed_item in pipe_fetch(context, forever, {u'URL': {u'type': u'url', u'value': feed['link']}}):
                    yield feed_item
        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
Пример #9
Пример #10
def pipe_feedautodiscovery(context, _INPUT, conf, **kwargs):
    """This source search for feed links in a page
    Keyword arguments:
    context -- pipeline context       
    _INPUT -- not used
        URL -- url
    Yields (_OUTPUT):
    feed entries
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]
    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)

            if not '://' in url:
                url = 'http://' + url
            if context.verbose:
                print "pipe_feedautodiscovery loading:", url
            d = autorss.getRSSLink(url.encode('utf-8'))
            for entry in d:
                yield {'link':entry}
                #todo add rel, type, title
        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
Пример #12
Пример #13
def pipe_feedautodiscovery(context=None, _INPUT=None, conf=None, **kwargs):
    """This source search for feed links in a page

    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
        URL -- url

    Yields (_OUTPUT):
    feed entries
    conf = DotDict(conf)
    urls = util.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = util.get_abspath(url)

            if context and context.verbose:
                print "pipe_feedautodiscovery loading:", url

            for entry in autorss.getRSSLink(url.encode('utf-8')):
                yield {'link': entry}
                # todo: add rel, type, title

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
Пример #14
Пример #15
def pipe_subelement(context, _INPUT, conf, **kwargs):
    """Returns a subelement.
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
        path -- contains the value and type to select
    Yields (_OUTPUT):
    subelement of source item
    path = conf['path']
    path['subkey'] = path['value']  #switch to using as a reference
    del path['value']

    for item in _INPUT:
        t = util.get_value(path, item)
        if t:
            if isinstance(t, list):
                for nested_item in t:
                    yield nested_item
                yield t

        if item == True:  #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
Пример #16
def pipe_strconcat(context, _INPUT, conf, **kwargs):
    """This source builds a string.
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
        part -- parts
    Yields (_OUTPUT):
    if not isinstance(conf['part'], list):    #todo do we need to do this anywhere else?
        conf['part'] = [conf['part']]

    for item in _INPUT:
        s = ""
        for part in conf['part']:
                s += util.get_value(part, item, **kwargs)
            except AttributeError:
                continue  #ignore if the item is referenced but doesn't have our source field (todo: issue a warning if debugging?)
            except TypeError:
                if context.verbose:
                    print "pipe_strconcat: TypeError"
        yield s
Пример #17
def pipe_rssitembuilder(context, _INPUT, conf, **kwargs):
    """This source builds an rss item.
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
        dictionary of key/values
    Yields (_OUTPUT):
    for item in _INPUT:
        d = {}
        for key in conf:
                value = util.get_value(conf[key], item, **kwargs)  #todo really dereference item? (sample pipe seems to suggest so: surprising)
            except KeyError:
                continue  #ignore if the source doesn't have our source field (todo: issue a warning if debugging?)
            key = map_key_to_rss.get(key, key)
            if value:
                if key == 'title':
                    util.set_value(d, 'y:%s' % key, value)
                #todo also for guid -> y:id (is guid the only one?)

                #todo try/except?
                util.set_value(d, key, value)
        yield d
        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
Пример #18
Пример #19
def pipe_dateformat(context, _INPUT, conf, **kwargs):
    """This source formats a date.
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
        format -- date format
    Yields (_OUTPUT):
    formatted date
    date_format = util.get_value(conf['format'], None, **kwargs)

    for item in _INPUT:
        s = item
        if isinstance(s, basestring):
            for df in util.ALTERNATIVE_DATE_FORMATS:
                    s = datetime.strptime(s, df).timetuple()
                #todo: raise an exception: unexpected date format
        s = time.strftime(date_format, s)   #todo check all PHP formats are covered by Python
        #todo silent error handling? e.g. if item is not a date
        yield s
Пример #20
def pipe_fetch(context=None, _INPUT=None, conf=None, **kwargs):
    """Fetches and parses one or more feeds to yield the feed entries.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
        URL -- url

    Yields (_OUTPUT):
    feed entries
    conf = DotDict(conf)
    urls = util.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = util.get_abspath(url)

            if not url:

            if context and context.verbose:
                print "pipe_fetch loading:", url

            parsed = feedparser.parse(urlopen(url).read())

            for entry in util.gen_entries(parsed):
                yield entry

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
Пример #21
def pipe_rename(context, _INPUT, conf, **kwargs):
    """This operator renames or copies fields in the input source. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
        RULE -- rules - each rule comprising (op, field, newval)
    Yields (_OUTPUT):
    source items after copying/renaming
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]

    for rule in rule_defs:
        newval = util.get_value(rule['newval'], None,
                                **kwargs)  #todo use subkey?
        newfield = rule['field']
        #trick the get_value in the loop to mapping value onto an item key (rather than taking it literally, i.e. make it a LHS reference, not a RHS value)
        newfield['subkey'] = newfield['value']
        del newfield['value']

        rules.append((rule['op']['value'], newfield, newval))

    for item in _INPUT:
        for rule in rules:
                value = util.get_value(
                    rule[1], item,
                    **kwargs)  #forces an exception if any part is not found
                util.set_value(item, rule[2], value)
                if rule[0] == 'rename':
                        util.del_value(item, rule[1]['subkey'])
                    except (
                            KeyError, TypeError
                    ):  #TypeError catches pseudo subkeys, e.g. summary.content
                        pass  #ignore if the target doesn't have our field (todo: issue a warning if debugging?)
            except AttributeError:
                pass  #ignore if the source doesn't have our field (todo: issue a warning if debugging?)
        yield item
Пример #22
def pipe_regex(context, _INPUT, conf, **kwargs):
    """This operator replaces values using regexes. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
        RULE -- rules - each rule comprising (field, match, replace)
    Yields (_OUTPUT):
    source items after replacing values matching regexes
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]
    for rule in rule_defs:
        #todo use the undocumented g,s,m,i flags here: rule['singlelinematch']['value'] == 2 indicates re.DOTALL
        # so use that to pass to re.compile: see here for more http://livedocs.adobe.com/flex/3/html/help.html?content=12_Using_Regular_Expressions_10.html
        match = util.get_value(rule['match'], None, **kwargs) #todo use subkey?
        matchc = re.compile(match, re.DOTALL)  #compile for speed and we need to pass flags
        replace = util.get_value(rule['replace'], None, **kwargs) #todo use subkey?
        if replace is None:
            replace = ''
        #convert regex to Python format: todo use a common routine for this
        replace = re.sub('\$(\d+)', r'\\\1', replace)   #map $1 to \1 etc.   #todo: also need to escape any existing \1 etc.

        rules.append((rule['field']['value'], matchc, replace))
    for item in _INPUT:
        def sub_fields(matchobj):
            return util.get_value({'subkey':matchobj.group(1)}, item)
        for rule in rules:
            #todo: do we ever need get_value here instead of item[]?
            if rule[0] in item and item[rule[0]]:
                util.set_value(item, rule[0], re.sub(rule[1], rule[2], unicode(item[rule[0]])))
                util.set_value(item, rule[0], re.sub('\$\{(.+)\}', sub_fields, unicode(item[rule[0]])))
        yield item
Пример #23
Пример #24
def pipe_urlbuilder(context, _INPUT, conf, **kwargs):
    """This source builds a url and yields it forever.
    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
        BASE -- base
        PATH -- path elements
        PARAM -- query parameters
    Yields (_OUTPUT):

    for item in _INPUT:
        #note: we could cache get_value results if item==True
        url = util.get_value(conf['BASE'], item, **kwargs)
        if not url.endswith('/'):
            url += '/'

        if 'PATH' in conf:
            path = conf['PATH']
            if not isinstance(path, list):
                path = [path]
            path = [util.get_value(p, item, **kwargs) for p in path if p]

            url += "/".join(p for p in path if p)
        url = url.rstrip("/")

        #Ensure url is valid
        url = util.url_quote(url)

        param_defs = conf['PARAM']
        if not isinstance(param_defs, list):
            param_defs = [param_defs]

        params = dict([(util.get_value(p['key'], item, **kwargs),
                        util.get_value(p['value'], item, **kwargs))
                       for p in param_defs if p])
        if params and params.keys() != [u'']:
            url += "?" + urllib.urlencode(params)

        yield url
Пример #25
Пример #26
Пример #27
Пример #28
Пример #29
Пример #30
def pipe_substr(context, _INPUT, conf, **kwargs):
    """Returns a substring.
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
        from -- starting character
        length -- number of characters to return
    Yields (_OUTPUT):
    portion of source string
    sfrom = int(util.get_value(conf['from'], None, **kwargs))
    length = int(util.get_value(conf['length'], None, **kwargs))

    for item in _INPUT:
        yield item[sfrom:sfrom+length]

        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
Пример #31
Пример #32
def pipe_strregex(context, _INPUT, conf, **kwargs):
    """This operator replaces values using regexes. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
        RULE -- rules - each rule comprising (match, replace)
    Yields (_OUTPUT):
    source item after replacing values matching regexes
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]

    for rule in rule_defs:
        #TODO compile regex here: c = re.compile(match)
        match = util.get_value(rule['match'], None,
                               **kwargs)  #todo use subkey?
        replace = util.get_value(rule['replace'], None,
                                 **kwargs)  #todo use subkey?

        #convert regex to Python format: todo use a common routine for this
        replace = re.sub(
            '\$(\d+)', r'\\\1', replace
        )  #map $1 to \1 etc.   #todo: also need to escape any existing \1 etc.
        if replace is None:
            replace = ''

        rules.append((match, replace))

    for item in _INPUT:
        for rule in rules:
            item = re.sub(match, replace, item)

        yield item
Пример #33
def pipe_strreplace(context, _INPUT, conf, **kwargs):
    """Replaces text with replacement text.
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
        RULE -- rules - each rule comprising (find, param, replace):
            find -- text to find
            param -- type of match: 1=first, 2=last, 3=every
            replace -- text to replace with
    Yields (_OUTPUT):
    source string with replacements
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]

    for rule in rule_defs:
        find = util.get_value(rule['find'], None, **kwargs)
        param = util.get_value(rule['param'], None, **kwargs)
        replace = util.get_value(rule['replace'], None, **kwargs)
        rules.append((find, param, replace))

    for item in _INPUT:
        t = item
        for rule in rules:
            if rule[1] == '1':
                t = t.replace(rule[0], rule[2], 1)
            elif rule[1] == '2':
                t = util.rreplace(t, rule[0], rule[2], 1)
            elif rule[1] == '3':
                t = t.replace(rule[0], rule[2])
            #todo else assertion

        yield t
Пример #34
def pipe_fetch(context, _INPUT, conf, **kwargs):
    """This source fetches and parses one or more feeds to yield the feed entries.
    Keyword arguments:
    context -- pipeline context       
    _INPUT -- not used
        URL -- url
    Yields (_OUTPUT):
    feed entries
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)

            if not '://' in url:
                url = 'http://' + url

            if context.verbose:
                print "pipe_fetch loading:", url
            d = feedparser.parse(url.encode('utf-8'))

            for entry in d['entries']:
                if 'updated_parsed' in entry:
                    entry['pubDate'] = entry[
                        'updated_parsed']  #map from universal feedparser's normalised names
                    entry['y:published'] = entry[
                        'updated_parsed']  #yahoo's own version
                if 'author' in entry:
                    entry['dc:creator'] = entry['author']
                if 'author_detail' in entry:
                    if 'href' in entry['author_detail']:
                        entry['author.uri'] = entry['author_detail']['href']
                    if 'name' in entry['author_detail']:
                        entry['author.name'] = entry['author_detail']['name']
                #todo more!?
                if 'title' in entry:
                    entry['y:title'] = entry['title']  #yahoo's own versions
                if 'id' in entry:
                    entry['y:id'] = entry['id']  #yahoo's own versions
                #todo more!?
                yield entry

        if item == True:  #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
Пример #35
Пример #36
def pipe_datebuilder(context, _INPUT, conf, **kwargs):
    """This source builds a date and yields it forever.
    Keyword arguments:
    context -- pipeline context
    _INPUT -- XXX
        DATE -- date
    Yields (_OUTPUT):
    for item in _INPUT:
        date = util.get_value(conf['DATE'], item, **kwargs)
            date = float(date)
        except ValueError: pass
        if type(date) == float or type(date) == int:
            date = datetime.utcfromtimestamp(date)
            date = str(date).lower()
            if date.endswith(' day') or date.endswith(' days'):
                count = int(date.split(' ')[0])
                date = (datetime.utcnow() + timedelta(days=count))
            elif date.endswith(' year') or date.endswith(' years'):
                count = int(date.split(' ')[0])
                date = datetime.utcnow()
                date = date.replace(year = date.year + count)
            elif date == 'today':
                date = datetime.utcnow()
            elif date == 'tomorrow':
                date = (datetime.utcnow() + timedelta(days=1))
            elif date == 'yesterday':
                date = (datetime.utcnow() + timedelta(days=-1))
            elif date == 'now':  #todo is this allowed by Yahoo?
                date = datetime.utcnow()
                for df in util.ALTERNATIVE_DATE_FORMATS:
                        date = datetime.strptime(date, df)
                    #todo: raise an exception: unexpected date format
        yield date
Пример #37
def pipe_yql(context=None, _INPUT=None, conf=None, **kwargs):
    """This source issues YQL queries.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
        yqlquery -- YQL query
        # todo: handle envURL

    Yields (_OUTPUT):
    query results
    # todo: get from a config/env file
    url = "http://query.yahooapis.com/v1/public/yql"
    conf = DotDict(conf)
    query = conf['yqlquery']

    for item in _INPUT:
        item = DotDict(item)
        yql = util.get_value(query, item, **kwargs)

        # note: we use the default format of xml since json loses some
        # structure
        # todo: diagnostics=true e.g. if context.test
        # todo: consider paging for large result sets
        r = requests.get(url, params={'q': yql}, stream=True)

        # Parse the response
        tree = parse(r.raw)

        if context and context.verbose:
            print "pipe_yql loading xml:", yql

        root = tree.getroot()

        # note: query also has row count
        results = root.find('results')

        # Convert xml into generation of dicts
        for element in results.getchildren():
            yield util.etree_to_dict(element)

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
Пример #38
def pipe_truncate(context, _INPUT, conf, **kwargs):
    """This operator truncates the number of items in a feed.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- terminal, if the truncation value is wired in
        count -- length of the truncated feed, if specified literally
    Yields (_OUTPUT):
    truncated list of source items

    count = conf['count']
    limit = int(util.get_value(count, None, **kwargs))
    for i in xrange(0, limit):
        yield _INPUT.next()
Пример #39
Пример #40
Пример #41
def pipe_stringtokenizer(context, _INPUT, conf, **kwargs):
    """Splits a string into tokens delimited by separators.
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
        to-str -- separator string
    Yields (_OUTPUT):
    tokens of the input string
    delim = util.get_value(conf['to-str'], None, **kwargs)

    for item in _INPUT:
        if item is not None:
            yield item.split(delim)

        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
Пример #42
Пример #43
Пример #44
def pipe_stringtokenizer(context, _INPUT, conf, **kwargs):
    """Splits a string into tokens delimited by separators.
    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
        to-str -- separator string
    Yields (_OUTPUT):
    tokens of the input string
    delim = util.get_value(conf['to-str'], None, **kwargs)

    for item in _INPUT:
        if item is not None:
            for chunk in item.split(delim):
                yield {'content':chunk}

        if item == True: #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
Пример #45
Пример #46
Пример #47
def pipe_filter(context, _INPUT, conf, **kwargs):
    """This operator filters the input source, including or excluding fields, that match a set of defined rules. 

    Keyword arguments:
    context -- pipeline context        
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
        MODE -- filter mode, either "permit" or "block"
        COMBINE -- filter boolean combination, either "and" or "or"
        RULE -- rules - each rule comprising (field, op, value)
    Yields (_OUTPUT):
    source items that match the rules
    mode = conf['MODE']['value']
    combine = conf['COMBINE']['value']
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]

    for rule in rule_defs:
        field = rule['field']['value']
        value = util.get_value(rule['value'], None,
                               **kwargs)  #todo use subkey?
        rules.append((field, rule['op']['value'], value))

    for item in _INPUT:
        if combine in COMBINE_BOOLEAN:
            res = COMBINE_BOOLEAN[combine](_rulepass(rule, item)
                                           for rule in rules)
            raise Exception("Invalid combine %s (expecting and or or)" %

        if (res and mode == "permit") or (not res and mode == "block"):
            yield item
Пример #48
def _convert_item(rules, item, **kwargs):
    for rule in rules:
        value = util.get_value(rule[1], item, **kwargs)

            # forces an exception if any part is not found
            item.set(rule[2], value)
        except AttributeError:
            # ignore if the source doesn't have our field
            # todo: issue a warning if debugging?

        if rule[0] == 'rename':
            # TypeError catches pseudo subkeys, e.g. summary.content
            except (KeyError, TypeError):
                # ignore if the target doesn't have our field
                # todo: issue a warning if debugging?

    return item
Пример #49
def pipe_filter(context, _INPUT, conf, **kwargs):
    """This operator filters the input source, including or excluding fields, that match a set of defined rules. 

    Keyword arguments:
    context -- pipeline context        
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
        MODE -- filter mode, either "permit" or "block"
        COMBINE -- filter boolean combination, either "and" or "or"
        RULE -- rules - each rule comprising (field, op, value)
    Yields (_OUTPUT):
    source items that match the rules
    mode = conf['MODE']['value']
    combine = conf['COMBINE']['value']
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]
    for rule in rule_defs:
        field = rule['field']['value']
        value = util.get_value(rule['value'], None, **kwargs) #todo use subkey?
        rules.append((field, rule['op']['value'], value))

    for item in _INPUT:
        if item == True:
        if combine in COMBINE_BOOLEAN: 
            res = COMBINE_BOOLEAN[combine](_rulepass(rule, item) for rule in rules)
            raise Exception("Invalid combine %s (expecting and or or)" % combine)

        if (res and mode == "permit") or (not res and mode == "block"):
            yield item
Пример #50
def pipe_fetchdata(context, _INPUT, conf, **kwargs):
    """This source fetches and parses any XML or JSON file (todo iCal or KML) to yield a list of elements.
    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
        URL -- url
        path -- path to list
    Yields (_OUTPUT):
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)

            if not '://' in url:
                url = 'http://' + url
            path = util.get_value(conf['path'], item, **kwargs)
            match = None

            #Parse the file into a dictionary
                f = urllib2.urlopen(url)
                ft = ElementTree.parse(f)
                if context.verbose:
                    print "pipe_fetchdata loading xml:", url
                root = ft.getroot()
                #Move to the point referenced by the path
                #todo lxml would simplify and speed up this
                if path:
                    if root.tag[0] == '{':
                        namespace = root.tag[1:].split("}")[0]
                        for i in path.split(".")[:-1]:
                            root = root.find("{%s}%s" % (namespace, i))
                            if root is None:
                        match = "{%s}%s" % (namespace, path.split(".")[-1])
                        match = "%s" % (path.split(".")[-1])
                #Convert xml into generation of dicts
                if match:
                    for element in root.findall(match):
                        i = util.etree_to_pipes(element)
                        yield i
                    i = util.etree_to_pipes(root)
                    yield i

            except Exception, e:
                    f = urllib2.urlopen(url)
                    d = json.load(f)
                    #todo test:-
                    if context.verbose:
                        print "pipe_fetchdata loading json:", url
                    if path:
                        for i in path.split(".")[:-1]:
                            d = d.get(i)
                        match = path.split(".")[-1]
                    if match:
                        for itemd in d:
                            if not match or itemd == match:
                                if isinstance(d[itemd], list):
                                    for nested_item in d[itemd]:
                                        yield nested_item
                                    yield [d[itemd]]
                        yield d
                except Exception, e:
                    #todo try iCal and yield
                    #todo try KML and yield
                    if context.verbose:
                        print "xml and json both failed:"

Пример #51
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs):
    """This operator loops over the input performing the embedded submodule. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
        mode -- how to affect output - either assign or EMIT
        assign_to -- if mode is assign, which field to assign to (new or existing)
        loop_with -- pass a particular field into the submodule rather than the whole item
    embed -- embedded submodule
    Yields (_OUTPUT):
    source items after passing through the submodule and adding/replacing values
    mode = conf['mode']['value']
    assign_to = conf['assign_to']['value']
    assign_part = conf['assign_part']['value']
    emit_part = conf['emit_part']['value']
    loop_with = conf['with']['value']
    embed_conf = conf['embed']['value']['conf']

    #Prepare the submodule to take parameters from the loop instead of from the user
    embed_context = copy.copy(context)
    embed_context.submodule = True

    for item in _INPUT:
        if loop_with:
            inp = item[loop_with]  #todo: get_value here?
            inp = item

        #Pass any input parameters into the submodule
        embed_context.inputs = {}
        for k in embed_conf:
            embed_context.inputs[k] = unicode(
                util.get_value(embed_conf[k], item))
        p = embed(embed_context, [inp], embed_conf)  #prepare the submodule

        results = None
            #loop over the submodule, emitting as we go or collecting results for later assignment
            for i in p:
                if assign_part == 'first':
                    if mode == 'EMIT':
                        yield i
                        results = i
                else:  #all
                    if mode == 'EMIT':
                        yield i
                        if results:
                            results = [i]
        except HTTPError:  #todo any other errors we want to continue looping after?
            if context.verbose:
                print "Submodule gave HTTPError - continuing the loop"

        if mode == 'assign':
            if results and len(results) == 1:
                results = results[0]
            util.set_value(item, assign_to, results)
            yield item
        elif mode == 'EMIT':
            pass  #already yielded
            raise Exception("Invalid mode %s (expecting assign or EMIT)" %
Пример #52
 def sub_fields(matchobj):
     return util.get_value({'subkey': matchobj.group(1)}, item)
Пример #53
def pipe_fetchpage(context, _INPUT, conf, **kwargs):
    """Fetch Page module

    _INPUT -- not used since this does not have inputs.

       URL -- url object contain the URL to download
       from -- string from where to start the input
       to -- string to limit the input
       token -- if present, split the input on this token to generate items

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#FetchPage

        - don't retrieve pages larger than 200k
        - don't retrieve if page is not indexable.
        - item delimiter removes the closing tag if using a HTML tag
          (not documented but happens)
        - items should be cleaned, i.e. stripped of HTML tags
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)
            if context.verbose:
                print "FetchPage: Preparing to download:", url

                request = urllib2.Request(url)
                request.add_header('User-Agent', 'Yahoo Pipes 1.0')
                request = urllib2.build_opener().open(request)
                content = unicode(

                # TODO it seems that Yahoo! converts relative links to absolute
                # TODO this needs to be done on the content but seems to be a non-trival
                # TODO task python?

                if context.verbose:
                    print "............FetchPage: content ................."
                    print content.encode("utf-8")
                    print "............FetchPage: EOF     ................."

                from_delimiter = util.get_value(conf["from"], _INPUT, **kwargs)
                to_delimiter = util.get_value(conf["to"], _INPUT, **kwargs)
                split_token = util.get_value(conf["token"], _INPUT, **kwargs)

                # determine from location, i.e. from where to start reading content
                from_location = 0
                if from_delimiter != "":
                    from_location = content.find(from_delimiter)
                    # Yahoo! does not strip off the from_delimiter.
                    #if from_location > 0:
                    #    from_location += len(from_delimiter)

                # determine to location, i.e. where to stop reading content
                to_location = 0
                if to_delimiter != "":
                    to_location = content.find(to_delimiter, from_location)

                # reduce the content depended on the to/from locations
                if from_location > 0 and to_location > 0:
                    content = content[from_location:to_location]
                elif from_location > 0:
                    content = content[from_location:]
                elif to_location > 0:
                    content = content[:to_location]

                # determine items depended on the split_token
                res_items = []
                if split_token != "":
                    res_items = content.split(split_token)
                    res_items = [content]

                if context.verbose:
                    print "FetchPage: found count items:", len(res_items)

                for res_item in res_items:
                    if context.verbose:
                        print "--------------item data --------------------"
                        print res_item
                        print "--------------EOF item data ----------------"
                    yield {"content": res_item}

            except Exception, e:
                if context.verbose:
                    print "FetchPage: failed to retrieve from:", url

                    print "----------------- FetchPage -----------------"
                    import traceback
                    print "----------------- FetchPage -----------------"

        if item == True:  #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once
Пример #54
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs):
    """This operator loops over the input performing the embedded submodule. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
        mode -- how to affect output - either assign or EMIT
        assign_to -- if mode is assign, which field to assign to (new or existing)
        loop_with -- pass a particular field into the submodule rather than the whole item
    embed -- embedded submodule
    Yields (_OUTPUT):
    source items after passing through the submodule and adding/replacing values
    mode = conf['mode']['value']
    assign_to = conf['assign_to']['value']
    assign_part = conf['assign_part']['value']
    emit_part = conf['emit_part']['value']
    loop_with = conf['with']['value']
    embed_conf = conf['embed']['value']['conf']

    #Prepare the submodule to take parameters from the loop instead of from the user
    embed_context = copy.copy(context)
    embed_context.submodule = True

    for item in _INPUT:
        if loop_with:
            inp = util.get_subkey(loop_with, item)
            inp = item

        #Pass any input parameters into the submodule
        embed_context.inputs = {}
        for k in embed_conf:
            embed_context.inputs[k] = unicode(
                util.get_value(embed_conf[k], item))
        p = embed(embed_context, [inp], embed_conf)  #prepare the submodule

        results = None
            #loop over the submodule, emitting as we go or collecting results for later assignment
            for i in p:
                if assign_part == 'first':
                    if mode == 'EMIT':
                        yield i
                        results = i
                else:  #all
                    if mode == 'EMIT':
                        yield i
                        if results:
                            results = [i]
            if results and mode == 'assign':
                #this is a hack to make sure fetchpage works in an out of a loop while not disturbing strconcat in a loop etc.
                #(goes with the comment below about checking the delivery capability of the source)
                if len(results) == 1 and isinstance(results[0], dict):
                    results = [results]
        except HTTPError:  #todo any other errors we want to continue looping after?
            if context.verbose:
                print "Submodule gave HTTPError - continuing the loop"

        if mode == 'assign':
            if results and len(
            ) == 1:  #note: i suspect this needs to be more discerning and only happen if the source can only ever deliver 1 result, e.g. strconcat vs. fetchpage
                results = results[0]
            util.set_value(item, assign_to, results)
            yield item
        elif mode == 'EMIT':
            pass  #already yielded
            raise Exception("Invalid mode %s (expecting assign or EMIT)" %
Пример #55
def pipe_xpathfetchpage(context, _INPUT, conf, **kwargs):
    """XPath Fetch Page module

    _INPUT -- not used since this does not have inputs.

       URL -- url object contain the URL to download
       xpath -- xpath to extract
       html5 -- use html5 parser?
       useAsString -- emit items as string?

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#XPathFetchPage

        - don't retrieve pages larger than 1.5MB
        - don't retrieve if page is not indexable.
    urls = conf['URL']
    if not isinstance(urls, list):
        urls = [urls]

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(item_url, item, **kwargs)
            if context.verbose:
                print "XPathFetchPage: Preparing to download:", url

                request = urllib2.Request(url)
                request.add_header('User-Agent', 'Yahoo Pipes 1.0')
                request = urllib2.build_opener().open(request)
                content = unicode(

                # TODO it seems that Yahoo! converts relative links to absolute
                # TODO this needs to be done on the content but seems to be a non-trival
                # TODO task python?

                xpath = util.get_value(conf["xpath"], _INPUT, **kwargs)
                html5 = False
                useAsString = False
                if "html5" in conf:
                    html5 = util.get_value(conf["html5"], _INPUT,
                                           **kwargs) == "true"
                if "useAsString" in conf:
                    useAsString = util.get_value(conf["useAsString"], _INPUT,
                                                 **kwargs) == "true"

                if html5:
                    #from lxml.html import html5parser
                    #root = html5parser.fromstring(content)
                    from html5lib import parse
                    root = parse(content,
                    from lxml import etree
                    root = etree.HTML(content)
                res_items = root.xpath(xpath)

                if context.verbose:
                    print "XPathFetchPage: found count items:", len(res_items)

                for res_item in res_items:
                    i = util.etree_to_pipes(
                        res_item)  #TODO xml_to_dict(res_item)
                    if context.verbose:
                        print "--------------item data --------------------"
                        print i
                        print "--------------EOF item data ----------------"
                    if useAsString:
                        yield {"content": unicode(i)}
                        yield i

            except Exception, e:
                if context.verbose:
                    print "XPathFetchPage: failed to retrieve from:", url

                    print "----------------- XPathFetchPage -----------------"
                    import traceback
                    print "----------------- XPathFetchPage -----------------"

        if item == True:  #i.e. this is being fed forever, i.e. not in a loop, so we just yield our item once