Пример #1
0
def pipe_feedautodiscovery(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that searches for and returns feed links found in a page.
    Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)

            if context and context.verbose:
                print "pipe_feedautodiscovery loading:", url

            for entry in autorss.getRSSLink(url.encode('utf-8')):
                yield {'link': entry}
                # todo: add rel, type, title

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Пример #2
0
def pipe_subelement(context=None, _INPUT=None, conf=None, **kwargs):
    """An operator extracts select sub-elements from a feed. Not loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipe2py.modules pipe like object (iterable of items)
    conf : {'path': {'value': <element path>}}

    Yields
    ------
    _OUTPUT : items
    """
    path = DotDict(conf).get('path', **kwargs)

    for item in _INPUT:
        element = DotDict(item).get(path, **kwargs)

        for i in utils.gen_items(element):
            yield {'content': i}

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Пример #3
0
def _gen_rules(rule_defs, **kwargs):
    for rule in rule_defs:
        rule = DotDict(rule)

        # flags = re.DOTALL # DOTALL was the default for pipe2py previously
        # flag 'm'
        flags = re.MULTILINE if 'multilinematch' in rule else 0

        # flag 'i'; this name is reversed from its meaning
        flags |= re.IGNORECASE if 'casematch' in rule else 0

        # flag 's'
        flags |= re.DOTALL if 'singlelinematch' in rule else 0

        # todo: 'globalmatch' is the default in python
        # todo: if set, re.sub() below would get count=0 and by default would
        # get count=1

        # todo: use subkey?
        match = rule.get('match', **kwargs)

        # compile for speed and we need to pass flags
        matchc = re.compile(match, flags)

        # todo: use subkey?
        replace = rule.get('replace', **kwargs) or ''

        # Convert regex to Python format
        # todo: use a common routine for this
        # map $1 to \1 etc.
        # todo: also need to escape any existing \1 etc.
        replace = re.sub('\$(\d+)', r'\\\1', replace)
        yield (rule.get('filed'), matchc, replace)
Пример #4
0
def pipe_fetchsitefeed(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches and parses the first feed found on one or more
    sites. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)

            if context and context.verbose:
                print "pipe_fetchsitefeed loading:", url

            for link in autorss.getRSSLink(url.encode('utf-8')):
                parsed = speedparser.parse(urlopen(link).read())

                for entry in utils.gen_entries(parsed):
                    yield entry

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Пример #5
0
def pipe_stringtokenizer(context=None, _INPUT=None, conf=None, **kwargs):
    """Splits a string into tokens delimited by separators.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        to-str -- separator string

    Yields (_OUTPUT):
    tokens of the input string
    """
    conf = DotDict(conf)
    delim = conf.get('to-str', **kwargs)

    for item in _INPUT:
        for chunk in item.split(delim):
            yield {'content': chunk}

        try:
            forever = item.get('forever')
        except AttributeError:
            forever = False

        if forever:
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Пример #6
0
def pipe_dateformat(context=None, _INPUT=None, conf=None, **kwargs):
    """Formats a datetime value. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipedatebuilder pipe like object (iterable of date timetuples)
    conf : {
        'format': {'value': <'%B %d, %Y'>},
        'timezone': {'value': <'EST'>}
    }

    Yields
    ------
    _OUTPUT : formatted dates
    """
    conf = DotDict(conf)
    loop_with = kwargs.pop('with', None)
    date_format = conf.get('format', **kwargs)
    # timezone = conf.get('timezone', **kwargs)

    for item in _INPUT:
        _with = item.get(loop_with, **kwargs) if loop_with else item

        try:
            # todo: check that all PHP formats are covered by Python
            date_string = time.strftime(date_format, _with)
        except TypeError as e:
            if context and context.verbose:
                print 'Error formatting date: %s' % item
                print e

            continue
        else:
            yield date_string
Пример #7
0
def pipe_dateformat(context=None, _INPUT=None, conf=None, **kwargs):
    """This source formats a date.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        format -- date format

    Yields (_OUTPUT):
    formatted date
    """
    conf = DotDict(conf)
    date_format = conf.get('format', **kwargs)
    date = ()

    for item in _INPUT:
        if not hasattr(item, 'tm_year'):
            date = util.get_date(item)

        date = date.timetuple() if date else item

        if not date:
            raise Exception('Unexpected date format: %s' % date_format)

        try:
            # todo: check that all PHP formats are covered by Python
            date_string = time.strftime(date_format, date)
        except TypeError:
            # silent error handling e.g. if item is not a date
            continue
        else:
            yield date_string
Пример #8
0
def pipe_createrss(context=None, _INPUT=None, conf=None, **kwargs):
    conf = DotDict(conf)

    for item in _INPUT:
        item = DotDict(item)

        yield {
            value: item.get(conf.get(key, **kwargs))
            for key, value in RSS_FIELDS.items()}
Пример #9
0
def _gen_rules(rule_defs, **kwargs):
    rule_defs = util.listize(rule_defs)

    # todo: use subkey?
    for rule_def in rule_defs:
        rule_def = DotDict(rule_def)
        op = rule_def.get('op', **kwargs)
        newfield = {'subkey': rule_def.get('field')}
        newval = rule_def.get('newval', **kwargs)
        yield (op, newfield, newval)
Пример #10
0
def pipe_createrss(context=None, _INPUT=None, conf=None, **kwargs):
    """An operator that converts a source into an RSS stream. Not loopable.

    """
    conf = DotDict(conf)

    for item in _INPUT:
        item = DotDict(item)

        yield {value: item.get(conf.get(key, **kwargs)) for key, value in RSS_FIELDS.items()}
Пример #11
0
def pipe_yql(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that issues YQL queries. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : yqlquery -- YQL query
        # todo: handle envURL

    Yields
    ------
    _OUTPUT : query results
    """
    # todo: get from a config/env file
    url = "http://query.yahooapis.com/v1/public/yql"
    conf = DotDict(conf)
    query = conf['yqlquery']

    for item in _INPUT:
        item = DotDict(item)
        yql = utils.get_value(query, item, **kwargs)

        # note: we use the default format of xml since json loses some
        # structure
        # todo: diagnostics=true e.g. if context.test
        # todo: consider paging for large result sets
        r = requests.get(url, params={'q': yql}, stream=True)

        # Parse the response
        tree = parse(r.raw)

        if context and context.verbose:
            print "pipe_yql loading xml:", yql

        root = tree.getroot()

        # note: query also has row count
        results = root.find('results')

        # Convert xml into generation of dicts
        for element in results.getchildren():
            yield utils.etree_to_dict(element)

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Пример #12
0
def pipe_tail(context=None, _INPUT=None, conf=None, **kwargs):
    """This operator truncates the number of items in a feed.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- terminal, if the truncation value is wired in
    conf:
        count -- length of the truncated feed, if specified literally

    Yields (_OUTPUT):
    tail-truncated list of source items
    """
    conf = DotDict(conf)
    limit = conf.get('count', func=int, **kwargs)

    for item in deque(_INPUT, limit):
        yield item
Пример #13
0
def pipe_tail(context=None, _INPUT=None, conf=None, **kwargs):
    """Returns a specified number of items from the bottom of a feed.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipe2py.modules pipe like object (iterable of items)
    kwargs -- terminal, if the truncation value is wired in
    conf : count -- length of the truncated feed, if specified literally

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    limit = conf.get('count', func=int, **kwargs)

    for item in deque(_INPUT, limit):
        yield item
Пример #14
0
def pipe_datebuilder(context=None, _INPUT=None, conf=None, **kwargs):
    """A date module that converts a text string into a datetime value. Useful
    as terminal data. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items
    conf : {'DATE': {'type': 'datetime', 'value': '12/2/2014'}}

    Yields
    ------
    _OUTPUT : date timetuples
    """
    conf = DotDict(conf)

    for item in _INPUT:
        _input = DotDict(item)
        date = utils.get_value(conf['DATE'], _input, **kwargs).lower()

        if date.endswith(' day') or date.endswith(' days'):
            count = int(date.split(' ')[0])
            new_date = dt.today() + timedelta(days=count)
        elif date.endswith(' year') or date.endswith(' years'):
            count = int(date.split(' ')[0])
            new_date = dt.today().replace(year=dt.today().year + count)
        else:
            new_date = SWITCH.get(date)

        if not new_date:
            new_date = utils.get_date(date)

        if not new_date:
            raise Exception('Unrecognized date string: %s' % date)

        yield new_date.timetuple()
Пример #15
0
def pipe_uniq(context=None, _INPUT=None, conf=None, **kwargs):
    """This operator filters out non unique items according to the specified
    field.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        field -- field to be unique

    Yields (_OUTPUT):
    source items, one per unique field value
    """
    seen = set()
    conf = DotDict(conf)
    field = conf.get('field', **kwargs)

    for item in _INPUT:
        value = DotDict(item).get(field)

        if value not in seen:
            seen.add(value)
            yield item
Пример #16
0
def pipe_substr(context=None, _INPUT=None, conf=None, **kwargs):
    """Returns a substring.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    conf:
        from -- starting character
        length -- number of characters to return

    Yields (_OUTPUT):
    portion of source string
    """
    conf = DotDict(conf)
    start = conf.get('from', func=int, **kwargs)
    length = conf.get('length', func=int, **kwargs)

    for item in _INPUT:
        yield item[start:start + length]

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Пример #17
0
def pipe_urlbuilder(context=None, _INPUT=None, conf=None, **kwargs):
    """This source builds a url and yields it forever.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
    conf:
        BASE -- base
        PATH -- path elements
        PARAM -- query parameters

    Yields (_OUTPUT):
    url
    """
    conf = DotDict(conf)
    paths = util.listize(conf.get('PATH'))  # use .get() incase 'PATH' isnt set
    param_defs = util.listize(conf['PARAM'])
    url = None

    for item in _INPUT:
        # if _INPUT is pipeforever and not a loop, get values from cache
        if not url:
            item = DotDict(item)
            forever = item.get('forever')
            url = conf.get('BASE', **kwargs)
            url += '/' if not url.endswith('/') else url
            url += "/".join(str(p) for p in paths if p)
            url = url.rstrip("/")
            url = util.url_quote(url)  # Ensure url is valid
            params = dict(_gen_params(param_defs, item, **kwargs))

            if params and params.keys() != [u'']:
                url += "?" + urllib.urlencode(params)

        yield url
        url = url if forever else None
Пример #18
0
def pipe_createrss(context=None, _INPUT=None, conf=None, **kwargs):
    """An operator that converts a source into an RSS stream. Not loopable.

    """
    conf = DotDict(conf)

    for item in _INPUT:
        item = DotDict(item)

        yield {
            value: item.get(conf.get(key, **kwargs))
            for key, value in RSS_FIELDS.items()
        }
Пример #19
0
def pipe_rssitembuilder(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that builds an rss item. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever asyncPipe or an iterable of items or fields
    conf : {
        'mediaContentType': {'type': 'text', 'value': ''},
        'mediaContentHeight': {'type': 'text', 'value': ''},
        'mediaContentWidth': {'type': 'text', 'value': ''},
        'mediaContentURL': {'type': 'text', 'value': 'url'},
        'mediaThumbHeight': {'type': 'text', 'value': ''},
        'mediaThumbWidth': {'type': 'text', 'value': ''},
        'mediaThumbURL': {'type': 'text', 'value': 'url'},
        'description': {'type': 'text', 'value': 'description'},
        'pubdate': {'type': 'text', 'value': 'pubdate'},
        'author': {'type': 'text', 'value': 'author'},
        'title': {'type': 'text', 'value': 'title'},
        'link': {'type': 'text', 'value': 'url'},
        'guid': {'type': 'text', 'value': 'guid'},
    }

    Yields
    ------
    _OUTPUT : items
    """
    get_value = partial(utils.get_value, **kwargs)
    pkwargs = utils.combine_dicts({'parse_func': get_value}, kwargs)
    parse_conf = partial(utils.parse_conf, DotDict(conf), **pkwargs)
    get_RSS = lambda key, value: (RSS.get(key, key), value)
    get_YAHOO = lambda key, value: (YAHOO.get(key), value)
    make_dict = lambda func, conf: dict(starmap(func, conf.iteritems()))
    clean_dict = lambda d: dict(i for i in d.items() if all(i))
    funcs = [partial(make_dict, get_RSS), partial(make_dict, get_YAHOO)]

    finite = utils.finitize(_INPUT)
    inputs = imap(DotDict, finite)
    confs = imap(parse_conf, inputs)
    splits = utils.broadcast(confs, *funcs)
    combined = starmap(utils.combine_dicts, splits)
    result = imap(clean_dict, combined)
    _OUTPUT = imap(DotDict, result)
    return _OUTPUT
Пример #20
0
def _get_broadcast_funcs(pieces, ftype='with', **kwargs):
    test = kwargs.pop('pass_if', None)
    listize = kwargs.pop('listize', True)
    parse = kwargs.pop('parse', True)
    pdictize = kwargs.pop('pdictize', True)
    cust_func = kwargs.pop('cust_func', False)
    get_value = partial(utils.get_value, **kwargs)
    get_pass = partial(utils.get_pass, test=test)
    get_with = partial(utils.get_with, **kwargs)

    if parse:
        get_func = partial(utils.parse_conf, parse_func=get_value, **kwargs)
    else:
        get_func = get_value

    if listize:
        listed = utils.listize(pieces)
        piece_defs = map(DotDict, listed) if pdictize else listed
        get_pieces = lambda item: imap(get_func, piece_defs, repeat(item))
    else:
        piece_defs = DotDict(pieces) if pdictize else pieces
        get_pieces = partial(get_func, piece_defs)

    return (get_pieces, get_with, get_pass, cust_func)
Пример #21
0
def pipe_csv(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches and parses a csv file to yield items. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url
        skip -- number of header rows to skip
        col_mode -- column name source: row=header row(s),
                    custom=defined in col_name
        col_name -- list of custom column names
        col_row_start -- first column header row
        col_row_end -- last column header row
        separator -- column separator

    Yields
    ------
    _OUTPUT : items

    Note:
    Current restrictions:
      separator must be 1 character
      assumes every row has exactly the expected number of fields, as defined
      in the header
    """
    conf = DotDict(conf)
    conf_sep = conf['separator']
    conf_mode = conf['col_mode']
    col_name = conf['col_name']

    for item in _INPUT:
        item = DotDict(item)
        url = utils.get_value(conf['URL'], item, **kwargs)
        url = utils.get_abspath(url)
        separator = utils.get_value(conf_sep, item, encode=True, **kwargs)
        skip = int(utils.get_value(conf['skip'], item, **kwargs))
        col_mode = utils.get_value(conf_mode, item, **kwargs)

        f = urlopen(url)

        if context and context.verbose:
            print "pipe_csv loading:", url

        for i in xrange(skip):
            f.next()

        reader = csv.UnicodeReader(f, delimiter=separator)
        fieldnames = []

        if col_mode == 'custom':
            fieldnames = [DotDict(x).get() for x in col_name]
        else:
            fieldnames = _gen_fieldnames(conf, reader, item, **kwargs)

        for rows in reader:
            yield dict(zip(fieldnames, rows))

        f.close()

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Пример #22
0
def pipe_filter(context=None, _INPUT=None, conf=None, **kwargs):
    """An operator that filters for source items matching the given rules.
    Not loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipe2py.modules pipe like object (iterable of items)
    conf : {
        'MODE': {'value': <'permit' or 'block'>},
        'COMBINE': {'value': <'and' or 'or'>}
        'RULE': [
            {
                'field': {'value': 'search field'},
                'op': {'value': 'one of SWITCH above'},
                'value': {'value': 'search term'}
            }
        ]
    }

    kwargs : other inputs, e.g., to feed terminals for rule values

    Returns
    -------
    _OUTPUT : generator of filtered items

    Examples
    --------
    >>> import os.path as p
    >>> from pipe2py.modules.pipeforever import pipe_forever
    >>> from pipe2py.modules.pipefetchdata import pipe_fetchdata
    >>> parent = p.dirname(p.dirname(__file__))
    >>> file_name = p.abspath(p.join(parent, 'data', 'gigs.json'))
    >>> path = 'value.items'
    >>> url = 'file://%s' % file_name
    >>> conf = {'URL': {'value': url}, 'path': {'value': path}}
    >>> input = pipe_fetchdata(_INPUT=pipe_forever(), conf=conf)
    >>> mode = {'value': 'permit'}
    >>> combine = {'value': 'and'}
    >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \
'value': {'value': 'web'}}]
    >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule}
    >>> pipe_filter(_INPUT=input, conf=conf).next()['title']
    u'E-Commerce Website Developer | Elance Job'
    >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \
'value': {'value': 'kjhlked'}}]
    >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule}
    >>> list(pipe_filter(_INPUT=input, conf=conf))
    []
    """
    conf = DotDict(conf)
    test = kwargs.pop('pass_if', None)
    permit = conf.get('MODE', **kwargs) == 'permit'
    combine = conf.get('COMBINE', **kwargs)

    if not combine in {'and', 'or'}:
        raise Exception("Invalid combine: %s. (Expected 'and' or 'or')" %
                        combine)

    rule_defs = map(DotDict, utils.listize(conf['RULE']))
    get_pass = partial(utils.get_pass, test=test)
    get_value = partial(utils.get_value, **kwargs)
    parse_conf = partial(utils.parse_conf, parse_func=get_value, **kwargs)
    get_rules = lambda i: imap(parse_conf, rule_defs, repeat(i))
    funcs = [COMBINE_BOOLEAN[combine], utils.passthrough, utils.passthrough]

    inputs = imap(DotDict, _INPUT)
    splits = utils.broadcast(inputs, get_rules, utils.passthrough, get_pass)
    outputs = starmap(partial(parse_rules, **kwargs), splits)
    parsed = utils.dispatch(outputs, *funcs)
    gathered = starmap(partial(parse_result, permit=permit), parsed)
    _OUTPUT = ifilter(None, gathered)
    return _OUTPUT
Пример #23
0
def pipe_fetchpage(context=None, _INPUT=None, conf=None, **kwargs):
    """Fetch Page module

    _INPUT -- not used since this does not have inputs.

    conf:
       URL -- url object contain the URL to download
       from -- string from where to start the input
       to -- string to limit the input
       token -- if present, split the input on this token to generate items

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#FetchPage

       TODOS:
        - don't retrieve pages larger than 200k
        - don't retrieve if page is not indexable.
        - item delimiter removes the closing tag if using a HTML tag
          (not documented but happens)
        - items should be cleaned, i.e. stripped of HTML tags
    """
    conf = DotDict(conf)
    split_token = conf.get('token', **kwargs)
    urls = util.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = util.get_abspath(url)

            if not url:
                continue

            f = urlopen(url)

            # TODO: it seems that Yahoo! converts relative links to
            # absolute. This needs to be done on the content but seems to
            # be a non-trival task python?
            content = unicode(f.read(), 'utf-8')

            if context and context.verbose:
                print '............Content .................'
                print content
                print '...............EOF...................'

            parsed = _parse_content(content, conf, **kwargs)
            items = parsed.split(split_token) if split_token else [parsed]

            if context and context.verbose:
                print "FetchPage: found count items:", len(items)

            for i in items:
                if context and context.verbose:
                    print "--------------item data --------------------"
                    print i
                    print "--------------EOF item data ----------------"

                yield {"content": i}

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Пример #24
0
def pipe_csv(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches and parses a csv file to yield items. Loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : URL -- url
        skip -- number of header rows to skip
        col_mode -- column name source: row=header row(s),
                    custom=defined in col_name
        col_name -- list of custom column names
        col_row_start -- first column header row
        col_row_end -- last column header row
        separator -- column separator

    Yields
    ------
    _OUTPUT : items

    Note:
    Current restrictions:
      separator must be 1 character
      assumes every row has exactly the expected number of fields, as defined
      in the header
    """
    conf = DotDict(conf)
    conf_sep = conf["separator"]
    conf_mode = conf["col_mode"]
    col_name = conf["col_name"]

    for item in _INPUT:
        item = DotDict(item)
        url = utils.get_value(conf["URL"], item, **kwargs)
        url = utils.get_abspath(url)
        separator = utils.get_value(conf_sep, item, encode=True, **kwargs)
        skip = int(utils.get_value(conf["skip"], item, **kwargs))
        col_mode = utils.get_value(conf_mode, item, **kwargs)

        f = urlopen(url)

        if context and context.verbose:
            print "pipe_csv loading:", url

        for i in xrange(skip):
            f.next()

        reader = csv.UnicodeReader(f, delimiter=separator)
        fieldnames = []

        if col_mode == "custom":
            fieldnames = [DotDict(x).get() for x in col_name]
        else:
            fieldnames = _gen_fieldnames(conf, reader, item, **kwargs)

        for rows in reader:
            yield dict(zip(fieldnames, rows))

        f.close()

        if item.get("forever"):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Пример #25
0
def pipe_csv(context=None, _INPUT=None, conf=None, **kwargs):
    """This source fetches and parses a csv file to yield items.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- not used
    conf:
        URL -- url
        skip -- number of header rows to skip
        col_mode -- column name source: row=header row(s),
                    custom=defined in col_name
        col_name -- list of custom column names
        col_row_start -- first column header row
        col_row_end -- last column header row
        separator -- column separator

    Yields (_OUTPUT):
    file entries

    Note:
    Current restrictions:
      separator must be 1 character
      assumes every row has exactly the expected number of fields, as defined
      in the header
    """
    conf = DotDict(conf)
    conf_sep = conf['separator']
    conf_mode = conf['col_mode']
    col_name = conf['col_name']

    for item in _INPUT:
        item = DotDict(item)
        url = util.get_value(conf['URL'], item, **kwargs)
        url = util.get_abspath(url)
        separator = util.get_value(conf_sep, item, encode=True, **kwargs)
        skip = util.get_value(conf['skip'], item, func=int, **kwargs)
        col_mode = util.get_value(conf_mode, item, **kwargs)

        f = urlopen(url)

        if context and context.verbose:
            print "pipe_csv loading:", url

        for i in xrange(skip):
            f.next()

        reader = csv.UnicodeReader(f, delimiter=separator)
        fieldnames = []

        if col_mode == 'custom':
            fieldnames = [DotDict(x).get() for x in col_name]
        else:
            fieldnames = _gen_fieldnames(conf, reader, item, **kwargs)

        for rows in reader:
            yield dict(zip(fieldnames, rows))

        f.close()

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Пример #26
0
def pipe_regex(context=None, _INPUT=None, conf=None, **kwargs):
    """Applies regex rules to _INPUT items.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : source generator of dicts
    conf: dict
        {
            'RULE': [
                {
                    'field': {'value': 'search field'},
                    'match': {'value': 'regex'},
                    'replace': {'value': 'replacement'}
                }
            ]
        }

    Yields
    ------
    _OUTPUT : source pipe items post regexes application
    """
    rule_defs = util.listize(conf['RULE'])

    # use list bc iterator gets used up if there are no matching feeds
    rules = list(_gen_rules(rule_defs, **kwargs))

    for item in _INPUT:
        item = DotDict(item)

        def sub_fields(matchobj):
            return item.get(matchobj.group(1), **kwargs)

        for rule in rules:
            # todo: do we ever need get_value here instead of item[]?
            # when the subject being examined is an HTML node, not a
            # string then the unicode() converts the dict representing the node
            # to a dict literal, and then attempts to apply the pattern
            # to the literal; as an HTML element node, it may have attributes
            # which then appear in the literal. It should be only matching on
            # (and replacing the value of) the `.content` subelement
            # I'm not confident that what is below will work across the board
            # nor if this is the right way to detect that we're looking at
            # an HTML node and not a plain string
            if rule[0] in item and item[rule[0]]:
                sub_string = '\$\{(.+?)\}'

                if (
                    hasattr(item[rule[0]], 'keys')
                    and 'content' in item[rule[0]]
                ):
                    # this looks like an HTML node, so only do substitution on
                    # the content of the node possible gotcha: the content
                    # might be a subtree, in which case we revert to modifying
                    # the literal of the subtree dict
                    args1 = _get_args(item, rule, rule[1], rule[2], 'content')
                    args2 = _get_args(item, rule, sub_string, sub_fields)
                else:
                    args1 = _get_args(item, rule, rule[1], rule[2])
                    args2 = _get_args(item, rule, sub_string, sub_fields)

                item.set(*args1)
                item.set(*args2)

        yield item
Пример #27
0
def pipe_fetchpage(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches the content of a given web site as a string.
    Loopable.

    context : pipe2py.Context object
    _INPUT : pipeforever asyncPipe or an iterable of items or fields

    conf : dict
       URL -- url object contain the URL to download
       from -- string from where to start the input
       to -- string to limit the input
       token -- if present, split the input on this token to generate items

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#FetchPage

       TODOS:
        - don't retrieve pages larger than 200k
        - don't retrieve if page is not indexable.
        - item delimiter removes the closing tag if using a HTML tag
          (not documented but happens)
        - items should be cleaned, i.e. stripped of HTML tags

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    split_token = conf.get('token', **kwargs)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)

            if not url:
                continue

            f = urlopen(url)

            # TODO: it seems that Yahoo! converts relative links to
            # absolute. This needs to be done on the content but seems to
            # be a non-trival task python?
            content = unicode(f.read(), 'utf-8')

            if context and context.verbose:
                print '............Content .................'
                print content
                print '...............EOF...................'

            parsed = _parse_content(content, conf, **kwargs)
            items = parsed.split(split_token) if split_token else [parsed]

            if context and context.verbose:
                print "FetchPage: found count items:", len(items)

            for i in items:
                if context and context.verbose:
                    print "--------------item data --------------------"
                    print i
                    print "--------------EOF item data ----------------"

                yield {"content": i}

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Пример #28
0
def pipe_filter(context=None, _INPUT=None, conf=None, **kwargs):
    """An operator that filters for source items matching the given rules.
    Not loopable.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : pipe2py.modules pipe like object (iterable of items)
    conf : {
        'MODE': {'value': <'permit' or 'block'>},
        'COMBINE': {'value': <'and' or 'or'>}
        'RULE': [
            {
                'field': {'value': 'search field'},
                'op': {'value': 'one of SWITCH above'},
                'value': {'value': 'search term'}
            }
        ]
    }

    kwargs : other inputs, e.g., to feed terminals for rule values

    Returns
    -------
    _OUTPUT : generator of filtered items

    Examples
    --------
    >>> import os.path as p
    >>> from pipe2py.modules.pipeforever import pipe_forever
    >>> from pipe2py.modules.pipefetchdata import pipe_fetchdata
    >>> parent = p.dirname(p.dirname(__file__))
    >>> file_name = p.abspath(p.join(parent, 'data', 'gigs.json'))
    >>> path = 'value.items'
    >>> url = 'file://%s' % file_name
    >>> conf = {'URL': {'value': url}, 'path': {'value': path}}
    >>> input = pipe_fetchdata(_INPUT=pipe_forever(), conf=conf)
    >>> mode = {'value': 'permit'}
    >>> combine = {'value': 'and'}
    >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \
'value': {'value': 'web'}}]
    >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule}
    >>> pipe_filter(_INPUT=input, conf=conf).next()['title']
    u'E-Commerce Website Developer | Elance Job'
    >>> rule = [{'field': {'value': 'title'}, 'op': {'value': 'contains'}, \
'value': {'value': 'kjhlked'}}]
    >>> conf = {'MODE': mode, 'COMBINE': combine, 'RULE': rule}
    >>> list(pipe_filter(_INPUT=input, conf=conf))
    []
    """
    conf = DotDict(conf)
    test = kwargs.pop('pass_if', None)
    permit = conf.get('MODE', **kwargs) == 'permit'
    combine = conf.get('COMBINE', **kwargs)

    if not combine in {'and', 'or'}:
        raise Exception(
            "Invalid combine: %s. (Expected 'and' or 'or')" % combine)

    rule_defs = map(DotDict, utils.listize(conf['RULE']))
    get_pass = partial(utils.get_pass, test=test)
    get_value = partial(utils.get_value, **kwargs)
    parse_conf = partial(utils.parse_conf, parse_func=get_value, **kwargs)
    get_rules = lambda i: imap(parse_conf, rule_defs, repeat(i))
    funcs = [COMBINE_BOOLEAN[combine], utils.passthrough, utils.passthrough]

    inputs = imap(DotDict, _INPUT)
    splits = utils.broadcast(inputs, get_rules, utils.passthrough, get_pass)
    outputs = starmap(partial(parse_rules, **kwargs), splits)
    parsed = utils.dispatch(outputs, *funcs)
    gathered = starmap(partial(parse_result, permit=permit), parsed)
    _OUTPUT = ifilter(None, gathered)
    return _OUTPUT
Пример #29
0
def pipe_fetchdata(context=None, _INPUT=None, conf=None, **kwargs):
    """Fetches and parses an XML or JSON file.

    Parameters
    ----------
    context : pipe2py.Context object
    _INPUT : source generator of dicts
    conf : dict
        {
            'URL': {'value': url},
            'path': {'value': dot separated path to data list}
        }

    Yields
    ------
    _OUTPUT : pipe items fetched from source

    Examples
    --------
    >>> from os import path as p
    >>> from pipe2py.modules.pipeforever import pipe_forever
    >>> parent = p.dirname(p.dirname(__file__))
    >>> abspath = p.abspath(p.join(parent, 'data', 'gigs.json'))
    >>> path = 'value.items'
    >>> url = "file://%s" % abspath
    >>> conf = {'URL': {'value': url}, 'path': {'value': path}}
    >>> pipe_fetchdata(_INPUT=pipe_forever(), conf=conf).next().keys()[:5]
    [u'y:repeatcount', u'description', u'pubDate', u'title', u'y:published']
    >>> abspath = p.abspath(p.join(parent, 'data', 'places.xml'))
    >>> path = 'appointment'
    >>> url = "file://%s" % abspath
    >>> conf = {'URL': {'value': url}, 'path': {'value': path}}
    >>> sorted(pipe_fetchdata(_INPUT=pipe_forever(), conf=conf).next().keys())
    ['alarmTime', 'begin', 'duration', 'places', 'subject', 'uid']
    >>> conf = {'URL': {'value': url}, 'path': {'value': ''}}
    >>> sorted(pipe_fetchdata(_INPUT=pipe_forever(), conf=conf).next().keys())
    ['appointment', 'reminder']
    """
    # todo: iCal and KML
    conf = DotDict(conf)
    urls = util.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            item = DotDict(item)
            url = util.get_value(DotDict(item_url), item, **kwargs)
            url = util.get_abspath(url)
            f = urlopen(url)
            path = util.get_value(conf['path'], item, **kwargs)
            split_path = path.split(".") if path else []
            res = {}

            try:
                tree = objectify.parse(f)
                root = tree.getroot()
            except XMLSyntaxError:
                if context and context.verbose:
                    print "pipe_fetchdata loading json:", url

                f = urlopen(url)
                element = loads(f.read())
            else:
                if context and context.verbose:
                    print "pipe_fetchdata loading xml:", url

                # print etree.tostring(element, pretty_print=True)
                element = util.etree_to_dict(root)
            finally:
                res = _parse_dict(split_path, element) if element else None

                for i in util.gen_items(res, True):
                    yield i

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Пример #30
0
def _gen_order(keys, **kwargs):
    for key in keys:
        key = DotDict(key)
        field = key.get('field', **kwargs)
        sort_dir = key.get('dir', **kwargs)
        yield '%s%s' % (sort_dir == 'DESC' and '-' or '', field)
Пример #31
0
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs):
    """This operator loops over the input performing the embedded submodule.

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        mode -- how to affect output - either assign or EMIT
        assign_to -- if mode is assign, which field to assign to
            (new or existing)

        loop_with -- pass a particular field into the submodule rather than the
            whole item
    embed -- embedded submodule

    Yields (_OUTPUT):
    source items after passing through the submodule and adding/replacing values
    """
    conf = DotDict(conf)
    mode = conf.get('mode')
    assign_to = conf.get('assign_to')
    assign_part = conf.get('assign_part')
    # TODO: what is this for??
    # emit_part = conf.get('emit_part')
    loop_with = conf.get('with')
    embed_conf = conf.get('embed')['conf']

    # Prepare the submodule to take parameters from the loop instead of from
    # the user
    embed_context = copy(context)
    embed_context.submodule = True

    for item in _INPUT:
        item = DotDict(item)
        inp = item.get(loop_with, **kwargs) if loop_with else item

        # prepare the submodule
        embed_context.inputs = dict(_gen_inputs(item, embed_conf))
        submodule = embed(embed_context, [inp], embed_conf)
        first = assign_part == 'first'
        results = _gen_results(submodule, mode, first)

        if not results:
            continue
        elif mode == 'EMIT':
            for i in results:
                yield i
        elif mode == 'assign':
            results = list(results)

            # this is a hack to make sure fetchpage works in an out of a
            # loop while not disturbing strconcat in a loop etc.
            # note: i suspect this needs to be more discerning and only happen
            # if the source can only ever deliver 1 result, e.g. strconcat vs.
            # fetchpage
            if len(results) == 1 and not hasattr(results[0], 'keys'):
                results = results[0]

            item.set(assign_to, results)
            yield item
        else:
            raise Exception(
                "Invalid mode: %s. (Expected 'assign' or 'EMIT')" % mode)
Пример #32
0
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs):
    """XPath Fetch Page module

    _INPUT -- not used since this does not have inputs.

    conf:
       URL -- url object contain the URL to download
       xpath -- xpath to extract
       html5 -- use html5 parser?
       useAsString -- emit items as string?

       Description: http://pipes.yahoo.com/pipes/docs?doc=sources#XPathFetchPage

       TODOS:
        - don't retrieve pages larger than 1.5MB
        - don't retrieve if page is not indexable.
    """
    conf = DotDict(conf)
    urls = util.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = util.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = util.get_abspath(url)
            f = urlopen(url)

            # TODO: it seems that Yahoo! converts relative links to
            # absolute. This needs to be done on the content but seems to
            # be a non-trival task python?
            content = unicode(f.read(), 'utf-8')

            if context and context.verbose:
                print '............Content .................'
                print content
                print '...............EOF...................'

            xpath = conf.get('xpath', **kwargs)
            html5 = conf.get('html5', **kwargs) == 'true'
            use_as_string = conf.get('useAsString', **kwargs) == 'true'
            tree = html5parser.parse(f) if html5 else html.parse(f)
            root = tree.getroot()
            items = root.xpath(xpath)

            if context and context.verbose:
                print 'XPathFetchPage: found count items:', len(items)

            for etree in items:
                i = util.etree_to_dict(etree)

                if context and context.verbose:
                    print '--------------item data --------------------'
                    print i
                    print '--------------EOF item data ----------------'

                if use_as_string:
                    yield {'content': unicode(i)}
                else:
                    yield i

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break
Пример #33
0
def pipe_xpathfetchpage(context=None, _INPUT=None, conf=None, **kwargs):
    """A source that fetches the content of a given website as DOM nodes or a
    string. Loopable.

    context : pipe2py.Context object
    _INPUT : pipeforever pipe or an iterable of items or fields
    conf : dict
       URL -- url object contain the URL to download
       xpath -- xpath to extract
       html5 -- use html5 parser?
       useAsString -- emit items as string?

       TODOS:
        - don't retrieve pages larger than 1.5MB
        - don't retrieve if page is not indexable.

    Yields
    ------
    _OUTPUT : items
    """
    conf = DotDict(conf)
    urls = utils.listize(conf['URL'])

    for item in _INPUT:
        for item_url in urls:
            url = utils.get_value(DotDict(item_url), DotDict(item), **kwargs)
            url = utils.get_abspath(url)
            f = urlopen(url)

            # TODO: it seems that Yahoo! converts relative links to
            # absolute. This needs to be done on the content but seems to
            # be a non-trival task python?
            content = unicode(f.read(), 'utf-8')

            if context and context.verbose:
                print '............Content .................'
                print content
                print '...............EOF...................'

            xpath = conf.get('xpath', **kwargs)
            html5 = conf.get('html5', **kwargs) == 'true'
            use_as_string = conf.get('useAsString', **kwargs) == 'true'
            tree = html5parser.parse(f) if html5 else html.parse(f)
            root = tree.getroot()
            items = root.xpath(xpath)

            if context and context.verbose:
                print 'XPathFetchPage: found count items:', len(items)

            for etree in items:
                i = utils.etree_to_dict(etree)

                if context and context.verbose:
                    print '--------------item data --------------------'
                    print i
                    print '--------------EOF item data ----------------'

                if use_as_string:
                    yield {'content': unicode(i)}
                else:
                    yield i

        if item.get('forever'):
            # _INPUT is pipeforever and not a loop,
            # so we just yield our item once
            break