Exemplo n.º 1
0
def pipe_uniq(context, _INPUT, conf, **kwargs):
    """This operator filters out non unique items according to the specified field. 

    Keyword arguments:
    context -- pipeline context        
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        field -- field to be unique
    
    Yields (_OUTPUT):
    source items, one per unique field value
    """
       
    field = util.get_value(conf['field'], None, **kwargs)
    order = ['%s%s' % ('', field)]

    #read all and sort
    sorted_input = []
    for item in _INPUT:
        sorted_input.append(item)
    sorted_input = util.multikeysort(sorted_input, order)
            
    seen = None
    for item in sorted_input:
        #todo: do we ever need get_value here instead of item[]?
        v = util.get_subkey(field, item)
        if seen != v:
            yield item
            seen = v
Exemplo n.º 2
0
def _rulepass(rule, item):
    field, op, value = rule
    
    data = util.get_subkey(field, item)
    
    if data is None:
        return False
    
    #todo check which of these should be case insensitive
    if op == "contains":
        try:
            if value.lower() and value.lower() in data.lower():  #todo use regex?
                return True
        except UnicodeDecodeError:
            pass
    if op == "doesnotcontain":
        try:
            if value.lower() and value.lower() not in data.lower():  #todo use regex?
                return True
        except UnicodeDecodeError:
            pass
    if op == "matches":
        try:
            if data is not None and re.search(value, data):
                return True
        except TypeError:
            return False
    if op == "is":
        if data == value:
            return True
    if op == "greater":
        try:
            if Decimal(data) > Decimal(value):
                return True
        except:
            if data > value:
                return True
    if op == "less":
        try:
            if Decimal(data) < Decimal(value):
                return True
        except:
            if data < value:
                return True
    if op == "after":
        #todo handle partial datetime values
        if isinstance(value, basestring):
            value = datetime.datetime.strptime(value, util.DATE_FORMAT).timetuple()
        if data > value:
            return True
    if op == "before":
        #todo handle partial datetime values
        if isinstance(value, basestring):
            value = datetime.datetime.strptime(value, util.DATE_FORMAT).timetuple()
        if data < value:
            return True
        
    return False
Exemplo n.º 3
0
def transform_to_rss(item, conf):
    new = dict()
    for i in RSS_FIELDS:
        try:
            field_conf = conf[i]
            if field_conf['value']:
                new[i] = util.get_subkey(field_conf['value'], item)
        except KeyError:
            continue
    return new
Exemplo n.º 4
0
def transform_to_rss(item, conf):
    new = dict()
    for i in RSS_FIELDS:
        try:
            field_conf = conf[i]
            if field_conf['value']:
                new[RSS_FIELDS[i]] = util.get_subkey(field_conf['value'], item)
        except KeyError:
            continue
    return new
Exemplo n.º 5
0
def _rulepass(rule, item):
    field, op, value = rule
    
    data = util.get_subkey(field, item)
    
    if data is None:
        return False
    
    #todo check which of these should be case insensitive
    if op == "contains":
        try:
            if value.lower() and value.lower() in data.lower():  #todo use regex?
                return True
        except UnicodeDecodeError:
            pass
    if op == "doesnotcontain":
        try:
            if value.lower() and value.lower() not in data.lower():  #todo use regex?
                return True
        except UnicodeDecodeError:
            pass
    if op == "matches":
        if re.search(value, data):
            return True
    if op == "is":
        if data == value:
            return True
    if op == "greater":
        try:
            if Decimal(data) > Decimal(value):
                return True
        except:
            if data > value:
                return True
    if op == "less":
        try:
            if Decimal(data) < Decimal(value):
                return True
        except:
            if data < value:
                return True
    if op == "after":
        #todo handle partial datetime values
        if isinstance(value, basestring):
            value = datetime.datetime.strptime(value, util.DATE_FORMAT).timetuple()
        if data > value:
            return True
    if op == "before":
        #todo handle partial datetime values
        if isinstance(value, basestring):
            value = datetime.datetime.strptime(value, util.DATE_FORMAT).timetuple()
        if data < value:
            return True
        
    return False
Exemplo n.º 6
0
def pipe_regex(context, _INPUT, conf, **kwargs):
    """This operator replaces values using regexes. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        RULE -- rules - each rule comprising (field, match, replace)
    
    Yields (_OUTPUT):
    source items after replacing values matching regexes
    """
    rules = []

    rule_defs = conf['RULE']
    if not isinstance(rule_defs, list):
        rule_defs = [rule_defs]
    
    for rule in rule_defs:
        #todo use the undocumented g,s,m,i flags here: rule['singlelinematch']['value'] == 2 indicates re.DOTALL
        # so use that to pass to re.compile: see here for more http://livedocs.adobe.com/flex/3/html/help.html?content=12_Using_Regular_Expressions_10.html
        match = util.get_value(rule['match'], None, **kwargs) #todo use subkey?
        matchc = re.compile(match, re.DOTALL)  #compile for speed and we need to pass flags
        replace = util.get_value(rule['replace'], None, **kwargs) #todo use subkey?
        if replace is None:
            replace = ''
        
        #convert regex to Python format: todo use a common routine for this
        replace = re.sub('\$(\d+)', r'\\\1', replace)   #map $1 to \1 etc.   #todo: also need to escape any existing \1 etc.

        rules.append((rule['field']['value'], matchc, replace))
            
    for item in _INPUT:
        def sub_fields(matchobj):
            return unicode(util.get_value({'subkey':matchobj.group(1)}, item))

        for rule in rules:
            v = util.as_unicode(util.get_subkey(rule[0], item))
            v = re.sub(rule[1], rule[2], v)
            v = re.sub('\$\{([^\}]+)\}', sub_fields, v)
            util.set_value(item, rule[0], v)
            
        yield item
Exemplo n.º 7
0
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs):
    """This operator loops over the input performing the embedded submodule. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        mode -- how to affect output - either assign or EMIT
        assign_to -- if mode is assign, which field to assign to (new or existing)
        loop_with -- pass a particular field into the submodule rather than the whole item
    embed -- embedded submodule
    
    Yields (_OUTPUT):
    source items after passing through the submodule and adding/replacing values
    """
    mode = conf['mode']['value']
    assign_to = conf['assign_to']['value']
    assign_part = conf['assign_part']['value']
    emit_part = conf['emit_part']['value']
    loop_with = conf['with']['value']
    embed_conf = conf['embed']['value']['conf']
    
    #Prepare the submodule to take parameters from the loop instead of from the user
    embed_context = copy.copy(context)
    embed_context.submodule = True
    
    for item in _INPUT:        
        if loop_with:
            inp = util.get_subkey(loop_with, item)
        else:
            inp = item
            
        #Pass any input parameters into the submodule
        embed_context.inputs = {}
        for k in embed_conf:
            embed_context.inputs[k] = unicode(util.get_value(embed_conf[k], item))
        p = embed(embed_context, [inp], embed_conf)  #prepare the submodule
        
        results = None
        try:
            #loop over the submodule, emitting as we go or collecting results for later assignment
            for i in p:
                if assign_part == 'first':
                    if mode == 'EMIT':
                        yield i
                    else:
                        results = i
                    break
                else:  #all
                    if mode == 'EMIT':
                        yield i
                    else:
                        if results:
                            results.append(i)
                        else:
                            results = [i]
            if results and mode == 'assign':
                #this is a hack to make sure fetchpage works in an out of a loop while not disturbing strconcat in a loop etc.
                #(goes with the comment below about checking the delivery capability of the source)
                if len(results) == 1 and isinstance(results[0], dict):
                    results = [results]
        except HTTPError:  #todo any other errors we want to continue looping after?
            if context.verbose:
                print "Submodule gave HTTPError - continuing the loop"
            continue
        
        if mode == 'assign':
            if results and len(results) == 1:  #note: i suspect this needs to be more discerning and only happen if the source can only ever deliver 1 result, e.g. strconcat vs. fetchpage
                results = results[0]           
            util.set_value(item, assign_to, results)
            yield item
        elif mode == 'EMIT':
            pass  #already yielded
        else:
            raise Exception("Invalid mode %s (expecting assign or EMIT)" % mode)
Exemplo n.º 8
0
def pipe_loop(context, _INPUT, conf, embed=None, **kwargs):
    """This operator loops over the input performing the embedded submodule. 

    Keyword arguments:
    context -- pipeline context
    _INPUT -- source generator
    kwargs -- other inputs, e.g. to feed terminals for rule values
    conf:
        mode -- how to affect output - either assign or EMIT
        assign_to -- if mode is assign, which field to assign to (new or existing)
        loop_with -- pass a particular field into the submodule rather than the whole item
    embed -- embedded submodule
    
    Yields (_OUTPUT):
    source items after passing through the submodule and adding/replacing values
    """
    mode = conf['mode']['value']
    assign_to = conf['assign_to']['value']
    assign_part = conf['assign_part']['value']
    emit_part = conf['emit_part']['value']
    loop_with = conf['with']['value']
    embed_conf = conf['embed']['value']['conf']

    #Prepare the submodule to take parameters from the loop instead of from the user
    embed_context = copy.copy(context)
    embed_context.submodule = True

    for item in _INPUT:
        if loop_with:
            inp = util.get_subkey(loop_with, item)
        else:
            inp = item

        #Pass any input parameters into the submodule
        embed_context.inputs = {}
        for k in embed_conf:
            embed_context.inputs[k] = unicode(
                util.get_value(embed_conf[k], item))
        p = embed(embed_context, [inp], embed_conf)  #prepare the submodule

        results = None
        try:
            #loop over the submodule, emitting as we go or collecting results for later assignment
            for i in p:
                if assign_part == 'first':
                    if mode == 'EMIT':
                        yield i
                    else:
                        results = i
                    break
                else:  #all
                    if mode == 'EMIT':
                        yield i
                    else:
                        if results:
                            results.append(i)
                        else:
                            results = [i]
            if results and mode == 'assign':
                #this is a hack to make sure fetchpage works in an out of a loop while not disturbing strconcat in a loop etc.
                #(goes with the comment below about checking the delivery capability of the source)
                if len(results) == 1 and isinstance(results[0], dict):
                    results = [results]
        except HTTPError:  #todo any other errors we want to continue looping after?
            if context.verbose:
                print "Submodule gave HTTPError - continuing the loop"
            continue

        if mode == 'assign':
            if results and len(
                    results
            ) == 1:  #note: i suspect this needs to be more discerning and only happen if the source can only ever deliver 1 result, e.g. strconcat vs. fetchpage
                results = results[0]
            util.set_value(item, assign_to, results)
            yield item
        elif mode == 'EMIT':
            pass  #already yielded
        else:
            raise Exception("Invalid mode %s (expecting assign or EMIT)" %
                            mode)