def count_by(*args): args = list(args) if len(args) > 0: args[0] = coerce_to_fields(args[0]) if len(args) > 1: args[1] = coerce_to_fields(args[1]) return SubAssembly(assembly.CountBy, *args)
def average_by(*args): args = list(args) if len(args) > 0: args[0] = coerce_to_fields(args[0]) if len(args) > 1: args[1] = coerce_to_fields(args[1]) if len(args) > 2: args[2] = coerce_to_fields(args[2]) return SubAssembly(assembly.AverageBy, *args)
def _create_with_parent(self, parent): args = [] if self.__argument_selector: args.append(coerce_to_fields(self.__argument_selector)) args.append(self.__function) if self.__output_selector: args.append(coerce_to_fields(self.__output_selector)) # We need to put another Pipe after the Each since otherwise # joins may not work as the names of pipes apparently have to be # different for Cascading. each = cascading.pipe.Each(parent.get_assembly(), *args) return cascading.pipe.Pipe(random_pipe_name('each'), each)
def rename(*args): """Rename the fields to new names. If only one argument (a list of names) is given, it is assumed that the user wants to rename all the fields. If there are two arguments, the first list is the set of fields to be renamed, and the second is a list of the new names. """ if len(args) == 1: (fields_from, fields_to) = (Fields.ALL, args[0]) else: (fields_from, fields_to) = (args[0], args[1]) return SubAssembly(cascading.pipe.assembly.Rename, \ coerce_to_fields(fields_from), \ coerce_to_fields(fields_to))
def SelectFields(fields): """Keeps only some fields in the tuple stream. Arguments: fields -- a list of fields to keep, or a Cascading Fields wildcard """ return com.twitter.pycascading.SelectFields(coerce_to_fields(fields))
def un_group(*args): args = list(args) if args: args[0] = coerce_to_fields(args[0]) if len(args) > 1: if isinstance(args[1], (list, tuple)): new_arg = [] for f in args[1]: new_arg.append(coerce_to_fields(f)) args[1] = new_arg else: args[1] = coerce_to_fields(args[1]) if len(args) > 2: if isinstance(args[2], (list, tuple)): new_arg = [] for f in args[2]: new_arg.append(coerce_to_fields(f)) args[2] = new_arg return function.UnGroup(*args)
def __create_args(self, pipe=None, aggregator=None, output_selector=None, assertion_level=None, assertion=None, buffer=None, argument_selector=None): if self.__args: # If we pass in an unnamed argument, try to determine its type if isinstance(self.__args[0], cascading.operation.Aggregator): aggregator = self.__args[0] else: buffer = self.__args[0] # Set up some defaults if argument_selector is None: argument_selector = cascading.tuple.Fields.ALL if output_selector is None: if aggregator is not None: # In the case of aggregators, we want to return both the # groupings and the results output_selector = cascading.tuple.Fields.ALL else: output_selector = cascading.tuple.Fields.RESULTS args = [] args.append(pipe.get_assembly()) if argument_selector is not None: args.append(coerce_to_fields(argument_selector)) if aggregator is not None: # for now we assume it's a Cascading aggregator straight args.append(wrap_function(aggregator, CascadingAggregatorWrapper)) if output_selector: args.append(coerce_to_fields(output_selector)) if assertion_level is not None: args.append(assertion_level) args.append(assertion) if buffer is not None: args.append(wrap_function(buffer, CascadingBufferWrapper)) if output_selector: args.append(coerce_to_fields(output_selector)) return args
def __create_args(self, group_name=None, pipes=None, group_fields=None, sort_fields=None, reverse_order=None, pipe=None, lhs_pipe=None, rhs_pipe=None): # We can use an unnamed parameter only for group_fields if self.__args: group_fields = coerce_to_fields(self.__args[0]) args = [] if group_name: args.append(group_name) if pipes: args.append([p.get_assembly() for p in pipes]) if group_fields: args.append(coerce_to_fields(group_fields)) if sort_fields: args.append(coerce_to_fields(sort_fields)) if reverse_order: args.append(reverse_order) elif pipe: args.append(pipe.get_assembly()) if group_fields: args.append(coerce_to_fields(group_fields)) if sort_fields: args.append(coerce_to_fields(sort_fields)) if reverse_order: args.append(reverse_order) elif lhs_pipe: args.append(lhs_pipe.get_assembly()) args.append(rhs_pipe.get_assembly()) args.append(coerce_to_fields(group_fields)) return args
def average(*args): args = list(args) if args: args[0] = coerce_to_fields(args[0]) return aggregator.Average(*args)
def unique(*args): args = list(args) args[0] = coerce_to_fields(args[0]) return SubAssembly(assembly.Unique, *args)
def sum_by(*args): # SumBy has at least 3 parameters args = list(args) for i in xrange(0, 3): args[i] = coerce_to_fields(args[i]) return SubAssembly(assembly.SumBy, *args)
def min(*args): args = list(args) if args: args[0] = coerce_to_fields(args[0]) return aggregator.Min(*args)
def sum(*args): args = list(args) if args: args[0] = coerce_to_fields(args[0]) return aggregator.Sum(*args)
def __create_args(self, group_name=None, pipes=None, group_fields=None, declared_fields=None, result_group_fields=None, joiner=None, pipe=None, num_self_joins=None, lhs=None, lhs_group_fields=None, rhs=None, rhs_group_fields=None): # We can use an unnamed parameter only for group_fields if self.__args: group_fields = [coerce_to_fields(f) for f in self.__args[0]] args = [] if group_name: args.append(str(group_name)) if lhs: args.append(lhs.get_assembly()) args.append(coerce_to_fields(lhs_group_fields)) args.append(rhs.get_assembly()) args.append(coerce_to_fields(rhs_group_fields)) if declared_fields: args.append(coerce_to_fields(declared_fields)) if result_group_fields: args.append(coerce_to_fields(result_group_fields)) if joiner: args.append(joiner) elif pipes: args.append([p.get_assembly() for p in pipes]) if group_fields: args.append([coerce_to_fields(f) for f in group_fields]) if declared_fields: args.append(coerce_to_fields(declared_fields)) if result_group_fields: args.append(coerce_to_fields(result_group_fields)) else: args.append(None) if joiner is None: joiner = cascading.pipe.cogroup.InnerJoin() args.append(joiner) elif pipe: args.append(pipe.get_assembly()) args.append(coerce_to_fields(group_fields)) args.append(int(num_self_joins)) if declared_fields: args.append(coerce_to_fields(declared_fields)) if result_group_fields: args.append(coerce_to_fields(result_group_fields)) if joiner: args.append(joiner) return args
def last(*args): args = list(args) if args: args[0] = coerce_to_fields(args[0]) return aggregator.Last(*args)