def add_columns(self, frame, expression, schema, columns_accessed=None): if not schema or not hasattr(schema, "__iter__"): raise ValueError("add_columns requires a non-empty schema of (name, type)") only_one_column = False if isinstance(schema[0], basestring): only_one_column = True schema = [schema] schema = self._format_schema(schema) names, data_types = zip(*schema) optimized_frame_schema = [] if columns_accessed: if isinstance(columns_accessed, basestring): columns_accessed = [columns_accessed] frame_schema = frame.schema for i in columns_accessed: for j in frame_schema: if i == j[0]: optimized_frame_schema.append(j) # By default columns_accessed is an empty list and optimized frame schema is empty which implies frame.schema is considered to evaluate columns_accessed, optimized_frame_schema = ([], None) if columns_accessed is None else (columns_accessed, optimized_frame_schema) add_columns_function = get_add_one_column_function(expression, data_types[0]) if only_one_column \ else get_add_many_columns_function(expression, data_types) from itertools import imap arguments = {'frame': frame.uri, 'column_names': names, 'column_types': [get_rest_str_from_data_type(t) for t in data_types], 'udf': get_udf_arg(frame, add_columns_function, imap, optimized_frame_schema), 'columns_accessed': columns_accessed} execute_update_frame_command('add_columns', arguments, frame)
def aggregate_with_udf(self, frame, group_by_column_keys, aggregator_expression, output_schema, init_acc_values=None): if not output_schema or not hasattr(output_schema, "__iter__"): raise ValueError("aggregate_with_udf requires a non-empty schema of (name, type)") if isinstance(output_schema[0], basestring): output_schema = [output_schema] output_schema = self._format_schema(output_schema) names, data_types = zip(*output_schema) aggregate_with_udf_function = get_group_by_aggregator_function(aggregator_expression, data_types) from itertools import imap arguments = { "frame": frame.uri, "aggregate_by_column_keys": group_by_column_keys, "column_names": names, "column_types": [get_rest_str_from_data_type(t) for t in data_types], "udf": get_aggregator_udf_arg(frame, aggregate_with_udf_function, imap, output_schema, init_acc_values) } return execute_new_frame_command('frame/aggregate_with_udf', arguments)
def from_types_to_strings(s): return [(name, get_rest_str_from_data_type(data_type)) for name, data_type in s]