def merge_rows_joined_on_values(left, right, left_schema, right_schema, how, on): left_names = left_schema.names right_names = right_schema.names left_on_fields, right_on_fields = get_on_fields(left_schema, right_schema, on) on_parts = [(on_field, left[on_field] if left is not None else right[on_field]) for on_field in on] if left is None and how in (FULL_JOIN, RIGHT_JOIN): left = create_row(left_names, [None for _ in left_names]) if right is None and how in (LEFT_JOIN, FULL_JOIN): right = create_row(right_names, [None for _ in right_names]) left_parts = ((field.name, value) for field, value in zip(left_schema.fields, left) if field not in left_on_fields) if how in (INNER_JOIN, CROSS_JOIN, LEFT_JOIN, FULL_JOIN, RIGHT_JOIN): right_parts = ((field.name, value) for field, value in zip(right_schema.fields, right) if field not in right_on_fields) elif how in (LEFT_SEMI_JOIN, LEFT_ANTI_JOIN): right_parts = () else: raise IllegalArgumentException( "Argument 'how' cannot be '{0}'".format(how)) return row_from_keyed_values( itertools.chain(on_parts, left_parts, right_parts))
def eval(self, row, schema): struct_cols, struct_values = [], [] for col in self.columns: output_cols, output_values = resolve_column(col, row, schema, allow_generator=False) struct_cols += output_cols struct_values += output_values[0] return create_row(struct_cols, struct_values)
def parse_record(record, schema, partition, partition_schema, options): raw_record_value = json.loads(record, encoding=options.encoding) if not isinstance(raw_record_value, dict): raise NotImplementedError( "Top level items should be JSON objects (dicts), got {0} with {1}". format(type(raw_record_value), raw_record_value)) record_value = decode_record(raw_record_value) if schema is not None: record_fields = record_value.__fields__ available_names = tuple(partition_schema.names) + record_fields field_names = [ name for name in record_fields if name in schema.names ] + [f.name for f in schema.fields if f.name not in available_names] else: field_names = list(record_value.__fields__) record_values = [ record_value[field_name] if field_name in record_value.__fields__ else None for field_name in field_names ] partition_field_names = [f.name for f in partition_schema.fields ] if partition_schema else [] # pylint: disable=W0511 # todo: handle nested rows row = create_row(itertools.chain(field_names, partition_field_names), itertools.chain(record_values, partition)) return row
def text_record_to_row(record, options, schema, partition_schema, partition): partition_field_names = [ f.name for f in partition_schema.fields ] if partition_schema else [] row = create_row( itertools.chain([schema.fields[0].name], partition_field_names), itertools.chain([record], partition or []) ) return row
def csv_record_to_row(record, options, schema=None, header=None, null_value=None, partition_schema=None, partition=None): record_values = [ val if val != null_value else None for val in record.split(options.sep) ] if schema is not None: field_names = [f.name for f in schema.fields] elif header is not None: field_names = header else: field_names = [ "_c{0}".format(i) for i, field in enumerate(record_values) ] partition_field_names = [f.name for f in partition_schema.fields ] if partition_schema else [] row = create_row(itertools.chain(field_names, partition_field_names), itertools.chain(record_values, partition or [])) return row
def merge_rows(left, right): return create_row(itertools.chain(left.__fields__, right.__fields__), left + right)
def do_cast_to_struct(value): return create_row(names, (caster(sub_value) for caster, sub_value in zip(casters, value)), metadata=value.get_metadata())