Exemplo n.º 1
0
def merge_rows_joined_on_values(left, right, left_schema, right_schema, how,
                                on):
    left_names = left_schema.names
    right_names = right_schema.names

    left_on_fields, right_on_fields = get_on_fields(left_schema, right_schema,
                                                    on)

    on_parts = [(on_field,
                 left[on_field] if left is not None else right[on_field])
                for on_field in on]

    if left is None and how in (FULL_JOIN, RIGHT_JOIN):
        left = create_row(left_names, [None for _ in left_names])
    if right is None and how in (LEFT_JOIN, FULL_JOIN):
        right = create_row(right_names, [None for _ in right_names])

    left_parts = ((field.name, value)
                  for field, value in zip(left_schema.fields, left)
                  if field not in left_on_fields)

    if how in (INNER_JOIN, CROSS_JOIN, LEFT_JOIN, FULL_JOIN, RIGHT_JOIN):
        right_parts = ((field.name, value)
                       for field, value in zip(right_schema.fields, right)
                       if field not in right_on_fields)
    elif how in (LEFT_SEMI_JOIN, LEFT_ANTI_JOIN):
        right_parts = ()
    else:
        raise IllegalArgumentException(
            "Argument 'how' cannot be '{0}'".format(how))

    return row_from_keyed_values(
        itertools.chain(on_parts, left_parts, right_parts))
Exemplo n.º 2
0
 def eval(self, row, schema):
     struct_cols, struct_values = [], []
     for col in self.columns:
         output_cols, output_values = resolve_column(col, row, schema, allow_generator=False)
         struct_cols += output_cols
         struct_values += output_values[0]
     return create_row(struct_cols, struct_values)
Exemplo n.º 3
0
def parse_record(record, schema, partition, partition_schema, options):
    raw_record_value = json.loads(record, encoding=options.encoding)
    if not isinstance(raw_record_value, dict):
        raise NotImplementedError(
            "Top level items should be JSON objects (dicts), got {0} with {1}".
            format(type(raw_record_value), raw_record_value))
    record_value = decode_record(raw_record_value)
    if schema is not None:
        record_fields = record_value.__fields__
        available_names = tuple(partition_schema.names) + record_fields
        field_names = [
            name for name in record_fields if name in schema.names
        ] + [f.name for f in schema.fields if f.name not in available_names]
    else:
        field_names = list(record_value.__fields__)
    record_values = [
        record_value[field_name]
        if field_name in record_value.__fields__ else None
        for field_name in field_names
    ]
    partition_field_names = [f.name for f in partition_schema.fields
                             ] if partition_schema else []
    # pylint: disable=W0511
    # todo: handle nested rows
    row = create_row(itertools.chain(field_names, partition_field_names),
                     itertools.chain(record_values, partition))
    return row
Exemplo n.º 4
0
def text_record_to_row(record, options, schema, partition_schema, partition):
    partition_field_names = [
        f.name for f in partition_schema.fields
    ] if partition_schema else []
    row = create_row(
        itertools.chain([schema.fields[0].name], partition_field_names),
        itertools.chain([record], partition or [])
    )
    return row
Exemplo n.º 5
0
def csv_record_to_row(record,
                      options,
                      schema=None,
                      header=None,
                      null_value=None,
                      partition_schema=None,
                      partition=None):
    record_values = [
        val if val != null_value else None for val in record.split(options.sep)
    ]
    if schema is not None:
        field_names = [f.name for f in schema.fields]
    elif header is not None:
        field_names = header
    else:
        field_names = [
            "_c{0}".format(i) for i, field in enumerate(record_values)
        ]
    partition_field_names = [f.name for f in partition_schema.fields
                             ] if partition_schema else []
    row = create_row(itertools.chain(field_names, partition_field_names),
                     itertools.chain(record_values, partition or []))
    return row
Exemplo n.º 6
0
def merge_rows(left, right):
    return create_row(itertools.chain(left.__fields__, right.__fields__),
                      left + right)
Exemplo n.º 7
0
 def do_cast_to_struct(value):
     return create_row(names,
                       (caster(sub_value)
                        for caster, sub_value in zip(casters, value)),
                       metadata=value.get_metadata())