def resolve_partitions(patterns): """ Given a list of patterns, returns all the files matching or in folders matching one of them. The file are returned in a list of tuple of 2 elements: - The first tuple is the file path - The second being the partition keys and values if any were encountered else None In addition to this list, return, if the data was partitioned, a schema for the partition keys, else None :type patterns: list of str :rtype: Tuple[List[str], List[Optional[Row]], Optional[StructType]] """ file_paths = File.get_content(patterns) if not file_paths: raise AnalysisException('Path does not exist: {0}'.format(patterns)) partitions = {} for file_path in file_paths: if '=' in file_path: row = row_from_keyed_values( folder.split('=') for folder in file_path.split('/')[:-1] if folder.count('=') == 1) partitions[file_path] = row else: partitions[file_path] = None partitioning_field_sets = set(p.__fields__ for p in partitions.values() if p is not None) if len(partitioning_field_sets) > 1: raise Exception( 'Conflicting directory structures detected while reading {0}. ' 'All partitions must have the same partitioning fields, found fields {1}' .format( ','.join(patterns), ' and also '.join( str(fields) for fields in partitioning_field_sets), )) if partitioning_field_sets: if any(value is None for value in partitions.values()): raise AnalysisException( 'Unable to parse those malformed folders: {1} of {0}'.format( file_paths, [ path for path, value in partitions.items() if value is None ], )) partitioning_fields = partitioning_field_sets.pop() partition_schema = guess_schema_from_strings(partitioning_fields, partitions.values(), options={}) else: partition_schema = None return partitions, partition_schema
def get_checked_matches(matches, field_name, schema, show_id): if not matches: raise AnalysisException( "Unable to find the column '{0}' among {1}".format( field_name, format_schema(schema, show_id))) if len(matches) > 1: raise AnalysisException( "Reference '{0}' is ambiguous, found {1} columns matching it.". format(field_name, len(matches))) return matches.pop()
def eval(self, row, schema): value_1 = self.arg1.eval(row, schema) value_2 = self.arg2.eval(row, schema) if value_1 is None or value_2 is None: return None type_1 = value_1.__class__ type_2 = value_2.__class__ if type_1 == type_2: return self.unsafe_operation(value_1, value_2) try: order_1 = INTERNAL_TYPE_ORDER.index(type_1) order_2 = INTERNAL_TYPE_ORDER.index(type_2) except ValueError as e: raise AnalysisException(f'Unable to process type: {e}') from None spark_type_1 = python_to_spark_type(type_1) spark_type_2 = python_to_spark_type(type_2) if order_1 > order_2: caster = get_caster(from_type=spark_type_2, to_type=spark_type_1, options={}) value_2 = caster(value_2) elif order_1 < order_2: caster = get_caster(from_type=spark_type_1, to_type=spark_type_2, options={}) value_1 = caster(value_1) return self.unsafe_operation(value_1, value_2)
def guess_type_from_values_as_string(values, options): # Reproduces inferences available in Spark # PartitioningUtils.inferPartitionColumnValue() # located in org.apache.spark.sql.execution.datasources tested_types = ( IntegerType(), LongType(), DecimalType(), DoubleType(), TimestampType(), StringType(), ) string_type = StringType() for tested_type in tested_types: type_caster = get_caster(from_type=string_type, to_type=tested_type, options=options) try: for value in values: casted_value = type_caster(value) if casted_value is None and value not in ('null', None): raise ValueError return tested_type except ValueError: pass # Should never happen raise AnalysisException( 'Unable to find a matching type for some fields, even StringType did not work' )
def cast_to_binary(value, from_type, options): if isinstance(from_type, StringType): # noinspection PyTypeChecker return bytearray(value, 'utf-8') if isinstance(from_type, BinaryType): return value raise AnalysisException('Cannot cast type {0} to binary'.format(from_type))
def eval(self, row, schema): column_value = self.column.eval(row, schema) if isinstance(column_value, (list, dict)): return len(column_value) raise AnalysisException( '{0} value should be an array or a map, got {1}'.format( self.column, type(column_value)))
def eval(self, row, schema): metadata = row.get_metadata() if metadata is None or 'grouping' not in metadata: raise AnalysisException( 'grouping_id() can only be used with GroupingSets/Cube/Rollup') pos = self.column.find_position_in_schema(schema) return int(metadata['grouping'][pos])
def cast_to_boolean(value, from_type, options): if value == '' or value is None: return None if isinstance(from_type, StringType): return True if value.lower() == 'true' else False if value.lower() == 'false' else None if isinstance(from_type, (NumericType, BooleanType)): return bool(value) raise AnalysisException('Cannot cast type {0} to boolean'.format(from_type))
def eval(self, row, schema): metadata = row.get_metadata() if metadata is None or 'grouping' not in metadata: raise AnalysisException( 'grouping_id() can only be used with GroupingSets/Cube/Rollup') id_binary_string_value = ''.join('1' if grouping else '0' for grouping in metadata['grouping']) return int(id_binary_string_value, 2)
def cast_to_map(value, from_type, to_type, options): if isinstance(from_type, MapType): key_caster = get_caster(from_type=from_type.keyType, to_type=to_type.keyType, options=options) value_caster = get_caster(from_type=from_type.valueType, to_type=to_type.valueType, options=options) return { key_caster(key): (value_caster(sub_value) if sub_value is not None else None) for key, sub_value in value.items() } raise AnalysisException('Cannot cast type {0} to map'.format(from_type))
def cast_to_float(value, from_type, options): # NB: fast_pyspark_tester does not mimic the loss of accuracy of Spark nor value # bounding between float min&max values try: return cast_value(value, options=options) except ValueError: if isinstance(from_type, (DateType, TimestampType, NumericType, StringType)): return None raise AnalysisException(f'Cannot cast type {from_type} to float') from None
def get_caster(from_type, to_type, options): to_type_class = to_type.__class__ if from_type == to_type: return partial(identity, options=options) if to_type_class == NullType: return partial(cast_from_none, from_type=from_type, options=options) if to_type_class == TimestampType: return get_datetime_parser(options.get('timestampFormat')) if to_type_class in DESTINATION_DEPENDENT_CASTERS: caster = DESTINATION_DEPENDENT_CASTERS[to_type_class] return partial(caster, from_type=from_type, to_type=to_type, options=options) if to_type_class in CASTERS: return partial(CASTERS[to_type_class], from_type=from_type, options=options) raise AnalysisException('Cannot cast from {0} to {1}'.format(from_type, to_type))
def eval(self, row, schema): value = self.column.eval(row, schema) if not isinstance(value, str) or value == '': raise AnalysisException( 'type mismatch: The input csv should be a string literal and not null; ' 'however, got {0}.'.format(value)) # pylint: disable=import-outside-toplevel; circular import from fast_pyspark_tester.sql.internal_utils.readers.csvreader import csv_record_to_row from fast_pyspark_tester.sql.internal_utils.readers.utils import guess_schema_from_strings record_as_row = csv_record_to_row(value, self.options) schema = guess_schema_from_strings(record_as_row.__fields__, [record_as_row], self.options) return schema.simpleString()
def eval(self, row, schema): value_1 = self.arg1.eval(row, schema) value_2 = self.arg2.eval(row, schema) if value_1 is None or value_2 is None: return None type_1 = value_1.__class__ type_2 = value_2.__class__ if type_1 == type_2 or (isinstance(value_1, (int, float)) and isinstance(value_2, (int, float))): return self.unsafe_operation(value_1, value_2) raise AnalysisException( 'Cannot resolve {0} due to data type mismatch, first value is {1}, second value is {2}.' ''.format(self, type_1, type_2))
def _cast_to_bounded_type(name, min_value, max_value, value, from_type, options): if value == '' or value is None: return None size = max_value - min_value + 1 if isinstance(from_type, DateType): return None if isinstance(from_type, TimestampType): return _cast_to_bounded_type( name, min_value, max_value, cast_to_float(value, from_type, options=options), FloatType(), options=options, ) if isinstance(from_type, StringType): casted_value = int(value) return casted_value if min_value <= casted_value <= max_value else None if isinstance(from_type, (NumericType, BooleanType)): value = int(value) return value % size if value % size <= max_value else value % -size raise AnalysisException('Cannot cast type {0} to {1}'.format(from_type, name))
def save(self): output_path = self.path mode = self.mode if os.path.exists(output_path): if mode == 'ignore': return if mode in ('error', 'errorifexists'): raise AnalysisException( 'path {0} already exists.;'.format(output_path)) if mode == 'overwrite': shutil.rmtree(output_path) os.makedirs(output_path) else: os.makedirs(output_path) self.apply_on_aggregated_data(col( WriteInFolder(writer=self))).collect() success_path = os.path.join(output_path, '_SUCCESS') with open(success_path, 'w'): pass
def cast_to_date(value, from_type, options): if isinstance(value, datetime.datetime): return value.date() if isinstance(value, datetime.date): return value if isinstance(value, str): # Spark cast only considers the first non empty part before a ' ' or a 'T' if ' ' in value: value = value.strip().split(' ')[0] if 'T' in value: value = value.split('T')[0] date_components = value.split('-') if len(date_components) > 3 or len(date_components[0]) != 4: return None # default month and day to 1 date_components += [1] * (3 - len(date_components)) try: return datetime.date(*map(int, date_components)) except ValueError: return None if isinstance(from_type, (TimestampType, DateType, StringType)): return None # other values would have been handle in the lines above raise AnalysisException('Cannot cast type {0} to date'.format(from_type))
def cast_to_timestamp(value, from_type, options): if value == '' or value is None: return None if isinstance(value, str): date_as_string, time_as_string = split_datetime_as_string(value) date = cast_to_date(date_as_string, from_type, options=options) time_of_day = parse_time_as_string(time_as_string) return ( None if date is None or time_of_day is None else datetime.datetime(year=date.year, month=date.month, day=date.day, **time_of_day) .astimezone(tzlocal()) .replace(tzinfo=None) ) if isinstance(value, datetime.datetime): return value if isinstance(value, datetime.date): return datetime.datetime(year=value.year, month=value.month, day=value.day) if isinstance(value, (int, float)): return datetime.datetime.fromtimestamp(value) if isinstance(from_type, (StringType, TimestampType, NumericType, BooleanType)): return None raise AnalysisException('Cannot cast type {0} to timestamp'.format(from_type))
def get_literal_value(self): if isinstance(self.expr, Expression): return self.expr.get_literal_value() raise AnalysisException("Expecting a Literal, but got {0}: {1}".format( type(self), self))
def cast_to_array(value, from_type, to_type, options): if isinstance(from_type, ArrayType): caster = get_caster(from_type=from_type.elementType, to_type=to_type.elementType, options=options,) return [caster(sub_value) if sub_value is not None else None for sub_value in value] raise AnalysisException('Cannot cast type {0} to array'.format(from_type))
def get_literal_value(self): raise AnalysisException("Expecting a Literal, but got {0}: {1}".format( type(self), self))
def cast_from_none(value, from_type, options): if value is None: return None raise AnalysisException('Expected a null value from a field with type {0}, got {1}'.format(from_type, value))