def _create_table_or_temp_view_from_csv(self, name, path, schema=None, database=None, force=False, temp_view=False, format='parquet', **kwargs): options = _read_csv_defaults.copy() options.update(kwargs) if schema: assert ('inferSchema', True) not in options.items() schema = spark_dtype(schema) options['schema'] = schema else: options['inferSchema'] = True df = self._session.read.csv(path, **options) if temp_view: if force: df.createOrReplaceTempView(name) else: df.createTempView(name) else: qualified_name = _fully_qualified_name( name, database or self.current_database) mode = 'error' if force: mode = 'overwrite' df.write.saveAsTable(qualified_name, format=format, mode=mode)
def compile_reduction_udf(t, expr, scope, timecontext, context=None, **kwargs): op = expr.op() spark_output_type = spark_dtype(op._output_type) spark_udf = pandas_udf( op.func, spark_output_type, PandasUDFType.GROUPED_AGG ) func_args = (t.translate(arg, scope, timecontext) for arg in op.func_args) col = spark_udf(*func_args) if context in (AggregationContext.ENTIRE, AggregationContext.GROUP): return col elif context == AggregationContext.WINDOW: window = kwargs['window'] return col.over(window) else: src_table = t.translate(op.func_args[0].op().table, scope, timecontext) return src_table.agg(col)
def __init__(self, input_type, output_type): self.input_type = list(map(dt.dtype, input_type)) self.output_type = dt.dtype(output_type) self.spark_output_type = spark_dtype(self.output_type)
def compile_elementwise_udf(t, expr, scope, timecontext, **kwargs): op = expr.op() spark_output_type = spark_dtype(op._output_type) spark_udf = pandas_udf(op.func, spark_output_type, PandasUDFType.SCALAR) func_args = (t.translate(arg, scope, timecontext) for arg in op.func_args) return spark_udf(*func_args)