def _get_schema(header, schema): if schema is None or len(schema) == 0: # Use header to generate schema if header is None or len(header) == 0: return None elif len(header) > 4: warnings.warn(WARNING_MOVIE_LENS_HEADER) header = header[:4] schema = StructType() try: schema.add(StructField(header[0], IntegerType())).add( StructField(header[1], IntegerType()) ).add(StructField(header[2], FloatType())).add( StructField(header[3], LongType()) ) except IndexError: pass else: if header is not None: warnings.warn(WARNING_HAVE_SCHEMA_AND_HEADER) if len(schema) > 4: warnings.warn(WARNING_MOVIE_LENS_HEADER) schema = schema[:4] return schema
def _get_schema(header, schema): if schema is None or len(schema) == 0: # Use header to generate schema if header is None or len(header) == 0: return None elif len(header) > 4: warnings.warn(WARNING_MOVIE_LENS_HEADER) header = header[:4] schema = StructType() try: schema.add(StructField(header[0], IntegerType())).add( StructField(header[1], IntegerType())).add( StructField(header[2], FloatType())).add( StructField(header[3], LongType())) except IndexError: pass else: if header is not None: warnings.warn(WARNING_HAVE_SCHEMA_AND_HEADER) if len(schema) > 4: warnings.warn(WARNING_MOVIE_LENS_HEADER) schema = schema[:4] return schema
class SparkTemplate(object): def __init__(self, schema, table=None): self.default_values = schema self.spark_schema = StructType() self.table = table for col, default in schema.items(): try: spark_type = MAP_TO_SPARK_TYPE[type(default)](), except KeyError as e: raise KeyError( "No such spark_type for: '{}' on column '{}'".format( type(default), col)) self.spark_schema.add(StructField(col, spark_type[0])) def __call__(self, data, table=None): if table is not None: self.table = table spark = local_session() data = [{**self.default_values, **new_values} for new_values in data] data = spark.createDataFrame(data, self.spark_schema) if self.table is not None: db, table = self.table.split('.') spark.sql('CREATE DATABASE IF NOT EXISTS {}'.format(db)) data.write.saveAsTable(db + '.' + table, mode='append') return data
def build_spark_schema(self, column_names, column_types, verbose=False): """Given a set of names and types, construct a dictionary to be used as the Spark read_csv dtypes argument""" # If we don't know the type put it into a string unknown_type = StringType() schema = StructType() for name, zeek_type in zip(column_names, column_types): # Grab the type spark_type = self.type_map.get(zeek_type) # Sanity Check if not spark_type: if verbose: print('Could not find type for {:s} using StringType...'. format(zeek_type)) spark_type = unknown_type # Add the Spark type for this column schema.add(name, spark_type) # Return the Spark schema return schema
def test_convertToDelta(self) -> None: df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"]) df.write.format("parquet").save(self.tempFile) dt = DeltaTable.convertToDelta(self.spark, "parquet.`%s`" % self.tempFile) self.__checkAnswer( self.spark.read.format("delta").load(self.tempFile), [('a', 1), ('b', 2), ('c', 3)]) # test if convert to delta with partition columns work tempFile2 = self.tempFile + "_2" df.write.partitionBy("value").format("parquet").save(tempFile2) schema = StructType() schema.add("value", IntegerType(), True) dt = DeltaTable.convertToDelta( self.spark, "parquet.`%s`" % tempFile2, schema) self.__checkAnswer( self.spark.read.format("delta").load(tempFile2), [('a', 1), ('b', 2), ('c', 3)]) self.assertEqual(type(dt), DeltaTable) # convert to delta with partition column provided as a string tempFile3 = self.tempFile + "_3" df.write.partitionBy("value").format("parquet").save(tempFile3) dt = DeltaTable.convertToDelta( self.spark, "parquet.`%s`" % tempFile3, "value int") self.__checkAnswer( self.spark.read.format("delta").load(tempFile3), [('a', 1), ('b', 2), ('c', 3)]) self.assertEqual(type(dt), DeltaTable)
def fix_schema(schema): fixed_schema = StructType() for field in schema.fields: if field.name != "_corrupt_record": fixed_schema.add(field) fixed_schema = ArrayType(fixed_schema) return fixed_schema
def get_ipdata_schema(): schema = StructType() schema.add('start', LongType(), True) schema.add('stop', LongType(), True) schema.add('continent_code', StringType(), True) schema.add('continent_name', StringType(), True) schema.add('country_iso_code', StringType(), True) schema.add('country_name', StringType(), True) return schema
def read_schema(schema_arg): mydict = {"StringType()": StringType(), "IntegerType()": IntegerType()} split_val = schema_arg.split(",") schema = StructType() for i in split_val: x = i.split(" ") schema.add(x[0], mydict[x[1]], True) return schema
def make_schema(schema_file): """Define schema based on .csv input.""" with open('/home/ubuntu/data-processing/' + schema_file, newline='') as f: reader = csv.reader(f) schema_cols = list(reader) schema = StructType() for i, name in enumerate(schema_cols[0]): schema.add(name, schema_cols[1][i], True) return schema
def get_schema(num_col_names, str_col_names): assert isinstance(num_col_names[0], str) assert isinstance(str_col_names[0], str) _struct = StructType() for i in num_col_names: _struct.add(i, FloatType()) for i in str_col_names: _struct.add(i, StringType()) return _struct
def test_wrong_schema_raises_error(self): data = [['A', 1], ['B', 2]] schema = StructType() schema.add('Name', StringType(), True) schema.add('Group', IntegerType(), True) data_df = self.session.createDataFrame(data, schema=schema) self.assertRaises(AssertionError, MyEntity, data_df)
def get_spark_schema(header=DEFAULT_HEADER): ## create schema schema = StructType() ## do label + ints n_ints = 14 for i in range(n_ints): schema.add(StructField(header[i], IntegerType())) ## do categoricals for i in range(26): schema.add(StructField(header[i + n_ints], StringType())) return schema
def schema_for(descriptor): if descriptor is None: return None struct_type = StructType() for field_descriptor in sorted(descriptor.fields, key=lambda x: x.name): struct_type.add( field_descriptor.name, __type_for(field_descriptor), field_descriptor.label != field_descriptor.LABEL_REQUIRED) return struct_type
def test_wrong_output_schema_raises_error(self): input_schema = StructType() input_schema.add('Batsman', StringType(), True) input_schema.add('Match', StringType(), True) input_schema.add('Runs', IntegerType(), True) data = [['Sachin', 'M1', 100], ['Dravid', 'M1', 50], ['Laxman', 'M2', 150]] input_df = self.session.createDataFrame(data, schema=input_schema) trans = MyTransformation() trans.output_schema.add('Batsman', StringType(), True) self.assertRaises(ValueError, trans.transform, input_df)
def readHbaseTableToDataFrame(spark,zk_hosts,zk_parent,hbase_table,rowkey_name): sc = spark.sparkContext rdd = readHbaseRDD(sc,zk_hosts,zk_parent,hbase_table) #rdd2 = rdd.flatMap(lambda x : [ (x[0], json.loads(e) ) for e in x[1].split('\n') ] ) # eval how much cols should be used columns = rdd.flatMap(lambda x : [ (x[0], json.loads(e) ) for e in x[1].split('\n') ] ).map(lambda col : col[1]['qualifier']).distinct().collect() schema = StructType() schema.add(StructField(rowkey_name,StringType(),True)) for c in columns : schema.add(StructField(c,StringType(),True)) rdd2 = rdd.map(lambda x : (x[0], [ json.loads(e) for e in x[1].split('\n') ] )).map(lambda line: jsonListToDict(line,rowkey_name)) df = spark.createDataFrame(rdd2,schema) return df
def __init__(self, predAndLabelsWithOptWeight): sc = predAndLabelsWithOptWeight.ctx sql_ctx = SQLContext.getOrCreate(sc) numCol = len(predAndLabelsWithOptWeight.first()) schema = StructType([ StructField("prediction", DoubleType(), nullable=False), StructField("label", DoubleType(), nullable=False)]) if (numCol == 3): schema.add("weight", DoubleType(), False) df = sql_ctx.createDataFrame(predAndLabelsWithOptWeight, schema) java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics java_model = java_class(df._jdf) super(MulticlassMetrics, self).__init__(java_model)
def __init__(self, predictionAndObservations): sc = predictionAndObservations.ctx sql_ctx = SQLContext.getOrCreate(sc) numCol = len(predictionAndObservations.first()) schema = StructType([ StructField("prediction", DoubleType(), nullable=False), StructField("observation", DoubleType(), nullable=False)]) if numCol == 3: schema.add("weight", DoubleType(), False) df = sql_ctx.createDataFrame(predictionAndObservations, schema=schema) java_class = sc._jvm.org.apache.spark.mllib.evaluation.RegressionMetrics java_model = java_class(df._jdf) super(RegressionMetrics, self).__init__(java_model)
def __init__(self, scoreAndLabels): sc = scoreAndLabels.ctx sql_ctx = SQLContext.getOrCreate(sc) numCol = len(scoreAndLabels.first()) schema = StructType([ StructField("score", DoubleType(), nullable=False), StructField("label", DoubleType(), nullable=False)]) if numCol == 3: schema.add("weight", DoubleType(), False) df = sql_ctx.createDataFrame(scoreAndLabels, schema=schema) java_class = sc._jvm.org.apache.spark.mllib.evaluation.BinaryClassificationMetrics java_model = java_class(df._jdf) super(BinaryClassificationMetrics, self).__init__(java_model)
def __init__(self, predAndLabelsWithOptWeight): sc = predAndLabelsWithOptWeight.ctx sql_ctx = SQLContext.getOrCreate(sc) numCol = len(predAndLabelsWithOptWeight.first()) schema = StructType([ StructField("prediction", DoubleType(), nullable=False), StructField("label", DoubleType(), nullable=False) ]) if (numCol == 3): schema.add("weight", DoubleType(), False) df = sql_ctx.createDataFrame(predAndLabelsWithOptWeight, schema) java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics java_model = java_class(df._jdf) super(MulticlassMetrics, self).__init__(java_model)
def schema_for(descriptor): if descriptor is None: return None struct_type = StructType() for field_descriptor in descriptor.fields: struct_type.add( field_descriptor.name, __type_for(field_descriptor), field_descriptor.label != field_descriptor.LABEL_REPEATED and field_descriptor.label != field_descriptor.LABEL_REQUIRED ) return struct_type
def get_schema(csv): """ Get schema for collected csv Args: csv: csv string file Returns: StructType with schemas """ schemas = StructType() for field in build_table_schema(csv, index=False)["fields"]: type = StringType() if field["type"] == "string" \ else FloatType() if field["type"] == "number" \ else IntegerType() schemas.add(StructField(field["name"], type, True)) return schemas
def get_featureengineering_schema(): schema = StructType() schema.add('ip', StringType(), True).add('maxScore', DoubleType(), True).add('minScore', DoubleType(), True) schema.add('avgScore', DoubleType(), True).add('lastScore', DoubleType(), True).add('trendUp', DoubleType(), True) schema.add('trendDown', DoubleType(), True).add('trueCount', DoubleType(), True).add('dataSetCount', DoubleType(), True) schema.add('mostCommonCustomerHit', StringType(), True).add('totalCustomersHit', DoubleType(), True) schema.add('0549ca77de276efd3b57753b1489a17748b85da3', DoubleType(), True).add('07d5d0d8f59cd398a6beeb58a06d4c522a57f112', DoubleType(), True) return schema
def __init__(self, predictionAndLabels: RDD[Tuple[float, float]]): sc = predictionAndLabels.ctx sql_ctx = SQLContext.getOrCreate(sc) numCol = len(predictionAndLabels.first()) schema = StructType([ StructField("prediction", DoubleType(), nullable=False), StructField("label", DoubleType(), nullable=False), ]) if numCol >= 3: schema.add("weight", DoubleType(), False) if numCol == 4: schema.add("probability", ArrayType(DoubleType(), False), False) df = sql_ctx.createDataFrame(predictionAndLabels, schema) assert sc._jvm is not None java_class = sc._jvm.org.apache.spark.mllib.evaluation.MulticlassMetrics java_model = java_class(df._jdf) super(MulticlassMetrics, self).__init__(java_model)
def get_spark_schema(header=DEFAULT_HEADER): """Get Spark schema from header. Args: header (list): Dataset header names. Returns: pyspark.sql.types.StructType: Spark schema. """ # create schema schema = StructType() # do label + ints n_ints = 14 for i in range(n_ints): schema.add(StructField(header[i], IntegerType())) # do categoricals for i in range(26): schema.add(StructField(header[i + n_ints], StringType())) return schema
def build_polygon_layer(sc,spark,polygon,wkid): from pyspark.sql.types import DoubleType, ArrayType, StructType, StructField,IntegerType from pyspark.sql import Row # two fields: row = Row(1, [polygon]) rows = [row] rdd = sc.parallelize(rows) fschema = StructType() fschema.add("ID", data_type=IntegerType()) rings_Type = StructType( [StructField("rings", ArrayType(elementType=ArrayType(elementType=ArrayType(elementType=DoubleType()))))]) meta = {'geometry': {'type': 'polygon', 'spatialReference': {'latestWkid': wkid, 'wkid': wkid}}} fschema.add("$geometry", data_type=rings_Type, metadata=meta) df = spark.createDataFrame(rdd, fschema) return df
def get_dataframe_schema(df_dict): dtypes = { "IntegerType()": IntegerType(), "StringType()": StringType(), "DoubleType()": DoubleType() } cust_schema = StructType() column_count = len(df_dict['sources']['driverSource']['fields']) print(column_count) for i in range(0, column_count): cust_schema.add(df_dict['sources']['driverSource']['fields'][i]['name'],\ dtypes[df_dict['sources']['driverSource']['fields'][i]['type']], True) return cust_schema
def ray_dataset_to_spark_dataframe(spark: sql.SparkSession, arrow_schema: "pa.lib.Schema", blocks: List[ObjectRef], locations: List[bytes]) -> DataFrame: if not isinstance(arrow_schema, pa.lib.Schema): raise RuntimeError(f"Schema is {type(arrow_schema)}, required pyarrow.lib.Schema. \n" \ f"to_spark does not support converting non-arrow ray datasets.") schema = StructType() for field in arrow_schema: schema.add(field.name, from_arrow_type(field.type), nullable=field.nullable) #TODO how to branch on type of block? sample = ray.get(blocks[0]) if isinstance(sample, bytes): return _convert_by_rdd(spark, blocks, locations, schema) elif isinstance(sample, pa.Table): return _convert_by_udf(spark, blocks, locations, schema) else: raise RuntimeError("ray.to_spark only supports arrow type blocks")
def build_schema(self, metadata, fileconfig): data_types = metadata.data_types derived_columns = fileconfig.get('derived_columns') if derived_columns: col_num = len(data_types) - len(derived_columns) else: col_num = len(data_types) columns_to_drop = fileconfig.get('columns_to_drop') if columns_to_drop: col_num = col_num + len(columns_to_drop.split(',')) schema = StructType() for i in range(col_num): schema.add(StructField("_c" + str(i), StringType(), True)) self.logger.info('schema=%s' % schema.simpleString()) return schema
def custom_rule_check(spark, jdbcUrl, connectionProperties, FinalDf, entryid): import pyspark.sql.functions as f from pyspark.sql.types import StructType, StringType, IntegerType, StructField import mysql.connector pdq_1 = "(select Custom_rule from `deaccelator`.`custom_rule_metadata` where `EntryID` = {0} ) as pdq".format( entryid) df_custom = spark.read.jdbc(url=jdbcUrl, table=pdq_1, properties=connectionProperties) try: customdict = df_custom.first().asDict() customlist = customdict["Custom_rule"].split(',') for item in customlist: pdq_3 = "(select Definition from `deaccelator`.`centralrulerepo` where `Name` = '{0}' )".format( item) cnx = mysql.connector.connect( user="******", password="******", host="demetadata.mysql.database.azure.com", database='deaccelator') cur = cnx.cursor() cur.execute(pdq_3) rule_definition = (cur.fetchone())[0] FinalDf = FinalDf.select(f.col("*"), f.expr(rule_definition).alias(item)) newrdd1 = FinalDf.rdd.map(lambda row: row_function_1(row, customlist)) to_prepend = [StructField("Flag", StringType(), True)] updated_schema = StructType(to_prepend + FinalDf.schema.fields) schema_sorted = StructType() structfield_list_sorted = sorted(updated_schema, key=lambda x: x.name) for item in structfield_list_sorted: schema_sorted.add(item) newdf = spark.createDataFrame(newrdd1, schema_sorted) except: newdf = FinalDf.withColumn("Flag", f.lit("VALID")) silver_df = newdf.filter(newdf['Flag'] == 'VALID') silver_df = silver_df.drop('Flag') silver_df = silver_df.select(list(Metadata_Dict.keys())) reject_df = newdf.filter(newdf['Flag'] != 'VALID') return silver_df, reject_df
def Business_Rule_Check(spark, jdbcUrl, connectionProperties, df, entryid): pdq_1 = "(select * from `deaccelator`.`business_rule_metadata` where EntryID = {0} ) pdq_1".format( entryid) df_rule = spark.read.jdbc(url=jdbcUrl, table=pdq_1, properties=connectionProperties) from pyspark.sql.functions import regexp_replace df_rule = df_rule.withColumn( "ColumnName", regexp_replace(df_rule["ColumnName"], " ", "")) lookup_df = spark.createDataFrame([], df_rule.schema) parameter = [] rddobject = df_rule.rdd for x in rddobject.collect(): adict = x.asDict() parameter.append(adict) for x in parameter: if (x['RuleName'] == 'Lookup'): lookuplist = x['RuleParameters'].split(',', 3) connection_dictionary = eval(lookuplist[3]) lookup_df = ReadBlob(spark, connection_dictionary) else: lookup_df = spark.createDataFrame([], df_rule.schema) global lookuprdd lookuprdd = sc.broadcast(lookup_df.collect()) import pyspark.sql.functions as f from pyspark.sql.types import StructType, StringType, IntegerType, StructField newrdd = df.rdd.map(lambda row: row_function(row, parameter)) to_prepend = [StructField("Flag", StringType(), True)] updated_schema = StructType(to_prepend + df.schema.fields) schema_sorted = StructType() structfield_list_sorted = sorted(updated_schema, key=lambda x: x.name) for item in structfield_list_sorted: schema_sorted.add(item) newdf = spark.createDataFrame(newrdd, schema_sorted) silver_df = newdf.filter(newdf['Flag'] == 'VALID') silver_df = silver_df.drop('Flag') silver_df = silver_df.select(list(Metadata_Dict.keys())) reject_df = newdf.filter(newdf['Flag'] != 'VALID') return silver_df, reject_df
def get_dark3_schema(): schema = StructType() schema.add('record_id', StringType(), True).add('report_date', StringType(), True).add('anonymous_id', StringType(), True) schema.add('sector', StringType(), True).add('size', StringType(), True).add('data', StringType(), True) schema.add('count_seen', DoubleType(), True).add('first_seen', StringType(), True).add('last_seen', StringType(), True) schema.add('dark3_score', DoubleType(), True).add('estimate', StringType(), True) return schema
def buildSchema(*fields): """ Creates a schema from a list field tuples The resulting schema is an instance of a Spark StructType. :param fields: :type fields: tuple<String, DataType> :return: :rtype: StructType """ schema = StructType() for fieldName, fieldType in fields: schema = schema.add(fieldName, fieldType, False, None) return schema
def _create_from_pandas_with_arrow(self, pdf, schema, timezone): """ Create a DataFrame from a given pandas.DataFrame by slicing it into partitions, converting to Arrow data, then sending to the JVM to parallelize. If a schema is passed in, the data types will be used to coerce the data in Pandas to Arrow conversion. """ from distutils.version import LooseVersion from pyspark.serializers import ArrowStreamPandasSerializer from pyspark.sql.types import from_arrow_type, to_arrow_type, TimestampType from pyspark.sql.utils import require_minimum_pandas_version, \ require_minimum_pyarrow_version require_minimum_pandas_version() require_minimum_pyarrow_version() from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype import pyarrow as pa # Create the Spark schema from list of names passed in with Arrow types if isinstance(schema, (list, tuple)): if LooseVersion(pa.__version__) < LooseVersion("0.12.0"): temp_batch = pa.RecordBatch.from_pandas(pdf[0:100], preserve_index=False) arrow_schema = temp_batch.schema else: arrow_schema = pa.Schema.from_pandas(pdf, preserve_index=False) struct = StructType() for name, field in zip(schema, arrow_schema): struct.add(name, from_arrow_type(field.type), nullable=field.nullable) schema = struct # Determine arrow types to coerce data when creating batches if isinstance(schema, StructType): arrow_types = [to_arrow_type(f.dataType) for f in schema.fields] elif isinstance(schema, DataType): raise ValueError("Single data type %s is not supported with Arrow" % str(schema)) else: # Any timestamps must be coerced to be compatible with Spark arrow_types = [to_arrow_type(TimestampType()) if is_datetime64_dtype(t) or is_datetime64tz_dtype(t) else None for t in pdf.dtypes] # Slice the DataFrame to be batched step = -(-len(pdf) // self.sparkContext.defaultParallelism) # round int up pdf_slices = (pdf[start:start + step] for start in xrange(0, len(pdf), step)) # Create list of Arrow (columns, type) for serializer dump_stream arrow_data = [[(c, t) for (_, c), t in zip(pdf_slice.iteritems(), arrow_types)] for pdf_slice in pdf_slices] jsqlContext = self._wrapped._jsqlContext safecheck = self._wrapped._conf.arrowSafeTypeConversion() col_by_name = True # col by name only applies to StructType columns, can't happen here ser = ArrowStreamPandasSerializer(timezone, safecheck, col_by_name) def reader_func(temp_filename): return self._jvm.PythonSQLUtils.readArrowStreamFromFile(jsqlContext, temp_filename) def create_RDD_server(): return self._jvm.ArrowRDDServer(jsqlContext) # Create Spark DataFrame from Arrow stream file, using one batch per partition jrdd = self._sc._serialize_to_jvm(arrow_data, ser, reader_func, create_RDD_server) jdf = self._jvm.PythonSQLUtils.toDataFrame(jrdd, schema.json(), jsqlContext) df = DataFrame(jdf, self._wrapped) df._schema = schema return df