Exemplo n.º 1
0
 def __init__(self):
     self.spark = SparkSession.builder.appName("DatasetClient").getOrCreate()  # TODO make config, and per (linux) user
     for path in PATHS:
         if self.df is None:
             self.df = self.spark.read.schema(StructType.fromJson(json.loads(PATH_TO_SCHEMA[path]))).parquet(path)
         else:
             new_df = self.spark.read.schema(StructType.fromJson(json.loads(PATH_TO_SCHEMA[path]))).parquet(path)
             self.df = self.df.join(new_df, on='user_id')
Exemplo n.º 2
0
def test_rmse():
    # TODO: revised so that it will take user's inputs instead of hardcoded values

    movies_schema = None
    ratings_schema = None

    # load the schemas
    with open("movielens_20m_movies_schema.json", "r") as json_schema_file:
        movies_schema = StructType.fromJson(json.load(json_schema_file))

    with open("movielens_20m_ratings_schema.json", "r") as json_schema_file:
        ratings_schema = StructType.fromJson(json.load(json_schema_file))

    # create a hdfs directory
    os.system("hdfs dfs -mkdir datasets")

    # load the json file into the hdfs directory
    os.system("hdfs dfs -put movielens_10m_ratings.json.gz datasets/movielens_10m_ratings.json.gz")

    # create a DataFrame based on the content of the json file
    ratingsDF = scsingleton.sqlCtx.read.json("hdfs://localhost:9000/datasets/movielens_10m_ratings.json.gz", schema=ratings_schema)
    # explicitly repartition RDD after loading so that more tasks can run on it in parallel
    # by default, defaultMinPartitions == defaultParallelism == estimated # of cores across all of the machines in your cluster
    ratingsDF = ratingsDF.repartition(scsingleton.sc.defaultParallelism * 3)    

    # parse ratings DataFrame into an RDD of [(userId, itemId, rating)]
    ratingsRDD = ratingsDF.map(lambda row: (row.user_id, row.movie_id, row.rating))
    ratingsRDD.cache()

    # split data into train (60%), test (40%)
    # TODO: add validation in the future? train (60%), validation (20%), test(20%)?
    trainingRDD, testRDD = ratingsRDD.randomSplit([0.6, 0.4])
    trainingRDD.cache()
    testRDD.cache()

    # run training algorithm to build the model
    # without validation
    with Timer() as t:
        model = ALS.train(trainingRDD, rank=3)
    print "ALS.train(trainingRDD, rank=3): %s seconds" % t.secs

    # make a prediction
    with Timer() as t:
        testPredRDD = model.predictAll( testRDD.map( lambda x: (x[0], x[1]) ) ).cache()
    print "testPredRDD: %s seconds" % t.secs

    # calculate RMSE
    with Timer() as t:
        testRmse = pm.calculate_rmse_using_rdd(testRDD, testPredRDD)
    print "testRmse: %s seconds" % t.secs
    print "testRmse", testRmse

    return
    def _retrieve_schema(self, manifest_file):
        rdd = self.spark.sparkContext.wholeTextFiles(manifest_file)
        text = rdd.collect()[0][1]
        dict = json.loads(str(text))
        custom_schema = StructType.fromJson(dict)

        return custom_schema
Exemplo n.º 4
0
def get_dataset(sc, spark, base_path, connector, input_path, start_day,
                end_day):
    # Ship code to executors
    ship_dir(base_path + "/algos", sc, base_path)
    ship_dir(base_path + "/core", sc, base_path)
    ship_dir(base_path + "/connectors", sc, base_path)

    # Find connector
    connector_module = my_import(connector, sc)

    # Parse dates
    y1, m1, d1 = start_day.split("_")
    date1 = date(int(y1), int(m1), int(d1))
    y2, m2, d2 = end_day.split("_")
    date2 = date(int(y2), int(m2), int(d2))

    # Instantiate connector
    connector_instance = connector_module(input_path, date1, date2)

    # Get and enforce Schema
    output_type = connector_instance.output_type
    schema_file = base_path + "/schema/" + output_type + ".json"
    schema_json = json.load(open(schema_file, "r"))
    schema = StructType.fromJson(schema_json)
    connector_instance.set_schema(schema)

    # Get Dataset
    dataset = connector_instance.get_DF(sc, spark)

    # Return
    return dataset
Exemplo n.º 5
0
def json_to_spark_schema(json_schema: Dict[str, JsonSchemaType]) -> StructType:
    """
    Return Spark Schema for a JSON schema.

    Args:
        json_schema (Dict[str, JSON_SCHEMA_TYPE]): schema in json format.

    Returns:
        StructType: Spark Schema for the corresponding JSON schema.

    Raises:
        KeyError: Missing Schema key fields Name/Field/Nullable
        TypeError: Invalid json was provided

    """
    try:
        return StructType.fromJson(json_schema)
    except KeyError as key_error:
        LOGGING.error(str(key_error))
        raise KeyError('Missing key: {0}. Valid format: {1}'.format(
            str(key_error),
            'All schema columns must have a name, type and nullable key'))
    except TypeError as key_error:
        LOGGING.error(str(key_error))
        raise TypeError('Invalid json was provided')
Exemplo n.º 6
0
def getSchema(dataPath, externalSystem, schema, table, stepLogGuid, basepath, samplingRatio=.5, timeout=6000, zone="silver", delimiter="", header=True, multiLine="False"):
  if zone == "silver":
    path = "{0}/query/schemas/{1}/{2}/{3}/schema.json".format(basepath, externalSystem, schema, table)
    args = {
      "stepLogGuid": stepLogGuid,
      "dataPath": dataPath,
      "externalSystem": externalSystem,
      "schemaName": schema, 
      "tableName": table,
      "samplingRatio": samplingRatio,
      "schemaPath": path
    }
  elif zone == "bronze":
    path = "{0}/raw/schemas/{1}/{2}/schema.json".format(basepath, externalSystem, table)
    args = {
      "stepLogGuid": stepLogGuid,
      "dataPath": dataPath,
      "externalSystem": externalSystem,
      "tableName": tableName,
      "samplingRatio": samplingRatio,
      "delimiter": delimiter,
      "hasHeader": header,
      "schemaPath": path,
      "multiLine": multiLine
    }
  
  try:
    head = dbutils.fs.head(path, 256000)
  except Exception as e:
    dbutils.notebook.run("/Framework/Data Engineering/Silver Zone/Get Schema", timeout, args)
    head = dbutils.fs.head(path, 256000)
    
  import json
  from pyspark.sql.types import StructType
  return StructType.fromJson(json.loads(head))
Exemplo n.º 7
0
        def run_job(job_info, df):
            table_name = job_info.target_table_name if job_info.target_table_name else "ods_{}.sync_{}".format(
                job_info.source_db_name, job_info.source_table_name)
            schema = table_name.split(".")[0]

            if job_info.target_type == 'hive':
                spark.sql("create database if not exists {}".format(schema))
                df.write.mode("overwrite").format("orc").saveAsTable(table_name)

            if job_info.target_type == 'phoenix':
                df.write \
                    .format("org.apache.phoenix.spark") \
                    .mode("overwrite") \
                    .option("table", table_name) \
                    .option("zkUrl", job_info.target_host) \
                    .save()

            if job_info.target_type in ('mysql', 'postgresql'):
                jdbc_url = "jdbc:{}://{}:{}/{}".format(job_info.target_type, job_info.target_host, job_info.target_port,
                                                       job_info.target_db_name)

                properties = {"user": job_info.target_db_user, "password": job_info.target_db_psw}

                # struct array等转为string
                schema = df.schema.jsonValue()
                map(lambda x: x.update(type='string') if isinstance(x.get("type"), dict) else x, schema.get("fields"))
                struct = StructType.fromJson(schema)

                new_df = spark.createDataFrame(df.rdd, struct)

                new_df.write \
                    .mode("overwrite") \
                    .option("truncate", True) \
                    .jdbc(jdbc_url, table_name, properties=properties)
Exemplo n.º 8
0
 def schema(self):
     """load schema from the json file"""
     if (self.fullSchemaPath() is None):
         return None
     else:
         with open(self.fullSchemaPath(), "r") as sj:
             schema_st = sj.read()
         return StructType.fromJson(json.loads(schema_st))
Exemplo n.º 9
0
 def read_source(self, source):
     """
     get spark dataframe from source
     :param source: a source that contains file_path, source_type, options and schema_json
     :return: spark dataframe
     """
     if source.schema_json:
         if isinstance(source.schema_json, str):
             schema = StructType.fromJson(json.loads(source.schema_json))
         elif isinstance(source.schema_json, dict):
             schema = StructType.fromJson(source.schema_json)
         else:
             raise TypeError("source schema should be str")
         return self.sparkSession.read.format(source.source_type) \
             .options(**json.loads(source.options)).schema(schema).load(source.file_path).cache()
     else:
         return self.sparkSession.read.format(source.source_type) \
             .options(**json.loads(source.options)).load(source.file_path).cache()
def load_df_schema(bucket, fname):
  import boto3, json
  from pyspark.sql.types import StructType 

  # reload the schema from s3
  s3 = boto3.client('s3')

  obj = s3.get_object(Bucket=bucket, Key=fname)
  json_schema = obj['Body'].read().decode('utf-8')
  return StructType.fromJson(json.loads(json_schema))
Exemplo n.º 11
0
def schema_load(option):
    """Loads Spark DataFrame schema from JSON file.

    :param option: File name suffix for the DataFrame schema.
    :type option: string
    :returns: DataFrame schema.
    :rtype: StructType
    """
    with open(f"trending_{option}.json", "r", encoding="UTF-8") as f_schema:
        return StructType.fromJson(load(f_schema))
Exemplo n.º 12
0
 def BuildSparkSchema(table,
                      forceAllFieldsToString=False,
                      useValidation=False,
                      excludeComputed=False):
     '''
     returns the schema for spark
     '''
     from pyspark.sql.types import StructType  #@UnresolvedImport
     schemaJson = SparkUtilities.BuildSparkSchemaJson(
         table, forceAllFieldsToString, useValidation, excludeComputed)
     schema = StructType.fromJson(schemaJson)
     return schema
Exemplo n.º 13
0
 def load_table(self, sc, spark, table_path, table_name):
     parquet_reader = spark.read.format('parquet')
     if self.args.table_schema is not None:
         self.get_logger(sc).info("Reading table schema from {}".format(
             self.args.table_schema))
         with open(self.args.table_schema, 'r') as s:
             schema = StructType.fromJson(json.loads(s.read()))
         parquet_reader = parquet_reader.schema(schema)
     df = parquet_reader.load(table_path)
     df.createOrReplaceTempView(table_name)
     self.get_logger(sc).info("Schema of table {}:\n{}".format(
         table_name, df.schema))
Exemplo n.º 14
0
def sample_data(spark):
    root = os.path.dirname(__file__)
    schema_path = os.path.join(root, "resources",
                               "experiments-summary.schema.json")
    with open(schema_path) as f:
        d = json.load(f)
        schema = StructType.fromJson(d)
    rows_path = os.path.join(root, "resources",
                             "experiments-summary-190-rows.json")
    # FAILFAST causes us to abort early if the data doesn't match
    # the given schema. Without this there was as very annoying
    # problem where dataframe.collect() would return an empty set.
    frame = spark.read.json(rows_path, schema, mode="FAILFAST")
    return frame
Exemplo n.º 15
0
def process_json_to_dataframe(schema_name, paths):
    """Processes JSON to Spark DataFrame.

    :param schema_name: Schema name.
    :type schema_name: string
    :param paths: S3 paths to process.
    :type paths: list
    :returns: Spark DataFrame.
    :rtype: DataFrame
    """
    drop_subset = [
        "dut_type", "dut_version",
        "passed",
        "test_name_long", "test_name_short",
        "test_type",
        "version"
    ]

    # load schemas
    with open(f"iterative_{schema_name}.json", "r", encoding="UTF-8") as f_schema:
        schema = StructType.fromJson(load(f_schema))

    # create empty DF out of schemas
    sdf = spark.createDataFrame([], schema)

    # filter list
    filtered = [path for path in paths if schema_name in path]

    # select
    for path in filtered:
        print(path)

        sdf_loaded = spark \
            .read \
            .option("multiline", "true") \
            .schema(schema) \
            .json(path) \
            .withColumn("job", lit(path.split("/")[4])) \
            .withColumn("build", lit(path.split("/")[5]))
        sdf = sdf.unionByName(sdf_loaded, allowMissingColumns=True)

    # drop rows with all nulls and drop rows with null in critical frames
    sdf = sdf.na.drop(how="all")
    sdf = sdf.na.drop(how="any", thresh=None, subset=drop_subset)

    # flatten frame
    sdf = flatten_frame(sdf)

    return sdf
Exemplo n.º 16
0
    def __init__(self, jresolved_table_schema):
        """
        Create a resolved table schema from the underlying Java object.

        :param jresolved_table_schema: Java object of ResolvedTableSchema
        """
        from pyspark.sql.types import StructType
        table_name = jresolved_table_schema.tableName()
        json_schema = json.loads(jresolved_table_schema.schema().json())
        jschema = StructType.fromJson(json_schema)

        pk_columns = []
        it = jresolved_table_schema.pkColumns().iterator()
        while it.hasNext():
            pk_columns.append(it.next())

        sharding_columns = []
        it = jresolved_table_schema.shardingColumns().iterator()
        while it.hasNext():
            sharding_columns.append(it.next())

        self.pk_indexes = []
        it = jresolved_table_schema.pkIndex().iterator()
        while it.hasNext():
            jindex_spec = it.next()
            json_ispec = json.loads(jindex_spec.toJsonStr())
            self.pk_indexes.append(IndexSpecification.from_json(json_ispec))

        partition_columns = []
        it_option_wrapper = jresolved_table_schema.partitionColumns().iterator()
        if "{}".format(it_option_wrapper) == "non-empty iterator":
            #remove the Option wrapper
            it_partCols = it_option_wrapper.next().iterator()
            if "{}".format(it_partCols) == "non-empty iterator":
                while it_partCols.hasNext():
                    partition_columns.append(it_partCols.next())
            # else: an empty list for partition_columns
        elif "{}".format(it_option_wrapper) != "empty iterator":
            raise Exception("Expected returned value for partition_columns to be a JavaObject representing an iterator not {}".format(it))
        #else: print ("a wrapper for None")

        if len(partition_columns) > 0:
            super(ResolvedTableSchema, self).__init__(table_name, jschema,
                                                  sharding_columns, pk_columns, partition_columns)
        else:
            super(ResolvedTableSchema, self).__init__(table_name, jschema,
                                                  sharding_columns, pk_columns)
        self.jresolved_table_schema = jresolved_table_schema
    def init_source(self, spark_session, options):
        source_format = options.get("format", "text")
        schema = options["schema"]
        input_path = options["path"]
        max_files_in_batch = options["max-files-per-trigger"]

        print(schema)
        schema_type = StructType.fromJson(loads(schema))

        return spark_session \
            .readStream \
            .format(source_format) \
            .schema(schema_type) \
            .option("path", input_path) \
            .option("maxFilesPerTrigger", max_files_in_batch) \
            .load()
Exemplo n.º 18
0
def schema_from_json(path):
    """ Create a pyspark schema from the json representation.

    The json representation must be from a StructType. This can be
    generated from any StructType using the `.json()` method. The
    schema for a dataframe can be obtained using the `.schema`
    accessor. For example, to generate the json from the
    `topline_summary`, run the following in the pyspark repl:

    >>> path = 's3a://telemetry-parquet/topline_summary/v1/mode=weekly'
    >>> json_data = spark.read.parquet(path).schema.json()

    :path str: Path the the json data
    """
    with pkg_resources.resource_stream(mozetl.topline.__name__, path) as f:
        data = json.load(f)
        return StructType.fromJson(data)
Exemplo n.º 19
0
    def do(self, workflow, etl_process):

        from pyspark.sql.types import StructType, StructField, StringType
        from json import load

        self.create()
        self.location = self.action_details.pop("location")
        self.format = self.action_details.pop("format")
        self.schema = self.action_details.pop("schema", None)

        # self.schema should be a relative filepath to a spark-compliant schema
        if self.schema:
            with open(self.schema) as fp:
                schema = StructType.fromJson(load(fp))

        # if there is no spark-compliant schema, attempt to use the columns and set all
        # fields to string
        elif self.columns:
            schema = StructType(
                [StructField(i, StringType()) for i in self.columns])

        # if there are no columns, then allow the schema to be inferred using the inference
        # rules native to the format
        else:
            schema = None

        schema_types = {
            "csv": self.spark.read.csv,
            "json": self.spark.read.json
        }

        schemaless_types = {
            "parquet": self.spark.read.parquet,
            "orc": self.spark.read.orc,
        }

        if self.format in schema_types:
            reader = schema_types[self.format]
            workflow.df = reader(self.location,
                                 schema=schema,
                                 **self.action_details)
        else:
            reader = schemaless_types[self.format]
            workflow.df = reader(self.location, **self.action_details)
Exemplo n.º 20
0
def _generate_struct(message_definition):
    lexer = RosMessageLexer(InputStream(message_definition))
    stream = CommonTokenStream(lexer)
    parser = RosMessageParser(stream)
    tree = parser.rosbag_input()
    visitor = RosMessageSchemaVisitor()
    visitor.visit(tree)

    struct_fields = []
    for f in visitor.fields:
        struct_fields.append({
            'metadata': {},
            'name': f[0],
            'nullable': True,
            'type': 'integer'
        })

    schema_dict = {'fields': struct_fields, 'type': 'struct'}
    return StructType.fromJson(schema_dict)
Exemplo n.º 21
0
def generate_schema(columns, nullable_columns='all'):
    """
    Parameters
    ----------
    columns: dict of column names (keys) and types (values)
    nullables: list of nullable columns, optional, default is 'all'

    Returns
    -------
    schema: StructType
        Spark DataFrame schema corresponding to Python/numpy types.
    """
    columns = sorted(columns.items())
    colnames = list(map(itemgetter(0), columns))
    coltypes = list(map(itemgetter(1), columns))

    invalid_types = []
    new_types = []
    keys = list(map(itemgetter(0), list(_mapping.items())))
    for coltype in coltypes:
        if coltype not in keys:
            invalid_types.append(coltype)
        else:
            if coltype == np.dtype('O'):
                new_types.append(str)
            else:
                new_types.append(keys[keys.index(coltype)])
    assert len(invalid_types) == 0, "Invalid type(s) specified: {}".format(
        str(invalid_types))

    if nullable_columns == 'all':
        nullables = [True] * len(colnames)
    else:
        nullables = [col in nullable_columns for col in colnames]

    fields = [{
        "metadata": {},
        "name": name,
        "nullable": nullable,
        "type": _mapping[typ]
    } for name, typ, nullable in zip(colnames, new_types, nullables)]
    return StructType.fromJson({"type": "struct", "fields": fields})
Exemplo n.º 22
0
    def run(self, processor_context: ProcessorContext) -> Dependency:
        dependency_config = {}

        default_options = processor_context.get_property_group(
            self.DEFAULT_PROPS_GROUP)
        load_options = processor_context.get_property_group(
            self.LOAD_OPTIONS_GROUP)

        view_name = default_options.get_property(self.VIEW_NAME)
        if view_name is not None:
            dependency_config['view_name'] = view_name

        path = default_options.get_property(self.PATH)
        load_format = default_options.get_property(self.FORMAT)
        schema = default_options.get_property(self.SCHEMA)

        struct_type = StructType.fromJson(schema)

        df = processor_context.spark_session.readStream.load(
            path=path, format=load_format, schema=struct_type, **load_options)

        return Dependency(df, dependency_config)
Exemplo n.º 23
0
def generate_schema(colnames, coltypes, nullables=None):
    """
    Parameters
    ----------
    colnames: list of string
    coltypes: list of type
    nullables: list of boolean, optional

    Returns
    -------
    schema: StructType
        Spark DataFrame schema corresponding to Python/numpy types.
    """
    assert len(colnames) == len(
        coltypes), "You must specify types for all columns."
    invalid_types = []
    new_types = []
    keys = list(map(itemgetter(0), list(_mapping.items())))
    for coltype in coltypes:
        if coltype not in keys:
            invalid_types.append(coltype)
        else:
            if coltype == np.dtype('O'):
                new_types.append(str)
            else:
                new_types.append(keys[keys.index(coltype)])
    assert len(invalid_types) == 0, "Invalid type(s) specified: {}".format(
        str(invalid_types))

    if nullables is None:
        nullables = [True] * len(colnames)

    fields = [{
        "metadata": {},
        "name": name,
        "nullable": nullable,
        "type": _mapping[typ]
    } for name, typ, nullable in zip(colnames, new_types, nullables)]
    return StructType.fromJson({"type": "struct", "fields": fields})
def test_drop_dup_keep_latest(tsv_path, csv_schema_path, json_schema_path,
                              id_col, date_col, keep_date_null):
    # setup
    spark = pytest.spark
    pwd = os.path.dirname(os.path.abspath(__file__))
    with open(os.path.join(pwd, csv_schema_path)) as f:
        csv_schema = StructType.fromJson(json.load(f))
    with open(os.path.join(pwd, json_schema_path)) as f:
        json_schema = ArrayType.fromJson(json.load(f))

    df = spark.read.csv(os.path.join(pwd, tsv_path),
                        header=True,
                        sep='\t',
                        schema=csv_schema)
    df = df.withColumn(
        'tmp_payload',
        functions.explode(
            functions.from_json(functions.col('payload'), json_schema)))
    df = df.withColumn('status', functions.col('tmp_payload.status')) \
           .withColumn('is_old', functions.col('tmp_payload.is_old')) \
           .withColumn('order_date', functions.col('tmp_payload.order_date')) \
           .withColumn('timestamp', functions.col('tmp_payload.timestamp')) \
           .drop('payload', 'tmp_payload')

    # exec
    tmp_df = df.groupBy(id_col).agg(functions.max(date_col).alias(date_col))\
               .sort(id_col)
    if not keep_date_null:
        tmp_df = tmp_df.dropna(subset=date_col)
    res_df = drop_dup_keep_latest(pytest.spark, df, id_col, date_col,
                                  keep_date_null)

    # assert
    ans = [list(row) for row in tmp_df.collect()]
    res = [list(row) for row in res_df.select(id_col, date_col).collect()]
    assert res == ans
Exemplo n.º 25
0
def init_schema(json_location) -> StructType:
    with open(json_location) as source:
        data = json.load(source)
        return StructType.fromJson(data)
Exemplo n.º 26
0
import ujson as json
import os
import pkg_resources

from pyspark.sql.types import StructType

import mozetl
SCHEMA_DIR = 'json'
MAIN_SUMMARY_SCHEMA_BASENAME = 'main_summary.v4.schema.json'
main_summary_path = os.path.join(SCHEMA_DIR, MAIN_SUMMARY_SCHEMA_BASENAME)

with pkg_resources.resource_stream(mozetl.__name__, main_summary_path) as f:
    d = json.load(f)
    MAIN_SUMMARY_SCHEMA = StructType.fromJson(d)
Exemplo n.º 27
0
 def str_to_schema(s):
     return StructType.fromJson(json.loads(s))
Exemplo n.º 28
0
def get_twitter_schema(json_file_name):
    schema_dict = json.load(open(json_file_name))
    schema_struct = StructType.fromJson(schema_dict)
    return schema_struct
import json
import pyspark
import configFile
from pyspark.sql import functions
from pyspark.sql.functions import explode
from pyspark.sql.types import *
from pyspark.sql.types import StructType, StructField, StringType

#Create Spark Session
sparkSession = pyspark.sql.SparkSession.builder.config(conf=configFile.sparkConfig).getOrCreate()

dfDatabaseData = sparkSession.read.option("multiline", "true")\
    .json(configFile.dbaasSourceFile)

str = StructType.fromJson(json.load(open("/Users/prammitr/Documents/Doc/my_projects/pyspark/dbaas_excra_schema.json")))

# 1. dbaas_db_system_dim
## Dependencies:
### $.dbSystemData  --> dbaas_db_system_dim
dfdbaas_db_system_dim = sparkSession.read.schema(str).option("multiline", "true")\
    .json(configFile.dbaasSourceFile)\
    .withColumn("dbSystemData", explode("dbSystemData")) \
    .select("dbSystemData.*")
dfdbaas_db_system_dim.createOrReplaceTempView("dbaas_db_system_dim")
dbaas_db_system_dim = sparkSession.sql(
     "select id,displayName,computeShape,dbSystemShape,databaseEdition,nodeCount,timeCreated,nodeCount,licenseType,tempHostSerial from dbaas_db_system_dim a")

#print("Printing Data for ---- > dbSystemData  --> dbaas_db_system_dim")
#dbaas_db_system_dim.show(2)

Exemplo n.º 30
0
dbutils.fs.help()

# COMMAND ----------

dbutils.fs.put(untappd_raw_path, json.dumps(full_data), True)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Ingest our schema file from the Data Lake

# COMMAND ----------

head = dbutils.fs.head(untappd_raw_schema_path, 10000)
schema = StructType.fromJson(json.loads(head))

# COMMAND ----------

df = spark.read.schema(schema).json(untappd_raw_path)

# COMMAND ----------

# df.show()

# COMMAND ----------

df.write.format('delta').mode("append").save(untappd_raw_delta_path)

# COMMAND ----------
Exemplo n.º 31
0
COUNTRIES_FILE_PATH  = '/opt/SparkDatasets/geography/countries.csv'
CITIES_FILE_PATH     = '/opt/SparkDatasets/geography/cities.csv'

CONTINENT_STRUCTURE = \
    [ ( 'continent_id'  , 'integer' )
    , ( 'continent_name', 'string'  ) ]
COUNTRY_STRUCTURE = \
    [ ( 'country_id'  , 'integer' )
    , ( 'continent_id', 'integer' )
    , ( 'country_name', 'string'  ) ]
CITY_STRUCTURE = \
    [ ( 'city_id'   , 'integer' )
    , ( 'country_id', 'integer' )
    , ( 'city_name' , 'string'  ) ]

CONTINENT_SCHEMA = StructType.fromJson( generate_schema_dict(CONTINENT_STRUCTURE) )
COUNTRY_SCHEMA   = StructType.fromJson( generate_schema_dict(COUNTRY_STRUCTURE)   )
CITY_SCHEMA      = StructType.fromJson( generate_schema_dict(CITY_STRUCTURE)      )

spark = SparkSession.builder.getOrCreate()

continents_df = generate_dataframe( spark, CONTINENT_SCHEMA, CONTINENTS_FILE_PATH )
countries_df  = generate_dataframe( spark, COUNTRY_SCHEMA  , COUNTRIES_FILE_PATH  )
cities_df     = generate_dataframe( spark, CITY_SCHEMA     , CITIES_FILE_PATH     )

continents_df.registerTempTable('continents')
countries_df.registerTempTable('countries')
cities_df.registerTempTable('cities')

print continents_df.count()
print countries_df.count()

df_json_vals = trainer_df.toJSON().collect()[0]
df_json_schema = trainer_df.schema.jsonValue()

full_scoring_record = {"vals":json.loads(df_json_vals),"schema":df_json_schema}
json.dump({"vals":json.loads(df_json_vals),"schema":df_json_schema},"full_scoring_record.json")

spark.createDataFrame(full_scoring_record['vals'])

json.dumps({"vals":json.loads(df_json_vals),"schema":df_json_schema})


with open('json_vals.json','w') as json_vals_out_file:
    json_vals_out_file.write(df_json_vals)

with open('json_schema.json','w') as json_schema_out_file:
    json.dump(df_json_schema,json_schema_out_file)


with open('json_vals.json','r') as json_vals_in_file:
    in_df_json_vals = json.load(json_vals_in_file)

with open('json_schema.json','r') as json_schema_in_file:
    in_df_json_schema = StructType.fromJson(json.load(json_schema_in_file))


in_df = spark.read.json(
    'json_vals.json'
    ,schema = in_df_json_schema)