Exemplo n.º 1
0
dataPath = "file:///home/karthik/srikarthik/twitter/"

"""
In case of windows type absolute path of file including drive.

Example: dataPath = "C:\\filepath\\twitter\\"

Note: only specify folder path dont specify filename else it throws error in below statements

"""


# Defining Schema

dataSchema = StructType([StructField("Text",StringType(),True),StructField("Score",DoubleType(),True)])

"""
Notes:

*The method SparkSession.readStream returns a DataStreamReader used to configure the stream.

*Every streaming DataFrame must have a schema - the definition of column names and data types.
else it through error. 

*The type of stream can be: Files, Kafka, TCP/IP, console etc. we are using file as stream.
In our example below, we will be consuming files written continuously to a pre-defined directory. To control how much data is pulled into Spark at once, we can specify the option maxFilesPerTrigger.For example we will be reading in only one file for every trigger.

* We have stored tweets with fields text and score using '|' as delimiter as a text file. Hence we will read using csv format with separator as '|'.

Note Streaming data frame 
Exemplo n.º 2
0
pdf_system_alerts= pd.DataFrame(system_alerts)
pdf_system_regions= pd.DataFrame(system_regions)


# Datatype change : station_ids -> List to station_id string and renaming the column too
columns = ['alert_id','last_updated', 'station_id','summary','type']
new_pdf_system_alerts = pd.DataFrame(columns=columns)
for index, row in pdf_system_alerts.iterrows():
    if len(row.station_ids)==1:
        new_pdf_system_alerts.loc[len(new_pdf_system_alerts)]=[row['alert_id'],row['last_updated'],row['station_ids'][0],row['summary'],row['type']] 
    else:
        for val in row.station_ids[:]:
            new_pdf_system_alerts.loc[len(new_pdf_system_alerts)]=[row['alert_id'],row['last_updated'],val,row['summary'],row['type']]

            
schema_system_alerts = StructType([StructField("alert_id", LongType(), True), StructField("last_updated", LongType(), False), StructField("station_id", StringType(), False), StructField("summary", StringType(), False), StructField("type", StringType(), False)])            
sqldf_system_alerts = sqlContext.createDataFrame([],schema_system_alerts)

#Changing Pandas DF to SQL for aggregation 
sqldf_station_info=sqlContext.createDataFrame(pdf_station_info)   
sqldf_station_status=sqlContext.createDataFrame(pdf_station_status)
if len(pdf_system_alerts>1):
    sqldf_system_alerts=sqlContext.createDataFrame(new_pdf_system_alerts)  
if len(pdf_system_regions):
    sqldf_system_regions=sqlContext.createDataFrame(pdf_system_regions)




joined_info_status = sqldf_station_info.join(sqldf_station_status,['station_id'],'outer')
join_info_status_alert = joined_info_status.join(sqldf_system_alerts,['station_id'],'outer')
jdf = spark.read.json(df.select("emp_json").rdd.map(lambda s: str(s)))

jdf.show(truncate=False)
jdf.printSchema()

schema = \
    StructType([
        StructField("emp",ArrayType(
            StructType([
                StructField("address",
                        StructType([
                            StructField("city",  StringType(), True),
                            StructField("street",StringType(), True),
                            StructField("unit",  StringType(), True)]),
                        True),
                StructField("name",
                        StructType([
                            StructField("firstName", StringType(), True),
                            StructField("lastName", StringType(), True)]),
                        True)
            ])
        ,True)
   ,True)
])

#    .select(F.explode(F.col("employee")).alias("emp")) \

jdf = jdf.select(F.col("*"),
                 F.from_json(F.col("_corrupt_record"), schema).alias("emp"))

jdf = jdf.select("emp.name.firstName", "emp.name.lasteName",
Exemplo n.º 4
0
def cleaner(df_union, spark):

    # Cambio el tipo de datos
    df_union = df_union.withColumn(
        "intervalSize", df_union["intervalSize"].cast(IntegerType()))

    #Hago el request para traer el json de configuracion
    base_url = "http://80f56133-default-platforms-88e5-2108999998.us-east-1.elb.amazonaws.com/platform_settings/get_configuration"

    headers = {
        "Content-Type": "application/json",
        "Transaction-Id": "test",
        "User-Id": "test",
        "owner": "test"
    }

    # Hago el request usando urlib
    req = request.Request(base_url, headers=headers)
    response = request.urlopen(req).read()
    response = json.loads(response)
    req_config = response["data"][0]

    # extraigo los atributos y los guardo en variables
    for item in req_config["configPastFutureDays"]:
        if item['type'] == 'LP':
            lp_futuro = item['future']
            lp_pasado = item['past']
        else:
            re_futuro = item['future']
            re_pasado = item['past']

    ceros = req_config["withNullsOrCero"]["withCeros"]

    lista_apagon = req_config["QualityCode"]["PowerOutage"]

    def check_empty(df):
        return len(df.head(1)) > 0

    # Defino diccionario y la funcion para el paso de utc a readinglocaltime, esto se usa en el paso 5

    utc_dicc = {
        '0': -12,
        '1': -11,
        '2': -10,
        '3': -9,
        '4': -8,
        '5': -7,
        '6': -7,
        '7': -6,
        '8': -6,
        '9': -6,
        '10': -5,
        '11': -5,
        '12': -5,
        '13': -4,
        '14': -4,
        '15': -3.5,
        '16': -3,
        '17': -3,
        '18': -2,
        '19': -1,
        '20': 0,
        '21': 0,
        '22': 1,
        '23': 1,
        '24': 1,
        '25': 2,
        '26': 2,
        '27': 2,
        '28': 2,
        '29': 2,
        '30': 3,
        '31': 3,
        '32': 3.5,
        '33': 4,
        '34': 4.5,
        '35': 5,
        '36': 5.5,
        '37': 6,
        '38': 7,
        '39': 8,
        '40': 8,
        '41': 9,
        '42': 9.5,
        '43': 9.5,
        '44': 10,
        '45': 10,
        '46': 10,
        '47': 11,
        '48': 12,
        '49': 12,
        '50': -8,
        '51': -7,
        '52': -6,
        '53': -6,
        '54': -6,
        '55': -6,
        '56': 10
    }

    def conversor_utc_local(row):
        if not (row.readingLocalTime):
            fecha = datetime.strptime(row.readingUtcLocalTime,
                                      '%Y-%m-%d %H:%M:%S')
            fecha = fecha + timedelta(hours=utc_dicc[row.servicePointTimeZone])
            return fecha.strftime('%Y-%m-%d %H:%M:%S')
        else:
            return row.readingLocalTime

    # PARTE 1: FLAGS
    #descarte de banderas, si alguna es false, se descarta el registro (se guarda en otro dataframe que se
    # va a usar para crear los logs)
    print(df_union.head(1))
    print(len(df_union.head(1)))
    print(len(df_union.head(1)) > 0)
    if check_empty(df_union):
        # primero revisamos por la bandera de estado de asociacion, si esta es falsa todos los registros con dicha condicion
        # van a separarse el dataframe original para ir al dataframe de logs

        df_logs = df_union.filter(df_union.result_rD == False).withColumn(
            "Descripcion_log", lit("Estado de asociacion"))
        df_logs.show()
        df_logs.printSchema()

        df_union = df_union.filter(df_union.result_rD == True)

        # Ahora separamos aquellos registros que tienen falso en el estado de la variable o en la bandera de guardado
        # y lo concatenamos al df de logs

        df_logs_aux = df_union.filter((df_union.result_sPV == False) | (df_union.toStock == False))\
        .withColumn("Descripcion_log",lit("result variable - to stock "))

        df_logs = df_logs.union(df_logs_aux).coalesce(1)

        df_union = df_union.filter((df_union.result_sPV == True)
                                   & (df_union.toStock == True))

        df_union.write.format('csv').mode("overwrite").save(
            "./output/cleaned/paso1_banderas", header="true", emptyValue="")

    # PARTE 2: DIAS PASADOS FUTUROS
    # en base a la respuesta del request (req_config) tenemos que encontrar aquellas lecturas de los dias que correspondan

    if check_empty(df_union):

        # En los regustros donde el campo readingUtcLocalTime sea nulo, no vamos a poder hacer ninguna validacion, por lo que l
        # los descartamos del df original y los agragamos a los logs
        df_union = df_union.filter(df_union.readingUtcLocalTime.isNotNull())

        # creamos la funcion para aplicar a cada fila del df
        hoy = datetime.today()
        date_format = '%Y-%m-%d'

        def pasado_futuro(row):
            last_date = datetime.strptime(row.lastReadDate[:10], date_format)
            asoc = datetime.strptime(row.relationStartDate[:10], date_format)
            reading_time = datetime.strptime(row.readingUtcLocalTime[:10],
                                             date_format)
            if (asoc - last_date).days > 0:
                last_date = asoc

            if row.readingType == "LOAD PROFILE READING":
                return (last_date - timedelta(days=lp_pasado)
                        ) <= reading_time <= (hoy + timedelta(days=lp_futuro))
            else:
                return (last_date - timedelta(days=re_pasado)
                        ) <= reading_time <= (hoy + timedelta(days=re_futuro))

        # Aplico la funcion al dataframe
        udf_object = udf(pasado_futuro, BooleanType())
        df_union = df_union.withColumn(
            "lastReadDate_result",
            udf_object(struct([df_union[x] for x in df_union.columns])))

        # Ahora separamos aquellos registros que tienen falso en el lastReadDate_result
        # y lo concatenamos al df de logs

        df_logs_aux = df_union.filter(df_union.lastReadDate_result == False)\
        .withColumn("Descripcion_log",lit("dias pasado-futuro"))\
        .drop("lastReadDate_result")

        df_logs = df_logs.union(df_logs_aux).coalesce(1)

        df_union = df_union.filter(df_union.lastReadDate_result == True)
        df_union = df_union.drop("lastReadDate_result")

        df_union.write.format('csv').mode("overwrite").save(
            "./output/cleaned/paso2_pasadoFuturo",
            header="true",
            emptyValue="")

    # PARTE 3: DUPLICADOS E INCONSISTENCIAS
    # las lecturas totalmente repetidas hay que descartar todas menos una (paso 1), pero si hay alguna lectura con el mismo conjunto
    # de datos (servicePointId, deviceId, meteringType, variableId, readingLocalTime,logNumber) pero con distinto
    # readingsValue entonces se descartan todas las filas por inconsistencia en los datos (paso 2).

    if check_empty(df_union):

        # paso 1 (DUPLICADOS)
        #union_unique = union.dropDuplicates(['servicePointId','deviceId','meteringType','variableId','readingLocalTime','logNumber','readingsValue'])
        #df_logs_aux = union.subtract(union_unique).withColumn("Descripcion_log",lit("Duplicados"))

        ### Get Duplicate rows in pyspark

        df_logs_aux = df_union.groupBy('servicePointId', 'deviceId',
                                       'meteringType', 'variableId',
                                       'readingLocalTime', 'logNumber',
                                       'readingsValue').count()
        df_logs_aux = df_logs_aux.filter(df_logs_aux["count"] > 1)
        df_logs_aux = df_logs_aux.select(
            col("servicePointId").alias("servicePointId_2"),
            col("deviceId").alias("deviceId_2"),
            col("meteringType").alias("meteringType_2"),
            col("variableId").alias("variableId_2"),
            col("readingLocalTime").alias("readingLocalTime_2"),
            col("logNumber").alias("logNumber_2"),
            col("readingsValue").alias("readingsValue_2"))
        df_logs_aux = df_logs_aux.withColumn("Descripcion_log",
                                             lit("Duplicados"))
        df_logs_aux = df_union.join(
            df_logs_aux,
            [(df_union.servicePointId == df_logs_aux.servicePointId_2),
             (df_union.deviceId == df_logs_aux.deviceId_2),
             (df_union.meteringType == df_logs_aux.meteringType_2),
             (df_union.variableId == df_logs_aux.variableId_2),
             (df_union.readingLocalTime == df_logs_aux.readingLocalTime_2),
             (df_union.logNumber.eqNullSafe(df_logs_aux.logNumber_2)),
             (df_union.readingsValue == df_logs_aux.readingsValue_2)],
            how='inner').coalesce(1)
        df_logs_aux = df_logs_aux.dropDuplicates([
            'servicePointId', 'deviceId', 'meteringType', 'variableId',
            'readingLocalTime', 'logNumber', 'readingsValue'
        ])
        columns_to_drop = [
            "servicePointId_2", "deviceId_2", "meteringType_2", "variableId_2",
            "readingLocalTime_2", "logNumber_2", "readingsValue_2"
        ]
        df_logs_aux = df_logs_aux.drop(*columns_to_drop)

        df_union = df_union.dropDuplicates([
            'servicePointId', 'deviceId', 'meteringType', 'variableId',
            'readingLocalTime', 'logNumber', 'readingsValue'
        ])

        df_logs = df_logs.union(df_logs_aux).coalesce(1)

        # paso 2 (INCONSISTENCIAS)
        # funciona con el left anti join
        union_unique = df_union.dropDuplicates([
            'servicePointId', 'deviceId', 'meteringType', 'variableId',
            'readingLocalTime', 'logNumber'
        ])
        df_duplicates = df_union.subtract(union_unique).select(
            'servicePointId', 'deviceId', 'meteringType', 'variableId',
            'readingLocalTime', 'logNumber')
        df_final = union_unique.join(
            df_duplicates,
            [(union_unique.servicePointId == df_duplicates.servicePointId),
             (union_unique.deviceId == df_duplicates.deviceId),
             (union_unique.meteringType == df_duplicates.meteringType),
             (union_unique.variableId == df_duplicates.variableId),
             (union_unique.readingLocalTime == df_duplicates.readingLocalTime),
             (union_unique.logNumber.eqNullSafe(df_duplicates.logNumber))],
            how='left_anti').coalesce(1)
        # con esta linea salvamos la union entre nulos de pyspark, (union_unique.logNumber.eqNullSafe(df_duplicates.logNumber))

        df_logs_aux = df_union.subtract(union_unique).withColumn(
            "Descripcion_log", lit("Valores inconsitentes")).coalesce(1)
        df_logs = df_logs.union(df_logs_aux).coalesce(1)

        df_union = df_final

        df_union.write.format('csv').mode("overwrite").save(
            "./output/cleaned/paso3_duplicados", header="true", emptyValue="")

    # PARTE 4: VERSIONADO

    # creo las columnas de editado y estimado con valores vacios
    df_union = df_union.withColumn("estimationReading", lit(""))\
                    .withColumn("estimationValid", lit(""))\
                    .withColumn("editionReading", lit(""))\
                    .withColumn("editionValid", lit(""))

    # verificar si el df tiene datos de REGISTERS Y LOAD PROFILE READING
    if check_empty(
            df_union.filter((df_union.readingType == "REGISTERS") | (
                df_union.readingType == "LOAD PROFILE READING"))):

        df_nuevo = df_union.filter((df_union.readingType == "REGISTERS") | (
            df_union.readingType == "LOAD PROFILE READING"))

        min_ts = df_nuevo.agg(min('readingUtcLocalTime')).collect()[0][0]

        # pongo todos los valores de version en 1, como si fueran nuevos, despues se van modificando
        df_nuevo = df_nuevo.withColumn("version", lit(1))

        # este es el df que esta presente en trustdata (base de datos), hacer una consulta que traiga datos con readinType REGISTERS O
        # LOAD PROFILE READING y ademas que tenga una fecha comprendida entre max_ts y min_ts
        df_orig = spark.read.csv(
            "../tests/Examples/original_modificado_glue.csv",
            header=True,
            mode="DROPMALFORMED")

        df_orig = df_orig.filter(df_orig.readingUtcLocalTime >= min_ts)

        df_orig.write.format('csv').mode("overwrite").save(
            "./output/cleaned/trustdata_para_versionado",
            header="true",
            emptyValue="")
        # ATHENA

        # checkeo que el df en la base no este vacio para poder hacer los cruces correctamente, caso que sea vacio, automaticamente
        # se rellena el campo version presentes el el df procesado con 1
        if len(df_orig.head(1)) > 0:

            df_orig = df_orig.filter((df_orig.readingType == "REGISTERS") | (
                df_orig.readingType == "LOAD PROFILE READING"))

            # obtener la info de la ultima version de cada lectura unica
            df_orig = df_orig.withColumn(
                "version", df_orig["version"].cast(IntegerType()))
            df_maxs = df_orig.groupby([
                'servicePointId', "variableId", "logNumber",
                "readingUtcLocalTime", "meteringType", "readingsSource",
                "owner"
            ]).agg(max("version")).drop('version').withColumnRenamed(
                'max(version)', 'version').coalesce(1)
            df_orig = df_orig.withColumnRenamed('servicePointId', 'servicePointId_drop')\
                            .withColumnRenamed('variableId', 'variableId_drop')\
                            .withColumnRenamed('logNumber', 'logNumber_drop')\
                            .withColumnRenamed('readingUtcLocalTime', 'readingUtcLocalTime_drop')\
                            .withColumnRenamed('meteringType', 'meteringType_drop')\
                            .withColumnRenamed('version', 'version_drop')\
                            .withColumnRenamed('readingsSource', 'readingsSource_drop')\
                            .withColumnRenamed('owner', 'owner_drop')\

            # traer los datos de la ultima version de cada lectura unica
            df_orig = df_maxs.join(
                df_orig,
                [(df_maxs.servicePointId == df_orig.servicePointId_drop),
                 (df_maxs.variableId == df_orig.variableId_drop),
                 (df_maxs.readingsSource == df_orig.readingsSource_drop),
                 (df_maxs.owner == df_orig.owner_drop),
                 (df_maxs.meteringType == df_orig.meteringType_drop),
                 (df_maxs.readingUtcLocalTime
                  == df_orig.readingUtcLocalTime_drop),
                 (df_maxs.logNumber.eqNullSafe(df_orig.logNumber_drop)),
                 (df_maxs.version == df_orig.version_drop)],
                how='inner').coalesce(1)

            columns_to_drop = [
                item for item in df_orig.columns if "_drop" in item
            ]
            df_orig = df_orig.drop(*columns_to_drop)

            # seleccionar los valores de interes en la base de trustdata
            df_orig = df_orig.select(
                col("servicePointId").alias("servicePointId_orig"),
                col("variableId").alias("variableId_orig"),
                col("owner").alias("owner_orig"),
                col("logNumber").alias("logNumber_orig"),
                col("meteringType").alias("meteringType_orig"),
                col("readingUtcLocalTime").alias("readingUtcLocalTime_orig"),
                col("readingsSource").alias("readingsSource_orig"),
                col("readingsValue").alias("readingsValue_orig"),
                col("version").alias("version_orig"),
                col("estimationReading").alias("estimationReading_orig"),
                col("estimationValid").alias("estimationValid_orig"),
                col("editionReading").alias("editionReading_orig"),
                col("editionValid").alias("editionValid_orig"))

            # left join entre el df en proceso y el df en la base de datos
            df_final = df_nuevo.join(
                df_orig,
                [(df_nuevo.servicePointId == df_orig.servicePointId_orig),
                 (df_nuevo.variableId == df_orig.variableId_orig),
                 (df_nuevo.readingsSource == df_orig.readingsSource_orig),
                 (df_nuevo.owner == df_orig.owner_orig),
                 (df_nuevo.meteringType == df_orig.meteringType_orig),
                 (df_nuevo.readingUtcLocalTime
                  == df_orig.readingUtcLocalTime_orig),
                 (df_nuevo.logNumber.eqNullSafe(df_orig.logNumber_orig))],
                how='left_outer').coalesce(1)

            def versionamiento(row):
                if row.readingsValue_orig is None:
                    return 1
                else:
                    if row.readingsValue_orig == row.readingsValue:
                        return None
                    else:
                        return row.version_orig + 1

            udf_object = udf(versionamiento, IntegerType())
            df_final = df_final.withColumn(
                "version_ref",
                udf_object(struct([df_final[x] for x in df_final.columns])))

            # logs y df resultante en base a los valores devueltos por la funcion de versionamiento
            df_logs_aux = df_final.filter(
                df_final.version_ref.isNull()).withColumn(
                    "Descripcion_log", lit("Versionamiento duplicados"))

            df_final = df_final.filter(df_final.version_ref.isNotNull())
            df_final = df_final.drop("version").withColumnRenamed(
                "version_ref", 'version')
            df_final = df_final.drop("estimationReading").withColumnRenamed(
                "estimationReading_orig", 'estimationReading')
            df_final = df_final.drop("estimationValid").withColumnRenamed(
                "estimationValid_orig", 'estimationValid')
            df_final = df_final.drop("editionReading").withColumnRenamed(
                "editionReading_orig", 'editionReading')
            df_final = df_final.drop("editionValid").withColumnRenamed(
                "editionValid_orig", 'editionValid')
            # mantengo los valores de estimado y editado de la version anterior

        else:
            df_final = df_nuevo

        def usage_reading(row):
            string = ":".join(row.multiplier.split(":")[1:])
            string = string.replace("'", "\"")
            dicti = json.loads(string)
            val = float(row.readingsValue)
            for value in dicti.values():
                if value:
                    val = val * float(value)
            return val

        udf_object = udf(usage_reading, StringType())
        df_final = df_final.withColumn(
            "usageReading",
            udf_object(struct([df_final[x] for x in df_final.columns])))

        lista_columnas = [
            "servicePointId", "readingType", "variableId", "deviceId",
            "meteringType", "readingUtcLocalTime", "readingDateSource",
            "readingLocalTime", "dstStatus", "channel", "unitOfMeasure",
            "qualityCodesSystemId", "qualityCodesCategorization",
            "qualityCodesIndex", "intervalSize", "logNumber", "version",
            "readingsValue", "primarySource", "readingsSource", "owner",
            "guidFile", "estatus", "registersNumber", "eventsCode", "agentId",
            "agentDescription", "multiplier", "deviceMaster",
            "deviceDescription", "deviceStatus", "serial", "accountNumber",
            "servicePointTimeZone", "connectionType", "relationStartDate",
            "relationEndDate", "deviceType", "brand", "model", "idVdi",
            "identReading", "idenEvent", "idDateYmd", "idDateYw",
            'dateCreation', "usageReading", "estimationReading",
            "estimationValid", "editionReading", "editionValid"
        ]

        df_final = df_final.select(*lista_columnas)

        # union con los EVENTS
        df_union = df_union.filter((df_union.readingType == "EVENTS"))
        df_union = df_union.withColumn("usageReading",
                                       lit('')).select(*lista_columnas)
        df_union = df_union.union(df_final).coalesce(1)

        # logs
        columnas_log = df_logs.columns
        df_logs_aux = df_logs_aux.select(*columnas_log)
        df_logs = df_logs.union(df_logs_aux).coalesce(1)

        df_union.write.format('csv').mode("overwrite").save(
            "./output/cleaned/paso4_versionado", header="true", emptyValue="")

    # PARTE 5: RELLENO DE CEROS Y NULOS

    if check_empty(
            df_union.filter(df_union.readingType == "LOAD PROFILE READING")):
        # Se trabaja solo para los datos de LOAD PROFILE READING
        df_load_profile = df_union.filter(
            df_union.readingType == "LOAD PROFILE READING")
        # Crear columna quality_flag
        df_load_profile = df_load_profile.withColumn(
            "quality_flag",
            concat(col("qualityCodesSystemId"), lit("-"),
                   col("qualityCodesCategorization"), lit("-"),
                   col("qualityCodesIndex")))

        variables_id = df_load_profile.select(
            "variableId").distinct().collect()
        variables_id = [row.variableId for row in variables_id]

        # definimos algunas funciones

        def elem_ranges(all_elem, elem):
            """
                        Funcion que devuelve una lista con listas incluyendo los rangos de tiempos faltantes.
                        Input:
                        all_elem (string): todos los timestamps separados por coma en un string
                        elem (list): lista de los timestamp presentes en nuestro df a enriquecer
                        """
            aux_list = []
            for i in elem[1:]:
                include, ommit = all_elem.split(i)
                aux_list.append(include[:-1])
                all_elem = i + ommit
            aux_list.append(elem[-1])
            return aux_list

        # funcion para rellenar el campo readingsValue con nulos o ceros dependiendo de si la bandera es de apagon.
        def func_apagon(row):
            if (row.quality_flag in lista_apagon) and (not row.referencia):
                return relleno
            else:
                return row.readingsValue_ref

        # creo funcion que va a rellenar en otra columna (a la que vamos a llamar validacion_intervalos) con nulo cuando el valor del
        # campo quality_flag no sea bandera de apagon, y un 1 cuando la bandera indique apagon
        # esta nueva columna nos va a permitir identificar si hay que contar o no ese registro a la hora de hacer la validacion de
        # intervalos
        def func_apagon_intervalos(row):
            if (row.quality_flag in lista_apagon) or (row.referencia):
                return "1"
            else:
                return None

        # aca empieza el ciclo for por cada elemento de la variable variable_Id
        for iteracion, variable in enumerate(variables_id):

            df_lp = df_load_profile.filter(
                df_load_profile.variableId == variable)

            # obtengo los limites de los tiempos y los paso a timestamp
            max_ts = datetime.strptime(
                (df_lp.agg(max('readingUtcLocalTime')).collect()[0][0]),
                "%Y-%m-%d %H:%M:%S")
            min_ts = datetime.strptime(
                (df_lp.agg(min('readingUtcLocalTime')).collect()[0][0]),
                "%Y-%m-%d %H:%M:%S")
            delta = max_ts - min_ts

            # interval indica el intervalo en minutos
            interval = df_lp.select("intervalSize").first()[0]
            mins = delta.seconds // 60
            lista = [min_ts]
            for rep in range(1, mins // interval):
                lista.append(min_ts + timedelta(minutes=rep * interval))
            if min_ts != max_ts:
                lista.append(max_ts)

            all_elem = [fecha.strftime('%Y-%m-%d %H:%M:%S') for fecha in lista]
            all_elem = ",".join(all_elem)

            elem = df_lp.select("readingUtcLocalTime").orderBy(
                "readingUtcLocalTime", ascending=True).collect()
            elem = [row.readingUtcLocalTime for row in elem]

            # genero lista que va a formar parte del df
            to_df = elem_ranges(all_elem, elem)
            to_df = [ls.split(",") for ls in to_df]
            # genero lista para el join, los valores son el primer elemento de cada lista de to_df
            to_df_join = [item[0] for item in to_df]
            # creo el df con las listas generadas
            data = zip(to_df_join, to_df)
            schema = StructType([
                StructField('readingUtcLocalTime_2', StringType(), True),
                StructField('timestamps_arrays', ArrayType(StringType()), True)
            ])
            rdd = spark.sparkContext.parallelize(data)
            df_timestamps = spark.createDataFrame(rdd, schema).coalesce(1)

            # hago un join del df timestamp explodeado con una porcion del df original para obtener un nuevo df que tenga los valores de value y las
            # quality flags como estan originalmente en las fechas presentes en el df original y si no existen tendran nulo.
            df_portion = df_lp.select(col("readingUtcLocalTime").alias("readingUtcLocalTime_ref")
                                    ,col("readingLocalTime").alias("readingLocalTime_ref")
                                    ,col("readingDateSource").alias("readingDateSource_ref")
                                    ,col("readingsValue").alias("readingsValue_ref")
                                    ,col("qualityCodesSystemId").alias("qualityCodesSystemId_ref")
                                    ,col("qualityCodesCategorization").alias("qualityCodesCategorization_ref")
                                    ,col("qualityCodesIndex").alias("qualityCodesIndex_ref")
                                    )\
                                    .withColumn("referencia",lit("Original"))

            df_timestamps_comp = df_timestamps.withColumn(
                "complete_interval_ref",
                explode("timestamps_arrays")).select("complete_interval_ref")

            df_reference = df_timestamps_comp.join(
                df_portion, [(df_timestamps_comp.complete_interval_ref
                              == df_portion.readingUtcLocalTime_ref)],
                how='left').coalesce(1)

            # hago un join del df original con el df timestamp para obtener los datos de los timestamp faltantes en el
            # df original
            df_lp = df_lp.join(df_timestamps,
                               [(df_lp.readingUtcLocalTime
                                 == df_timestamps.readingUtcLocalTime_2)],
                               how='inner').coalesce(1)
            df_lp = df_lp.drop("readingUtcLocalTime_2")

            # explode de los arrays de la columna
            df_lp = df_lp.withColumn("complete_interval",
                                     explode("timestamps_arrays"))
            df_lp = df_lp.drop("timestamps_arrays")

            # hago join del df original con el df de referencia, para asi tener los valores consistentes de los campos valor y quality flags
            df_lp = df_lp.join(df_reference, [
                (df_lp.complete_interval == df_reference.complete_interval_ref)
            ],
                               how='inner').coalesce(1)

            # hago el rellenado con nulos y ceros
            # veo si rellanar con ceros o con nulos

            if ceros:
                relleno = "0"
            else:
                relleno = None

            # Create your UDF object (which accepts func_apagon  python function)
            udf_object = udf(func_apagon, StringType())
            df_lp = df_lp.withColumn(
                "readingsValue_ref",
                udf_object(struct([df_lp[x] for x in df_lp.columns])))

            # Create your UDF object (which accepts func_apagon_intervalos python function)
            udf_object = udf(func_apagon_intervalos, StringType())
            df_lp = df_lp.withColumn(
                "validacion_intervalos",
                udf_object(struct([df_lp[x] for x in df_lp.columns])))

            # Elimino algunas columnas y renombro otras para mantener el formato de salida esperado

            columns_to_drop = [
                'readingUtcLocalTime_ref', 'complete_interval_ref',
                'readingUtcLocalTime', 'readingDateSource', 'readingLocalTime',
                "qualityCodesSystemId", "qualityCodesCategorization",
                'qualityCodesIndex', "readingsValue", "referencia",
                "quality_flag"
            ]
            df_lp = df_lp.drop(*columns_to_drop)

            if iteracion == 0:
                df_load_profile_final = df_lp
            else:
                df_load_profile_final = df_load_profile.union(df_lp).coalesce(
                    1)

        df_load_profile = df_load_profile_final.withColumnRenamed(
            "complete_interval", "readingUtcLocalTime").withColumnRenamed(
                "readingLocalTime_ref", "readingLocalTime").withColumnRenamed(
                    "readingDateSource_ref",
                    "readingDateSource").withColumnRenamed(
                        "qualityCodesSystemId_ref",
                        "qualityCodesSystemId").withColumnRenamed(
                            "qualityCodesCategorization_ref",
                            "qualityCodesCategorization").withColumnRenamed(
                                "qualityCodesIndex_ref",
                                "qualityCodesIndex").withColumnRenamed(
                                    "readingsValue_ref", "readingsValue")

        # Aplico la funcion de transformacion de fechas al dataframe, para obtener todos los valores de readingLocalTime
        # y readingDateSource
        udf_object = udf(conversor_utc_local, StringType())
        df_load_profile = df_load_profile.withColumn("readingLocalTime", udf_object(struct([df_load_profile[x] for x in df_load_profile.columns])))\
                                        .withColumn("readingDateSource", col("readingLocalTime"))

        # selecciona las columnas en el orden que van y ademas hacer el union con el df original filtrado por
        # los otros tipos de lecturas

        df_registers_events = df_union.filter(
            (df_union.readingType == "REGISTERS")
            | (df_union.readingType == "EVENTS"))
        df_registers_events = df_registers_events.withColumn(
            "validacion_intervalos", lit(""))

        lista_columnas = [
            "servicePointId", "readingType", "variableId", "deviceId",
            "meteringType", "readingUtcLocalTime", "readingDateSource",
            "readingLocalTime", "dstStatus", "channel", "unitOfMeasure",
            "qualityCodesSystemId", "qualityCodesCategorization",
            "qualityCodesIndex", "intervalSize", "logNumber", "version",
            "readingsValue", "primarySource", "readingsSource", "owner",
            "guidFile", "estatus", "registersNumber", "eventsCode", "agentId",
            "agentDescription", "multiplier", "deviceMaster",
            "deviceDescription", "deviceStatus", "serial", "accountNumber",
            "servicePointTimeZone", "connectionType", "relationStartDate",
            "relationEndDate", "deviceType", "brand", "model", "idVdi",
            "identReading", "idenEvent", "idDateYmd", "idDateYw",
            'dateCreation', "validacion_intervalos", "usageReading",
            "estimationReading", "estimationValid", "editionReading",
            "editionValid"
        ]

        df_load_profile = df_load_profile.select(*lista_columnas)
        df_registers_events = df_registers_events.select(*lista_columnas)

        df_union = df_load_profile.union(df_registers_events).coalesce(1)

        df_union.write.format('csv').mode("overwrite").save(
            "./output/cleaned/paso5_rellenoNulosyCeros",
            header="true",
            emptyValue="")

    # PARTE 6: VALIDACION DE INTERVALOS

    if check_empty(
            df_union.filter(df_union.readingType == "LOAD PROFILE READING")):
        df_union = df_union.withColumn(
            "intervalSize", df_union["intervalSize"].cast(IntegerType()))

        df_loadprofile = df_union.filter(
            df_union.readingType == "LOAD PROFILE READING")

        variables_id = df_loadprofile.select("variableId").distinct().collect()
        variables_id = [row.variableId for row in variables_id]

        # aca empieza el ciclo for por cada elemento de la variable variable_Id
        for iteracion, variable in enumerate(variables_id):

            df_lp = df_loadprofile.filter(
                df_loadprofile.variableId == variable)

            # obtengo los limites de los tiempos y los paso a timestamp
            max_ts = datetime.strptime(
                (df_lp.agg(max('readingUtcLocalTime')).collect()[0][0]),
                "%Y-%m-%d %H:%M:%S")
            min_ts = datetime.strptime(
                (df_lp.agg(min('readingUtcLocalTime')).collect()[0][0]),
                "%Y-%m-%d %H:%M:%S")
            delta = max_ts - min_ts

            # interval indica el intervalo en minutos
            interval = df_lp.select("intervalSize").first()[0]
            mins = delta.seconds // 60
            cant_lecturas = (mins // interval) + 1

            if df_lp.filter(df_lp.validacion_intervalos.isNotNull()).count(
            ) == cant_lecturas:
                df_lp = df_lp.withColumn("usageValid", lit(True)).withColumn(
                    "ValidationDetail",
                    lit("")).withColumn("IsGrouped", lit(False))
            else:
                df_lp_success = df_lp.filter(
                    df_lp.validacion_intervalos.isNotNull())
                df_lp_success = df_lp_success.withColumn(
                    "usageValid", lit(True)).withColumn(
                        "ValidationDetail",
                        lit("")).withColumn("IsGrouped", lit(False))

                df_lp_failure = df_lp.filter(
                    df_lp.validacion_intervalos.isNull())
                df_lp_failure = df_lp_failure.withColumn("usageValid",lit(False))\
                                        .withColumn("ValidationDetail",lit("{'Usage':{'IntervalsError':'Interval not exist'}}"))\
                                        .withColumn("IsGrouped",lit(False))

                df_lp = df_lp_success.union(df_lp_failure).coalesce(1)
                df_lp = df_lp.orderBy("readingUtcLocalTime", ascending=True)

            if iteracion == 0:
                df_loadprofile_final = df_lp
            else:
                df_loadprofile_final = df_loadprofile_final.union(
                    df_lp).coalesce(1)

        df_loadprofile = df_loadprofile_final

        # selecciona las columnas en el orden que van y ademas hacer el union con el df original filtrado por
        # los otros tipos de lecturas

        df_registers_events = df_union.filter(
            (df_union.readingType == "REGISTERS")
            | (df_union.readingType == "EVENTS"))
        df_registers_events = df_registers_events.withColumn("usageValid",lit(None))\
                                                .withColumn("ValidationDetail",lit(""))\
                                                .withColumn("IsGrouped",lit(None))

        lista_columnas = [
            "servicePointId", "readingType", "variableId", "deviceId",
            "meteringType", "readingUtcLocalTime", "readingDateSource",
            "readingLocalTime", "dstStatus", "channel", "unitOfMeasure",
            "qualityCodesSystemId", "qualityCodesCategorization",
            "qualityCodesIndex", "intervalSize", "logNumber", "version",
            "readingsValue", "primarySource", "readingsSource", "owner",
            "guidFile", "estatus", "registersNumber", "eventsCode", "agentId",
            "agentDescription", "multiplier", "deviceMaster",
            "deviceDescription", "deviceStatus", "serial", "accountNumber",
            "servicePointTimeZone", "connectionType", "relationStartDate",
            "relationEndDate", "deviceType", "brand", "model", "idVdi",
            "identReading", "idenEvent", "idDateYmd", "idDateYw",
            'dateCreation', "usageReading", "usageValid", "ValidationDetail",
            "IsGrouped", "estimationReading", "estimationValid",
            "editionReading", "editionValid"
        ]

        df_loadprofile = df_loadprofile.select(*lista_columnas)
        df_registers_events = df_registers_events.select(*lista_columnas)

        df_union = df_loadprofile.union(df_registers_events).coalesce(1)

        df_union.write.format('csv').mode("overwrite").save(
            "./output/cleaned/paso6_validacionIntervalos",
            header="true",
            emptyValue="")

    # acomodo las columnas del df antes de escribirlo, y creo las columnas faltantes con valor vacio
    lista_columnas = [
        "servicePointId", "readingType", "variableId", "deviceId",
        "meteringType", "readingUtcLocalTime", "readingDateSource",
        "readingLocalTime", "dstStatus", "channel", "unitOfMeasure",
        "qualityCodesSystemId", "qualityCodesCategorization",
        "qualityCodesIndex", "intervalSize", "logNumber", "version",
        "readingsValue", "primarySource", "guidFile", "estatus",
        "registersNumber", "eventsCode", "agentId", "agentDescription",
        "multiplier", "deviceMaster", "deviceDescription", "deviceStatus",
        "serial", "accountNumber", "servicePointTimeZone", "connectionType",
        "relationStartDate", "relationEndDate", "deviceType", "brand", "model",
        "idVdi", "identReading", "idenEvent", "idDateYmd", 'dateCreation',
        "usageReading", "usageValid", "ValidationDetail", "IsGrouped",
        "estimationReading", "estimationValid", "editionReading",
        "editionValid", "owner", "readingsSource", "idDateYw"
    ]

    cols_df = set(df_union.columns)

    for columna in set(lista_columnas).difference(cols_df):
        df_union = df_union.withColumn(columna, lit(''))

    df_union = df_union.select(*lista_columnas)

    # bloque obtencion de anio y semana para nomenclatura del archivo
    fecha = date.today()
    anio, sem, _ = fecha.isocalendar()
    anio = str(anio)
    sem = str(sem)

    # escribir csv
    df_union.write.format('csv').mode("overwrite").save(
        "./output/cleaned/final/" + anio + "/" + sem,
        header="true",
        emptyValue="")
    df_logs.write.format('csv').mode("overwrite").save(
        "./output/cleaned/removed_rows", header="true", emptyValue="")
    return df_union
Exemplo n.º 5
0
      ).select("movie1", "movie2", "score", "numPairs")

    return result

# Get movie name by given movie id 
def getMovieName(movieNames, movieId):
    result = movieNames.filter(func.col("movieID") == movieId) \
        .select("movieTitle").collect()[0]

    return result[0]


spark = SparkSession.builder.appName("MovieSimilarities").master("local[*]").getOrCreate()

movieNamesSchema = StructType([ \
                               StructField("movieID", IntegerType(), True), \
                               StructField("movieTitle", StringType(), True) \
                               ])
    
moviesSchema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])
    
    
# Create a broadcast dataset of movieID and movieTitle.
# Apply ISO-885901 charset
movieNames = spark.read \
      .option("sep", "|") \
      .option("charset", "ISO-8859-1") \
      .schema(movieNamesSchema) \
Exemplo n.º 6
0
def make_df_solid(context):
    schema = StructType([StructField('name', StringType()), StructField('age', IntegerType())])
    rows = [Row(name='John', age=19), Row(name='Jennifer', age=29), Row(name='Henry', age=50)]
    return context.resources.pyspark.spark_session.createDataFrame(rows, schema)
Exemplo n.º 7
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructField, StructType, FloatType, StringType, BooleanType, ArrayType, DateType

SPARK_HOST = "spark://spark:7077"
KAFKA_HOST = "kafka:19092"

spark = SparkSession.builder.master(SPARK_HOST).appName("customer-deposits").getOrCreate()
spark.sparkContext.setLogLevel("WARN")

BankDepositsSchema = StructType([
    StructField("accountNumber", StringType()),
    StructField("amount", FloatType()),
    StructField("dateAndTime", StringType())
])

BankCustomersSchema = StructType([
    StructField("customerName", StringType()),
    StructField("email", StringType()),
    StructField("phone", StringType()),
    StructField("birthDay", StringType()),
    StructField("accountNumber", StringType()),
    StructField("location", StringType()),

])

bankDepositsRawStreamingDF = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_HOST) \
    .option("subscribe", "bank-deposits") \
Exemplo n.º 8
0
def from_arrow_type(at: "pa.DataType",
                    prefer_timestamp_ntz: bool = False) -> DataType:
    """Convert pyarrow type to Spark data type."""
    from distutils.version import LooseVersion
    import pyarrow as pa
    import pyarrow.types as types

    spark_type: DataType
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None:
        spark_type = TimestampNTZType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " +
                            str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_map(at):
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError(
                "MapType is only supported with pyarrow 2.0.0 and above")
        if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type):
            raise TypeError("Unsupported type in conversion from Arrow: " +
                            str(at))
        spark_type = MapType(from_arrow_type(at.key_type),
                             from_arrow_type(at.item_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError(
                "Nested StructType not supported in conversion from Arrow: " +
                str(at))
        return StructType([
            StructField(field.name,
                        from_arrow_type(field.type),
                        nullable=field.nullable) for field in at
        ])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    elif types.is_null(at):
        spark_type = NullType()
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " +
                        str(at))
    return spark_type
Exemplo n.º 9
0

def top_dataframe(aggregated_df):
    return aggregated_df.sort('Carrera').sort('Promedio', ascending=False)


if __name__ == '__main__':
    import sys

    spark = SparkSession.builder.appName('Mejores Promedios').getOrCreate()

    # Archivo de estudiantes

    estudiante_schema = StructType([
        StructField('Carnet', IntegerType()),
        StructField('Nombre', StringType()),
        StructField('Carrera', StringType())
    ])

    estudiante_df = spark.read.csv(sys.argv[1],
                                   schema=estudiante_schema,
                                   header=False)

    estudiante_df.show()

    # Archivo de cursos

    curso_schema = StructType([
        StructField('Curso', IntegerType()),
        StructField('Creditos', IntegerType()),
        StructField('Carrera', StringType())
Exemplo n.º 10
0
    sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", args.access_key)

if args.secret_key is not None:
    sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", args.secret_key)

spark = SparkSession(sc)

reviewsSchema = \
   StructType() \
      .add("marketplace", "string") \
      .add("customer_id", "integer") \
      .add("review_id", "string") \
      .add("product_id", "string") \
      .add("product_title", "string") \
      .add("product_category", "string") \
      .add("star_rating", "integer") \
      .add("helpful_votes", "integer") \
      .add("total_votes", "integer") \
      .add("vine", "boolean") \
      .add("verified_purchase", "boolean") \
      .add("review_headline", "string") \
      .add("review_body", "string") \
      .add("review_date", "date")

df = spark.read \
          .format('csv') \
          .option('sep','\t') \
          .option('header','true') \
          .option('schema',reviewsSchema) \
          .load(args.source)
        ts_col.append(t)
    return window_col, ts_col


def combine_data(window_col):
    feature_matrix = np.zeros((0, 11))
    for i, item in enumerate(window_col):
        feature = rr_feature_computation(item[:, 0], item[:, 1])
        feature_matrix = np.concatenate(
            (feature_matrix, np.array(feature).reshape(-1, 11)))
    return feature_matrix


schema = StructType([
    StructField("user", StringType()),
    StructField("timestamp", TimestampType()),
    StructField("rr_feature", ArrayType(FloatType())),
])


@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def rr_interval_feature_extraction(data: object) -> object:
    winsor_limit = 0.1  #FIXME - this must be passed or configurable

    mean = data['rr_interval'].mean()
    std = data['rr_interval'].std()

    data['rr_interval'] = (data['rr_interval'] - mean) / std

    window_col, ts_col = get_windows(data)
    X = combine_data(window_col)
def vectorizeBi(row, dico):
    vector_dict = {}
    for w in row.bigrams:
        if w in dico:
            vector_dict[dico[w]] = 1
    return (row.label, SparseVector(len(dico), vector_dict))


# In[321]:

from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.types import StructType, StructField, DoubleType

schema = StructType([
    StructField('label', DoubleType(), True),
    StructField('bigramVectors', VectorUDT(), True)
])

# In[322]:

from functools import partial
print "Converting bigrams to sparse vectors in a dataframe for the train set"
t0 = time()
features = dfTrain.map(partial(vectorizeBi,
                               dico=dict_broad.value)).toDF(schema)
features.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt, 3))

# In[323]:
Exemplo n.º 13
0
def get_hrv_features(rr_data, acceptable_percentage=50, window_length=60):
    """

    Args:
        rr_data (DataStream):
        acceptable_percentage (int):
        window_length (int):

    Returns:

    """
    stream_name = 'org.md2k.autosense.ecg.features'

    def get_metadata():
        stream_metadata = Metadata()
        stream_metadata.set_name(stream_name).set_description("HRV Features from ECG RR interval") \
            .add_input_stream(rr_data.metadata.get_name()) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("var")
                .set_type("double")
                .set_attribute("description","variance")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("iqr")
                .set_type("double")
                .set_attribute("description","Inter Quartile Range")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("mean")
                .set_type("double")
                .set_attribute("description","Mean RR Interval")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("median")
                .set_type("double")
                .set_attribute("description","Median RR Interval")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("80th")
                .set_type("double")
                .set_attribute("description","80th percentile RR Interval")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("20th")
                .set_type("double")
                .set_attribute("description","20th percentile RR Interval")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("heartrate")
                .set_type("double")
                .set_attribute("description","Heart Rate in BPM")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("vlf")
                .set_type("double")
                .set_attribute("description","Very Low Frequency Energy")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("lf")
                .set_type("double")
                .set_attribute("description","Low Frequency Energy")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("hf")
                .set_type("double")
                .set_attribute("description","High Frequency Energy")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("lfhf")
                .set_type("double")
                .set_attribute("description","Low frequency to High Frequency energy ratio")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("window")
                .set_type("struct")
                .set_attribute("description","window start and end time in UTC")
                .set_attribute('start','start of window')
                .set_attribute('end','end of window')) \
            .add_module(
            ModuleMetadata().set_name("HRV Features from ECG RR Interval")
                .set_attribute("url", "http://md2k.org/")
                .set_attribute('algorithm','ecg feature computation')
                .set_attribute('unit','ms')
                .set_author("Md Azim Ullah", "*****@*****.**"))
        return stream_metadata

    def get_rr_features(a):
        return np.array([
            np.var(a),
            iqr(a),
            np.mean(a),
            np.median(a),
            np.percentile(a, 80),
            np.percentile(a, 20), 60000 / np.median(a)
        ])

    def frequencyDomain(RRints,
                        tmStamps,
                        band_type=None,
                        lf_bw=0.11,
                        hf_bw=0.1,
                        vlf=(0.003, 0.04),
                        lf=(0.04, 0.15),
                        hf=(0.15, 0.4)):
        """

        Args:
            RRints:
            tmStamps:
            band_type:
            lf_bw:
            hf_bw:
            vlf:
            lf:
            hf:

        Returns:

        """
        NNs = RRints
        tss = tmStamps
        frequency_range = np.linspace(0.001, 1, 10000)
        NNs = np.array(NNs)
        NNs = NNs - np.mean(NNs)
        result = signal.lombscargle(tss, NNs, frequency_range)

        #Pwelch w/ zero pad
        fxx = frequency_range
        pxx = result

        if band_type == 'adapted':

            vlf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and(
                fxx >= vlf[0], fxx < vlf[1])]))[0][0]]
            lf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and(
                fxx >= lf[0], fxx < lf[1])]))[0][0]]
            hf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and(
                fxx >= hf[0], fxx < hf[1])]))[0][0]]

            peak_freqs = (vlf_peak, lf_peak, hf_peak)

            hf = (peak_freqs[2] - hf_bw / 2, peak_freqs[2] + hf_bw / 2)
            lf = (peak_freqs[1] - lf_bw / 2, peak_freqs[1] + lf_bw / 2)
            vlf = (0.003, lf[0])

            if lf[0] < 0:
                print(
                    '***Warning***: Adapted LF band lower bound spills into negative frequency range'
                )
                print('Lower thresold of LF band has been set to zero')
                print('Adjust LF and HF bandwidths accordingly')
                lf = (0, lf[1])
                vlf = (0, 0)
            elif hf[0] < 0:
                print(
                    '***Warning***: Adapted HF band lower bound spills into negative frequency range'
                )
                print('Lower thresold of HF band has been set to zero')
                print('Adjust LF and HF bandwidths accordingly')
                hf = (0, hf[1])
                lf = (0, 0)
                vlf = (0, 0)

        df = fxx[1] - fxx[0]
        vlf_power = np.trapz(pxx[np.logical_and(fxx >= vlf[0], fxx < vlf[1])],
                             dx=df)
        lf_power = np.trapz(pxx[np.logical_and(fxx >= lf[0], fxx < lf[1])],
                            dx=df)
        hf_power = np.trapz(pxx[np.logical_and(fxx >= hf[0], fxx < hf[1])],
                            dx=df)
        totalPower = vlf_power + lf_power + hf_power

        #Normalize and take log
        vlf_NU_log = np.log((vlf_power / (totalPower - vlf_power)) + 1)
        lf_NU_log = np.log((lf_power / (totalPower - vlf_power)) + 1)
        hf_NU_log = np.log((hf_power / (totalPower - vlf_power)) + 1)
        lfhfRation_log = np.log((lf_power / hf_power) + 1)

        freqDomainFeats = {
            'VLF_Power': vlf_NU_log,
            'LF_Power': lf_NU_log,
            'HF_Power': hf_NU_log,
            'LF/HF': lfhfRation_log
        }

        return freqDomainFeats

    schema = StructType([
        StructField("timestamp", TimestampType()),
        StructField("start", TimestampType()),
        StructField("end", TimestampType()),
        StructField("localtime", TimestampType()),
        StructField("version", IntegerType()),
        StructField("user", StringType()),
        StructField("features", ArrayType(DoubleType()))
    ])

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    @CC_MProvAgg('org.md2k.autosense.ecg.rr', 'get_hrv_features', stream_name,
                 ['user', 'timestamp'], ['user', 'timestamp'])
    def ecg_r_peak(key, data):
        """

        Args:
            key:
            data:

        Returns:

        """
        if data.shape[0] >= acceptable_percentage * window_length / 100:
            data = data.sort_values('time')
            data['time'] = 1000 * data['time']
            a = data['rr'].values
            features = [
                np.double(
                    np.array(
                        list(get_rr_features(a)) + list(
                            frequencyDomain(
                                np.array(a) / 1000,
                                np.cumsum(a) / 1000).values())))
            ]
            data = data[:1]
            data['features'] = features
            data['start'] = [key[2]['start']]
            data['end'] = [key[2]['end']]
            data = data[[
                'timestamp', 'localtime', 'version', 'user', 'start', 'end',
                'features'
            ]]
            return data
        else:
            return pd.DataFrame([],
                                columns=[
                                    'timestamp', 'localtime', 'version',
                                    'user', 'features', 'start', 'end'
                                ])

    rr_data = rr_data.withColumn('time', F.col('timestamp').cast('double'))
    ecg_features = rr_data.compute(ecg_r_peak,
                                   windowDuration=window_length,
                                   startTime='0 seconds')
    df = ecg_features.select('timestamp',
                             F.struct('start', 'end').alias('window'),
                             'localtime', 'features', 'user', 'version')
    df = df.withColumn('var', F.col('features').getItem(0))
    df = df.withColumn('iqr', F.col('features').getItem(1))
    df = df.withColumn('vlf', F.col('features').getItem(7))
    df = df.withColumn('lf', F.col('features').getItem(8))
    df = df.withColumn('hf', F.col('features').getItem(9))
    df = df.withColumn('lfhf', F.col('features').getItem(10))
    df = df.withColumn('mean', F.col('features').getItem(2))
    df = df.withColumn('median', F.col('features').getItem(3))
    df = df.withColumn('80th', F.col('features').getItem(4))
    df = df.withColumn('20th', F.col('features').getItem(5))
    ecg_features_final = df.withColumn('heartrate',
                                       F.col('features').getItem(6))
    ecg_features_final = ecg_features_final.drop('features')
    ecg_features_final.metadata = get_metadata()

    feature_names = [
        'var', 'iqr', 'mean', 'median', '80th', '20th', 'heartrate', 'vlf',
        'lf', 'hf', 'lfhf'
    ]
    stress_features = ecg_features_final.withColumn(
        'features', F.array([F.col(i) for i in feature_names]))

    return stress_features
Exemplo n.º 14
0
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover, CountVectorizer, RegexTokenizer, Word2Vec

spark = SparkSession.builder.appName("process_marc").getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

list = []


fields = [StructField("id", StringType(), True),
          StructField("marc", StringType(), True),
          StructField("titles", ArrayType(StringType()), True),
          StructField("subjects", ArrayType(StringType()), True),
          StructField("notes", ArrayType(StringType()), True)
          ]
schema = StructType(fields)

# 10,000
# marc/data/alma_bibs__2017090800_4547647970003811_new_1.xml
# 7
# marc/data/test_small_alma_bibs__2017102418_4593081760003811_new.xml

def array_merge(titles, subjects, notes):
    new_arr = []
    new_arr.extend(titles)
    new_arr.extend(subjects)
    new_arr.extend(notes)
    return new_arr

exclude = set(string.punctuation)
Exemplo n.º 15
0
Arquivo: df_1.py Projeto: sidaker/dq
# local mode vs cluster mode
# driver runs its own Java process(JVM) so does the Executors which have their own JVM's.
# spark-submit --executor-memory 3G --total-executor-cores 2 sampleSpark.py


spark = SparkSession.builder.appName('mytestSparkapp').getOrCreate()

filename =  "/Users/sbommireddy/Documents/python-spark-tutorial-master/in/airports.text"

myManualSchema = StructType([
StructField("AirportId",LongType(),True),
StructField("AirportName",StringType(),True),
StructField("City",StringType(),True),
StructField("Country",StringType(),True),
StructField("AirportCode",StringType(),True),
StructField("ICAOCode",StringType(),True),
StructField("Latitude",FloatType(),False),
StructField("Longitude",FloatType(),False),
StructField("Altitude",FloatType(),False),
StructField("timezoneDST",StringType(),False),
StructField("unknown",StringType(),False),
StructField("timeZoneinOlson",StringType(),False)
])

airports = spark.read.format("csv").schema(myManualSchema).load(filename)
# option("inferSchema", "true").option("header","true")
print(airports.rdd.getNumPartitions)
# Each partition is a separate CSV file when you write a DataFrame to disk.

# make the dataframe a table or view.
airports.createOrReplaceTempView("airports_all")
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType

spark = SparkSession.builder.appName("TotalSpentByCustomer").master(
    "local[*]").getOrCreate()

# Create schema when reading customer-orders
customerOrderSchema = StructType([ \
                                  StructField("cust_id", IntegerType(), True),
                                  StructField("item_id", IntegerType(), True),
                                  StructField("amount_spent", FloatType(), True)
                                  ])

# Load up the data into spark dataset
customersDF = spark.read.schema(customerOrderSchema).csv(
    "file:///SparkCourse/customer-orders.csv")

totalByCustomer = customersDF.groupBy("cust_id").agg(func.round(func.sum("amount_spent"), 2) \
                                      .alias("total_spent"))

totalByCustomerSorted = totalByCustomer.sort("total_spent")

totalByCustomerSorted.show(totalByCustomerSorted.count())

spark.stop()
Exemplo n.º 17
0
#destDataDirRoot = "/home/allenk/tmp/"
srcDataDirRoot = "abfs://[email protected]/"
destDataDirRoot = "abfs://[email protected]/"

TripSchemaColList = [
    "year", "month", "day", "city", "vendor", "pickup_time", "pickup_zone",
    "dropoff_time", "dropoff_zone", "trip_time_by_sec", "fare", "tips",
    "total", "tips_rate"
]

TripSchema = StructType([
    StructField("vendor", StringType(), True),
    StructField("pickup_zone", IntegerType(), True),
    StructField("dropoff_zone", IntegerType(), True),
    StructField("trip_time_by_sec", IntegerType(), True),
    StructField("pickup_time", TimestampType(), True),
    StructField("dropoff_time", TimestampType(), True),
    StructField("fare", DoubleType(), True),
    StructField("tips", DoubleType(), True),
    StructField("total", DoubleType(), True),
    StructField("verbose", StringType(), True)
])


# clearnsing the data by remove the verbose column and create 3 new column: Year, Month, TipRate
def cleansingTripDataframe(sourceDF):
    sourceDF = (sourceDF.withColumn("city", lit("shanghai")).withColumn(
        "tips_rate", round(col("tips") / col("total"), 2)).withColumn(
            "year", substring(col("pickup_time"), 0, 4)).withColumn(
                "month",
                substring(col("pickup_time"), 6,
                          2)).withColumn("day",
Exemplo n.º 18
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 25 16:55:00 2019

@author: willy
"""

if __name__ == '__main__':
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import explode
    from pyspark.sql.functions import split
    from pyspark.sql.types import StructType, StructField, StringType, LongType
    import findspark


    spark = SparkSession.builder.appName("StructuredNetworkWordCount").getOrCreate()

    userSchema = StructType([
                            StructField("id", LongType(), True),
                            StructField("text", StringType(), True),
                            ])

    df = spark.readStream.schema(userSchema).json('/home/jovyan/git/personal/lab/spark101/twitter/tweets/')
    a =  "/home/jovyan/git/personal/lab/spark101/twitter/twitter/final/final.txt"
    b = "/home/jovyan/git/personal/lab/spark101/twitter/twitter/final/final.txt"
    df.writeStream.format("console").outputMode("append").start().awaitTermination()
Exemplo n.º 19
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, IntegerType, LongType

spark = SparkSession.builder.appName("PopularMovies").getOrCreate()

# Create schema when reading u.data
schema = StructType([ \
                     StructField("userID", IntegerType(), True), \
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])

# Load up movie data as dataframe
moviesDF = spark.read.option("sep", "\t").schema(schema).csv("ml-100k/u.data")

# Some SQL-style magic to sort all movies by popularity in one line!
topMovieIDs = moviesDF.groupBy("movieID").count().orderBy(desc("count"))

# Grab the top 10
topMovieIDs.show(10)

# Stop the session
spark.stop()
Exemplo n.º 20
0
def main():
    random.seed(2018)

    # spark config
    conf = SparkConf()
    conf.setMaster("local").setAppName("MemoryBasedCF")
    conf.set("spark.network.timeout", "3600s")
    conf.set("spark.executor.heartbeatInterval", "3000s")

    sc = SparkContext(conf=conf)
    #sc.setLogLevel("ERROR")
    sc.setCheckpointDir("checkpoint")
    sqlContext = SQLContext(sc)

    '''
    load train data
    '''
    train_path = 'PA/Restaurants/train/'
    train_user = train_path + 'PA_train_yelp_academic_dataset_user.csv'
    train_review = train_path + 'PA_train_yelp_academic_dataset_review.csv'
    train_business = train_path + 'PA_train_yelp_academic_dataset_business.csv'
    train_tips = train_path + 'PA_train_yelp_academic_dataset_tip.csv'
    train_checkin = train_path + 'PA_train_yelp_academic_dataset_checkin.csv'

    schema_review = StructType([
        StructField("funny", IntegerType()),
        StructField("user_id", StringType()),
        StructField("review_id", StringType()),
        StructField("text", StringType()),
        StructField("business_id", StringType()),
        StructField("stars", IntegerType()),
        StructField("date", StringType()),
        StructField("useful", IntegerType()),
        StructField("cool", IntegerType()),
        StructField("1overN", DoubleType()),
        StructField("2overN", DoubleType()),
        StructField("percentile", DoubleType())
    ])

    rawData_review = sqlContext.read.format("com.databricks.spark.csv") \
        .option("header", "true") \
        .option("inferschema", "true") \
        .option("mode", "DROPMALFORMED") \
        .schema(schema_review) \
        .load(train_review).rdd

    rawData_business = sqlContext.read.format("com.databricks.spark.csv")\
        .option("header", "true")\
        .option("inferschema", "true")\
        .option("mode", "DROPMALFORMED")\
        .load(train_business).rdd

    # Step1: find nn for users using review text
    userReviewVec, lshf, transformer, idMap, stop_words = findNN(rawData_review)

    print "Step1 Completed"

    # Step2: find business attr mappings
    busiAttrMap = findBusiAttrMapping(rawData_business)

    print "Step2 Completed"

    # Step3: get user business map
    userBusiMap = rawData_review.map(lambda x: (x[1], [x[4]])).reduceByKey(lambda x, y: x + y).collectAsMap()

    print "Step3 Completed"

    # Step4: for each user in knn find its business, then compute a weighted vote on their business
    #userReviewVec = userReviewVec.collectAsMap()
    #print(weightedVote(userReviewVec['IjVuk0tawvT0ygazmrBQEg'], userBusiMap, busiAttrMap))
    userReviewVec = userReviewVec.map(lambda x: (x[0], weightedVote(x[1], userBusiMap, busiAttrMap)))
    #print userReviewVec.collectAsMap()['IjVuk0tawvT0ygazmrBQEg']

    print "Step4 Completed"

    # Step5: find true business mapping
    # run train on train for test first
    #true_review = rawData_review.map(lambda x: (x[1], busiAttrMap[str(x[4])]))
    true_review = rawData_review.map(lambda x: (x[1], busiAttrMap[str(x[4])])).collectAsMap()

    print "Step5 Completed"

    # Step6: join prediction and true val
    #result = userReviewVec.collect()
    #result2 = true_review.collect()#.join(true_review)

    result = userReviewVec.filter(lambda x: x[0] in true_review).map(lambda x: (x[0], (x[1], true_review[x[0]])))

    print "Step6 Completed"

    # Step7: Compute error between prediction and true mapping
    MSE = result.map(lambda x: computeMSE(x[1][0], x[1][1])).collect()
    MSE = np.mean(MSE, axis=0)
    RMSE = MSE ** 0.5

    print "Step7 Completed"

    # Step8: Output the results

    print RMSE

    with open('result_train.txt', 'w') as f:
        f.writelines([str(RMSE)])

    result = result.sortByKey()
    if os.path.exists('result/train_result'):
        shutil.rmtree('result/train_result')
    result.saveAsTextFile('result/train_result')

    print "Step8 Completed"

    # Step9: Run validation data

    valid_path = 'PA/Restaurants/valid/'
    valid_review = valid_path + 'PA_valid_yelp_academic_dataset_review.csv'
    valid_business = valid_path + 'PA_valid_yelp_academic_dataset_business.csv'

    runPrediction(sc, sqlContext, valid_review, valid_business, schema_review, userBusiMap, busiAttrMap, lshf, transformer, idMap, stop_words, "valid")

    print "Step9 Completed"

    # Step10: Run Test data

    test_path = 'PA/Restaurants/test/'
    test_review = test_path + 'PA_test_yelp_academic_dataset_review.csv'
    test_business = test_path + 'PA_test_yelp_academic_dataset_business.csv'

    runPrediction(sc, sqlContext, test_review, test_business, schema_review, userBusiMap, busiAttrMap, lshf, transformer, idMap, stop_words, "test")

    print "Step10 Completed"

    return
Exemplo n.º 21
0
from pyspark.sql import SparkSession, Row, functions
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, IntegerType

if __name__ == "__main__":
    spark = SparkSession.builder.appName("PopularMovies").getOrCreate()

    # Definindo os schemas para as tabelas
    ratingsSchema = StructType([
        StructField("userId", IntegerType(), True),
        StructField("movieId", IntegerType(), True),
        StructField("rating", IntegerType(), True)
    ])

    itemSchema = StructType([
        StructField("movieId", IntegerType(), True),
        StructField("movieTitle", StringType(), True),
        StructField("releaseDate", StringType(), True),
        StructField("videoReleaseDate", StringType(), True),
        StructField("imdb", StringType(), True),
        StructField("unknown", IntegerType(), True),
        StructField("Action", IntegerType(), True),
        StructField("Adventure", IntegerType(), True),
        StructField("Animation", IntegerType(), True),
        StructField("Childrens", IntegerType(), True),
        StructField("Comedy", IntegerType(), True),
        StructField("Crime", IntegerType(), True),
        StructField("Documentary", IntegerType(), True),
        StructField("Drama", IntegerType(), True),
        StructField("Fantasy", IntegerType(), True),
        StructField("FilmNoir", IntegerType(), True),
        StructField("Horror", IntegerType(), True),
Exemplo n.º 22
0
pdline = pandas.read_csv("/Users/xiejinxin/datafloder/test/copy.csv")

spkdf = sqlc.createDataFrame(pdline)

print(pdline)

print(spkdf)
spkdf.createOrReplaceTempView("tmp1")
data = sqlc.sql(
    "select row_number() over(order by relation_year) as id, cust_level,relation_year,sex,age,cust_status,is_xyk_kk,eva_bal_rmb,raroc_bal_rmb,cnt,transamt,liucun_bal,aum_bal,s_aum_bal,h_aum_bal,d_aum_bal,loan_bal,if(finance_bal > 0,1,0) as label from tmp1"
)

data.show()
tran = data.rdd.map(lambda x: LabeledPoint(
    list(x)[17], SparseVector(16, [i for i in range(16)],
                              list(x)[1:17])))
#cust_level,relation_year,sex,age,eva_bal_rmb,cnt,transamt,liucun_bal
#cust_level,relation_year,sex,age,cust_status,is_xyk_kk,degree,eva_bal_rmb,raroc_bal_rmb,cnt,transamt,liucun_bal,aum_bal,s_aum_bal,h_aum_bal,d_aum_bal,loan_bal,finance_bal,finance_bal_bb,finance_bal_fbb,invest_bal,ldjj_bal,gz_aum_bal,b_aum_bal,gold_bal,trust_bal,insurance_bal,third_bal,loan_house_bal,loan_car_bal,loan_mana_bal,loan_stuty_bal,loan_other_bal,ola_aum_bal,b_z_cd_aum_bal,loan_z_cd,zhc_aum_bal,jer_bal,dly_bal,hxlc_bal,jeqj_bal,jegd_bal,jewy_bal,dzzh_bal,decd_bal,xfc_aum_bal,jj_tot_vol,card_xy_bal_last_m_avg,card_xy_bal_last_m_avg_y,card_swing_bal_avg,card_swing_bal_avg_y,card_swing_num_avg,card_swing_num_avg_y,corpname,tran_amt_1m,tran_num_1m,tran_amt_3m,tran_num_3m,tran_amt_6m,tran_num_6m,day_cnt,tran_wy_amt_1m,tran_wy_num_1m,tran_wy_amt_3m,tran_wy_num_3m,tran_wy_amt_6m,tran_wy_num_6m,day_wy_cnt,tran_dz_amt_1m,tran_dz_num_1m,tran_dz_amt_3m,tran_dz_num_3m,tran_dz_amt_6m,tran_dz_num_6m,day_dz_cnt,tran_atm_amt_1m,tran_atm_num_1m,tran_atm_amt_3m,tran_atm_num_3m,tran_atm_amt_6m,tran_atm_num_6m,day_atm_cnt,tran_gt_amt_1m,tran_gt_num_1m,tran_gt_amt_3m,tran_gt_num_3m,tran_gt_amt_6m,tran_gt_num_6m,day_gt_cnt,tran_pos_amt_1m,tran_pos_num_1m,tran_pos_amt_3m,tran_pos_num_3m,tran_pos_amt_6m,tran_pos_num_6m,day_pos_cnt,tran_sj_amt_1m,tran_sj_num_1m,tran_sj_amt_3m,tran_sj_num_3m,tran_sj_amt_6m,tran_sj_num_6m,day_sj_cnt,tran_dh_amt_1m,tran_dh_num_1m,tran_dh_amt_3m,tran_dh_num_3m,tran_dh_amt_6m,tran_dh_num_6m,day_dh_cnt,is_despoit,is_fixed,is_finance,is_fund,is_gz_aum,is_insurance,is_gold,is_third,is_trust,is_loan,is_cbank,is_xyk,is_finance_bb,is_finance_fbb,is_ldjj,is_loan_house,is_loan_car,is_loan_mana,is_loan_stuty,is_loan_other,is_ola_aum,is_zhc_aum,is_jer,is_dly,is_hxlc,is_jeqj,is_jewy,is_decd,is_xfc_aum,'

model = GradientBoostedTrees.trainRegressor(tran, {}, numIterations=10)

model.save(sc, "./gbdymodelonlionev1")

a = StructType([
    StructField("ID", StringType(), False),
    StructField("cust_type", StringType(), True),
    StructField("cust_level", IntegerType(), True)
])

print(a)
    spark = SparkSession\
        .builder\
        .appName("StructuredNetworkWordCount")\
        .getOrCreate()

    # Create DataFrame representing the stream of input lines from connection to host:port
    """lines = spark\
        .readStream\
        .format('socket')\
        .option('host', host)\
        .option('port', port)\
        .load()"""

#"ID","language","Date","source","len","likes","RTs","Hashtags","Usernames","Userid","name","Place","followers","friends"
    user_schema = StructType().add("ID", "string").add("language", "string").add("Date", "string").add("source", "string").add("len", "string").add("likes", "integer").add("RTs", "integer").add("Hashtags", "string").add("Usernames", "string").add("Userid", "string").add("name", "string").add("Place", "string").add("followers", "integer").add("friends", "integer")
    lines = spark\
        .readStream\
        .option("sep", ";")\
        .schema(user_schema)\
        .csv("hdfs://localhost:9000/stream")

    # Split the lines into words
    words = lines.select(
        # explode turns each item in an array into a separate row
        lines.name, lines.friends/lines.followers
    )

    # Generate running word count
    wordCounts = words.groupBy(lines.name).agg({"(friends / followers)": "max" }).limit(5)
    wordCounts = wordCounts.select(col("name"), col("max((friends / followers))").alias("FRRatio"))
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType

spark = SparkSession \
    .builder \
    .appName("Bigdata") \
    .getOrCreate()

userSchema = StructType().add("ID","string").add("Lang","string",).add("Date","string").add("Source","string").add("len","integer").add("Likes","integer").add("RTs","integer").add("Hashtags","string").add("UserMentionNames","string").add("UserMentionID","string").add("Name","string").add("Place","string").add("Followers","integer").add("Friends","integer")
'''csvDF = spark \
    .readStream \
    .option("sep", ";") \
    .schema(userSchema) \
    .csv("/input")
'''

dfCSV = spark.readStream.option("sep", ";").option("header", "false").schema(userSchema).csv("/stream1")
dfCSV.createOrReplaceTempView("Bigdata")
summ = spark.sql("select Name,(Followers/Friends) as FRratio from Bigdata order by Frratio desc limit 5")
m = summ.writeStream.outputMode("complete").format("console")
m.start()
m.awaitTermination(100)
spark.stop()
Exemplo n.º 25
0
# Databricks notebook source
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType

# COMMAND ----------

from pyspark.sql.types import *

# COMMAND ----------

userSchema = StructType([
    StructField("desc", StringType(), True),
    StructField("zip", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("timeStamp", StringType(), True),
    StructField("twp", StringType(), True),
    StructField("addr", StringType(), True)
])

# COMMAND ----------

fileStreamDf = spark.readStream.option(
    "header", "true").schema(userSchema).csv("/FileStore/tables/")

# COMMAND ----------

fileStreamDf.createOrReplaceTempView("salary")

# COMMAND ----------

totalSalary = spark.sql("select count(title),title from salary group by title")
Exemplo n.º 26
0
#Read All JSON files from a directory
df3 = spark.read.json("resources/*.json")
df3.show()

# Define custom schema
schema = StructType([
    StructField("RecordNumber", IntegerType(), True),
    StructField("Zipcode", IntegerType(), True),
    StructField("ZipCodeType", StringType(), True),
    StructField("City", StringType(), True),
    StructField("State", StringType(), True),
    StructField("LocationType", StringType(), True),
    StructField("Lat", DoubleType(), True),
    StructField("Long", DoubleType(), True),
    StructField("Xaxis", IntegerType(), True),
    StructField("Yaxis", DoubleType(), True),
    StructField("Zaxis", DoubleType(), True),
    StructField("WorldRegion", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("LocationText", StringType(), True),
    StructField("Location", StringType(), True),
    StructField("Decommisioned", BooleanType(), True),
    StructField("TaxReturnsFiled", StringType(), True),
    StructField("EstimatedPopulation", IntegerType(), True),
    StructField("TotalWages", IntegerType(), True),
    StructField("Notes", StringType(), True)
])

df_with_schema = spark.read.schema(schema) \
        .json("resources/zipcodes.json")
df_with_schema.printSchema()
df_with_schema.show()
Exemplo n.º 27
0
    join staging_songs as s
    on e.song like s.title
    where e.page like 'NextSong' and e.song is not null and s.artist_id is not null
    and e.artist like s.artist_name
    """)

insert_table_queries = [
    user_table_insert, song_table_insert, artist_table_insert,
    time_table_insert, songplay_table_insert
]


usr_schema = StructType([ \
    StructField("userid", IntegerType(), False), \
    StructField("firstname", StringType(), True), \
    StructField("lastname", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("level", StringType(), True), \
    ])


song_schema = StructType([ \
    StructField("song_id", StringType(), False), \
    StructField("title", StringType(), True), \
    StructField("artist_id", StringType(), True), \
    StructField("year", IntegerType(), True), \
    StructField("duration", FloatType(), True), \
    ])


artist_schema = StructType([ \
schema = StructType([
    StructField('geo',StringType(), True),
    StructField('yr1980', DoubleType(), False),
    StructField('yr1981', DoubleType(), False),
    StructField('yr1982', DoubleType(), False),
    StructField('yr1983', DoubleType(), False),
    StructField('yr1984', DoubleType(), False),
    StructField('yr1985', DoubleType(), False),
    StructField('yr1986', DoubleType(), False),
    StructField('yr1987', DoubleType(), False),
    StructField('yr1988', DoubleType(), False),
    StructField('yr1989', DoubleType(), False),
    StructField('yr1990', DoubleType(), False),
    StructField('yr1991', DoubleType(), False),
    StructField('yr1992', DoubleType(), False),
    StructField('yr1993', DoubleType(), False),
    StructField('yr1994', DoubleType(), False),
    StructField('yr1995', DoubleType(), False),
    StructField('yr1996', DoubleType(), False),
    StructField('yr1997', DoubleType(), False),
    StructField('yr1998', DoubleType(), False),
    StructField('yr1999', DoubleType(), False),
    StructField('yr2000', DoubleType(), False),
    StructField('yr2001', DoubleType(), False),
    StructField('yr2002', DoubleType(), False),
    StructField('yr2003', DoubleType(), False),
    StructField('yr2004', DoubleType(), False),
    StructField('yr2005', DoubleType(), False),
    StructField('yr2006', DoubleType(), False),
    StructField('yr2007', DoubleType(), False),
    StructField('yr2008', DoubleType(), False),
    StructField('yr2009', DoubleType(), False),
    StructField('yr2010', DoubleType(), False)
])
Exemplo n.º 29
0
# All the queries are based largely on queries found in the book:
#       Bill Chambers & Matei Zaharia ,
#       Spark: The Definitive Guide: Big Data Processing Made Simple,
#       O'Reilly, 2018

# ------------------------------------------------------------------------------
#                    Apache Spark 2.x DataFrames and Python Pyspark
# ------------------------------------------------------------------------------
# ------------------We first create product and customer dataframes-------------
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, FloatType

# ------------------------------------------------------------------------------
# Create products dataframe
manual_schema_products = StructType([
    StructField('product_id', IntegerType(), False),
    StructField('product_name', StringType(), True),
    StructField('product_specs', StringType(), True),
    StructField('price', FloatType(), True)
])

array_temp1 = [[1, 'Apple iPhone 6', '16GB memory', 200.0],
               [2, 'Samsung Galaxy 5', '16 megapixel camera', 100.0]]

in_memory_products_dataframe = spark.createDataFrame(array_temp1,
                                                     manual_schema_products)

in_memory_products_dataframe.show(2)
in_memory_products_dataframe.cache()

# -----------------------------------------------------------------------------
# Create customers dataframe
Exemplo n.º 30
0
def main():
    spark = SparkSession.builder.appName("DBPediaSpark").getOrCreate()

    args = getResolvedOptions(sys.argv, [
        'S3_INPUT_BUCKET', 'S3_INPUT_KEY_PREFIX', 'S3_OUTPUT_BUCKET',
        'S3_OUTPUT_KEY_PREFIX', 'S3_MODEL_BUCKET', 'S3_MODEL_KEY_PREFIX'
    ])

    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    spark.sparkContext._jsc.hadoopConfiguration().set(
        "mapred.output.committer.class",
        "org.apache.hadoop.mapred.FileOutputCommitter")

    # Defining the schema corresponding to the input data. The input data does not contain the headers
    schema = StructType([
        StructField("label", IntegerType(), True),
        StructField("title", StringType(), True),
        StructField("abstract", StringType(), True)
    ])

    # Download the data from S3 into two separate Dataframes
    traindf = spark.read.csv(('s3://' + os.path.join(
        args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'train.csv')),
                             header=False,
                             schema=schema,
                             encoding='UTF-8')
    validationdf = spark.read.csv(('s3://' + os.path.join(
        args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'], 'test.csv')),
                                  header=False,
                                  schema=schema,
                                  encoding='UTF-8')

    # Tokenize the abstract column which contains the input text
    tokenizer = Tokenizer(inputCol="abstract", outputCol="tokenized_abstract")

    # Save transformed training data to CSV in S3 by converting to RDD.
    transformed_traindf = tokenizer.transform(traindf)
    transformed_train_rdd = transformed_traindf.rdd.map(
        lambda x: (x.label, x.tokenized_abstract))
    lines = transformed_train_rdd.map(csv_line)
    lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(
        args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train'))

    # Similar data processing for validation dataset.
    transformed_validation = tokenizer.transform(validationdf)
    transformed_validation_rdd = transformed_validation.rdd.map(
        lambda x: (x.label, x.tokenized_abstract))
    lines = transformed_validation_rdd.map(csv_line)
    lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(
        args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation'))

    # Serialize the tokenizer via MLeap and upload to S3
    SimpleSparkSerializer().serializeToBundle(tokenizer,
                                              "jar:file:/tmp/model.zip",
                                              transformed_validation)

    # Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file.
    import zipfile
    with zipfile.ZipFile("/tmp/model.zip") as zf:
        zf.extractall("/tmp/model")

    # Write back the content as a .tar.gz file
    import tarfile
    with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
        tar.add("/tmp/model/bundle.json", arcname='bundle.json')
        tar.add("/tmp/model/root", arcname='root')

    s3 = boto3.resource('s3')
    file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz')
    s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz',
                                                   file_name)