Пример #1
0
    def get_nunique(self, df, columns=[]):
        """return dict with number of unique entries for given columns

        :param df: input (spark) data frame
        :param columns: columns to select (optional)
        """
        if not columns:
            columns = df.columns
        qdf = df.agg(*(approxCountDistinct(sparkcol(c)).alias(c) for c in columns))
        return qdf.toPandas().T[0].to_dict()
Пример #2
0
    def test_aggregator(self):
        df = self.df
        g = df.groupBy()
        self.assertEqual([99, 100], sorted(g.agg({'key': 'max', 'value': 'count'}).collect()[0]))
        self.assertEqual([Row(**{"AVG(key#0)": 49.5})], g.mean().collect())

        from pyspark.sql import functions
        self.assertEqual((0, u'99'),
                         tuple(g.agg(functions.first(df.key), functions.last(df.value)).first()))
        self.assertTrue(95 < g.agg(functions.approxCountDistinct(df.key)).first()[0])
        self.assertEqual(100, g.agg(functions.countDistinct(df.value)).first()[0])
Пример #3
0
    spark = SparkSession\
        .builder\
        .appName("StructuredKafkaWordCount")\
        .getOrCreate()

    lines = spark\
        .readStream\
        .format("kafka")\
        .option("kafka.bootstrap.servers", bootstrapServers)\
        .option(subscribeType, topics)\
        .load()\
        .selectExpr("CAST(value AS STRING)")

    probes = lines.select(
        split(lines.value, ',')[0].alias('timestamp'),
        split(lines.value, ',')[1].alias('mac'),
        split(lines.value, ',')[2].alias('SSID'),
        split(lines.value, ',')[3].alias('fornecedor'),
        split(lines.value, ',')[4].alias('macId'))

    pnl = probes.filter('SSID != "BROADCAST"').select(
        approxCountDistinct('mac', rsd=0.01).alias('count'))

    query = pnl\
        .writeStream\
        .outputMode("complete")\
        .foreach(processRow)\
        .start()

    query.awaitTermination()
Пример #4
0
    topics = sys.argv[3]

    spark = SparkSession\
        .builder\
        .appName("StructuredKafkaWordCount")\
        .getOrCreate()

    lines = spark\
        .readStream\
        .format("kafka")\
        .option("kafka.bootstrap.servers", bootstrapServers)\
        .option(subscribeType, topics)\
        .load()\
        .selectExpr("CAST(value AS STRING)")

    probes = lines.select(
        split(lines.value, ',')[0].alias('timestamp'),
        split(lines.value, ',')[1].alias('mac'),
        split(lines.value, ',')[2].alias('SSID'),
        split(lines.value, ',')[3].alias('fornecedor'))

    probesDir = probes.filter('SSID != "BROADCAST"').select(
        approxCountDistinct('timestamp', rsd=0.01).alias('count'))

    query = probesDir\
        .writeStream\
        .outputMode("complete")\
        .foreach(processRow)\
        .start()

    query.awaitTermination()
Пример #5
0
        .builder\
        .appName("StructuredKafkaWordCount")\
        .getOrCreate()

    lines = spark\
        .readStream\
        .format("kafka")\
        .option("kafka.bootstrap.servers", bootstrapServers)\
        .option(subscribeType, topics)\
        .load()\
        .selectExpr("CAST(value AS STRING)")
    
    probes = lines.select(
        split(lines.value,',')[0].alias('timestamp'),
        split(lines.value,',')[1].alias('mac'),
        split(lines.value,',')[2].alias('SSID'),
        split(lines.value,',')[3].alias('fornecedor'),
        split(lines.value,',')[4].alias('macId')
    )
    
    ssid = probes.filter('SSID != "BROADCAST"').select(
        approxCountDistinct('SSID',rsd = 0.01).alias('count')
    )
    
    query = ssid\
        .writeStream\
        .outputMode("complete")\
        .foreach(processRow)\
        .start()

    query.awaitTermination()
 def number_distinct_values(col):
     return F.approxCountDistinct(col)
Пример #7
0
    # Create DataSet representing the stream of input lines from kafka
    lines = spark\
        .readStream\
        .format("kafka")\
        .option("kafka.bootstrap.servers", bootstrapServers)\
        .option(subscribeType, topics)\
        .load()\
        .selectExpr("CAST(value AS STRING)")

    # Cria a tabela de dados
    dados = lines.select(
        split(lines.value, ', ')[0].alias("Source"),
        split(lines.value, ', ')[1].alias("Time"),
        split(lines.value, ', ')[2].alias("ssid"),
        split(lines.value, ', ')[3].alias("marca")
    )  
    
    #remove as probes do tipo BroadCast
    directProbes = dados.filter('ssid != "Wildcard (Broadcast)"')

    #conta a quantidade de ssids diferentes
    
    qtdSsidDif = directProbes.agg(approxCountDistinct('ssid'))
    query = qtdSsidDif\
        .writeStream\
        .outputMode('complete')\
        .foreach(processRow)\
        .start()

    query.awaitTermination()
Пример #8
0
    # Create DataSet representing the stream of input lines from kafka
    lines = spark\
        .readStream\
        .format("kafka")\
        .option("kafka.bootstrap.servers", bootstrapServers)\
        .option(subscribeType, topics)\
        .load()\
        .selectExpr("CAST(value AS STRING)")

    # Cria a tabela de dados
    dados = lines.select(
        split(lines.value, ', ')[0].alias("Source"),
        split(lines.value, ', ')[1].alias("Time"),
        split(lines.value, ', ')[2].alias("ssid"),
        split(lines.value, ', ')[3].alias("marca"))

    #remove as probes do tipo BroadCast
    directProbes = dados.filter('ssid != "Wildcard (Broadcast)"')

    # total de probes direct

    totalDirect = directProbes.agg(approxCountDistinct('Time'))

    query = totalDirect\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .start()

    query.awaitTermination()
    lines = spark\
        .readStream\
        .format("kafka")\
        .option("kafka.bootstrap.servers", bootstrapServers)\
        .option(subscribeType, topics)\
        .load()\
        .selectExpr("CAST(value AS STRING)")

    probes = lines.select(
        split(lines.value, ',')[0].alias('timestamp'),
        split(lines.value, ',')[1].alias('mac'),
        split(lines.value, ',')[2].alias('SSID'),
        split(lines.value, ',')[3].alias('fornecedores'),
        split(lines.value, ',')[4].alias('macId'))

    dispositivos = probes.select(approxCountDistinct('mac', rsd=0.01))
    pnl = probes.select('mac', 'SSID',
                        'macId').filter('SSID != "BROADCAST"').distinct()

    pnl1 = pnl.select(approxCountDistinct('macId', rsd=0.01))

    probesTot = probes.select(approxCountDistinct('timestamp', rsd=0.01))

    probesBroad = probes.filter('SSID == "BROADCAST"').select(
        approxCountDistinct('timestamp', rsd=0.01))

    probesDir = probes.filter('SSID != "BROADCAST"').select(
        approxCountDistinct('timestamp', rsd=0.01))

    # macs = probes.groupBy('mac').count()
Пример #10
0
        .format("kafka")\
        .option("kafka.bootstrap.servers", bootstrapServers)\
        .option(subscribeType, topics)\
        .load()\
        .selectExpr("CAST(value AS STRING)")

    # Cria a tabela de dados
    dados = lines.select(
        split(lines.value, ', ')[0].alias("Source"),
        split(lines.value, ', ')[1].alias("Time"),
        split(lines.value, ', ')[2].alias("ssid"),
        split(lines.value, ', ')[3].alias("marca"))

    #conta a quantidade de devices diferentes

    qtdDeviceDif = dados.agg(approxCountDistinct('Source'))
    query = qtdDeviceDif\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .start()

    query.awaitTermination()

    #conta a quantidade de devices diferentes

    qtdDeviceDif = dados.agg(approxCountDistinct('Source'))
    query = qtdDeviceDif\
        .writeStream\
        .outputMode('complete')\
        .foreach(processRow)\
        .format("kafka")\
        .option("kafka.bootstrap.servers", bootstrapServers)\
        .option(subscribeType, topics)\
        .load()\
        .selectExpr("CAST(value AS STRING)")
    
    probes = lines.select(
        split(lines.value,',')[0].alias('timestamp'),
        split(lines.value,',')[1].alias('mac'),
        split(lines.value,',')[2].alias('SSID'),
        split(lines.value,',')[3].alias('fornecedor'),
        split(lines.value,',')[4].alias('macId')
    )
    
    dispositivos = probes.select(
        approxCountDistinct('mac',rsd = 0.01)
    )
  
    pnl = probes.filter('SSID != "BROADCAST"').select(
        'mac'
    ).distinct()

    pnl1 = pnl.select(
        approxCountDistinct('mac',rsd = 0.01)
    )
    
    probesTot = probes.select(
        approxCountDistinct('timestamp',rsd = 0.01)
    )

    probesBroad = probes.filter('SSID == "BROADCAST"').select(
Пример #12
0
def get_builtin_aggregator_column(agg, ctx):
    try:
        aggregator = ctx.aggregators[agg["aggregator"]]

        try:
            input = ctx.populate_values(agg["input"],
                                        aggregator["input"],
                                        preserve_column_refs=False)
        except CortexException as e:
            e.wrap("input")
            raise

        if aggregator["name"] == "approx_count_distinct":
            return F.approxCountDistinct(input["col"],
                                         input.get("rsd")).alias(agg["name"])
        if aggregator["name"] == "avg":
            return F.avg(input).alias(agg["name"])
        if aggregator["name"] in {
                "collect_set_int", "collect_set_float", "collect_set_string"
        }:
            return F.collect_set(input).alias(agg["name"])
        if aggregator["name"] == "count":
            return F.count(input).alias(agg["name"])
        if aggregator["name"] == "count_distinct":
            return F.countDistinct(*input).alias(agg["name"])
        if aggregator["name"] == "covar_pop":
            return F.covar_pop(input["col1"], input["col2"]).alias(agg["name"])
        if aggregator["name"] == "covar_samp":
            return F.covar_samp(input["col1"],
                                input["col2"]).alias(agg["name"])
        if aggregator["name"] == "kurtosis":
            return F.kurtosis(input).alias(agg["name"])
        if aggregator["name"] in {"max_int", "max_float", "max_string"}:
            return F.max(input).alias(agg["name"])
        if aggregator["name"] == "mean":
            return F.mean(input).alias(agg["name"])
        if aggregator["name"] in {"min_int", "min_float", "min_string"}:
            return F.min(input).alias(agg["name"])
        if aggregator["name"] == "skewness":
            return F.skewness(input).alias(agg["name"])
        if aggregator["name"] == "stddev":
            return F.stddev(input).alias(agg["name"])
        if aggregator["name"] == "stddev_pop":
            return F.stddev_pop(input).alias(agg["name"])
        if aggregator["name"] == "stddev_samp":
            return F.stddev_samp(input).alias(agg["name"])
        if aggregator["name"] in {"sum_int", "sum_float"}:
            return F.sum(input).alias(agg["name"])
        if aggregator["name"] in {"sum_distinct_int", "sum_distinct_float"}:
            return F.sumDistinct(input).alias(agg["name"])
        if aggregator["name"] == "var_pop":
            return F.var_pop(input).alias(agg["name"])
        if aggregator["name"] == "var_samp":
            return F.var_samp(input).alias(agg["name"])
        if aggregator["name"] == "variance":
            return F.variance(input).alias(agg["name"])

        raise ValueError("missing builtin aggregator")  # unexpected

    except CortexException as e:
        e.wrap("aggregate " + agg["name"])
        raise
Пример #13
0
        .appName("consumidor")\
        .getOrCreate()

    # Create DataSet representing the stream of input lines from kafka
    lines = spark\
        .readStream\
        .format("kafka")\
        .option("kafka.bootstrap.servers", bootstrapServers)\
        .option(subscribeType, topics)\
        .load()\
        .selectExpr("CAST(value AS STRING)")

    # Cria a tabela de dados
    dados = lines.select(
        split(lines.value, ', ')[0].alias("Source"),
        split(lines.value, ', ')[1].alias("Time"),
        split(lines.value, ', ')[2].alias("ssid"),
        split(lines.value, ', ')[3].alias("marca"))

    # total de probes direct

    totalProbes = dados.agg(approxCountDistinct('Time'))

    query = totalProbes\
        .writeStream\
        .outputMode('complete')\
        .format('console')\
        .start()

    query.awaitTermination()
 def number_distinct_values(
         col):  #function to calculate distinct values
     return F.approxCountDistinct(col)
Пример #15
0
    spark = SparkSession\
        .builder\
        .appName("consumidor")\
        .getOrCreate()

    # Create DataSet representing the stream of input lines from kafka
    lines = spark\
        .readStream\
        .format("kafka")\
        .option("kafka.bootstrap.servers", bootstrapServers)\
        .option(subscribeType, topics)\
        .load()\
        .selectExpr("CAST(value AS STRING)")

    # Cria a tabela de dados
    dados = lines.select(
        split(lines.value, ', ')[0].alias("Source"),
        split(lines.value, ', ')[1].alias("Time"),
        split(lines.value, ', ')[2].alias("ssid"),
        split(lines.value, ', ')[3].alias("marca"))

    #conta a quantidade de devices com pnl >= 1

    qtdDeviceComPNL = dados.agg(approxCountDistinct('Source'))
    query = qtdDeviceComPNL\
        .writeStream\
        .outputMode('complete')\
        .foreach(processRow)\
        .start()

    query.awaitTermination()