Пример #1
0
def register_udf(spark):
    udf = spark.udf
    udf.register('get_year_month', get_year_month, returnType=StringType())
    udf.register('filter_shixi', filter_shixi, returnType=BooleanType())
Пример #2
0
    def make_not_terminal_udf():
        """ Return true iff next_action is an empty map """
        def get_not_terminal(next_action):
            return len(next_action) > 0

        return udf(get_not_terminal, BooleanType())
Пример #3
0
        false for bad alert, and true for good alert.

    Examples
    ----------
    >>> pdf = pd.read_parquet('datatest')
    >>> classification = sso_fink_candidates_(pdf['roid'])
    >>> print(len(pdf[classification]['objectId'].values))
    3

    >>> assert 'ZTF21acqeepb' in pdf[classification]['objectId'].values
    """
    f_roid = roid.astype(int) == 2

    return f_roid

@pandas_udf(BooleanType(), PandasUDFType.SCALAR)
def sso_fink_candidates(roid) -> pd.Series:
    """ Pandas UDF version of sso_fink_candidates_ for Spark

    Parameters
    ----------
    roid: Spark DataFrame Column
        Column containing the Solar System label

    Returns
    ----------
    out: pandas.Series of bool
        Return a Pandas DataFrame with the appropriate flag:
        false for bad alert, and true for good alert.

    """
Пример #4
0
    StructField("coolantTemp", FloatType(), False),
    StructField("intakeAirTemp", FloatType(), False),
    StructField("intakeAirFlowSpeed", FloatType(), False),
    StructField("batteryPercentage", FloatType(), False),
    StructField("batteryVoltage", FloatType(), False),
    StructField("speed", FloatType(), False),
    StructField("engineVibrationAmplitude", FloatType(), False),
    StructField("throttlePos", FloatType(), False),
    StructField("tirePressure11", FloatType(), False),
    StructField("tirePressure12", FloatType(), False),
    StructField("tirePressure21", FloatType(), False),
    StructField("tirePressure22", FloatType(), False),
    StructField("accelerometer11Value", FloatType(), False),
    StructField("accelerometer12Value", FloatType(), False),
    StructField("accelerometer21Value", FloatType(), False),
    StructField("accelerometer22Value", FloatType(), False),
    StructField("controlUnitFirmware", IntegerType(), False),
    StructField("failureOccurred", BooleanType(), False)
])

# parsing the JSONs based on the schema
unlabeled = rows.map(lambda x: json.loads(x[1], schema)).drop(
    "failureOccurred")

# make the predictions
predictionsDF = model.transform(unlabeled)

# start the streaming
ssc.start()
ssc.awaitTermination()
    return partial


# Boilerplate for generating example main_summary tables
def generate_search_count(engine='google', source='urlbar', count=4):
    return {
        'engine': engine,
        'source': source,
        'count':  count,
    }


addons_type = ArrayType(StructType([
    StructField('addon_id', StringType(), False),
    StructField('blocklisted', BooleanType(), True),
    StructField('name', StringType(), True),
    StructField('user_disabled', BooleanType(), True),
    StructField('app_disabled', BooleanType(), True),
    StructField('version', StringType(), True),
    StructField('scope', LongType(), True),
    StructField('type', StringType(), True),
    StructField('foreign_install', BooleanType(), True),
    StructField('has_binary_components', BooleanType(), True),
    StructField('install_day', LongType(), True),
    StructField('update_day', LongType(), True),
    StructField('signed_state', LongType(), True),
    StructField('is_system', BooleanType(), True),
    StructField('is_web_extension', BooleanType(), True),
    StructField('multiprocess_compatible', BooleanType(), True),
]))
schema = StructType() \
      .add("RecordNumber",IntegerType(),True) \
      .add("Zipcode",IntegerType(),True) \
      .add("ZipCodeType",StringType(),True) \
      .add("City",StringType(),True) \
      .add("State",StringType(),True) \
      .add("LocationType",StringType(),True) \
      .add("Lat",DoubleType(),True) \
      .add("Long",DoubleType(),True) \
      .add("Xaxis",IntegerType(),True) \
      .add("Yaxis",DoubleType(),True) \
      .add("Zaxis",DoubleType(),True) \
      .add("WorldRegion",StringType(),True) \
      .add("Country",StringType(),True) \
      .add("LocationText",StringType(),True) \
      .add("Location",StringType(),True) \
      .add("Decommisioned",BooleanType(),True) \
      .add("TaxReturnsFiled",StringType(),True) \
      .add("EstimatedPopulation",IntegerType(),True) \
      .add("TotalWages",IntegerType(),True) \
      .add("Notes",StringType(),True)

df_with_schema = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("C:/apps/sparkbyexamples/src/pyspark-examples/resources/zipcodes.csv")
df_with_schema.printSchema()

df2.write.option("header",True) \
 .csv("/tmp/spark_output/zipcodes123")
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, to_json, col, unbase64, base64, split, expr
from pyspark.sql.types import StructField, StructType, StringType, BooleanType, ArrayType, DateType

# TO-DO: create a StructType for the Kafka redis-server topic which has all changes made to Redis - before Spark 3.0.0, schema inference is not automatic
redisServerSchema = StructType([
    StructField("key", StringType()),
    StructField("value", StringType()),
    StructField("expiredType", StringType()),
    StructField("expiredValue", StringType()),
    StructField("existType", StringType()),
    StructField("ch", StringType()),
    StructField("incr", BooleanType()),
    StructField(
        "zSetEntries",
        ArrayType(
            StructType([
                StructField("element", StringType()),
                StructField("score", StringType())
            ])))
])

# TO-DO: create a StructType for the Customer JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic
customersSchema = StructType([
    StructField("customerName", StringType()),
    StructField("email", StringType()),
    StructField("phone", StringType()),
    StructField("birthDay", StringType())
])

# TO-DO: create a StructType for the Kafka stedi-events topic which has the Customer Risk JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic
Пример #8
0
    def test_as_spark_type_pandas_on_spark_dtype(self):
        type_mapper = {
            # binary
            np.character: (np.character, BinaryType()),
            np.bytes_: (np.bytes_, BinaryType()),
            np.string_: (np.bytes_, BinaryType()),
            bytes: (np.bytes_, BinaryType()),
            # integer
            np.int8: (np.int8, ByteType()),
            np.byte: (np.int8, ByteType()),
            np.int16: (np.int16, ShortType()),
            np.int32: (np.int32, IntegerType()),
            np.int64: (np.int64, LongType()),
            np.int: (np.int64, LongType()),
            int: (np.int64, LongType()),
            # floating
            np.float32: (np.float32, FloatType()),
            np.float: (np.float64, DoubleType()),
            np.float64: (np.float64, DoubleType()),
            float: (np.float64, DoubleType()),
            # string
            np.str: (np.unicode_, StringType()),
            np.unicode_: (np.unicode_, StringType()),
            str: (np.unicode_, StringType()),
            # bool
            np.bool: (np.bool, BooleanType()),
            bool: (np.bool, BooleanType()),
            # datetime
            np.datetime64: (np.datetime64, TimestampType()),
            datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()),
            # DateType
            datetime.date: (np.dtype("object"), DateType()),
            # DecimalType
            decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)),
            # ArrayType
            np.ndarray: (np.dtype("object"), ArrayType(StringType())),
            List[bytes]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.character]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[datetime.date]: (np.dtype("object"), ArrayType(DateType())),
            List[np.int8]: (np.dtype("object"), ArrayType(ByteType())),
            List[np.byte]: (np.dtype("object"), ArrayType(ByteType())),
            List[decimal.Decimal]:
            (np.dtype("object"), ArrayType(DecimalType(38, 18))),
            List[float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float32]: (np.dtype("object"), ArrayType(FloatType())),
            List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())),
            List[int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int64]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int16]: (np.dtype("object"), ArrayType(ShortType())),
            List[str]: (np.dtype("object"), ArrayType(StringType())),
            List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())),
            List[datetime.datetime]:
            (np.dtype("object"), ArrayType(TimestampType())),
            List[np.datetime64]:
            (np.dtype("object"), ArrayType(TimestampType())),
            # CategoricalDtype
            CategoricalDtype(categories=["a", "b", "c"]): (
                CategoricalDtype(categories=["a", "b", "c"]),
                LongType(),
            ),
        }

        for numpy_or_python_type, (dtype, spark_type) in type_mapper.items():
            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
            self.assertEqual(pandas_on_spark_type(numpy_or_python_type),
                             (dtype, spark_type))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            as_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            as_spark_type(np.dtype("object"))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            pandas_on_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            pandas_on_spark_type(np.dtype("object"))
Пример #9
0
def process_gdelt_data(spark, datestr):
    """
    Processes the gdelt dataset by day.

    Transforms datatypes, add columns for month.

    Reduces the dataset to the necessary columns.

    Maps the country code to the ISO code which is used in the
    google and oxford datasets.

    Determines if url contains covid related content.

    Writes the cleaned data back to S3 partitioned by country code.

    spark: spark session
    datestr: day to be processed, needs to be a string in the format YYYYMMDD
    """

    # get schema of gdelt data
    schema = schema_helper.gdelt_s0_schema

    # read in original data from S3
    gdelt_file = 'gdelt/' + datestr + '.export.CSV'
    df_gdelt = spark.read.option("delimiter", "\t") \
                .csv(folder_s0 + gdelt_file,header=False,schema=schema)

    # converte date column to date type
    datefunc = F.udf(lambda x: datetime.strptime(x, '%Y%m%d'), DateType())
    df_gdelt = df_gdelt.withColumn('date', datefunc(col('SQLDATE')))
    #create month column
    df_gdelt = df_gdelt.withColumn("month", F.month("date"))

    #reduce dataset by selecting specific columns
    df_gdelt_reduced = df_gdelt.select("GLOBALEVENTID",\
                                    "date",\
                                    "month",\
                                    "Year",\
                                    "Actor1Code",\
                                    "Actor1Name",\
                                    "Actor1CountryCode",\
                                    "Actor2Code",\
                                    "Actor2Name",\
                                    "Actor2CountryCode",\
                                    "IsRootEvent",\
                                    "EventCode",\
                                    "EventBaseCode",\
                                    "EventRootCode",\
                                    "QuadClass",\
                                    "GoldsteinScale",\
                                    "NumMentions",\
                                    "NumSources",\
                                    "NumArticles",\
                                    "AvgTone",\
                                    "ActionGeo_Type",\
                                    "ActionGeo_CountryCode",\
                                    "ActionGeo_ADM1Code",\
                                    "SOURCEURL")

    #determines if url contains covid related content
    #and creates column covid (True=contains keywords)
    covidFunc = F.udf(lambda x: containsCovidContent(x), BooleanType())
    df_gdelt_reduced = df_gdelt_reduced. \
                        withColumn('covid', covidFunc(col('SOURCEURL')))

    #rename column Year to year
    df_gdelt_reduced = df_gdelt_reduced.withColumnRenamed("Year", "year")

    #maps fips based country code to iso
    df_mapping = read_fips2iso_mapping(spark)
    df_gdelt_reduced = df_gdelt_reduced \
                        .join(df_mapping, \
                            on=['ActionGeo_CountryCode'], how='left')

    #write data back to S3 partitioned by country_code
    df_gdelt_reduced.write.mode('overwrite') \
                        .partitionBy("country_code") \
                        .parquet(folder_s1+"gdelt/gdelt.parquet")
def main(context):
    """Main function takes a Spark SQL context."""
    # skips to task 10 if results from 9 are stored
    if os.path.isfile("full_sentiment_data.parquet/._SUCCESS.crc"):
        full_sentiment_data = context.read.parquet(
            "full_sentiment_data.parquet")
    else:
        # TASK 1
        if os.path.isfile("comments.parquet/._SUCCESS.crc") and os.path.isfile(
                "submissions.parquet/._SUCCESS.crc") and os.path.isfile(
                    "labeled_data.parquet/._SUCCESS.crc"):
            # print("WE HERE")
            comments = context.read.parquet("comments.parquet")
            submissions = context.read.parquet("submissions.parquet")
            labeled_data = context.read.parquet("labeled_data.parquet")
        else:
            comments = context.read.json("comments-minimal.json.bz2")
            comments.write.parquet("comments.parquet")

            submissions = context.read.json("submissions.json.bz2")
            submissions.write.parquet("submissions.parquet")

            labeled_data = context.read.csv("labeled_data.csv", header='true')
            labeled_data.write.parquet("labeled_data.parquet")

        # Create temporary views for later
        comments.createGlobalTempView("comments")
        labeled_data.createGlobalTempView("labeled_data")
        submissions.createGlobalTempView("submissions")

        # TASK 4
        context.registerFunction("sanitize", modified_sanitize,
                                 ArrayType(StringType()))

        # TASK 5
        if os.path.isfile("joined_data.parquet/._SUCCESS.crc"):
            joined_data = context.read.parquet("joined_data.parquet")
        else:
            joined_data = generate_joined_data(labeled_data, comments, context)
            joined_data.write.parquet("joined_data.parquet")

        # code to run sanitize on joined data
        if os.path.isfile("ngrams.parquet/._SUCCESS.crc"):
            ngrams = context.read.parquet("ngrams.parquet")
        else:
            joined_data.createOrReplaceTempView("joined_data")

            ngram_sql = """
        SELECT
            Input_id,
            labeldem,
            labelgop,
            labeldjt,
            sanitize(body) AS body
        FROM joined_data"""

            ngrams = context.sql(ngram_sql)
            ngrams.write.parquet("ngrams.parquet")

        # TASK 6A
        # REFERENCE:
        # https://spark.apache.org/docs/latest/ml-features.html#countvectorizer
        # couldn't figure out how to save this simply, so it always has to be ran? Ah well, was fast to run ¯\_(ツ)_/¯
        vectorizer = CountVectorizer(inputCol="body",
                                     outputCol="features",
                                     minDF=MIN_DF,
                                     binary=True)
        model = vectorizer.fit(ngrams)

        # TASK 6B
        if os.path.isdir("result.parquet"):
            result = context.read.parquet("result.parquet")
        else:
            result = model.transform(ngrams)
            result.write.parquet("result.parquet")

        if os.path.isdir("sentiment_data.parquet"):
            sentiment_data = context.read.parquet("sentiment_data.parquet")
        else:
            djt_sentiment_sql = """
        SELECT
            *,
            if (labeldjt = 1, 1, 0) AS pos_label,
            if (labeldjt = -1, 1, 0) AS neg_label
        FROM result"""
            result.createOrReplaceTempView("result")
            sentiment_data = context.sql(djt_sentiment_sql)
            sentiment_data.write.parquet("sentiment_data.parquet")
            # sentiment_data.show()

        # TASK 7
        if os.path.isfile(
                "project2/pos.model/bestModel/data/._SUCCESS.crc"
        ) and os.path.isfile(
                "project2/neg.model/bestModel/data/._SUCCESS.crc"):
            pos_model = CrossValidatorModel.load("project2/pos.model")
            neg_model = CrossValidatorModel.load("project2/neg.model")
        else:
            # Initialize two logistic regression models.
            # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
            poslr = LogisticRegression(labelCol="pos_label",
                                       featuresCol="features",
                                       maxIter=10)
            neglr = LogisticRegression(labelCol="neg_label",
                                       featuresCol="features",
                                       maxIter=10)
            # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
            posEvaluator = BinaryClassificationEvaluator(labelCol="pos_label")
            negEvaluator = BinaryClassificationEvaluator(labelCol="neg_label")
            # There are a few parameters associated with logistic regression. We do not know what they are a priori.
            # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
            # We will assume the parameter is 1.0. Grid search takes forever.
            posParamGrid = ParamGridBuilder().addGrid(poslr.regParam,
                                                      [1.0]).build()
            negParamGrid = ParamGridBuilder().addGrid(neglr.regParam,
                                                      [1.0]).build()
            # We initialize a 5 fold cross-validation pipeline.
            posCrossval = CrossValidator(estimator=poslr,
                                         evaluator=posEvaluator,
                                         estimatorParamMaps=posParamGrid,
                                         numFolds=5)
            negCrossval = CrossValidator(estimator=neglr,
                                         evaluator=negEvaluator,
                                         estimatorParamMaps=negParamGrid,
                                         numFolds=5)
            # Although crossvalidation creates its own train/test sets for
            # tuning, we still need a labeled test set, because it is not
            # accessible from the crossvalidator (argh!)
            # Split the data 50/50
            posTrain, posTest = sentiment_data.randomSplit([0.5, 0.5])
            negTrain, negTest = sentiment_data.randomSplit([0.5, 0.5])
            # Train the models
            print("Training positive classifier...")
            pos_model = posCrossval.fit(posTrain)
            print("Training negative classifier...")
            neg_model = negCrossval.fit(negTrain)

            # Once we train the models, we don't want to do it again. We can save the models and load them again later.
            pos_model.save("project2/pos.model")
            neg_model.save("project2/neg.model")

        # TASK 8
        # had to downsample because of RAM issues, seemed like the correct place to do it albeit a little redundant reloading
        # (Windows ate up way too much RAM on the desktop we used, and both of our laptops are too much of potatoes to run any of this)
        comments = context.read.parquet("comments.parquet").sample(
            False, 0.2, None)
        submissions = context.read.parquet("submissions.parquet").sample(
            False, 0.2, None)

        comments.createOrReplaceTempView("comments")
        submissions.createOrReplaceTempView("submissions")

        full_comments_data = generate_full_comments_data(
            submissions, comments, context)
        # full_comments_data.filter("state is not null").show()

        # TASK 9

        # task 4 redone
        # reregisters function in case of exception not happening for previous task 4
        context.registerFunction("sanitize", modified_sanitize,
                                 ArrayType(StringType()))

        # task 5 redone
        sanitized_full_comments = generate_sanitized_full_comments(
            context, full_comments_data)
        # sanitized_full_comments.show()

        # task 6A
        result_full_data = model.transform(sanitized_full_comments)
        # result_full_data.show()

        # classification part of task 9
        pos_result = pos_model.transform(result_full_data)
        # pos_result.show()
        neg_result = neg_model.transform(result_full_data)
        # neg_result.show()

        # probability threshold application from task 9
        context.registerFunction("first_element", lambda x: float(x[1]),
                                 FloatType())
        threshold_sql = """
    SELECT
        a.comment_id AS comment_id,
        a.submission_id AS submission_id,
        a.timestamp AS timestamp,
        a.state AS state,
        a.title AS title,
        if (first_element(a.probability) > 0.2, 1, 0) AS pos,
        if (first_element(b.probability) > 0.25, 1, 0) AS neg,
        a.comment_score AS comment_score,
        a.submission_score AS submission_score
    FROM pos_result a
    INNER JOIN neg_result b ON a.comment_id = b.comment_id
    """
        pos_result.createOrReplaceTempView("pos_result")
        neg_result.createOrReplaceTempView("neg_result")
        # pos_result.printSchema()
        # full_sentiment_data = context.sql(threshold_sql).explain()
        full_sentiment_data = context.sql(threshold_sql)
        full_sentiment_data.write.parquet("full_sentiment_data.parquet")
        # full_sentiment_data.show()
        # full_sentiment_data.show(20, False)
        # exit(1)

    # TASK 10

    # part 1
    percent_sql = """
SELECT
    AVG(pos) * 100.0 AS Positive,
    AVG(neg) * 100.0 AS Negative
FROM full_sentiment_data"""
    full_sentiment_data.createOrReplaceTempView("full_sentiment_data")
    task10_1 = context.sql(percent_sql)
    # task10_1.show()
    if os.path.isdir("raw_percentages.csv"):
        shutil.rmtree("raw_percentages.csv")
    task10_1.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("raw_percentages.csv")

    # part 2
    percent_by_day_sql = """
SELECT
    FROM_UNIXTIME(timestamp, 'YYYY-MM-dd') AS date,
    AVG(pos) * 100.0 AS Positive,
    AVG(neg) * 100.0 AS Negative
FROM full_sentiment_data
GROUP BY date
ORDER BY date"""
    # full_sentiment_data.createOrReplaceTempView("full_sentiment_data")
    task10_2 = context.sql(percent_by_day_sql)
    # task10_2.show()
    if os.path.isdir("time_data.csv"):
        shutil.rmtree("time_data.csv")
    task10_2.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("time_data.csv")

    # Task 3
    context.registerFunction("valid_state", lambda x: x in STATES,
                             BooleanType())
    task10_3_sql = """
SELECT
    state AS state,
    AVG(pos) * 100.0 AS Positive,
    AVG(neg) * 100.0 AS Negative
FROM full_sentiment_data
WHERE (valid_state(state))
GROUP BY state
    """
    # full_sentiment_data.createOrReplaceTempView("full_sentiment_data")
    task10_3 = context.sql(task10_3_sql)
    if os.path.isdir("state_data.csv"):
        shutil.rmtree("state_data.csv")
    task10_3.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("state_data.csv")

    # part 4
    task_10_4_sql_submission = """
SELECT
    submission_score,
    AVG(pos) * 100.0 AS Positive,
    AVG(neg) * 100.0 AS Negative
FROM full_sentiment_data
GROUP BY submission_score
    """

    task_10_4_sql_comment = """
SELECT
    comment_score,
    AVG(pos) * 100.0 AS Positive,
    AVG(neg) * 100.0 AS Negative
FROM full_sentiment_data
GROUP BY comment_score
    """
    # full_sentiment_data.createOrReplaceTempView("full_sentiment_data")

    percent_submission = context.sql(task_10_4_sql_submission)
    # full_sentiment_data.createOrReplaceTempView("full_sentiment_data")
    percent_comment = context.sql(task_10_4_sql_comment)

    if os.path.isdir("submission_score.csv"):
        shutil.rmtree("submission_score.csv")
    percent_submission.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("submission_score.csv")
    if os.path.isdir("comment_score.csv"):
        shutil.rmtree("comment_score.csv")
    percent_comment.repartition(1).write.format(
        "com.databricks.spark.csv").option("header",
                                           "true").save("comment_score.csv")

    # FOR 4 in, to get the top ten pos and neg submissions
    top_pos_submissions_sql = """
SELECT
    submission_id,
    submission_score,
    AVG(pos) * 100.0 AS Positive,
    AVG(neg) * 100.0 AS Negative
FROM full_sentiment_data
GROUP BY submission_score, submission_id
ORDER BY Positive DESC
LIMIT 10
    """

    top_neg_submissions_sql = """
SELECT
    submission_id,
    submission_score,
    AVG(pos) * 100.0 AS Positive,
    AVG(neg) * 100.0 AS Negative
FROM full_sentiment_data
GROUP BY submission_score, submission_id
ORDER BY Negative DESC
LIMIT 10
    """
    # full_sentiment_data.show()
    top_pos_submissions = context.sql(top_pos_submissions_sql)
    # top_pos_submissions.show()

    top_neg_submissions = context.sql(top_neg_submissions_sql)
    # top_neg_submissions.show()
    if os.path.isdir("top_pos_submissions.csv"):
        shutil.rmtree("top_pos_submissions.csv")
    top_pos_submissions.repartition(1).write.format(
        "com.databricks.spark.csv").option(
            "header", "true").save("top_pos_submissions.csv")
    if os.path.isdir("top_neg_submissions.csv"):
        shutil.rmtree("top_neg_submissions.csv")
    top_neg_submissions.repartition(1).write.format(
        "com.databricks.spark.csv").option(
            "header", "true").save("top_neg_submissions.csv")
Пример #11
0
            StructField("medium", StringType(), True),
            StructField("campaign", StringType(), True),
            StructField("content", StringType(), True)
        ]), True),
    StructField("channel", StringType(), True),
    StructField("client_id", StringType(), True),
    StructField("country", StringType(), True),
    StructField("default_search_engine", StringType(), True),
    StructField("distribution_id", StringType(), True),
    StructField("locale", StringType(), True),
    StructField("normalized_channel", StringType(), True),
    StructField("profile_creation_date", LongType(), True),
    StructField("submission_date_s3", StringType(), False),
    StructField("subsession_length", LongType(), True),
    StructField("subsession_start_date", StringType(), True),
    StructField("sync_configured", BooleanType(), True),
    StructField("sync_count_desktop", IntegerType(), True),
    StructField("sync_count_mobile", IntegerType(), True),
    StructField("timestamp", LongType(), True),
    StructField(SPBE + "total_uri_count", IntegerType(), True),
    StructField(SPBE + "unique_domains_count", IntegerType(), True)
])

default_sample = {
    "app_version": "57.0.0",
    "attribution": {
        "source": "source-value",
        "medium": "medium-value",
        "campaign": "campaign-value",
        "content": "content-value"
    },
STRING = 'string'
INTEGER = 'integer'
DOUBLE = 'double'
BOOLEAN = 'boolean'
TIMESTAMP = 'timestamp'

try:
    from pyspark.sql.types import (StructType, StructField, StringType,
                                   IntegerType, DoubleType, BooleanType)

    SPARK_DTYPE_MAPPING = {
        STRING: StringType(),
        INTEGER: IntegerType(),
        DOUBLE: DoubleType(),
        BOOLEAN: BooleanType()
    }
except (ModuleNotFoundError, NameError):
    pass

MODELS = {
    'SubmissionProducer': {
        'schema': {
            'id': STRING,
            'subreddit': STRING,
            'subreddit_subscribers': INTEGER,
            'title': STRING,
            'author': STRING,
            'created_utc': DOUBLE,
            'over_18': BOOLEAN,
            'selftext': STRING
        },
Пример #13
0
    import requests
    import boto3
    bucket = "podcast-mp3-bucket"
    try:
        s3 = boto3.resource('s3')
        mp3data = requests.get(url).content
        s3Object = s3.Object(bucket, key)
        s3Object.put(Body=mp3data)
        #success
        return True
    except:
        #error
        return False


url_to_s3_udf = udf(lambda x, z: download_to_s3(x, z), BooleanType())

#Get all podcast episodes in Elasticsearch
dfES = spark.read.format('org.elasticsearch.spark.sql')\
    .option('es.nodes', '10.0.0.6:9200, 10.0.0.14:9200, 10.0.0.10:9200')\
    .option('es.resource', "podcast_v1")\
    .option('es.read.metadata', 'true')\
    .load()

dfES = dfES.select(
    col("_metadata").getItem("_id").alias("idES"), col("downloaded"),
    col("audiourl"))

#Get list of MP3 files in S3
dfS3 = spark.read.format("binaryFile")\
    .option("pathGlobFilter", "*")\
Пример #14
0
# init spark/ with 2 partitions made
spark = SparkSession.builder.master("local").appName("HouseScrape").config(
    "spark.sql.shuffle.partitions",
    2).config('spark.sql.warehouse.dir',
              'file:///C:/path/to/my/').getOrCreate()

# read data from certain csv files
lands = spark.read.format('csv').option('header', 'true').load('Downloads/*')
# drop the empty data
lands.na.drop()

# filter the data
lands = lands.filter((lands["主要用途"] == "住家用")
                     & (lands["建物型態"].contains("住宅大樓")))
# filter the data with udf
filter_udf = udf(filter_13, BooleanType())
lands = lands.filter(filter_udf(lands["總樓層數"]))

# create a new column for city name

lands = lands.withColumn("縣市名稱", (lands["土地區段位置建物區段門牌"]).substr(1, 3))
# select the columns that matches requirement
lands = lands.select(lands["縣市名稱"], lands['交易年月日'], lands['鄉鎮市區'],
                     lands['建物型態'])
# rename the column for further requirement
lands = lands.withColumnRenamed('縣市名稱', 'city').withColumnRenamed(
    '交易年月日', 'date').withColumnRenamed('鄉鎮市區', 'district').withColumnRenamed(
        '建物型態', 'building_state')

# transfer the date from Taiwan date to AD date via udf
converter_udf = udf(covert_date, StringType())
Пример #15
0
from pyspark.sql.functions import isnan, when, count, col
business_df.select([
    count(when(isnan(c) | col(c).isNull(), c)).alias(c)
    for c in business_df.columns
]).show()

# Drop rows with null values in categories column
business_df = business_df.dropna(subset='categories')
# business_df.show(5)

#Only keeping the businesses that are restaurants or are food related
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType


@udf(returnType=BooleanType())
def filter_restaurants(col):
    for category in col.split(','):
        if 'restaurant' in category.lower() or 'food' in category.lower():
            return True
    return False


@udf(returnType=BooleanType())
def filter_non_restaurants(col):
    category = ' '.join(col.split(','))
    if 'restaurant' not in category.lower() and 'food' not in category.lower():
        return True
    return False

Пример #16
0
        'USAWV', 'USAWY', 'USSR', 'UZBK', 'VANU', 'VCAN', 'VEN', 'VI', 'VIETN',
        'WAFR', 'WALLIS', 'WASIA', 'WEEC', 'WEIND', 'WESTW', 'WEUR', 'WORLD',
        'WSOMOA', 'YEMAR', 'YUG', 'ZAIRE', 'ZAMBIA', 'ZIMBAB'
    ])

Filter = Filter_regions.transform(limit_df)

Result_df = Filter.select('DocID', 'Topics').repartition(4)


def inter(a, b):
    match = set(a).issubset(set(b))
    return match


inter_udf = udf(inter, BooleanType())


def threshold(value):
    if value == True:
        return 1
    else:
        return 0


threshold_udf = udf(threshold, IntegerType())

df1 = Result_df.join(Result_df.alias("Result_df1").select(col("DocID").alias("DocID2"),col("Topics").alias("Topics2")),col("DocID") < col("DocID2"), 'inner')\
       .withColumn('Intersect_Score',inter_udf(col('Topics'),col('Topics2')))\
       .withColumn('True_match',threshold_udf(col('Intersect_Score')))
Пример #17
0
SCHEMA_TRANSACTIONS = StructType([
    StructField('dt', DateType()),
    StructField('payer_account', StringType()),
    StructField('beneficiary_account', StringType()),
    StructField('amount', DoubleType())
])

SCHEMA_ACCOUNT_INFO = StructType([
    StructField('account', StringType()),
    StructField('name', StringType()),
    StructField('country', StringType())
])

SCHEMA_COUNTRIES = StructType([
    StructField('country', StringType()),
    StructField('allowed', BooleanType())
])

ACCOUNT_INFO_ROWS = [("NL99INGB9999999999", "John Muller BV", "NL"),
                     ("NL88RABO8888888888", "Kris Geusebroek NV", "NL"),
                     ("NL29ABNA5612457383", "Super mooie laptops BV", "NL"),
                     ("BE59587979732526", "Ahmet Erdem Belgian Investment",
                      "BE"), ("BE31199386628955", "Vlaamse Patat", "BE"),
                     ("BE29587431928864", "Gauffre Belgique", "BE"),
                     ("PL84109024029551596171791699", "Polski Beat", "PL"),
                     ("PL75109024026862879594797792", "Zywiec", "PL"),
                     ("NK1", "Kim Jong Un Industries", "NK"),
                     ("NK2", "Kim Jong Un Investment", "NK")]


def generate_transactions(number):
from pyspark.sql.types import BooleanType
import re


def regex_filter(x):
    regexs = ['\d+']

    if x and x.strip():
        for r in regexs:
            if re.match(r, x, re.IGNORECASE):
                return True
    return False


filter_udf = udf(regex_filter, BooleanType())

data_filter = data.filter(filter_udf(data.listing_id))

# COMMAND ----------

# Create separate dataframe based on the outliers in room pricing,
# data_filter_pos --> greater than average price
# data_filter_neg --> lower than average price
data_filter_pos = data_filter.where(
    col("listing_id").isin([
        "7921556", "18479564", "10452642", "14859885", "6794333", "12382366",
        "3629096", "16031982", "17494091", "7330060"
    ]))

data_filter_neg = data_filter.where(
Пример #19
0
                      "short": "short",
                      "binary": "binary",
                      "null": "null"

                      # "vector": "vector"
                      }

SPARK_DTYPES_DICT = {"string": StringType, "int": IntegerType, "float": FloatType,
                     "double": DoubleType, "boolean": BooleanType, "struct": StructType, "array": ArrayType,
                     "bigint": LongType, "date": DateType, "byte": ByteType, "short": ShortType,
                     "datetime": TimestampType, "binary": BinaryType, "null": NullType
                     }

SPARK_DTYPES_DICT_OBJECTS = \
    {"string": StringType(), "int": IntegerType(), "float": FloatType(),
     "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()),
     "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(),
     "datetime": TimestampType(), "binary": BinaryType(), "null": NullType()
     }

# Profiler
PROFILER_TYPES = {"int", "float", "string", "bool", "date", "null", "array", "double"}
PROFILER_LEGEND_TYPES = {"string": "ABC", "int": "#", "integer": "#", "float": "##.#", "double": "##.#", "bigint": "#"}
PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "bool", "null", "array"}

# Strings and Function Messages
JUST_CHECKING = "Just check that Spark and all necessary environments vars are present..."
STARTING_SPARK = "Starting or getting SparkSession and SparkContext..."
STARTING_OPTIMUS = "Transform and Roll out..."

SUCCESS = "Optimus successfully imported. Have fun :)."
Пример #20
0
lambdaDF.show()
lambdaDF.count()


# Past versions of DataFrames used to wrap lambdas around
# Spark _User Defined Function_ (UDF). A UDF is a special wrapper around a
# function, allowing the function to be used in a DataFrame query,
# and requires both the function and the return type to be defined.

# In[ ]:


from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf

less_ten = udf(lambda s: s < 10, BooleanType())
lambdaDF = subDF.filter(less_ten(subDF.age))
lambdaDF.show()
lambdaDF.count()


# Lets try another example below.

# In[ ]:


# Let's collect the even values less than 10
even = udf(lambda s: s % 2 == 0, BooleanType())
evenDF = lambdaDF.filter(even(lambdaDF.age))
evenDF.show()
evenDF.count()
Пример #21
0
This script is used to:
1. Generate predictions for a set of reviews using a given lexicon.
2. Find the following metrics given the true values and predictions : Accuracy, Precision, Recall, F1 Score
3. Store the values for all combinations in a CSV.
"""
from itertools import product
from gcloud import storage
from pyspark.sql.functions import concat_ws, split, explode, sum, avg, udf
from pyspark.sql.types import StructType, StringType, DoubleType, BooleanType


def score_to_sentiment(score):
    return score >= 0


scoreUDF = udf(score_to_sentiment, BooleanType())


def generate_predictions(review_parquet, lexicon_csv):
    # TODO: Check if columns exist
    amazon = spark.read.parquet(review_parquet)

    # TODO: Change lexicon structure so that 'review_word' is used instead of 'word'
    struct = StructType().add(field="word", data_type=StringType()).add(
        field="score", data_type=DoubleType())
    words = spark.read.csv(lexicon_csv, schema=struct)

    # TODO: Should we create reviewID while processing it (instead of here) to be on the safer side?
    amazon = amazon.withColumn('reviewID',
                               concat_ws('-', amazon.asin, amazon.reviewerID))
    exploded_words = amazon.withColumn(
Пример #22
0
    def test_fillna(self):
        schema = StructType(
            [
                StructField("name", StringType(), True),
                StructField("age", IntegerType(), True),
                StructField("height", DoubleType(), True),
                StructField("spy", BooleanType(), True),
            ]
        )

        # fillna shouldn't change non-null values
        row = self.spark.createDataFrame([("Alice", 10, 80.1, True)], schema).fillna(50).first()
        self.assertEqual(row.age, 10)

        # fillna with int
        row = self.spark.createDataFrame([("Alice", None, None, None)], schema).fillna(50).first()
        self.assertEqual(row.age, 50)
        self.assertEqual(row.height, 50.0)

        # fillna with double
        row = self.spark.createDataFrame([("Alice", None, None, None)], schema).fillna(50.1).first()
        self.assertEqual(row.age, 50)
        self.assertEqual(row.height, 50.1)

        # fillna with bool
        row = self.spark.createDataFrame([("Alice", None, None, None)], schema).fillna(True).first()
        self.assertEqual(row.age, None)
        self.assertEqual(row.spy, True)

        # fillna with string
        row = self.spark.createDataFrame([(None, None, None, None)], schema).fillna("hello").first()
        self.assertEqual(row.name, "hello")
        self.assertEqual(row.age, None)

        # fillna with subset specified for numeric cols
        row = (
            self.spark.createDataFrame([(None, None, None, None)], schema)
            .fillna(50, subset=["name", "age"])
            .first()
        )
        self.assertEqual(row.name, None)
        self.assertEqual(row.age, 50)
        self.assertEqual(row.height, None)
        self.assertEqual(row.spy, None)

        # fillna with subset specified for string cols
        row = (
            self.spark.createDataFrame([(None, None, None, None)], schema)
            .fillna("haha", subset=["name", "age"])
            .first()
        )
        self.assertEqual(row.name, "haha")
        self.assertEqual(row.age, None)
        self.assertEqual(row.height, None)
        self.assertEqual(row.spy, None)

        # fillna with subset specified for bool cols
        row = (
            self.spark.createDataFrame([(None, None, None, None)], schema)
            .fillna(True, subset=["name", "spy"])
            .first()
        )
        self.assertEqual(row.name, None)
        self.assertEqual(row.age, None)
        self.assertEqual(row.height, None)
        self.assertEqual(row.spy, True)

        # fillna with dictionary for boolean types
        row = self.spark.createDataFrame([Row(a=None), Row(a=True)]).fillna({"a": True}).first()
        self.assertEqual(row.a, True)
Пример #23
0
def read_dataset(filename):
    spark = init_spark()
    schema = StructType([
        StructField('ID', StringType(), True),
        StructField('Source', StringType(), True),
        StructField('TMC', StringType(), True),
        StructField('Severity', IntegerType(), True),
        StructField('Start_Time', TimestampType(), True),
        StructField('End_Time', TimestampType(), True),
        StructField('Start_Lat', FloatType(), True),
        StructField('Start_Lng', FloatType(), True),
        StructField('End_Lat', FloatType(), True),
        StructField('End_Lng', FloatType(), True),
        StructField('Distance(mi)', FloatType(), True),
        StructField('Description', StringType(), True),
        StructField('Number', StringType(), True),
        StructField('Street', StringType(), True),
        StructField('Side', StringType(), True),
        StructField('City', StringType(), True),
        StructField('County', StringType(), True),
        StructField('State', StringType(), True),
        StructField('Zipcode', StringType(), True),
        StructField('Country', StringType(), True),
        StructField('Timezone', StringType(), True),
        StructField('Airport_Code', StringType(), True),
        StructField('Weather_Timestamp', TimestampType(), True),
        StructField('Temperature(F)', FloatType(), True),
        StructField('Wind_Chill(F)', FloatType(), True),
        StructField('Humidity(%)', FloatType(), True),
        StructField('Pressure(in)', FloatType(), True),
        StructField('Visibility(mi)', FloatType(), True),
        StructField('Wind_Direction', StringType(), True),
        StructField('Wind_Speed(mph)', FloatType(), True),
        StructField('Precipitation(in)', FloatType(), True),
        StructField('Weather_Condition', StringType(), True),
        StructField('Amenity', BooleanType(), True),
        StructField('Bump', BooleanType(), True),
        StructField('Crossing', BooleanType(), True),
        StructField('Give_Way', BooleanType(), True),
        StructField('Junction', BooleanType(), True),
        StructField('No_Exit', BooleanType(), True),
        StructField('Railway', BooleanType(), True),
        StructField('Roundabout', BooleanType(), True),
        StructField('Station', BooleanType(), True),
        StructField('Stop', BooleanType(), True),
        StructField('Traffic_Calming', BooleanType(), True),
        StructField('Traffic_Signal', BooleanType(), True),
        StructField('Turning_Loop', BooleanType(), True),
        StructField('Sunrise_Sunset', StringType(), True),
        StructField('Civil_Twilight', StringType(), True),
        StructField('Nautical_Twilight', StringType(), True),
        StructField('Astronomical_Twilight', StringType(), True)
    ])
    total_accidents_data = spark.read.schema(schema).csv(filename,
                                                         header=True,
                                                         mode="DROPMALFORMED",
                                                         encoding="ISO-8859-1",
                                                         inferSchema=True)
    # dropped meaningless and 1 class only columns
    final_result = total_accidents_data.drop('Country').drop('Turning_Loop').drop('id', 'Source', 'Description')\
        .withColumn('start_year', sql_func.date_format(total_accidents_data.Start_Time, 'y'))
    final_result = final_result.filter(((col('start_year') == 2019)))
    final_result.show()
    print(final_result.count())
    return final_result
Пример #24
0
simpleData = [("James", 34, "2006-01-01", "true", "M", 3000.60),
              ("Michael", 33, "1980-01-10", "true", "F", 3300.80),
              ("Robert", 37, "06-01-1992", "false", "M", 5000.50)]

columns = [
    "firstname", "age", "jobStartDate", "isGraduated", "gender", "salary"
]
df = spark.createDataFrame(data=simpleData, schema=columns)
df.printSchema()
df.show(truncate=False)

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, BooleanType, DateType
df2 = df.withColumn("age",col("age").cast(StringType())) \
    .withColumn("isGraduated",col("isGraduated").cast(BooleanType())) \
    .withColumn("jobStartDate",col("jobStartDate").cast(DateType()))
df2.printSchema()

df3 = df2.selectExpr("cast(age as int) age",
                     "cast(isGraduated as string) isGraduated",
                     "cast(jobStartDate as string) jobStartDate")
df3.printSchema()
df3.show(truncate=False)

df3.createOrReplaceTempView("CastExample")
df4 = spark.sql(
    "SELECT STRING(age),BOOLEAN(isGraduated),DATE(jobStartDate) from CastExample"
)
df4.printSchema()
df4.show(truncate=False)
Пример #25
0
    def make_not_terminal_udf(actions: List[str]):
        """ Return true iff next_action is terminal (i.e. idx = len(actions)). """
        def get_not_terminal(next_action):
            return next_action < len(actions)

        return udf(get_not_terminal, BooleanType())
Пример #26
0
        "timestamp": (int(
            (timestamp - epoch).total_seconds() * nanoseconds_per_second))
    }

    return date_snippet


def search_row(engine='hooli', count=1, source='searchbar'):
    return Row(engine=text_type(engine), source=text_type(source), count=count)


schema = StructType([
    StructField("document_id", StringType(), True),
    StructField("client_id", StringType(), True),
    StructField("timestamp", StringType(), True),
    StructField("is_default_browser", BooleanType(), True),
    StructField(
        "search_counts",
        ArrayType(
            StructType([
                StructField("engine", StringType(), True),
                StructField("source", StringType(), True),
                StructField("count", LongType(), True)
            ]), True), True),
    StructField("country", StringType(), True),
    StructField("profile_creation_date", LongType(), True),
    StructField("normalized_channel", StringType(), True),
    StructField("os", StringType(), True),
    StructField("subsession_length", LongType(), True),
    StructField("submission_date_s3", StringType(), True),
])
Пример #27
0
def select_relevant_columns(df,
                            discrete_action: bool = True,
                            include_possible_actions: bool = True):
    """ Select all the relevant columns and perform type conversions. """
    if not discrete_action and include_possible_actions:
        raise NotImplementedError(
            "currently we don't support include_possible_actions")

    select_col_list = [
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("reward").cast(FloatType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("state_features").cast(ArrayType(FloatType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("state_features_presence").cast(ArrayType(BooleanType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("next_state_features").cast(ArrayType(FloatType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("next_state_features_presence").cast(ArrayType(BooleanType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("not_terminal").cast(BooleanType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("action_probability").cast(FloatType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("mdp_id").cast(LongType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("sequence_number").cast(LongType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("step").cast(LongType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("time_diff").cast(LongType()),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("metrics").cast(ArrayType(FloatType())),
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        # pyre-fixme[16]: Module `functions` has no attribute `col`.
        col("metrics_presence").cast(ArrayType(BooleanType())),
    ]

    if discrete_action:
        select_col_list += [
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("action").cast(LongType()),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("next_action").cast(LongType()),
        ]
    else:
        select_col_list += [
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("action").cast(ArrayType(FloatType())),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("next_action").cast(ArrayType(FloatType())),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("action_presence").cast(ArrayType(BooleanType())),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("next_action_presence").cast(ArrayType(BooleanType())),
        ]

    if include_possible_actions:
        select_col_list += [
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("possible_actions_mask").cast(ArrayType(LongType())),
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            # pyre-fixme[16]: Module `functions` has no attribute `col`.
            col("possible_next_actions_mask").cast(ArrayType(LongType())),
        ]

    return df.select(*select_col_list)
Пример #28
0
 "isinf":
 lambda c: c == float("inf"),
 "isnan":
 F.isnan,
 "isnat":
 lambda c: NotImplemented,  # Koalas and PySpark does not have Nat concept.
 "log":
 F.log,
 "log10":
 F.log10,
 "log1p":
 F.log1p,
 "log2":
 F.pandas_udf(lambda s: np.log2(s), DoubleType()),
 "logical_not":
 lambda c: ~(c.cast(BooleanType())),
 "matmul":
 lambda _: NotImplemented,  # Can return a NumPy array in pandas.
 "negative":
 lambda c: c * -1,
 "positive":
 lambda c: c,
 "rad2deg":
 F.pandas_udf(lambda s: np.rad2deg(s), DoubleType()),
 "radians":
 F.radians,
 "reciprocal":
 F.pandas_udf(lambda s: np.reciprocal(s), DoubleType()),
 "rint":
 F.pandas_udf(lambda s: np.rint(s), DoubleType()),
 "sign":
 StructField("geoNetwork_subContinent", StringType(), True),
 StructField("geoNetwork_country", StringType(), True),
 StructField("geoNetwork_region", StringType(), True),
 StructField("geoNetwork_metro", StringType(), True),
 StructField("geoNetwork_city", StringType(), True),
 StructField("geoNetwork_cityId", IntegerType(), True),
 StructField("geoNetwork_networkDomain", StringType(), True),
 StructField("geoNetwork_latitude", DoubleType(), True),
 StructField("geoNetwork_longitude", DoubleType(), True),
 StructField("geoNetwork_networkLocation", StringType(), True),
 StructField("device_browser", StringType(), True),
 StructField("device_browserVersion", DoubleType(), True),
 StructField("device_browserSize", StringType(), True),
 StructField("device_operatingSystem", StringType(), True),
 StructField("device_operatingSystemVersion", StringType(), True),
 StructField("device_isMobile", BooleanType(), True),
 StructField("device_mobileDeviceBranding", StringType(), True),
 StructField("device_mobileDeviceModel", StringType(), True),
 StructField("device_mobileInputSelector", StringType(), True),
 StructField("device_mobileDeviceInfo", StringType(), True),
 StructField("device_mobileDeviceMarketingName", StringType(), True),
 StructField("device_flashVersion", IntegerType(), True),
 StructField("device_javaEnabled", StringType(), True),
 StructField("device_language", StringType(), True),
 StructField("device_screenColors", StringType(), True),
 StructField("device_screenResolution", StringType(), True),
 StructField("device_deviceCategory", StringType(), True),
 StructField("totals_transactionRevenue", StringType(), True),
 StructField("landingPage", StringType(), True),
 StructField("hits_type", StringType(), True),
 StructField("touchpoints", ArrayType(StringType()), True),
Пример #30
0
    StructField(
        "build",
        ArrayType(
            StructType([StructField("application_name", StringType(), True)]),
            True), True),
    StructField(
        "settings",
        ArrayType(StructType([StructField("locale", StringType(), True)]),
                  True), True),
    StructField(
        "active_addons",
        ArrayType(
            MapType(
                StringType(),
                StructType([
                    StructField("blocklisted", BooleanType(), True),
                    StructField("type", StringType(), True),
                    StructField("signed_state", LongType(), True),
                    StructField("user_disabled", BooleanType(), True),
                    StructField("app_disabled", BooleanType(), True),
                    StructField("is_system", BooleanType(), True)
                ]), True), True))
])

default_sample = {
    "client_id":
    "client-id",
    "normalized_channel":
    "release",
    "build": [{
        "application_name": "Firefox"