Exemplo n.º 1
0
# schema used to parse tweets json.
tweet_schema = StructType([
    StructField("created_at", StringType(), nullable=True),
    StructField("text", StringType(), nullable=True),
    StructField("place",
                StructType([
                    StructField("name", StringType(), nullable=True),
                    StructField("country_code", StringType(), nullable=True)
                ]),
                nullable=True),
    StructField("user",
                StructType([
                    StructField("location", StringType(), nullable=True),
                    StructField("created_at", StringType(), nullable=True),
                    StructField("updateTime", StringType(), nullable=True)
                ]),
                nullable=True),
    StructField("entities",
                StructType([
                    StructField("hashtags",
                                ArrayType(
                                    StructType([
                                        StructField("text",
                                                    StringType(),
                                                    nullable=True)
                                    ])),
                                nullable=True)
                ]),
                nullable=True)
])
Exemplo n.º 2
0
    def to_array(col):
        def to_array_(v):
            return v.toArray().tolist()

        return udf(to_array_, ArrayType(DoubleType()))(col)
Exemplo n.º 3
0
def preprocess_file(bucket_name, file_name):

    raw_data = sql_context.read.json("s3a://{0}/{1}".format(
        bucket_name, file_name))

    # Clean question body
    if (config.LOG_DEBUG):
        print(colored("[PROCESSING]: Cleaning question body...", "green"))
    clean_body = udf(lambda body: filter_body(body), StringType())
    partially_cleaned_data = raw_data.withColumn("cleaned_body",
                                                 clean_body("body"))

    # Concat cleaned question body and question title to form question vector
    if (config.LOG_DEBUG):
        print(
            colored(
                "[PROCESSING]: Concating question body and question title...",
                "green"))
    data = partially_cleaned_data.withColumn(
        "text_body", concat(col("title"), lit(" "), col("body")))

    # Tokenize question title
    if (config.LOG_DEBUG):
        print(colored("[PROCESSING]: Tokenizing text vector...", "green"))
    tokenizer = Tokenizer(inputCol="text_body",
                          outputCol="text_body_tokenized")
    tokenized_data = tokenizer.transform(data)

    # Remove stop words
    if (config.LOG_DEBUG):
        print(colored("[PROCESSING]: Removing stop words...", "green"))
    stop_words_remover = StopWordsRemover(
        inputCol="text_body_tokenized",
        outputCol="text_body_stop_words_removed")
    stop_words_removed_data = stop_words_remover.transform(tokenized_data)

    # Stem words
    if (config.LOG_DEBUG):
        print(colored("[PROCESSING]: Stemming tokenized vector...", "green"))
    stem = udf(lambda tokens: lemmatize(tokens), ArrayType(StringType()))
    stemmed_data = stop_words_removed_data.withColumn(
        "text_body_stemmed", stem("text_body_stop_words_removed"))

    # Shingle resulting body
    # if (config.LOG_DEBUG): print(colored("[PROCESSING] Shingling resulting text body...", "green"))
    # shingle = udf(lambda tokens: get_two_gram_shingles(tokens), ArrayType(ArrayType(StringType())))
    # shingled_data = stemmed_data.withColumn("text_body_shingled", shingle("text_body_stemmed"))

    # Extract data that we want
    final_data = stemmed_data
    final_data.registerTempTable("final_data")

    preprocessed_data = sql_context.sql(
        "SELECT title, body, creation_date, text_body, text_body_stemmed, post_type_id, tags, score, comment_count, view_count, id from final_data"
    )

    # Write to AWS
    if (config.LOG_DEBUG):
        print(colored("[UPLOAD]: Writing preprocessed data to AWS...",
                      "green"))
    write_aws_s3(config.S3_BUCKET_BATCH_PREPROCESSED, file_name,
                 preprocessed_data)
def generate_frequent_words(filename_read_S3, filename_wite_S3):
    """
    Generate a list of most frequent words for each subreddit.

    :param filename_read_S3: "S3 file location to be read"
    :param filename_wite_S3: "S3 file to write to"
    :return: None
    """
    # get data -
    print("Step 1: read cleaned file into Dataframe from S3")
    comments_df1 = sqlContext.read.parquet(filename_read_S3)

    # select limited columns
    comments_df2 = comments_df1.select('subreddit', 'subreddit_id', 'year',
                                       'month', 'body_without_stopwords')
    print("schema of dataset - {0}".format(comments_df2.printSchema()))

    # -------------------------
    # WORD Count
    # -------------------------
    print("Step  3: Apply punctuation to a body_without_stopwords")
    comments_df3 = comments_df2.select(
        'subreddit', 'subreddit_id', 'year', 'month',
        removePunctuation(col('body_without_stopwords')))

    print("Step 3.1: Apply word lemmatization to generate base words")
    # register UDF
    spark.udf.register("lemma", lemma, ArrayType(StringType()))
    lemma_udf = udf(lemma)
    # run transformation
    comments_df31 = comments_df3.withColumn("lemmatized_body",
                                            lemma_udf(col("cleaned_body")))
    print(comments_df31.printSchema())

    # remove punctuations again
    comments_df32 = comments_df31.select(
        'subreddit', 'subreddit_id', 'year', 'month',
        removePunctuation(col('lemmatized_body')))

    print("Step  4: Split lines to words to generate the count")
    comments_df4 = (comments_df31.select(
        explode(split(comments_df31.cleaned_body, ' ')).alias('word'),
        'subreddit', 'subreddit_id', 'year', 'month').where(col('word') != ''))

    print(comments_df4.printSchema())

    print("Step  5: Get WordCount")
    comments_df5 = wordCount(comments_df4).orderBy("count", ascending=False)

    print("Step  6: Get ranking of each word based on count by windo")
    window = Window.partitionBy(comments_df5['subreddit_id']).orderBy(
        comments_df5['count'].desc())

    print("Step 7: Get words with rankin > 5")
    comments_df6 = comments_df5.select(
        '*',
        rank().over(window).alias('rank')).filter(col('rank') <= 5)

    print("writing data to S3")
    # ----------------------
    # Store to S3 - as Parquet
    # ----------------------
    print("Step 8: Generate parquet file for the words and load to S3")
    comments_df6.write.parquet(filename_wite_S3)
    print("Completed writing data to S3")
    return
	return [wordnet_lemmatizer.lemmatize(word,pos="v") for word in line]

text = removedsw.withColumn("lemma",lemma(removedsw.filtered_words))



def unitoarr(line):
	s = []
	for w in line:
		w = w.strip()
		if len(w) != 1 and w != "" and w != ' ' and len(w)>2:
			s.append(w) 
	return s


unitoarr_udf =  udf(unitoarr, ArrayType(StringType()))

text2 = text.withColumn("review",unitoarr_udf(text.lemma)).withColumn("label",change_labels(text.stars))


ngram = NGram(n=2, inputCol="review", outputCol="ngrams")

ngramDataFrame = ngram.transform(text2)

cv = CountVectorizer(inputCol="ngrams", outputCol="features")

models = cv.fit(ngramDataFrame)

result = models.transform(ngramDataFrame)

result1 = result.select("business_id","text","stars","label","features","ngrams")
Exemplo n.º 6
0
    "boolean": BooleanType,
    "struct": StructType,
    "array": ArrayType,
    "bigint": LongType,
    "date": DateType,
    "byte": ByteType,
    "short": ShortType,
    "datetime": TimestampType,
    "binary": BinaryType,
    "null": NullType,
    "vector": VectorUDT
}

SPARK_DTYPES_DICT_OBJECTS = \
    {"string": StringType(), "int": IntegerType(), "float": FloatType(),
     "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()),
     "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(),
     "datetime": TimestampType(), "binary": BinaryType(), "null": NullType()
     }

# Profiler
PROFILER_TYPES = {
    "int", "float", "string", "bool", "date", "null", "array", "double"
}
PROFILER_LEGEND_TYPES = {
    "string": "ABC",
    "int": "#",
    "integer": "#",
    "float": "##.#",
    "double": "##.#",
    "bigint": "#"
# save model run to mlflow
with mlflow.start_run(run_name='deployment run') as run:
    mlflow.pyfunc.log_model('model',
                            python_model=_lifetimesModelWrapper(model),
                            conda_env=conda_env)

# COMMAND ----------

# MAGIC %md Now that our model along with its dependency information and class wrapper have been recorded, let's use mlflow to convert the model into a function we can employ against a Spark DataFrame:

# COMMAND ----------

from pyspark.sql.types import ArrayType, FloatType

# define the schema of the values returned by the function
result_schema = ArrayType(FloatType())

# define function based on mlflow recorded model
probability_alive_udf = mlflow.pyfunc.spark_udf(spark,
                                                'runs:/{0}/model'.format(
                                                    run.info.run_id),
                                                result_type=result_schema)

# register the function for use in SQL
_ = spark.udf.register('probability_alive', probability_alive_udf)

# COMMAND ----------

# MAGIC %md Assuming we had access to customer metrics for frequency, recency and age, we can now use our function to generate some predictions:

# COMMAND ----------
Exemplo n.º 8
0
from pyspark.ml import PipelineModel
from pyspark.ml.linalg import Vectors, VectorUDT

# Command to run this application:
# spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1,org.elasticsearch:elasticsearch-spark-30_2.12:7.12.1 --master local[*] app.py
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',
                    level=logging.ERROR)

elastic_host = "elasticsearch"
elastic_index = "matches"
kafkaServer = "kafkaserver:9092"
topic = "dota_lineup"

# Schema of the input data
schema = StructType([
    StructField("dire_lineup", ArrayType(IntegerType(), False), False),
    StructField("radiant_lineup", ArrayType(IntegerType(), False), False),
    StructField("radiant_win", BooleanType(), False),
    StructField("match_seq_num", LongType(), False)
])

# Spark configuration, mainly needed for the elasticsearch plugin
sparkConf = SparkConf().set("spark.app.name", "dotingestion2") \
                        .set("es.nodes", "elasticsearch") \
                        .set("es.port", "9200") \
                        .set("es.mapping.id", "match_seq_num") \
                        .set("es.write.operation", "upsert")

# Load the hero_id conversions
with open("heroes.json", 'r', encoding="utf-8") as f:
    heroes_dict = {hero['id']: i for i, hero in enumerate(loads(f.read()))}
Exemplo n.º 9
0
def convert_types_for_es(df: DataFrame) -> DataFrame:
    to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))

    return df.withColumn("radiant_win_prediction", df.prediction.cast(BooleanType())) \
             .withColumn("probability_arr", to_array(df.probability))
Exemplo n.º 10
0
# polaczenie z brokerem
v_broker = "ec2-34-236-190-208.compute-1.amazonaws.com:9092"
v_ckpt_loc = "/tmp/checkpoint"

spark = SparkSession.builder.appName("Structured").getOrCreate()
# odczyt strumienia z tematu
raw=spark.readStream.format("kafka")\
.option("kafka.bootstrap.servers",v_broker)\
.option("startingOffsets", "earliest")\
.option("subscribe","sensor").load()

# Schemat napływających danych
schema = StructType()\
.add("current", StructType()\
.add("fromDateTime", StringType())\
.add("indexes", ArrayType(StructType().add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("description",StringType()).add("name",StringType()).add("value",DoubleType())))\
.add("standards", ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType())))\
.add("tillDateTime", StringType())\
.add("values", ArrayType(StructType().add("name",StringType()).add("value",StringType())))
      )\
.add("forecast", ArrayType(StructType().add("fromDateTime",StringType())\
.add("indexes",ArrayType(StructType().add("advice",StringType()).add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("value",DoubleType())) ) \
.add("standarts",ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType()))) \
.add("tillDateTime", StringType())\
.add("values", ArrayType(StructType().add("name",StringType()).add("value",StringType())))\
))\
.add("history", ArrayType(StructType().add("fromDateTime",StringType())\
.add("indexes",ArrayType(StructType().add("advice",StringType()).add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("value",DoubleType())) ) \
.add("standarts",ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType()))) \
.add("tillDateTime", StringType())\
.add("values", ArrayType(StructType().add("name",StringType()).add("value",StringType())))\
Exemplo n.º 11
0
    studentMarks2 = [
        Row(1, Row("john", "doe"), 6, [70.0, 35.0, 85.0]),
        Row(2, Row("jane", "doe"), 9, [80.0, 35.0, 92.5, 35.0, 46.0])
    ]
    
    studentMarks2Rdd = spark.sparkContext.parallelize(studentMarks2, 4)

    schema2 = StructType()\
        .add("id", IntegerType(), nullable=True)\
        .add("name", StructType()\
             .add("first", StringType(), nullable=True)\
             .add("last", StringType(), nullable=True)
             , nullable=True)\
        .add("standard", IntegerType(), True)\
        .add("marks", ArrayType(DoubleType(), containsNull=False), nullable = True)

    studentMarks2DF = spark.createDataFrame(studentMarks2Rdd, schema2)

    print("Schema with array")
    studentMarks2DF.printSchema()

    print("DataFrame with array")
    studentMarks2DF.show()

    print("Count elements of each array in the column")
    studentMarks2DF.select("id", F.size("marks").alias("count")).show()

    print("Explode the array elements out into additional rows")
    studentMarks2DF.select("id", F.explode("marks").alias("scores")).show()
Exemplo n.º 12
0
        # weeks.append(long(date_item.strftime("%Y%W")))
        text_week_time = date_item + timedelta(7)
        week_id = long(date_item.strftime("%Y%W"))
        next_week_id = long(text_week_time.strftime("%Y%W"))
        week_and_next_week = Row("week_id", "next_week_id")(week_id,
                                                            next_week_id)
        weeks.append(week_and_next_week)
        date_item += timedelta(7)

    if len(weeks) == 0:
        weeks = [Row("week_id", "next_week_id")(week_fake, week_fake)]

    return weeks


get_weeks = f.udf(get_weeks, ArrayType(TimeStructType))

# def get_week_timestamp_from_week_id(week_id):
#     return 0L
#
# get_week_timestamp_from_week_id = f.udf(get_week_timestamp_from_week_id, LongType())


def get_df_student_package(glueContext):
    dyf_student_package = glueContext.create_dynamic_frame.from_options(
        connection_type="redshift",
        connection_options={
            "url":
            REDSHIFT_DATABASE,
            "user":
            REDSHIFT_USERNAME,
from pyspark.sql.functions import from_json, to_json, col, unbase64, base64, split, expr
from pyspark.sql.types import StructField, StructType, StringType, BooleanType, ArrayType, DateType, FloatType

# TO-DO: create a StructType for the Kafka redis-server topic which has all changes made to Redis - before Spark 3.0.0, schema inference is not automatic
redisMessageSchema = StructType(
    [
        StructField("key", StringType()),
        StructField("value", StringType()),
        StructField("expiredType", StringType()),
        StructField("expiredValue",StringType()),
        StructField("existType", StringType()),
        StructField("ch", StringType()),
        StructField("incr",BooleanType()),
        StructField("zSetEntries", ArrayType( \
            StructType([
                StructField("element", StringType()),\
                StructField("score", StringType())   \
            ]))                                      \
        )

    ]
)

# TO-DO: create a StructType for the Customer JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic

customerJSONSchema = StructType([
    StructField("customerName", StringType()),
    StructField("email", StringType()),
    StructField("phone", StringType()),
    StructField("birthDay", StringType())
])
Exemplo n.º 14
0
def generate_column_names(initial, intermediate_count, final):
    columns = ["__col_{:02d}".format(idx) for idx in range(intermediate_count)]
    columns.insert(0, initial)
    columns.append(final)
    return columns


if __name__ == "__main__":
    sc = pyspark.SparkContext('local[*]', 'PipelineFlow')
    sess = pyspark.sql.SparkSession(sc)
    rdd = sc.wholeTextFiles('data/*')
    rdd = rdd.map(lambda x: (x[0], json.loads(x[1])))
    print(type(rdd.take(1)[0][1][0]))
    schema = StructType([
        StructField('file', StringType(), True),
        StructField('content', ArrayType(MapType(StringType(), StringType())),
                    True)
    ])
    df = sess.createDataFrame(rdd, schema)

    trans_manager = TransformerModuleManager("modules")
    print("Available transformers' names: {}".format(", ".join(
        trans_manager.loaded_transformers_names)))

    loaded_transformers = trans_manager.loaded_transformers
    col_names = generate_column_names("content",
                                      len(loaded_transformers) - 1,
                                      "sentences")

    stages = list(
        map(
Exemplo n.º 15
0
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('unit3').getOrCreate()

tokensDF = spark.read.json(
    "/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Attack_Westminster_big_tokenized.json"
)

import nltk
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
from collections import Counter
from nltk.corpus import stopwords
import string

# [1] Tag tokens with POS
POSTagUDF = udf(lambda x: nltk.pos_tag(x), ArrayType(ArrayType(StringType())))
posRDD = tokensDF.rdd.flatMap(
    lambda x: nltk.pos_tag(x.tokens_with_stopwords)).map(lambda x: (x[0].lower(
    ), x[1])).filter(lambda x: x[0] not in stop_words)

custom_stopwords = [
    "``", "''", "'s", "said", "could", "also", "news", "--", "..."
]
stop_words = set(
    stopwords.words('english') + list(string.punctuation) + custom_stopwords)

# [2] Get most frequent nouns
counter = Counter(
    posRDD.filter(lambda x: x[1][0] == 'N').map(lambda x: x[0]).collect())
countDF = spark.createDataFrame(counter.most_common(100), ['noun', 'count'])
countDF.write.csv(
Exemplo n.º 16
0
    def hist_date(df, col_name):
        """
        Create a histogram for a date type column
        :param df: Dataframe to be analyzed
        :param col_name: Dataframe column to be analyzed
        :return:
        """
        col_info = {}

        # Create year/month/week day/hour/minute

        def func_infer_date(value, args):
            if value is None:
                result = [None]
            else:
                date = dateutil.parser.parse(value)
                result = [
                    date.year, date.month,
                    date.weekday(), date.hour, date.minute
                ]
            return result

        df = (df.cols.select(col_name).cols.apply(
            col_name, func_infer_date, ArrayType(
                LongType())).cols.unnest(col_name).h_repartition().cache())

        for i in range(5):
            key_name = ""
            temp_col = col_name + "_" + str(i)
            # Years
            if i == 0:
                buckets_date = 100
                key_name = "years"

                min_value = df.cols.min(temp_col)
                max_value = df.cols.max(temp_col)

            # Months
            elif i == 1:
                buckets_date = 12
                min_value = 0
                max_value = 12
                key_name = "months"

            # Weekdays
            elif i == 2:
                buckets_date = 7
                min_value = 0
                max_value = 7
                key_name = "weekdays"

            # Hours
            elif i == 3:
                buckets_date = 24
                min_value = 0
                max_value = 24
                key_name = "hours"

            # Minutes
            elif i == 4:
                buckets_date = 60
                min_value = 0
                max_value = 60
                key_name = "minutes"

            col_info[key_name] = df.cols.hist(temp_col, min_value, max_value,
                                              buckets_date)

        return col_info
Exemplo n.º 17
0
    in_df.coupon.cast(DecimalType(10, 5)),
    in_df['yield'].cast(DecimalType(10, 5)), in_df.type,
    in_df.duration.cast(IntegerType()))

in_cast_df.show()

periodic_value_schema = StructType([
    StructField("period", IntegerType(), True),
    StructField("cp", DecimalType(12, 5), True),
    StructField("pv", DecimalType(12, 5), True),
    StructField("aggpv", DecimalType(12, 5), True),
    StructField("quote", DecimalType(12, 5), True)
])

udf_calc_periodic_value = udf(calc_periodic_value,
                              ArrayType(periodic_value_schema))

in_cast_df.withColumn("periodic_value", udf_calc_periodic_value(in_cast_df["value"], in_cast_df["coupon"],
                                                                in_cast_df["yield"], in_cast_df["type"],
                                                                in_cast_df["duration"]))\
    .withColumn("periodic_value", explode("periodic_value"))\
    .select("id", "value", "periodic_value.period", "periodic_value.cp", "periodic_value.pv", "periodic_value.aggpv",
            "periodic_value.quote")\
    .createOrReplaceTempView("periodic_value_table")

spark.sql(
    "select id as `Bond ID`, period as `Period`, cp as `Coupon payment`, pv as `PV of periodic payments`, "
    "aggpv as A from periodic_value_table").show(50)

spark.sql(
    "select id as `Bond ID`, aggpv as A, value as `FV`, quote as `Quote` "
Exemplo n.º 18
0
total_counts = rawFeatures.rdd.map(lambda row: row['rawFeatures'].toArray(
)).reduce(lambda x, y: [x[i] + y[i] for i in range(len(y))])

vectorizerModel = model.stages[1]
vocabList = vectorizerModel.vocabulary
d = {'vocabList': vocabList, 'counts': total_counts}
spark.createDataFrame(np.array(list(d.values())).T.tolist(),
                      list(d.keys())).show()

from pyspark.sql.functions import udf
import pyspark.sql.functions as F
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.sql.types import ArrayType, StringType

indices_udf = udf(lambda vector: vector.indices.tolist(),
                  ArrayType(IntegerType()))
values_udf = udf(lambda vector: vector.toArray().tolist(),
                 ArrayType(DoubleType()))


def termsIdx2Term(vocabulary):
    def termsIdx2Term(termIndices):
        return [vocabulary[int(index)] for index in termIndices]

    return udf(termsIdx2Term, ArrayType(StringType()))


rawFeatures.withColumn('indices', indices_udf(F.col('rawFeatures'))) \
    .withColumn('values', values_udf(F.col('rawFeatures'))) \
    .withColumn("Terms", termsIdx2Term(vocabList)("indices")).show()
Exemplo n.º 19
0
def select_relevant_columns(df,
                            discrete_action: bool = True,
                            include_possible_actions: bool = True):
    """ Select all the relevant columns and perform type conversions. """
    if not discrete_action and include_possible_actions:
        raise NotImplementedError(
            "currently we don't support include_possible_actions")

    select_col_list = [
        col("reward").cast(FloatType()),
        col("state_features").cast(ArrayType(FloatType())),
        col("state_features_presence").cast(ArrayType(BooleanType())),
        col("next_state_features").cast(ArrayType(FloatType())),
        col("next_state_features_presence").cast(ArrayType(BooleanType())),
        col("not_terminal").cast(BooleanType()),
        col("action_probability").cast(FloatType()),
        col("mdp_id").cast(LongType()),
        col("sequence_number").cast(LongType()),
        col("step").cast(LongType()),
        col("time_diff").cast(LongType()),
        col("metrics").cast(ArrayType(FloatType())),
        col("metrics_presence").cast(ArrayType(BooleanType())),
    ]

    if discrete_action:
        select_col_list += [
            col("action").cast(LongType()),
            col("next_action").cast(LongType()),
        ]
    else:
        select_col_list += [
            col("action").cast(ArrayType(FloatType())),
            col("next_action").cast(ArrayType(FloatType())),
            col("action_presence").cast(ArrayType(BooleanType())),
            col("next_action_presence").cast(ArrayType(BooleanType())),
        ]

    if include_possible_actions:
        select_col_list += [
            col("possible_actions_mask").cast(ArrayType(LongType())),
            col("possible_next_actions_mask").cast(ArrayType(LongType())),
        ]

    return df.select(*select_col_list)
Exemplo n.º 20
0
def termsIdx2Term(vocabulary):
    def termsIdx2Term(termIndices):
        return [vocabulary[int(index)] for index in termIndices]

    return udf(termsIdx2Term, ArrayType(StringType()))
Exemplo n.º 21
0
def pyspark_script_console11(inputs, settings):
    data = inputs.get('data', None)
    df = inputs.get('df', None)
    df1 = inputs.get('df1', None)
    df2 = inputs.get('df2', None)
    df3 = inputs.get('df3', None)
    transformer = inputs.get('transformer', None)
    estimator = inputs.get('estimator', None)
    model = inputs.get('model', None)

    import re
    import datetime
    from pyspark.sql.functions import udf
    from pyspark.sql.types import IntegerType, StringType, ArrayType

    def p_ordinalDate(string):
        start = datetime.datetime.strptime(string.strip(), '%d/%m/%Y')
        return start.toordinal()

    def p_time(string):
        hours = int(string.split(":")[0])
        if "PM" in string: hours += 12
        return hours

    def p_entryLocation(string):
        vectors1 = ['PREMISES-REAR', 'PREMISES-FRONT', 'PREMISES-SIDE']
        for x in vectors1:
            if x in string: return x
        return "UNKNOWN"

    def p_entryPoint(string):
        vectors2 = ['POINT OF ENTRY-DOOR', 'POINT OF ENTRY-WINDOW', \
                    'POINT OF ENTRY-FENCE', 'POINT OF ENTRY-DOOR: GARAGE']
        vectors3 = [
            'POE - DOOR', 'POE - WINDOW', 'POE - FENCE', 'POE - GARAGE'
        ]
        for x, y in list(zip(vectors2, vectors3)):
            if x in string or y in string: return x
        return "UNKNOWN"

    def p_dayOfWeek(string):
        start = datetime.datetime.strptime(string, '%d/%m/%Y')
        return start.weekday()

    def p_northingEasting(string, string2):
        return "%s-%s" % (string, string2)

    def p_methodOfEntry(string):
        if string is None:
            return ''

        narrative = string.split(
            "__________________________________ CREATED BY")[-1]
        if 'NARRATIVE' in narrative or 'CIRCUMSTANCES' in narrative:
            narrative = re.split('NARRATIVE|CIRCUMSTANCES', narrative)[-1]
            narrative = re.split("\*|:", narrative[1:])[0]
        return narrative

        # Classifies if the search was messy

    def p_messy(string):
        negations = ["NOT ", "NO ", "HAVEN'T", "DIDN'T", 'DIDNT', "HAVENT"]
        messywords = ['MESSY', 'MESSIL', 'RUMMAG', 'TIPPED']
        sentences = [
            sentence + '.' for sentence in string.split(".")
            if any(word in sentence for word in messywords)
        ]
        c = 0
        for x in sentences:
            if any(word in x for word in negations):
                c -= 1
            else:
                c += 1
        return 1 if c > 0 else 0

    def p_signature(string):
        if "DEFECA" in string:
            return 1
        if "URINAT" in string:
            return 2
        if "MASTURB" in string:
            return 3
        if "GRAFFIT" in string:
            return 4
        return "UNKNOWN"

    def p_propertySecure(string):
        verbs = ['LOCKED', 'FENCED', 'GATED', 'SECURED', 'BOLTED']
        negations = ["NOT ", "NO ", "HAVEN'T", "DIDN'T", 'DIDNT', "HAVENT"]
        c = 0
        sentences = [
            sentence + '.' for sentence in string.split(".")
            if any(word in sentence for word in verbs)
        ]
        for x in sentences:
            if any(word in x for word in negations):
                c -= 1
            else:
                c += 1
        return 1 if c > 0 else 0

    import nltk
    from nltk.parse.stanford import StanfordDependencyParser
    import string as string_module

    stemmer = nltk.stem.porter.PorterStemmer()
    parser = StanfordDependencyParser(
        path_to_models_jar=
        '/Users/Chao/nzpolice/summer/stanford-parser/stanford-parser-3.8.0-models.jar',
        model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
        path_to_jar=
        '/Users/Chao/nzpolice/summer/stanford-parser/stanford-parser.jar',
        java_options='-Xmx1000M',
        verbose=False)
    remove_punctuation_map = dict(
        (ord(char), None) for char in string_module.punctuation)
    unigram_tagger = nltk.tag.UnigramTagger(nltk.corpus.brown.tagged_sents())
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # For vectorizing text
    def stem_tokens(tokens):
        return [stemmer.stem(item) for item in tokens]

        # Normalizes text (i.e, tokenizes and then stems words)

    def normalize(text):
        return stem_tokens(
            nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

    def p_propertyStolenList(string):
        if "PROPERTY" not in string:
            return []
        property_list = " ".join([
            re.split(':|_', listing)[0] for listing in re.split(
                "PROPERTY LIST SUMMARY:|PROPERTY STOLEN:", string)
        ])
        text = normalize(property_list)
        tagged = unigram_tagger.tag(text)
        removable = [
            'modus', 'operandi', 'call', 'with', 'list', 'of', 'location',
            'point', 'entry', 'value', 'property'
            'police', 'stage', 'name', 'details', 'insured', 'victim',
            'address'
        ]
        o = []
        for x in tagged:
            if (not (x[1] in ["NN", "NNS"])) or (x[0] in removable):
                pass
            else:
                if not len(x[0]) < 3:
                    o.append(x[0])
        return o

    def p_pullMOTags(string):
        sentences = sent_tokenizer.tokenize(string)
        sentences = [sent.lower().capitalize() for sent in sentences]
        x_relations = []
        for sent in sentences:
            if len(sent.split(" ")) > 100: continue
            try:
                parsed = parser.raw_parse(sent)
                triples = [parse.triples() for parse in parsed]
                selected = [
                    triple for triple in triples[0]
                    if (triple[1] in ("dobj", "nsubjpass"))
                ]
            except:
                continue
            for x in selected:
                x_relations.append(x)
        return x_relations

        # def stem_tokens(tokens):

    # 	return [stemmer.stem(item) for item in tokens]
    #
    #
    # # Normalizes text (i.e, tokenizes and then stems words)
    # def normalize(text):
    # 	if text is None:
    # 		return []
    # 	return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

    udf_ordinal_date = udf(p_ordinalDate, IntegerType())
    udf_time = udf(p_time, IntegerType())
    udf_entry_location = udf(p_entryLocation, StringType())
    udf_entry_point = udf(p_entryPoint, StringType())
    udf_day_of_week = udf(p_dayOfWeek, IntegerType())
    udf_northing_easting = udf(p_northingEasting, StringType())
    udf_method_of_entry = udf(p_methodOfEntry, StringType())  # *
    udf_messy = udf(p_messy, IntegerType())
    udf_signature = udf(p_signature, IntegerType())
    udf_property_secure = udf(p_propertySecure, IntegerType())
    udf_property_stolen_list = udf(p_propertyStolenList,
                                   ArrayType(StringType()))
    udf_pull_mo_tags = udf(p_pullMOTags, ArrayType(StringType()))

    # udf_normalize = udf(normalize, ArrayType(StringType()))

    FEATURES_TO_USE = [
        ('ordinalDate', 'Occurrence Start Date', udf_ordinal_date),
        ('time', 'Occurrence Start Time', udf_time),
        ('entryLocation', 'Narrative', udf_entry_location),
        ('entryPoint', 'Narrative', udf_entry_point),
        ('dayOfWeek', 'Occurrence Start Date', udf_day_of_week),
        ('northingEasting', ('NZTM Location Northing',
                             'NZTM Location Easting'), udf_northing_easting),
        ('methodOfEntry', 'Narrative', udf_method_of_entry),
        ('messy', 'methodOfEntry', udf_messy),
        ('signature', 'Narrative', udf_signature),
        ('propertySecure', 'Narrative', udf_property_secure),
        ('propertyStolenWordnet', 'Narrative', udf_property_stolen_list),
        # ('cosineTFIDF', 'Narrative', udf_method_of_entry),
        # ('cosineTFIDF2', 'Narrative', udf_method_of_entry),
        ('cosineMO', 'methodOfEntry', udf_pull_mo_tags),
        # ('propertyStolenWordNetNA', 'Narrative', udf_property_stolen_list),
        # ('listSimilarity', 'Narrative', udf_property_stolen_list),
        # ('moSim', 'methodOfEntry', udf_pull_mo_tags),
    ]

    df = df.na.fill({'Narrative': ''})
    # df.na.drop(subset=["Narrative"])

    for t in FEATURES_TO_USE:
        new_col = t[0]
        func = t[2]
        in_cols = t[1]
        params = (df[c] for c in t[1]) if isinstance(in_cols,
                                                     tuple) else [df[in_cols]]
        df = df.withColumn(new_col, func(*params))

    return {
        'data': data,
        'df': df,
        'df1': df1,
        'df2': df2,
        'df3': df3,
        'transformer': transformer,
        'estimator': estimator,
        'model': model
    }
# MAGIC Let's look at one way to apply the spaCy NLP pipeline to our tweets using SQL and a user defined function (UDF):

# COMMAND ----------

from pyspark.sql.types import ArrayType, FloatType, StringType
import spacy
nlp = spacy.load("en_core_web_sm")


def getVerbs(text):
    doc = nlp(text)
    verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]
    return verbs


spark.udf.register("getVerbs", getVerbs, ArrayType(StringType()))

# COMMAND ----------

# MAGIC %md
# MAGIC We can now use the UDF in our SQL statements to extract verbs from a tweet:

# COMMAND ----------

# MAGIC %sql
# MAGIC select normalized_text
# MAGIC       ,getVerbs(normalized_text) as `verbs`
# MAGIC from tweets_clean_for_nlp

# COMMAND ----------
Exemplo n.º 23
0
def get_data():
    data = """
[
  {
    "friends": [
      {
        "id": 0,
        "name": "Georgina Sears"
      },
      {
        "id": 1,
        "name": "Miranda Tillman"
      },
      {
        "id": 2,
        "name": "Rosario Doyle"
      }
    ]
  },
  {
    "friends": [
      {
        "id": 0,
        "name": "Manuela Noble"
      },
      {
        "id": 1,
        "name": "Aguilar Roy"
      },
      {
        "id": 2,
        "name": "Holt Espinoza"
      }
    ]
  },
  {
    "friends": [
      {
        "name": "Manuela Noble"
      },
      {
        "name": "Aguilar Roy"
      },
      {
        "id": 2,
        "name": "Holt Espinoza"
      }
    ]
  }
]
    """
    data_dict = json.loads(data)
    print(data_dict)
    schema = StructType().add(
        "friends",
        ArrayType(
            StructType([
                StructField("id", StringType()),
                StructField("name", StringType())
            ])))
    df = spark.createDataFrame(data_dict, schema)
    return df
Exemplo n.º 24
0
def test_exploding_data_frame(spark_session):
    sc = spark_session.sparkContext
    ###Generando string con json format:

    _data_js_string = [
        '{"numero_caja":"3","compras":[[{"cantidad":"2","nombre":"Harina","precio_unitario":"1500"},\
				   		           {"cantidad":"5","nombre":"Arroz","precio_unitario":"1000"}],\
                                                          [{"cantidad":"4","nombre":"Frijoles","precio_unitario":"800"}],\
                                                          [{"cantidad":"7","nombre":"Manzana","precio_unitario":"500"},\
				   		           {"cantidad":"2","nombre":"JugoNaranja","precio_unitario":"1800"},\
                                                           {"cantidad":"6","nombre":"Carbon","precio_unitario":"1500"},\
				   		           {"cantidad":"3","nombre":"Pera","precio_unitario":"400"}]]}\
 				                          '
    ]

    ##Definiendo el schema para que coincida con el desarrollado en la tarea

    schema = StructType([
        StructField(
            "compras",
            ArrayType(
                ArrayType(
                    StructType([
                        StructField("cantidad", StringType()),
                        StructField("nombre", StringType()),
                        StructField("precio_unitario", StringType())
                    ])))),
        StructField("numero_caja", StringType())
    ])
    ##Convirtiendo el json file a dataframe

    _data_js = spark_session.read.schema(schema).json(
        sc.parallelize(_data_js_string))

    ##El metodo a probar requiere el dataframe despues de haberlo convertido de string a json file
    ##El metodo exploding_data_frame realizara 2 explode para as tener cada producto separado del array

    _dato_calculado = exploding_data_frame(_data_js)

    ##Se genera el dataframe esperado con los datos enviados al metodo:

    _dato_esperado = [("3", ["2", "Harina", "1500"]),
                      ("3", ["5", "Arroz", "1000"]),
                      ("3", ["4", "Frijoles", "800"]),
                      ("3", ["7", "Manzana", "500"]),
                      ("3", ["2", "JugoNaranja", "1800"]),
                      ("3", ["6", "Carbon", "1500"]),
                      ("3", ["3", "Pera", "400"])]

    schema = StructType([
        StructField("numero_caja", StringType()),
        StructField(
            "col",
            StructType([
                StructField("cantidad", StringType()),
                StructField("nombre", StringType()),
                StructField("precio_unitario", StringType())
            ]))
    ])

    ##Se obtiene el data frame esperado
    _dato_esperado_df = spark_session.createDataFrame(_dato_esperado, schema)

    _dato_esperado_df.show()
    _dato_calculado.show()
    assert _dato_esperado_df.collect() == _dato_calculado.collect()
Exemplo n.º 25
0
spark = SparkSession.builder.getOrCreate()
# Omit all logs except errors
spark.sparkContext.setLogLevel('ERROR')
# Read each file in cricket folder as a separate record
rdd = spark.sparkContext.wholeTextFiles('/user/root/Final/cricket/')
# Suppress hortonworks path prefix from filename and create
# data frame with 2 columns ('doc' and 'text')
data = rdd.map(lambda x: (x[0].replace('hdfs://sandbox-hdp.hortonworks.com:8020', ''), x[1])).toDF(['doc', 'text'])
# Get total document count
total_docs = data.count()

# utility method for tokenizing a piece of text
def tokenize(text):
    return re.findall('\\w+', text.lower())
# Register the tokenize method as a udf
tokenize_udf = F.udf(tokenize, ArrayType(StringType()))
# tokenize all the text
data = data.select(['doc', tokenize_udf('text').alias('text')])
# make 1 separate row for each token
data_tokens = data.withColumn("token", F.explode('text'))

# calculate term frequency
tf = data_tokens.groupBy('doc', 'token').agg(F.count('text').alias('tf'))
# calculate document frequency
df = data_tokens.groupBy('token').agg(F.countDistinct('doc').alias('df'))

# utility method for calculating inverse document frequency
def inverse_doc_frequency(doc_frequency):
    return math.log((total_docs + 1) * 1.0 / (doc_frequency + 1))

# register inverse document frequency as a udf
Exemplo n.º 26
0
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))

    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(10)

    # Cast Float to Double (Float is not supported by the Mongo connector)
    userRecs = userRecs.withColumn(
        'recommendations', userRecs['recommendations'].cast(
            ArrayType(
                StructType([
                    StructField('movie_id', IntegerType()),
                    StructField('rating', DoubleType())
                ]))))

    # Write recommendations to the DB
    userRecs.write.format("com.mongodb.spark.sql.DefaultSource").options(
        uri=uri, collection="user_recommendations").mode("overwrite").save()

    # Generate top 10 user recommendations for each movie
    movieRecs = model.recommendForAllItems(10)

    # Generate top 10 movie recommendations for a specified set of users
    users = ratings.select(als.getUserCol()).distinct().limit(3)
    userSubsetRecs = model.recommendForUserSubset(users, 10)
    # Generate top 10 user recommendations for a specified set of movies
    movies = ratings.select(als.getItemCol()).distinct().limit(3)
Exemplo n.º 27
0
def build_passage_to_entity_maps(content_path,
                                 spark,
                                 max_rank,
                                 dir_path,
                                 dataset_metadata=dataset_metadata):
    """" """
    df = spark.read.parquet(content_path)
    df.printSchema()

    @udf(returnType=ArrayType(StringType()))
    def get_ents(content_bytearray):
        synthetic_entity_links = document_pb2.DocumentContent().FromString(
            pickle.loads(content_bytearray)).synthetic_entity_links
        entity_links = []
        for synthetic_entity_link in synthetic_entity_links:
            entity_links.append(str(synthetic_entity_link.entity_id))
        return entity_links

    df_entity = df.withColumn("entities", get_ents("content_bytearray"))
    df_entity.printSchema()

    for dataset in ['dev', 'train', 'test']:
        dateset_dir = dir_path + '{}_data/'.format(dataset)
        passage_name = 'passage' + '_{}'.format(dataset)
        passage_path = dataset_metadata[passage_name][0]
        print('================================')
        print('Building passage->entity mappings for {}: {}'.format(
            dataset, passage_path))
        run_dict = {}
        doc_ids_list = []
        with open(passage_path, 'r') as f:
            for line in f:

                query = line.split()[0]
                doc_id = line.split()[2]
                rank = int(line.split()[3])

                if rank <= max_rank:

                    if query not in run_dict:
                        run_dict[query] = []
                    run_dict[query].append(doc_id)
                    doc_ids_list.append(doc_id)

        query_list = sorted(list(run_dict.keys()))

        doc_ids_list = list(set(doc_ids_list))
        print("doc_ids_list len = {}".format(len(doc_ids_list)))
        dataset_df = df_entity[df_entity['content_id'].isin(
            doc_ids_list)].select("content_id", "entities")
        print("dataset_map len = {}".format(dataset_df.count()))
        print(dataset_df.head())

        dataset_dict = {}
        for row in dataset_df.collect():
            dataset_dict[row[0]] = row[1]

        print("dataset_dict len = {}".format(len(dataset_dict)))

        write_json_path = dateset_dir + 'passage_to_entity.json'
        print('writing to: {}'.format(write_json_path))
        with open(write_json_path, 'w') as f:
            json.dump(dataset_dict, f, indent=4)
Exemplo n.º 28
0
input_path = sys.argv[1]
output_path = sys.argv[2]
df = spark.read.csv(input_path, header=True, inferSchema=True)
names = df.columns

import pandas as pd
from pyspark.sql.functions import col, pandas_udf, size
from pyspark.sql.types import DoubleType, ArrayType

def predict(*series) -> pd.Series:
    import pandas as pd
    import numpy as np
    from numpy import nan
    from scipy.special._ufuncs import expit
    from scoring_h2oai_experiment_336ccd12_cbb4_11ea_8496_ac1f6b68b7be import Scorer # update with your key
    scorer = Scorer()
    merged = pd.concat(series, axis=1)
    merged.columns = names
    output = scorer.score_batch(merged)
    return pd.Series(output.values.tolist())

    
predict_udf = pandas_udf(predict, returnType=ArrayType(DoubleType()))
columns = [col(name) for name in df.columns]
withPredictions = df.withColumn("prediction", predict_udf(*columns))

# If working with multi-class, can expand prediction, e.g. 3 classes:
num_cols = withPredictions.withColumn("size", size(col("prediction"))).agg({"size": "max"}).head()[0] # To be performant, specify the value, e.g. num_cols=3
withPredictions = withPredictions.select(col("*"), *(col('prediction').getItem(i).alias(f'prediction_{i}') for i in range(num_cols)))
withPredictions = withPredictions.drop(col("prediction"))
Exemplo n.º 29
0
def get_schema(schema_name):
    schema = None
    if schema_name == 'interim_parkingbay_schema':
        schema = StructType([
            StructField('bay_id', IntegerType(), False),
            StructField('last_edit', StringType()),
            StructField('marker_id', StringType()),
            StructField('meter_id', StringType()),
            StructField('rd_seg_id', StringType()),
            StructField('rd_seg_dsc', StringType()),
            StructField(
                'the_geom',
                StructType([
                    StructField(
                        'coordinates',
                        ArrayType(ArrayType(ArrayType(ArrayType(
                            DoubleType()))))),
                    StructField('type', StringType())
                ])),
            StructField('load_id', StringType()),
            StructField('loaded_on', TimestampType())
        ])
    elif schema_name == 'interim_sensor':
        schema = StructType([
            StructField('bay_id', IntegerType(), False),
            StructField('st_marker_id', StringType()),
            StructField('lat', FloatType()),
            StructField('lon', FloatType()),
            StructField(
                'location',
                StructType([
                    StructField('coordinates', ArrayType(DoubleType())),
                    StructField('type', StringType())
                ]), False),
            StructField('status', StringType()),
            StructField('load_id', StringType()),
            StructField('loaded_on', TimestampType())
        ])
    elif schema_name == 'dw_dim_parking_bay':
        schema = StructType([
            StructField('dim_parking_bay_id', StringType(), False),
            StructField('bay_id', IntegerType(), False),
            StructField('marker_id', StringType()),
            StructField('meter_id', StringType()),
            StructField('rd_seg_id', StringType()),
            StructField('rd_seg_dsc', StringType()),
            StructField(
                'the_geom',
                StructType([
                    StructField(
                        'coordinates',
                        ArrayType(ArrayType(ArrayType(ArrayType(
                            DoubleType()))))),
                    StructField('type', StringType())
                ])),
            StructField('load_id', StringType()),
            StructField('loaded_on', TimestampType())
        ])
    elif schema_name == 'dw_dim_location':
        schema = StructType([
            StructField('dim_location_id', StringType(), False),
            StructField(
                'location',
                StructType([
                    StructField('coordinates', ArrayType(DoubleType())),
                    StructField('type', StringType())
                ]), False),
            StructField('lat', FloatType()),
            StructField('lon', FloatType()),
            StructField('load_id', StringType()),
            StructField('loaded_on', TimestampType())
        ])
    elif schema_name == 'dw_dim_st_marker':
        schema = StructType([
            StructField('dim_st_marker_id', StringType(), False),
            StructField('st_marker_id', StringType()),
            StructField('load_id', StringType()),
            StructField('loaded_on', TimestampType())
        ])
    return schema
Exemplo n.º 30
0
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.types import FloatType
from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType, StringType, BooleanType
import pyspark.sql.functions as F
import numpy as np
from operator import add
from functools import reduce


@F.udf(
    ArrayType(
        StructType([
            # Adjust types to reflect data types
            StructField("item0", StringType()),
            StructField("item1", IntegerType()),
            StructField("item2", FloatType())
        ])))
def ImpPrice(imp, price):
    imp_rank = range(len(imp))
    price = np.array(price).astype(float).tolist()
    return zip(imp, imp_rank, price)


def getPriceImpressionRank():
    funcs = []
    for col in ["price", 'imp_rank']:
        for func in [F.min, F.max, F.mean, F.stddev]:
            funcs.append(func(col).alias(col + "_" + func.func_name))
    funcs.append(F.count("price").alias('impression_freqs'))