# schema used to parse tweets json. tweet_schema = StructType([ StructField("created_at", StringType(), nullable=True), StructField("text", StringType(), nullable=True), StructField("place", StructType([ StructField("name", StringType(), nullable=True), StructField("country_code", StringType(), nullable=True) ]), nullable=True), StructField("user", StructType([ StructField("location", StringType(), nullable=True), StructField("created_at", StringType(), nullable=True), StructField("updateTime", StringType(), nullable=True) ]), nullable=True), StructField("entities", StructType([ StructField("hashtags", ArrayType( StructType([ StructField("text", StringType(), nullable=True) ])), nullable=True) ]), nullable=True) ])
def to_array(col): def to_array_(v): return v.toArray().tolist() return udf(to_array_, ArrayType(DoubleType()))(col)
def preprocess_file(bucket_name, file_name): raw_data = sql_context.read.json("s3a://{0}/{1}".format( bucket_name, file_name)) # Clean question body if (config.LOG_DEBUG): print(colored("[PROCESSING]: Cleaning question body...", "green")) clean_body = udf(lambda body: filter_body(body), StringType()) partially_cleaned_data = raw_data.withColumn("cleaned_body", clean_body("body")) # Concat cleaned question body and question title to form question vector if (config.LOG_DEBUG): print( colored( "[PROCESSING]: Concating question body and question title...", "green")) data = partially_cleaned_data.withColumn( "text_body", concat(col("title"), lit(" "), col("body"))) # Tokenize question title if (config.LOG_DEBUG): print(colored("[PROCESSING]: Tokenizing text vector...", "green")) tokenizer = Tokenizer(inputCol="text_body", outputCol="text_body_tokenized") tokenized_data = tokenizer.transform(data) # Remove stop words if (config.LOG_DEBUG): print(colored("[PROCESSING]: Removing stop words...", "green")) stop_words_remover = StopWordsRemover( inputCol="text_body_tokenized", outputCol="text_body_stop_words_removed") stop_words_removed_data = stop_words_remover.transform(tokenized_data) # Stem words if (config.LOG_DEBUG): print(colored("[PROCESSING]: Stemming tokenized vector...", "green")) stem = udf(lambda tokens: lemmatize(tokens), ArrayType(StringType())) stemmed_data = stop_words_removed_data.withColumn( "text_body_stemmed", stem("text_body_stop_words_removed")) # Shingle resulting body # if (config.LOG_DEBUG): print(colored("[PROCESSING] Shingling resulting text body...", "green")) # shingle = udf(lambda tokens: get_two_gram_shingles(tokens), ArrayType(ArrayType(StringType()))) # shingled_data = stemmed_data.withColumn("text_body_shingled", shingle("text_body_stemmed")) # Extract data that we want final_data = stemmed_data final_data.registerTempTable("final_data") preprocessed_data = sql_context.sql( "SELECT title, body, creation_date, text_body, text_body_stemmed, post_type_id, tags, score, comment_count, view_count, id from final_data" ) # Write to AWS if (config.LOG_DEBUG): print(colored("[UPLOAD]: Writing preprocessed data to AWS...", "green")) write_aws_s3(config.S3_BUCKET_BATCH_PREPROCESSED, file_name, preprocessed_data)
def generate_frequent_words(filename_read_S3, filename_wite_S3): """ Generate a list of most frequent words for each subreddit. :param filename_read_S3: "S3 file location to be read" :param filename_wite_S3: "S3 file to write to" :return: None """ # get data - print("Step 1: read cleaned file into Dataframe from S3") comments_df1 = sqlContext.read.parquet(filename_read_S3) # select limited columns comments_df2 = comments_df1.select('subreddit', 'subreddit_id', 'year', 'month', 'body_without_stopwords') print("schema of dataset - {0}".format(comments_df2.printSchema())) # ------------------------- # WORD Count # ------------------------- print("Step 3: Apply punctuation to a body_without_stopwords") comments_df3 = comments_df2.select( 'subreddit', 'subreddit_id', 'year', 'month', removePunctuation(col('body_without_stopwords'))) print("Step 3.1: Apply word lemmatization to generate base words") # register UDF spark.udf.register("lemma", lemma, ArrayType(StringType())) lemma_udf = udf(lemma) # run transformation comments_df31 = comments_df3.withColumn("lemmatized_body", lemma_udf(col("cleaned_body"))) print(comments_df31.printSchema()) # remove punctuations again comments_df32 = comments_df31.select( 'subreddit', 'subreddit_id', 'year', 'month', removePunctuation(col('lemmatized_body'))) print("Step 4: Split lines to words to generate the count") comments_df4 = (comments_df31.select( explode(split(comments_df31.cleaned_body, ' ')).alias('word'), 'subreddit', 'subreddit_id', 'year', 'month').where(col('word') != '')) print(comments_df4.printSchema()) print("Step 5: Get WordCount") comments_df5 = wordCount(comments_df4).orderBy("count", ascending=False) print("Step 6: Get ranking of each word based on count by windo") window = Window.partitionBy(comments_df5['subreddit_id']).orderBy( comments_df5['count'].desc()) print("Step 7: Get words with rankin > 5") comments_df6 = comments_df5.select( '*', rank().over(window).alias('rank')).filter(col('rank') <= 5) print("writing data to S3") # ---------------------- # Store to S3 - as Parquet # ---------------------- print("Step 8: Generate parquet file for the words and load to S3") comments_df6.write.parquet(filename_wite_S3) print("Completed writing data to S3") return
return [wordnet_lemmatizer.lemmatize(word,pos="v") for word in line] text = removedsw.withColumn("lemma",lemma(removedsw.filtered_words)) def unitoarr(line): s = [] for w in line: w = w.strip() if len(w) != 1 and w != "" and w != ' ' and len(w)>2: s.append(w) return s unitoarr_udf = udf(unitoarr, ArrayType(StringType())) text2 = text.withColumn("review",unitoarr_udf(text.lemma)).withColumn("label",change_labels(text.stars)) ngram = NGram(n=2, inputCol="review", outputCol="ngrams") ngramDataFrame = ngram.transform(text2) cv = CountVectorizer(inputCol="ngrams", outputCol="features") models = cv.fit(ngramDataFrame) result = models.transform(ngramDataFrame) result1 = result.select("business_id","text","stars","label","features","ngrams")
"boolean": BooleanType, "struct": StructType, "array": ArrayType, "bigint": LongType, "date": DateType, "byte": ByteType, "short": ShortType, "datetime": TimestampType, "binary": BinaryType, "null": NullType, "vector": VectorUDT } SPARK_DTYPES_DICT_OBJECTS = \ {"string": StringType(), "int": IntegerType(), "float": FloatType(), "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()), "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(), "datetime": TimestampType(), "binary": BinaryType(), "null": NullType() } # Profiler PROFILER_TYPES = { "int", "float", "string", "bool", "date", "null", "array", "double" } PROFILER_LEGEND_TYPES = { "string": "ABC", "int": "#", "integer": "#", "float": "##.#", "double": "##.#", "bigint": "#"
# save model run to mlflow with mlflow.start_run(run_name='deployment run') as run: mlflow.pyfunc.log_model('model', python_model=_lifetimesModelWrapper(model), conda_env=conda_env) # COMMAND ---------- # MAGIC %md Now that our model along with its dependency information and class wrapper have been recorded, let's use mlflow to convert the model into a function we can employ against a Spark DataFrame: # COMMAND ---------- from pyspark.sql.types import ArrayType, FloatType # define the schema of the values returned by the function result_schema = ArrayType(FloatType()) # define function based on mlflow recorded model probability_alive_udf = mlflow.pyfunc.spark_udf(spark, 'runs:/{0}/model'.format( run.info.run_id), result_type=result_schema) # register the function for use in SQL _ = spark.udf.register('probability_alive', probability_alive_udf) # COMMAND ---------- # MAGIC %md Assuming we had access to customer metrics for frequency, recency and age, we can now use our function to generate some predictions: # COMMAND ----------
from pyspark.ml import PipelineModel from pyspark.ml.linalg import Vectors, VectorUDT # Command to run this application: # spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1,org.elasticsearch:elasticsearch-spark-30_2.12:7.12.1 --master local[*] app.py logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.ERROR) elastic_host = "elasticsearch" elastic_index = "matches" kafkaServer = "kafkaserver:9092" topic = "dota_lineup" # Schema of the input data schema = StructType([ StructField("dire_lineup", ArrayType(IntegerType(), False), False), StructField("radiant_lineup", ArrayType(IntegerType(), False), False), StructField("radiant_win", BooleanType(), False), StructField("match_seq_num", LongType(), False) ]) # Spark configuration, mainly needed for the elasticsearch plugin sparkConf = SparkConf().set("spark.app.name", "dotingestion2") \ .set("es.nodes", "elasticsearch") \ .set("es.port", "9200") \ .set("es.mapping.id", "match_seq_num") \ .set("es.write.operation", "upsert") # Load the hero_id conversions with open("heroes.json", 'r', encoding="utf-8") as f: heroes_dict = {hero['id']: i for i, hero in enumerate(loads(f.read()))}
def convert_types_for_es(df: DataFrame) -> DataFrame: to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType())) return df.withColumn("radiant_win_prediction", df.prediction.cast(BooleanType())) \ .withColumn("probability_arr", to_array(df.probability))
# polaczenie z brokerem v_broker = "ec2-34-236-190-208.compute-1.amazonaws.com:9092" v_ckpt_loc = "/tmp/checkpoint" spark = SparkSession.builder.appName("Structured").getOrCreate() # odczyt strumienia z tematu raw=spark.readStream.format("kafka")\ .option("kafka.bootstrap.servers",v_broker)\ .option("startingOffsets", "earliest")\ .option("subscribe","sensor").load() # Schemat napływających danych schema = StructType()\ .add("current", StructType()\ .add("fromDateTime", StringType())\ .add("indexes", ArrayType(StructType().add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("description",StringType()).add("name",StringType()).add("value",DoubleType())))\ .add("standards", ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType())))\ .add("tillDateTime", StringType())\ .add("values", ArrayType(StructType().add("name",StringType()).add("value",StringType()))) )\ .add("forecast", ArrayType(StructType().add("fromDateTime",StringType())\ .add("indexes",ArrayType(StructType().add("advice",StringType()).add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("value",DoubleType())) ) \ .add("standarts",ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType()))) \ .add("tillDateTime", StringType())\ .add("values", ArrayType(StructType().add("name",StringType()).add("value",StringType())))\ ))\ .add("history", ArrayType(StructType().add("fromDateTime",StringType())\ .add("indexes",ArrayType(StructType().add("advice",StringType()).add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("value",DoubleType())) ) \ .add("standarts",ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType()))) \ .add("tillDateTime", StringType())\ .add("values", ArrayType(StructType().add("name",StringType()).add("value",StringType())))\
studentMarks2 = [ Row(1, Row("john", "doe"), 6, [70.0, 35.0, 85.0]), Row(2, Row("jane", "doe"), 9, [80.0, 35.0, 92.5, 35.0, 46.0]) ] studentMarks2Rdd = spark.sparkContext.parallelize(studentMarks2, 4) schema2 = StructType()\ .add("id", IntegerType(), nullable=True)\ .add("name", StructType()\ .add("first", StringType(), nullable=True)\ .add("last", StringType(), nullable=True) , nullable=True)\ .add("standard", IntegerType(), True)\ .add("marks", ArrayType(DoubleType(), containsNull=False), nullable = True) studentMarks2DF = spark.createDataFrame(studentMarks2Rdd, schema2) print("Schema with array") studentMarks2DF.printSchema() print("DataFrame with array") studentMarks2DF.show() print("Count elements of each array in the column") studentMarks2DF.select("id", F.size("marks").alias("count")).show() print("Explode the array elements out into additional rows") studentMarks2DF.select("id", F.explode("marks").alias("scores")).show()
# weeks.append(long(date_item.strftime("%Y%W"))) text_week_time = date_item + timedelta(7) week_id = long(date_item.strftime("%Y%W")) next_week_id = long(text_week_time.strftime("%Y%W")) week_and_next_week = Row("week_id", "next_week_id")(week_id, next_week_id) weeks.append(week_and_next_week) date_item += timedelta(7) if len(weeks) == 0: weeks = [Row("week_id", "next_week_id")(week_fake, week_fake)] return weeks get_weeks = f.udf(get_weeks, ArrayType(TimeStructType)) # def get_week_timestamp_from_week_id(week_id): # return 0L # # get_week_timestamp_from_week_id = f.udf(get_week_timestamp_from_week_id, LongType()) def get_df_student_package(glueContext): dyf_student_package = glueContext.create_dynamic_frame.from_options( connection_type="redshift", connection_options={ "url": REDSHIFT_DATABASE, "user": REDSHIFT_USERNAME,
from pyspark.sql.functions import from_json, to_json, col, unbase64, base64, split, expr from pyspark.sql.types import StructField, StructType, StringType, BooleanType, ArrayType, DateType, FloatType # TO-DO: create a StructType for the Kafka redis-server topic which has all changes made to Redis - before Spark 3.0.0, schema inference is not automatic redisMessageSchema = StructType( [ StructField("key", StringType()), StructField("value", StringType()), StructField("expiredType", StringType()), StructField("expiredValue",StringType()), StructField("existType", StringType()), StructField("ch", StringType()), StructField("incr",BooleanType()), StructField("zSetEntries", ArrayType( \ StructType([ StructField("element", StringType()),\ StructField("score", StringType()) \ ])) \ ) ] ) # TO-DO: create a StructType for the Customer JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic customerJSONSchema = StructType([ StructField("customerName", StringType()), StructField("email", StringType()), StructField("phone", StringType()), StructField("birthDay", StringType()) ])
def generate_column_names(initial, intermediate_count, final): columns = ["__col_{:02d}".format(idx) for idx in range(intermediate_count)] columns.insert(0, initial) columns.append(final) return columns if __name__ == "__main__": sc = pyspark.SparkContext('local[*]', 'PipelineFlow') sess = pyspark.sql.SparkSession(sc) rdd = sc.wholeTextFiles('data/*') rdd = rdd.map(lambda x: (x[0], json.loads(x[1]))) print(type(rdd.take(1)[0][1][0])) schema = StructType([ StructField('file', StringType(), True), StructField('content', ArrayType(MapType(StringType(), StringType())), True) ]) df = sess.createDataFrame(rdd, schema) trans_manager = TransformerModuleManager("modules") print("Available transformers' names: {}".format(", ".join( trans_manager.loaded_transformers_names))) loaded_transformers = trans_manager.loaded_transformers col_names = generate_column_names("content", len(loaded_transformers) - 1, "sentences") stages = list( map(
from pyspark.sql import SparkSession spark = SparkSession.builder.appName('unit3').getOrCreate() tokensDF = spark.read.json( "/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Attack_Westminster_big_tokenized.json" ) import nltk from pyspark.sql.functions import udf from pyspark.sql.types import ArrayType, StringType from collections import Counter from nltk.corpus import stopwords import string # [1] Tag tokens with POS POSTagUDF = udf(lambda x: nltk.pos_tag(x), ArrayType(ArrayType(StringType()))) posRDD = tokensDF.rdd.flatMap( lambda x: nltk.pos_tag(x.tokens_with_stopwords)).map(lambda x: (x[0].lower( ), x[1])).filter(lambda x: x[0] not in stop_words) custom_stopwords = [ "``", "''", "'s", "said", "could", "also", "news", "--", "..." ] stop_words = set( stopwords.words('english') + list(string.punctuation) + custom_stopwords) # [2] Get most frequent nouns counter = Counter( posRDD.filter(lambda x: x[1][0] == 'N').map(lambda x: x[0]).collect()) countDF = spark.createDataFrame(counter.most_common(100), ['noun', 'count']) countDF.write.csv(
def hist_date(df, col_name): """ Create a histogram for a date type column :param df: Dataframe to be analyzed :param col_name: Dataframe column to be analyzed :return: """ col_info = {} # Create year/month/week day/hour/minute def func_infer_date(value, args): if value is None: result = [None] else: date = dateutil.parser.parse(value) result = [ date.year, date.month, date.weekday(), date.hour, date.minute ] return result df = (df.cols.select(col_name).cols.apply( col_name, func_infer_date, ArrayType( LongType())).cols.unnest(col_name).h_repartition().cache()) for i in range(5): key_name = "" temp_col = col_name + "_" + str(i) # Years if i == 0: buckets_date = 100 key_name = "years" min_value = df.cols.min(temp_col) max_value = df.cols.max(temp_col) # Months elif i == 1: buckets_date = 12 min_value = 0 max_value = 12 key_name = "months" # Weekdays elif i == 2: buckets_date = 7 min_value = 0 max_value = 7 key_name = "weekdays" # Hours elif i == 3: buckets_date = 24 min_value = 0 max_value = 24 key_name = "hours" # Minutes elif i == 4: buckets_date = 60 min_value = 0 max_value = 60 key_name = "minutes" col_info[key_name] = df.cols.hist(temp_col, min_value, max_value, buckets_date) return col_info
in_df.coupon.cast(DecimalType(10, 5)), in_df['yield'].cast(DecimalType(10, 5)), in_df.type, in_df.duration.cast(IntegerType())) in_cast_df.show() periodic_value_schema = StructType([ StructField("period", IntegerType(), True), StructField("cp", DecimalType(12, 5), True), StructField("pv", DecimalType(12, 5), True), StructField("aggpv", DecimalType(12, 5), True), StructField("quote", DecimalType(12, 5), True) ]) udf_calc_periodic_value = udf(calc_periodic_value, ArrayType(periodic_value_schema)) in_cast_df.withColumn("periodic_value", udf_calc_periodic_value(in_cast_df["value"], in_cast_df["coupon"], in_cast_df["yield"], in_cast_df["type"], in_cast_df["duration"]))\ .withColumn("periodic_value", explode("periodic_value"))\ .select("id", "value", "periodic_value.period", "periodic_value.cp", "periodic_value.pv", "periodic_value.aggpv", "periodic_value.quote")\ .createOrReplaceTempView("periodic_value_table") spark.sql( "select id as `Bond ID`, period as `Period`, cp as `Coupon payment`, pv as `PV of periodic payments`, " "aggpv as A from periodic_value_table").show(50) spark.sql( "select id as `Bond ID`, aggpv as A, value as `FV`, quote as `Quote` "
total_counts = rawFeatures.rdd.map(lambda row: row['rawFeatures'].toArray( )).reduce(lambda x, y: [x[i] + y[i] for i in range(len(y))]) vectorizerModel = model.stages[1] vocabList = vectorizerModel.vocabulary d = {'vocabList': vocabList, 'counts': total_counts} spark.createDataFrame(np.array(list(d.values())).T.tolist(), list(d.keys())).show() from pyspark.sql.functions import udf import pyspark.sql.functions as F from pyspark.sql.types import DoubleType, IntegerType from pyspark.sql.types import ArrayType, StringType indices_udf = udf(lambda vector: vector.indices.tolist(), ArrayType(IntegerType())) values_udf = udf(lambda vector: vector.toArray().tolist(), ArrayType(DoubleType())) def termsIdx2Term(vocabulary): def termsIdx2Term(termIndices): return [vocabulary[int(index)] for index in termIndices] return udf(termsIdx2Term, ArrayType(StringType())) rawFeatures.withColumn('indices', indices_udf(F.col('rawFeatures'))) \ .withColumn('values', values_udf(F.col('rawFeatures'))) \ .withColumn("Terms", termsIdx2Term(vocabList)("indices")).show()
def select_relevant_columns(df, discrete_action: bool = True, include_possible_actions: bool = True): """ Select all the relevant columns and perform type conversions. """ if not discrete_action and include_possible_actions: raise NotImplementedError( "currently we don't support include_possible_actions") select_col_list = [ col("reward").cast(FloatType()), col("state_features").cast(ArrayType(FloatType())), col("state_features_presence").cast(ArrayType(BooleanType())), col("next_state_features").cast(ArrayType(FloatType())), col("next_state_features_presence").cast(ArrayType(BooleanType())), col("not_terminal").cast(BooleanType()), col("action_probability").cast(FloatType()), col("mdp_id").cast(LongType()), col("sequence_number").cast(LongType()), col("step").cast(LongType()), col("time_diff").cast(LongType()), col("metrics").cast(ArrayType(FloatType())), col("metrics_presence").cast(ArrayType(BooleanType())), ] if discrete_action: select_col_list += [ col("action").cast(LongType()), col("next_action").cast(LongType()), ] else: select_col_list += [ col("action").cast(ArrayType(FloatType())), col("next_action").cast(ArrayType(FloatType())), col("action_presence").cast(ArrayType(BooleanType())), col("next_action_presence").cast(ArrayType(BooleanType())), ] if include_possible_actions: select_col_list += [ col("possible_actions_mask").cast(ArrayType(LongType())), col("possible_next_actions_mask").cast(ArrayType(LongType())), ] return df.select(*select_col_list)
def termsIdx2Term(vocabulary): def termsIdx2Term(termIndices): return [vocabulary[int(index)] for index in termIndices] return udf(termsIdx2Term, ArrayType(StringType()))
def pyspark_script_console11(inputs, settings): data = inputs.get('data', None) df = inputs.get('df', None) df1 = inputs.get('df1', None) df2 = inputs.get('df2', None) df3 = inputs.get('df3', None) transformer = inputs.get('transformer', None) estimator = inputs.get('estimator', None) model = inputs.get('model', None) import re import datetime from pyspark.sql.functions import udf from pyspark.sql.types import IntegerType, StringType, ArrayType def p_ordinalDate(string): start = datetime.datetime.strptime(string.strip(), '%d/%m/%Y') return start.toordinal() def p_time(string): hours = int(string.split(":")[0]) if "PM" in string: hours += 12 return hours def p_entryLocation(string): vectors1 = ['PREMISES-REAR', 'PREMISES-FRONT', 'PREMISES-SIDE'] for x in vectors1: if x in string: return x return "UNKNOWN" def p_entryPoint(string): vectors2 = ['POINT OF ENTRY-DOOR', 'POINT OF ENTRY-WINDOW', \ 'POINT OF ENTRY-FENCE', 'POINT OF ENTRY-DOOR: GARAGE'] vectors3 = [ 'POE - DOOR', 'POE - WINDOW', 'POE - FENCE', 'POE - GARAGE' ] for x, y in list(zip(vectors2, vectors3)): if x in string or y in string: return x return "UNKNOWN" def p_dayOfWeek(string): start = datetime.datetime.strptime(string, '%d/%m/%Y') return start.weekday() def p_northingEasting(string, string2): return "%s-%s" % (string, string2) def p_methodOfEntry(string): if string is None: return '' narrative = string.split( "__________________________________ CREATED BY")[-1] if 'NARRATIVE' in narrative or 'CIRCUMSTANCES' in narrative: narrative = re.split('NARRATIVE|CIRCUMSTANCES', narrative)[-1] narrative = re.split("\*|:", narrative[1:])[0] return narrative # Classifies if the search was messy def p_messy(string): negations = ["NOT ", "NO ", "HAVEN'T", "DIDN'T", 'DIDNT', "HAVENT"] messywords = ['MESSY', 'MESSIL', 'RUMMAG', 'TIPPED'] sentences = [ sentence + '.' for sentence in string.split(".") if any(word in sentence for word in messywords) ] c = 0 for x in sentences: if any(word in x for word in negations): c -= 1 else: c += 1 return 1 if c > 0 else 0 def p_signature(string): if "DEFECA" in string: return 1 if "URINAT" in string: return 2 if "MASTURB" in string: return 3 if "GRAFFIT" in string: return 4 return "UNKNOWN" def p_propertySecure(string): verbs = ['LOCKED', 'FENCED', 'GATED', 'SECURED', 'BOLTED'] negations = ["NOT ", "NO ", "HAVEN'T", "DIDN'T", 'DIDNT', "HAVENT"] c = 0 sentences = [ sentence + '.' for sentence in string.split(".") if any(word in sentence for word in verbs) ] for x in sentences: if any(word in x for word in negations): c -= 1 else: c += 1 return 1 if c > 0 else 0 import nltk from nltk.parse.stanford import StanfordDependencyParser import string as string_module stemmer = nltk.stem.porter.PorterStemmer() parser = StanfordDependencyParser( path_to_models_jar= '/Users/Chao/nzpolice/summer/stanford-parser/stanford-parser-3.8.0-models.jar', model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', path_to_jar= '/Users/Chao/nzpolice/summer/stanford-parser/stanford-parser.jar', java_options='-Xmx1000M', verbose=False) remove_punctuation_map = dict( (ord(char), None) for char in string_module.punctuation) unigram_tagger = nltk.tag.UnigramTagger(nltk.corpus.brown.tagged_sents()) sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # For vectorizing text def stem_tokens(tokens): return [stemmer.stem(item) for item in tokens] # Normalizes text (i.e, tokenizes and then stems words) def normalize(text): return stem_tokens( nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) def p_propertyStolenList(string): if "PROPERTY" not in string: return [] property_list = " ".join([ re.split(':|_', listing)[0] for listing in re.split( "PROPERTY LIST SUMMARY:|PROPERTY STOLEN:", string) ]) text = normalize(property_list) tagged = unigram_tagger.tag(text) removable = [ 'modus', 'operandi', 'call', 'with', 'list', 'of', 'location', 'point', 'entry', 'value', 'property' 'police', 'stage', 'name', 'details', 'insured', 'victim', 'address' ] o = [] for x in tagged: if (not (x[1] in ["NN", "NNS"])) or (x[0] in removable): pass else: if not len(x[0]) < 3: o.append(x[0]) return o def p_pullMOTags(string): sentences = sent_tokenizer.tokenize(string) sentences = [sent.lower().capitalize() for sent in sentences] x_relations = [] for sent in sentences: if len(sent.split(" ")) > 100: continue try: parsed = parser.raw_parse(sent) triples = [parse.triples() for parse in parsed] selected = [ triple for triple in triples[0] if (triple[1] in ("dobj", "nsubjpass")) ] except: continue for x in selected: x_relations.append(x) return x_relations # def stem_tokens(tokens): # return [stemmer.stem(item) for item in tokens] # # # # Normalizes text (i.e, tokenizes and then stems words) # def normalize(text): # if text is None: # return [] # return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map))) udf_ordinal_date = udf(p_ordinalDate, IntegerType()) udf_time = udf(p_time, IntegerType()) udf_entry_location = udf(p_entryLocation, StringType()) udf_entry_point = udf(p_entryPoint, StringType()) udf_day_of_week = udf(p_dayOfWeek, IntegerType()) udf_northing_easting = udf(p_northingEasting, StringType()) udf_method_of_entry = udf(p_methodOfEntry, StringType()) # * udf_messy = udf(p_messy, IntegerType()) udf_signature = udf(p_signature, IntegerType()) udf_property_secure = udf(p_propertySecure, IntegerType()) udf_property_stolen_list = udf(p_propertyStolenList, ArrayType(StringType())) udf_pull_mo_tags = udf(p_pullMOTags, ArrayType(StringType())) # udf_normalize = udf(normalize, ArrayType(StringType())) FEATURES_TO_USE = [ ('ordinalDate', 'Occurrence Start Date', udf_ordinal_date), ('time', 'Occurrence Start Time', udf_time), ('entryLocation', 'Narrative', udf_entry_location), ('entryPoint', 'Narrative', udf_entry_point), ('dayOfWeek', 'Occurrence Start Date', udf_day_of_week), ('northingEasting', ('NZTM Location Northing', 'NZTM Location Easting'), udf_northing_easting), ('methodOfEntry', 'Narrative', udf_method_of_entry), ('messy', 'methodOfEntry', udf_messy), ('signature', 'Narrative', udf_signature), ('propertySecure', 'Narrative', udf_property_secure), ('propertyStolenWordnet', 'Narrative', udf_property_stolen_list), # ('cosineTFIDF', 'Narrative', udf_method_of_entry), # ('cosineTFIDF2', 'Narrative', udf_method_of_entry), ('cosineMO', 'methodOfEntry', udf_pull_mo_tags), # ('propertyStolenWordNetNA', 'Narrative', udf_property_stolen_list), # ('listSimilarity', 'Narrative', udf_property_stolen_list), # ('moSim', 'methodOfEntry', udf_pull_mo_tags), ] df = df.na.fill({'Narrative': ''}) # df.na.drop(subset=["Narrative"]) for t in FEATURES_TO_USE: new_col = t[0] func = t[2] in_cols = t[1] params = (df[c] for c in t[1]) if isinstance(in_cols, tuple) else [df[in_cols]] df = df.withColumn(new_col, func(*params)) return { 'data': data, 'df': df, 'df1': df1, 'df2': df2, 'df3': df3, 'transformer': transformer, 'estimator': estimator, 'model': model }
# MAGIC Let's look at one way to apply the spaCy NLP pipeline to our tweets using SQL and a user defined function (UDF): # COMMAND ---------- from pyspark.sql.types import ArrayType, FloatType, StringType import spacy nlp = spacy.load("en_core_web_sm") def getVerbs(text): doc = nlp(text) verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"] return verbs spark.udf.register("getVerbs", getVerbs, ArrayType(StringType())) # COMMAND ---------- # MAGIC %md # MAGIC We can now use the UDF in our SQL statements to extract verbs from a tweet: # COMMAND ---------- # MAGIC %sql # MAGIC select normalized_text # MAGIC ,getVerbs(normalized_text) as `verbs` # MAGIC from tweets_clean_for_nlp # COMMAND ----------
def get_data(): data = """ [ { "friends": [ { "id": 0, "name": "Georgina Sears" }, { "id": 1, "name": "Miranda Tillman" }, { "id": 2, "name": "Rosario Doyle" } ] }, { "friends": [ { "id": 0, "name": "Manuela Noble" }, { "id": 1, "name": "Aguilar Roy" }, { "id": 2, "name": "Holt Espinoza" } ] }, { "friends": [ { "name": "Manuela Noble" }, { "name": "Aguilar Roy" }, { "id": 2, "name": "Holt Espinoza" } ] } ] """ data_dict = json.loads(data) print(data_dict) schema = StructType().add( "friends", ArrayType( StructType([ StructField("id", StringType()), StructField("name", StringType()) ]))) df = spark.createDataFrame(data_dict, schema) return df
def test_exploding_data_frame(spark_session): sc = spark_session.sparkContext ###Generando string con json format: _data_js_string = [ '{"numero_caja":"3","compras":[[{"cantidad":"2","nombre":"Harina","precio_unitario":"1500"},\ {"cantidad":"5","nombre":"Arroz","precio_unitario":"1000"}],\ [{"cantidad":"4","nombre":"Frijoles","precio_unitario":"800"}],\ [{"cantidad":"7","nombre":"Manzana","precio_unitario":"500"},\ {"cantidad":"2","nombre":"JugoNaranja","precio_unitario":"1800"},\ {"cantidad":"6","nombre":"Carbon","precio_unitario":"1500"},\ {"cantidad":"3","nombre":"Pera","precio_unitario":"400"}]]}\ ' ] ##Definiendo el schema para que coincida con el desarrollado en la tarea schema = StructType([ StructField( "compras", ArrayType( ArrayType( StructType([ StructField("cantidad", StringType()), StructField("nombre", StringType()), StructField("precio_unitario", StringType()) ])))), StructField("numero_caja", StringType()) ]) ##Convirtiendo el json file a dataframe _data_js = spark_session.read.schema(schema).json( sc.parallelize(_data_js_string)) ##El metodo a probar requiere el dataframe despues de haberlo convertido de string a json file ##El metodo exploding_data_frame realizara 2 explode para as tener cada producto separado del array _dato_calculado = exploding_data_frame(_data_js) ##Se genera el dataframe esperado con los datos enviados al metodo: _dato_esperado = [("3", ["2", "Harina", "1500"]), ("3", ["5", "Arroz", "1000"]), ("3", ["4", "Frijoles", "800"]), ("3", ["7", "Manzana", "500"]), ("3", ["2", "JugoNaranja", "1800"]), ("3", ["6", "Carbon", "1500"]), ("3", ["3", "Pera", "400"])] schema = StructType([ StructField("numero_caja", StringType()), StructField( "col", StructType([ StructField("cantidad", StringType()), StructField("nombre", StringType()), StructField("precio_unitario", StringType()) ])) ]) ##Se obtiene el data frame esperado _dato_esperado_df = spark_session.createDataFrame(_dato_esperado, schema) _dato_esperado_df.show() _dato_calculado.show() assert _dato_esperado_df.collect() == _dato_calculado.collect()
spark = SparkSession.builder.getOrCreate() # Omit all logs except errors spark.sparkContext.setLogLevel('ERROR') # Read each file in cricket folder as a separate record rdd = spark.sparkContext.wholeTextFiles('/user/root/Final/cricket/') # Suppress hortonworks path prefix from filename and create # data frame with 2 columns ('doc' and 'text') data = rdd.map(lambda x: (x[0].replace('hdfs://sandbox-hdp.hortonworks.com:8020', ''), x[1])).toDF(['doc', 'text']) # Get total document count total_docs = data.count() # utility method for tokenizing a piece of text def tokenize(text): return re.findall('\\w+', text.lower()) # Register the tokenize method as a udf tokenize_udf = F.udf(tokenize, ArrayType(StringType())) # tokenize all the text data = data.select(['doc', tokenize_udf('text').alias('text')]) # make 1 separate row for each token data_tokens = data.withColumn("token", F.explode('text')) # calculate term frequency tf = data_tokens.groupBy('doc', 'token').agg(F.count('text').alias('tf')) # calculate document frequency df = data_tokens.groupBy('token').agg(F.countDistinct('doc').alias('df')) # utility method for calculating inverse document frequency def inverse_doc_frequency(doc_frequency): return math.log((total_docs + 1) * 1.0 / (doc_frequency + 1)) # register inverse document frequency as a udf
predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") rmse = evaluator.evaluate(predictions) print("Root-mean-square error = " + str(rmse)) # Generate top 10 movie recommendations for each user userRecs = model.recommendForAllUsers(10) # Cast Float to Double (Float is not supported by the Mongo connector) userRecs = userRecs.withColumn( 'recommendations', userRecs['recommendations'].cast( ArrayType( StructType([ StructField('movie_id', IntegerType()), StructField('rating', DoubleType()) ])))) # Write recommendations to the DB userRecs.write.format("com.mongodb.spark.sql.DefaultSource").options( uri=uri, collection="user_recommendations").mode("overwrite").save() # Generate top 10 user recommendations for each movie movieRecs = model.recommendForAllItems(10) # Generate top 10 movie recommendations for a specified set of users users = ratings.select(als.getUserCol()).distinct().limit(3) userSubsetRecs = model.recommendForUserSubset(users, 10) # Generate top 10 user recommendations for a specified set of movies movies = ratings.select(als.getItemCol()).distinct().limit(3)
def build_passage_to_entity_maps(content_path, spark, max_rank, dir_path, dataset_metadata=dataset_metadata): """" """ df = spark.read.parquet(content_path) df.printSchema() @udf(returnType=ArrayType(StringType())) def get_ents(content_bytearray): synthetic_entity_links = document_pb2.DocumentContent().FromString( pickle.loads(content_bytearray)).synthetic_entity_links entity_links = [] for synthetic_entity_link in synthetic_entity_links: entity_links.append(str(synthetic_entity_link.entity_id)) return entity_links df_entity = df.withColumn("entities", get_ents("content_bytearray")) df_entity.printSchema() for dataset in ['dev', 'train', 'test']: dateset_dir = dir_path + '{}_data/'.format(dataset) passage_name = 'passage' + '_{}'.format(dataset) passage_path = dataset_metadata[passage_name][0] print('================================') print('Building passage->entity mappings for {}: {}'.format( dataset, passage_path)) run_dict = {} doc_ids_list = [] with open(passage_path, 'r') as f: for line in f: query = line.split()[0] doc_id = line.split()[2] rank = int(line.split()[3]) if rank <= max_rank: if query not in run_dict: run_dict[query] = [] run_dict[query].append(doc_id) doc_ids_list.append(doc_id) query_list = sorted(list(run_dict.keys())) doc_ids_list = list(set(doc_ids_list)) print("doc_ids_list len = {}".format(len(doc_ids_list))) dataset_df = df_entity[df_entity['content_id'].isin( doc_ids_list)].select("content_id", "entities") print("dataset_map len = {}".format(dataset_df.count())) print(dataset_df.head()) dataset_dict = {} for row in dataset_df.collect(): dataset_dict[row[0]] = row[1] print("dataset_dict len = {}".format(len(dataset_dict))) write_json_path = dateset_dir + 'passage_to_entity.json' print('writing to: {}'.format(write_json_path)) with open(write_json_path, 'w') as f: json.dump(dataset_dict, f, indent=4)
input_path = sys.argv[1] output_path = sys.argv[2] df = spark.read.csv(input_path, header=True, inferSchema=True) names = df.columns import pandas as pd from pyspark.sql.functions import col, pandas_udf, size from pyspark.sql.types import DoubleType, ArrayType def predict(*series) -> pd.Series: import pandas as pd import numpy as np from numpy import nan from scipy.special._ufuncs import expit from scoring_h2oai_experiment_336ccd12_cbb4_11ea_8496_ac1f6b68b7be import Scorer # update with your key scorer = Scorer() merged = pd.concat(series, axis=1) merged.columns = names output = scorer.score_batch(merged) return pd.Series(output.values.tolist()) predict_udf = pandas_udf(predict, returnType=ArrayType(DoubleType())) columns = [col(name) for name in df.columns] withPredictions = df.withColumn("prediction", predict_udf(*columns)) # If working with multi-class, can expand prediction, e.g. 3 classes: num_cols = withPredictions.withColumn("size", size(col("prediction"))).agg({"size": "max"}).head()[0] # To be performant, specify the value, e.g. num_cols=3 withPredictions = withPredictions.select(col("*"), *(col('prediction').getItem(i).alias(f'prediction_{i}') for i in range(num_cols))) withPredictions = withPredictions.drop(col("prediction"))
def get_schema(schema_name): schema = None if schema_name == 'interim_parkingbay_schema': schema = StructType([ StructField('bay_id', IntegerType(), False), StructField('last_edit', StringType()), StructField('marker_id', StringType()), StructField('meter_id', StringType()), StructField('rd_seg_id', StringType()), StructField('rd_seg_dsc', StringType()), StructField( 'the_geom', StructType([ StructField( 'coordinates', ArrayType(ArrayType(ArrayType(ArrayType( DoubleType()))))), StructField('type', StringType()) ])), StructField('load_id', StringType()), StructField('loaded_on', TimestampType()) ]) elif schema_name == 'interim_sensor': schema = StructType([ StructField('bay_id', IntegerType(), False), StructField('st_marker_id', StringType()), StructField('lat', FloatType()), StructField('lon', FloatType()), StructField( 'location', StructType([ StructField('coordinates', ArrayType(DoubleType())), StructField('type', StringType()) ]), False), StructField('status', StringType()), StructField('load_id', StringType()), StructField('loaded_on', TimestampType()) ]) elif schema_name == 'dw_dim_parking_bay': schema = StructType([ StructField('dim_parking_bay_id', StringType(), False), StructField('bay_id', IntegerType(), False), StructField('marker_id', StringType()), StructField('meter_id', StringType()), StructField('rd_seg_id', StringType()), StructField('rd_seg_dsc', StringType()), StructField( 'the_geom', StructType([ StructField( 'coordinates', ArrayType(ArrayType(ArrayType(ArrayType( DoubleType()))))), StructField('type', StringType()) ])), StructField('load_id', StringType()), StructField('loaded_on', TimestampType()) ]) elif schema_name == 'dw_dim_location': schema = StructType([ StructField('dim_location_id', StringType(), False), StructField( 'location', StructType([ StructField('coordinates', ArrayType(DoubleType())), StructField('type', StringType()) ]), False), StructField('lat', FloatType()), StructField('lon', FloatType()), StructField('load_id', StringType()), StructField('loaded_on', TimestampType()) ]) elif schema_name == 'dw_dim_st_marker': schema = StructType([ StructField('dim_st_marker_id', StringType(), False), StructField('st_marker_id', StringType()), StructField('load_id', StringType()), StructField('loaded_on', TimestampType()) ]) return schema
from pyspark.sql import Window import pyspark.sql.functions as F from pyspark.ml.feature import CountVectorizer from pyspark.sql.types import FloatType from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType, StringType, BooleanType import pyspark.sql.functions as F import numpy as np from operator import add from functools import reduce @F.udf( ArrayType( StructType([ # Adjust types to reflect data types StructField("item0", StringType()), StructField("item1", IntegerType()), StructField("item2", FloatType()) ]))) def ImpPrice(imp, price): imp_rank = range(len(imp)) price = np.array(price).astype(float).tolist() return zip(imp, imp_rank, price) def getPriceImpressionRank(): funcs = [] for col in ["price", 'imp_rank']: for func in [F.min, F.max, F.mean, F.stddev]: funcs.append(func(col).alias(col + "_" + func.func_name)) funcs.append(F.count("price").alias('impression_freqs'))