def group_by_grid_square_and_tokenize(spark_session, latlongrid, tweets_df): """Calculates the grid square id from 'lat' and 'lon' columns in tweets_df, and then groups the tweets by grid square. Tweets are tokenized. Returned dataframe has columns ['grid_square', 'tokens'], where 'tokens' is a list of all tokens from every tweet within an entry's 'grid_square'. Args: spark_session -- An active SparkSession. latlongrid -- A LatLonGrid object. tweets_df -- A dataframe with columns ['lat', 'lon', and 'tweet'] of types [DoubleType, DoubleType, StringType].""" sql_tokenize = functions.udf(lambda tweet: twokenize.tokenize(tweet), returnType=types.ArrayType( types.StringType())) tweets_df = (tweets_df.withColumn('tweet_tokens', sql_tokenize( tweets_df['tweet'])).drop('tweet')) row_to_gridsquare_tokens = lambda row: (latlongrid.grid_square_index( lat=row['lat'], lon=row['lon']), row['tweet_tokens']) tokens_rdd = (tweets_df.rdd.map(row_to_gridsquare_tokens).reduceByKey( operator.concat)) tokens_df_schema = types.StructType([ types.StructField('grid_square', types.IntegerType()), types.StructField('tokens', types.ArrayType(types.StringType())) ]) tokens_df = spark_session.createDataFrame(tokens_rdd, schema=tokens_df_schema) return tokens_df
def frontend_result(sc, dataframe, buckets=20, prediction_col='prediction'): n_buckets = sc.broadcast(buckets) buckets_list_udf = F.udf( f=lambda dist, ratio, boundary: ShowResults.make_buckets( distances=dist, ratio=ratio, boundary=boundary, n_buckets=n_buckets.value) , returnType=T.ArrayType( elementType=T.ArrayType( elementType=T.IntegerType(), containsNull=True), containsNull=True ) ) tmp = (dataframe .groupBy(prediction_col, F.col('computed_boundary')) .agg(F.min('distance').alias('min'), F.max('distance').alias('max'), F.sum('is_outlier').alias('n_outliers'), F.collect_list('distance').alias('distances')) .withColumn(colName='ratio', col=F.col('max')/n_buckets.value) .withColumn(colName='buckets', col=buckets_list_udf( 'distances', 'ratio', 'computed_boundary')) ) return tmp.select(prediction_col, 'buckets')
def calc(df): ## function to calculate the appoximating function and its derivative def foo(x,y): y_arr = np.array(y) gy = g(y_arr) gp = gprime(y_arr) x_arr = np.array(x) res = np.outer(gy,x_arr) return([res.flatten().tolist(), gp.tolist()]) udf_foo = f.udf(foo, t.ArrayType(t.ArrayType(t.DoubleType()))) df2 = df.withColumn("vals", udf_foo("features","Y")) df2 = df2.select("id", f.col("vals").getItem(0).alias("gy"), f.col("vals").getItem(1).alias("gy_")) GY_ = np.array(df2.agg(f.array([f.sum(f.col("gy")[i]) for i in range(n_comp**2)])).collect()[0][0]).reshape(n_comp,n_comp)/num_rows GY_AVG_V = np.array(df2.agg(f.array([f.avg(f.col("gy_")[i]) for i in range(n_comp)])).collect()[0][0]).reshape(n_comp,1)*V return(GY_, GY_AVG_V)
def test_undefined_field(self): with six.assertRaisesRegex(self, KeyError, 'f2'): schema_has( T.StructType([T.StructField('f1', T.IntegerType())]), T.StructType([T.StructField('f2', T.LongType())]), ) with six.assertRaisesRegex(self, KeyError, 'f1\.element\.s2'): schema_has( T.StructType([ T.StructField( 'f1', T.ArrayType(T.StructType([T.StructField('s1', T.IntegerType())])), ), ]), T.StructType([ T.StructField( 'f1', T.ArrayType(T.StructType([T.StructField('s2', T.LongType())])), ), ]), ) with six.assertRaisesRegex(self, TypeError, 'element is IntegerType, expected LongType'): schema_has( T.ArrayType(T.IntegerType()), T.ArrayType(T.LongType()), )
def sum_word_vectors( urls_and_weighted_word_vectors: DataFrame) -> DataFrame: """ Sums weighted word vectors and their corresponding coefficients for each URL. :param urls_and_weighted_word_vectors: A DataFrame of URLs and weighted word vectors with columns: id, url, pos, word, weighted_word_vector, coefficient. :return: A DataFrame of URLs and their corresponding sum of word vectors and sum of coefficients with columns: id, url, split_url, coefficients, summed_vectors, summed_coefficients. """ word_array_sorter_udf = F.udf( URLVectorCalculator.sort_list_of_2_tuples_by_0th_item, T.ArrayType(T.StringType())) coefficient_array_sorter_udf = F.udf( URLVectorCalculator.sort_list_of_2_tuples_by_0th_item, T.ArrayType(T.DoubleType())) vector_size = len( urls_and_weighted_word_vectors.select( 'weighted_word_vector').first()[0]) return urls_and_weighted_word_vectors \ .groupBy("id", "url") \ .agg(F.collect_list(F.struct("pos", "word")).alias("positions_and_words"), F.collect_list(F.struct("pos", "coefficient")).alias("positions_and_coefficients"), F.sum("coefficient").alias("summed_coefficients"), F.array(*[F.sum(F.col("weighted_word_vector")[i]) for i in range(vector_size)]).alias("summed_vectors")) \ .select("id", "url", "summed_coefficients", "summed_vectors", word_array_sorter_udf("positions_and_words").alias("split_url"), coefficient_array_sorter_udf("positions_and_coefficients").alias("coefficients"))
def get_df_schema(self): return tp.StructType([ tp.StructField('added_date', tp.DateType(), True), tp.StructField('release_year', tp.IntegerType(), True), tp.StructField('title', tp.StringType(), False), tp.StructField('director', tp.StringType(), True), tp.StructField('type', tp.StringType(), False), tp.StructField('duration', tp.StringType(), True), tp.StructField('description', tp.StringType(), True), tp.StructField( 'comments', tp.ArrayType( tp.StructType([ tp.StructField('body', tp.StringType(), True), tp.StructField('author', tp.StringType(), True), tp.StructField('created_utc', tp.TimestampType(), True), tp.StructField('score', tp.IntegerType(), True), tp.StructField('sentiment', tp.StringType(), True), tp.StructField('description_word', tp.StringType(), True), tp.StructField('source', tp.StringType(), True) ])), True), tp.StructField( 'actors', tp.ArrayType( tp.StructType( [tp.StructField('name', tp.StringType(), True)])), True) ])
def hook_spark_pipeline_init(self, sc, sqlc, schema, indexer): if self.include_external: schema.append( SparkTypes.StructField( "external_links", SparkTypes.ArrayType( SparkTypes.StructType([ SparkTypes.StructField("href", SparkTypes.StringType(), nullable=False), SparkTypes.StructField("text", SparkTypes.StringType(), nullable=True) ])), nullable=True)) if self.include_internal: schema.append( SparkTypes.StructField( "internal_links", SparkTypes.ArrayType( SparkTypes.StructType([ SparkTypes.StructField("path", SparkTypes.StringType(), nullable=False), SparkTypes.StructField("text", SparkTypes.StringType(), nullable=True) ])), nullable=True))
def main(): #input_comments = '/Users/Mehvish/Documents/SFU/BigDataLab/Metabot/comments/RC_2016-01-aaaa.json.gz' change_to_str = F.udf(to_text, returnType=types.ArrayType(types.StringType())) sub_comments = spark.read.json(input_comments, schema=comments_schema).repartition(500) comm = sub_comments.select( sub_comments['subreddit'].alias('id'), sub_comments['body'].alias('comments'), sub_comments['ups'].alias('ups') ) #.where(sub_comments['subreddit'] == 'AskReddit').limit(10) preprocess = F.udf(clean_data, returnType=types.ArrayType(types.StringType())) comm_cleaned = comm.select(comm['id'], preprocess(comm['comments']).alias('comments'), comm['ups']) #comm_cleaned.show(truncate=False) subreddit_group = comm_cleaned.groupBy(comm_cleaned['id']).agg(change_to_str(F.collect_list('comments')).alias('comments') , F.sum('ups').alias('ups'), F.count('id').alias('count')) \ .select('id', 'comments', 'ups', 'count') #subreddit_group.show(20, False) #print("done") subreddit_group.write.format('parquet').save(output, mode='overwrite')
def main(): spark.sql("CLEAR CACHE") business = spark.read.parquet("yelp-etl/business_etl").repartition(8) business.createOrReplaceTempView("business") review = spark.read.parquet("yelp-etl/review_etl").repartition(16)#.cache() review.createOrReplaceTempView("review") ## Location based reviews # spark.sql("SELECT b.state, COUNT(*) AS bus_rev_count FROM business b INNER JOIN review r ON b.business_id = r.business_id GROUP BY b.state ORDER BY bus_rev_count DESC").show() # # ## Choosing reviews from Pennsylvania (state = "PA") pa_bus_rev = spark.sql("SELECT r.review_id, b.business_id, r.text, r.label FROM business b INNER JOIN review r ON b.business_id = r.business_id WHERE b.state = 'PA' AND r.label = 1") ## Remove punctuations and spaces punct_remover = functions.udf(lambda x: remove_punct(x)) review_df = pa_bus_rev.select('review_id', 'business_id', punct_remover('text')).withColumnRenamed('<lambda>(text)', 'text') ## Tokenize tok = Tokenizer(inputCol="text", outputCol="words") ## Remove stop words stopwordList = ['','i','get','got','also','really','would','one','good','like','great','tri','love','two','three','took','awesome','me','bad','horrible','disgusting','terrible','fabulous','amazing','terrific','worst','best','fine','excellent','acceptable','my','exceptional','satisfactory','satisfying','super','awful','atrocious','unacceptable','poor','sad','gross','authentic','myself','cheap','expensive','we','our','ours','ourselves','you','your','yours','yourself','yourselves', 'he', 'him', 'his', 'himself','she','her','hers','herself','it','its','itself','they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then','once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each','few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn','weren', 'won', 'wouldn'] stopword_rm = StopWordsRemover(inputCol="words", outputCol="words_nsw", stopWords=stopwordList) pipestages = [tok,stopword_rm] pipeline = Pipeline(stages = pipestages) model = pipeline.fit(review_df) tokenized_df = model.transform(review_df) ## Lemmatizing lemmatize_udf = functions.udf(lambda x: lemmatize(x), types.ArrayType(types.StringType())) lemmatized_df = tokenized_df.withColumn("lemmatized",lemmatize_udf("words_nsw")).select("review_id","business_id","lemmatized") ## Stemming stemmer_udf = functions.udf(lambda x: stem(x), types.ArrayType(types.StringType())) stemmed_df = lemmatized_df.withColumn("stemmed", stemmer_udf("lemmatized")).drop(lemmatized_df["lemmatized"]) ## Count Vectorizer cv = CountVectorizer(inputCol="stemmed", outputCol="vectors") cv_model = cv.fit(stemmed_df) cv_df = cv_model.transform(stemmed_df).drop(stemmed_df["stemmed"]) cv_model.save("topic_modelling/cvmodel_pos") idf = IDF(inputCol="vectors",outputCol="tfidf") idf_model = idf.fit(cv_df) result = idf_model.transform(cv_df) result = result.select("review_id","business_id","tfidf") lda = LDA(featuresCol='tfidf', k=5, seed=42, maxIter=50) model = lda.fit(result) model.write().overwrite().save("topic_modelling/ldamodel_pos") transformed = model.transform(result) transformed.write.parquet("topic_modelling/review_topics_pos",mode="overwrite") spark.stop()
def get_resume_er_schema(): return types.StructType([ types.StructField('id', types.LongType(), nullable=False), types.StructField('job_title', types.ArrayType(types.StringType()), nullable=False), types.StructField('job_details', types.ArrayType(types.StringType()), nullable=False), ])
def make_img_df(sqlContext, keys_rdd): kmeta_df = sqlContext.createDataFrame(keys_rdd.map(lambda x: x._asdict())) # applying python functions to DataFrames is more difficult and requires using typed UDFs twod_arr_type = sq_types.ArrayType( sq_types.ArrayType(sq_types.IntegerType())) # the pull_input_tile function is wrapped into a udf to it can be applied to create the new image column # numpy data is not directly supported and typed arrays must be used instead therefor we run the .tolist command pull_tile_udf = F.udf(lambda x: pull_input_tile(x_to_tile(x)).tolist(), returnType=twod_arr_type) kimg_df = kmeta_df.withColumn('Image', pull_tile_udf(kmeta_df['x'])) s_query = kimg_df.where(kimg_df['x'] > 99) return s_query.show()
def __init__(self, configuration: StatsExtractionConfig): self._filters = FilterTypesEnum self._columns = DataframeColumnsEnum self._stats = StatsExtractionEnum self._purging = PurgingEnum self._configuration = configuration standardisation_config_dict = self._configuration.standardisation_config standardisation_config = [ FilterConfiguration(name=name, parameters=params) for name, params in standardisation_config_dict.items() ] dec_separator = self._stats.DECORATION_SEPARATOR_TOKEN attachment_token = self._stats.ATTACHMENT_POINT_TOKEN self._mol_wts_udf = psf.udf( lambda x: ExactMolWt(Chem.MolFromSmiles(x)), pst.FloatType()) self._num_rings_udf = psf.udf( lambda x: rdMolDescriptors.CalcNumRings(Chem.MolFromSmiles(x)), pst.IntegerType()) self._num_atoms_udf = psf.udf( lambda x: Chem.MolFromSmiles(x).GetNumHeavyAtoms(), pst.IntegerType()) self._num_aromatic_rings_udf = psf.udf( lambda x: rdMolDescriptors.CalcNumAromaticRings( Chem.MolFromSmiles(x)), pst.IntegerType()) self._hbond_donors_udf = psf.udf( lambda x: rdMolDescriptors.CalcNumHBD(Chem.MolFromSmiles(x)), pst.IntegerType()) self._hbond_acceptors_udf = psf.udf( lambda x: rdMolDescriptors.CalcNumHBA(Chem.MolFromSmiles(x)), pst.IntegerType()) self._hetero_atom_ratio_udf = psf.udf( lambda x: len([ atom for atom in Chem.MolFromSmiles(x).GetAtoms() if atom.GetAtomicNum() == 6 ]) / Chem.MolFromSmiles(x).GetNumHeavyAtoms(), pst.FloatType()) self._make_canonical_udf = psf.udf( lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x)), pst.StringType()) self._standardise_smiles_udf = psf.udf( lambda x: RDKitStandardizer(standardisation_config, None). apply_filter(x), pst.StringType()) pattern = self._stats.REGEX_TOKENS self.regex = re.compile(pattern) self._tokeniser_udf = psf.udf(self.regex.findall, pst.ArrayType(pst.StringType())) self._decoration_split_udf = psf.udf(lambda x: x.split(dec_separator), pst.ArrayType(pst.StringType())) self._count_decorations_udf = psf.udf( lambda s: list(s).count(attachment_token), pst.IntegerType())
def create_credits_dataframe(): creditsDF = (spark.read.csv( "/Users/butterflyeffect/Downloads/tmdb-5000-movie-dataset/tmdb_5000_credits.csv", header=True, quote='"', escape='"', )) # Define non string columns into their corresponding datatypes credits_cols = { "movie_id": T.IntegerType, } # Define json columns into their corresponding types credits_json_cols = { "cast": T.ArrayType( T.StructType([ T.StructField("cast_id", T.IntegerType()), T.StructField("character", T.StringType()), T.StructField("credit_id", T.StringType()), T.StructField("gender", T.IntegerType()), T.StructField("id", T.IntegerType()), T.StructField("name", T.StringType()), T.StructField("order", T.IntegerType()), ])), "crew": T.ArrayType( T.StructType([ T.StructField("credit_id", T.StringType()), T.StructField("department", T.StringType()), T.StructField("gender", T.IntegerType()), T.StructField("id", T.IntegerType()), T.StructField("job", T.StringType()), T.StructField("name", T.StringType()), ])), } for col, schema in credits_cols.items(): creditsDF = creditsDF.withColumn(col, F.col(col).astype(schema())) for col, schema in credits_json_cols.items(): creditsDF = creditsDF.withColumn(col, F.from_json(col, schema)) # Validate Schema # creditsDF.printSchema() # Validate column names and types # print (creditsDF.columns) # print (creditsDF.dtypes) # Validate Rows # creditsDF.show(2, False) return creditsDF
def as_spark_type(tpe) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # TODO: Add "boolean" and "string" types. # ArrayType if tpe in (np.ndarray,): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list): return types.ArrayType(as_spark_type(tpe.__args__[0])) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date,): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal,): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int, np.int64, "int", "int64", "long", "bigint"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampType() else: raise TypeError("Type %s was not understood." % tpe)
def get_schema(data_type=None): type_lib = { 'int': types.IntegerType(), 'float': types.FloatType(), 'str': types.StringType(), 'dt': types.TimestampType(), 'arr_int': types.ArrayType(types.IntegerType()), 'arr_float': types.ArrayType(types.FloatType()), 'arr_str': types.ArrayType(types.StringType()), } return types.StructType([ types.StructField(fname, type_lib[ftype]) for fname, ftype in cols_dict[data_type] ])
def streaming_sent(dfX): # apply sentiment analysis to text stream df = pipeline.transform(dfX) # select sentiment column from pipeline output df = df.select('sentiment.result',"sentiment.metadata") \ .withColumn('result',F.concat_ws(',','result')) \ .withColumn("result", regexp_replace('result', "positive",'1')) \ .withColumn("result", regexp_replace('result', "na",'0')) \ .withColumn("result", regexp_replace('result', "negative",'-1')) \ .select(F.split('result', ',').alias('sents'), 'metadata') # Convert datatypes mapper = F.udf(lambda x: [i['confidence'] for i in x], T.ArrayType(T.StringType())) df = df.withColumn("metadata", mapper('metadata')) df = df.withColumn("metadata", df.metadata.cast("array<float>")) # Compute column product df_product = df.withColumn( "product", F.expr( "transform(arrays_zip(sents, metadata), x -> x.sents * x.metadata)" )) # Average array array_mean = F.udf(lambda x: float(np.mean(x)), T.FloatType()) sent_df = df_product.select(array_mean("product").alias("value")) return sent_df
def test_generated_rings(self): num_samples = 500 # make a simple unit circle theta = np.linspace(0, 2 * np.pi, num_samples) X1 = np.random.rand(num_samples, 2) + np.transpose( [0.5 * np.cos(theta), 0.5 * np.sin(theta)]) X2 = np.random.rand(num_samples, 2) + np.transpose( [5 * np.cos(theta), 5 * np.sin(theta)]) X = np.concatenate([X1, X2]) db = DBSCAN(eps=0.3, min_samples=5).fit(X) labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) labels_spark = np.zeros_like(db.labels_) labels_spark[:] = -1 data = [(i, [float(item) for item in X[i]]) for i in range(X.shape[0])] schema = T.StructType([ T.StructField("id", T.IntegerType(), False), T.StructField("value", T.ArrayType(T.FloatType()), False) ]) df = self.spark.createDataFrame(data, schema=schema) df_clusters = dbscan.process(self.spark, df, .3, 5, dist, 2, "checkpoint") out = df_clusters.distinct().collect() for item in out: labels_spark[item.point] = item.component n_clusters_spark_ = len(set(labels_spark)) - (1 if -1 in labels else 0) n_noise_spark_ = list(labels_spark).count(-1) self.assertEqual(n_clusters_, n_clusters_spark_) self.assertEqual(n_noise_, n_noise_spark_)
def test_generated_blobs(self): centers = [[1, 1], [-1, -1], [1, -1]] # with following data operations with sklearn dbscan 750*749/2 = 280875 for spark 149716(.2) 217624(0.3) X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=5) db = DBSCAN(eps=0.2, min_samples=10).fit(X) labels = db.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) labels_spark = np.zeros_like(db.labels_) labels_spark[:] = -1 data = [(i, [float(item) for item in X[i]]) for i in range(X.shape[0])] schema = T.StructType([ T.StructField("id", T.IntegerType(), False), T.StructField("value", T.ArrayType(T.FloatType()), False) ]) df = self.spark.createDataFrame(data, schema=schema) df_clusters = dbscan.process(self.spark, df, .2, 10, dist, 2, "checkpoint") out = df_clusters.distinct().collect() for item in out: labels_spark[item.point] = item.component n_clusters_spark_ = len(set(labels_spark)) - (1 if -1 in labels else 0) n_noise_spark_ = list(labels_spark).count(-1) self.assertEqual(n_clusters_, n_clusters_spark_) self.assertEqual(n_noise_, n_noise_spark_)
def getRedditDataFrameSchema(self): return tp.StructType([ tp.StructField('show_title', tp.StringType(), True), tp.StructField('show_director', tp.StringType(), True), tp.StructField('submission_id', tp.StringType(), True), tp.StructField('source', tp.StringType(), True), tp.StructField('title', tp.StringType(), True), tp.StructField('description', tp.StringType(), True), tp.StructField('created_utc', tp.TimestampType(), True), tp.StructField('author', tp.StringType(), True), tp.StructField('score', tp.IntegerType(), True), tp.StructField('spoiler', tp.BooleanType(), True), tp.StructField('is_original_content', tp.BooleanType(), True), tp.StructField('distinguished', tp.StringType(), True), tp.StructField('link', tp.StringType(), True), tp.StructField( 'comments', tp.ArrayType( tp.StructType([ tp.StructField('comment_id', tp.StringType(), True), tp.StructField('body', tp.StringType(), True), tp.StructField('created_utc', tp.TimestampType(), True), tp.StructField('score', tp.IntegerType(), True), tp.StructField('parent_id', tp.StringType(), True), tp.StructField('submission_id', tp.StringType(), True) ])), True) ])
def test_group_by(self): df = self.spark.createDataFrame( data=[ ('k4', 'k14', [1, 14, 141]), ('k1', 'k12', [1, 12, 121]), ('k1', 'k11', [1, 11, 111]), ('k1', 'k13', [1, 13, 131]), ], schema=T.StructType([ T.StructField('key_1', T.StringType()), T.StructField('key_2', T.StringType()), T.StructField('aux_data', T.ArrayType(T.IntegerType())), ]) ) df.write_ext.by_url( 'redis://redis.docker?keyBy=key_1&groupByKey=true&maxPipelineSize=2' ) redis_client = redis.StrictRedis('redis.docker') self.assertRowsEqual(redis_client.keys(), [b'k1', b'k4'], ignore_order=True) written_data = [json.loads(redis_client.get(key)) for key in [b'k1', b'k4']] expected = [ [ {'key_1': 'k1', 'key_2': 'k11', 'aux_data': [1, 11, 111]}, {'key_1': 'k1', 'key_2': 'k12', 'aux_data': [1, 12, 121]}, {'key_1': 'k1', 'key_2': 'k13', 'aux_data': [1, 13, 131]}, ], [{'key_1': 'k4', 'key_2': 'k14', 'aux_data': [1, 14, 141]}], ] self.assertRowsEqual(written_data, expected, ignore_order=True)
def test_redis_client_init(self): df = self.spark.createDataFrame( data=[ ('k1', 'k14', [1, 14, 141]), ], schema=T.StructType([ T.StructField('key_1', T.StringType()), T.StructField('key_2', T.StringType()), T.StructField('aux_data', T.ArrayType(T.IntegerType())), ]) ) df.write_ext.redis( key_by=['key_2'], max_pipeline_size=3, redis_client_init=partial(redis.StrictRedis, 'redis.docker'), ) redis_client = redis.StrictRedis('redis.docker') self.assertEqual(redis_client.keys(), [b'k14']) written_data = json.loads(redis_client.get('k14')) expected = {'key_1': 'k1', 'key_2': 'k14', 'aux_data': [1, 14, 141]} self.assertEqual(written_data, expected)
def test_coalescing_heavy_type_mismatch(self): first_df = self.spark.createDataFrame( data=[(1, None), (2, 'hi'), (3, None), (4, 'may')], schema=T.StructType([ T.StructField('id', T.IntegerType()), T.StructField('value', T.StringType()), ]), ) second_df = self.spark.createDataFrame( data=[(2, [ 2, ]), (3, [ 3, ]), (4, None)], schema=T.StructType([ T.StructField('id', T.IntegerType()), T.StructField('value', T.ArrayType(T.IntegerType())), ]), ) with self.assertRaises(U.AnalysisException): SF.multijoin([first_df, second_df], on='id', how='inner', coalesce=['value'])
def preprocessDF(self, df, cols): """ Input: $df represents a DataFrame $cols represents the list of columns (in $df) that will be concatenated and be tokenized Output: Return a new DataFrame that adds the "joinKey" column into the input $df Comments: The "joinKey" column is a list of tokens, which is generated as follows: (1) concatenate the $cols in $df; (2) apply the tokenizer to the concatenated string Here is how the tokenizer should work: (1) Use "re.split(r'\W+', string)" to split a string into a set of tokens (2) Convert each token to its lower-case (3) Remove stop words """ stop_words = self.stopWordsBC def tokenized_filterized_string(string): string = re.sub('\s+', ' ', string).strip().lower( ) # Remove extra whitespace and finally remove trailing spaces tokens = re.split(r'\W+', string) stop_words.add('') tokens = set(tokens) - stop_words return list(tokens) get_tokenized_string = functions.udf( tokenized_filterized_string, types.ArrayType(types.StringType())) concatanated_column = 'joinKey' df = df.withColumn(concatanated_column, concat_ws(' ', df[cols[0]], df[cols[1]])) df = df.withColumn(concatanated_column, get_tokenized_string(df[concatanated_column])) return df
def polynomialExpansionCore(requestStr, df): # 对参数格式进行转化:json->字典,并进一步进行解析 requestDict = json.loads(requestStr) columnNames = requestDict['columnNames'] # 新列的列名默认为"多项式扩展" + columnNames,若用户指定,以用户指定为准 try: newColumnName = requestDict['newColumnName'] except: newColumnName = "_".join(columnNames) + "_PolynomialExpansion" # 转化列类型 -> 向量, 输入列必须为数值型,否则返回错误信息 vecAssembler = VectorAssembler(inputCols=columnNames, outputCol="features") try: df = vecAssembler.transform(df) except utils.IllegalArgumentException: return "error_numerical" # 设定多项式扩展模型 px = PolynomialExpansion(inputCol="features", outputCol=newColumnName) # 训练 df = px.transform(df) # 转换新列的数据格式 def do_something(col): try: floatrow = [] for i in list(col): floatrow.append(float(i)) return floatrow except: return [] udf_dosth = F.udf(do_something, T.ArrayType(T.FloatType())) df = df.withColumn(newColumnName, udf_dosth(df[newColumnName])) df = df.drop("features") # df.show() return df
def piStrOneHotEncoding(featurename, dataframe): from pyspark.ml.feature import OneHotEncoder from pyspark.ml.feature import StringIndexer #from pyspark.ml.feature import VectorIndexer indexed = dataframe indexer = StringIndexer(inputCol=featurename, outputCol=featurename + "HE") indexed = indexer.fit(indexed).transform(indexed) encoder = OneHotEncoder(inputCols=[featurename + "HE"], outputCols=[featurename + "OHE"]) indexed = encoder.fit(indexed).transform(indexed) def convertSparseVectortoDenseVectorInt(v): v = DenseVector(v) new_array = list([int(x) for x in v]) return new_array toDenseVectorUdfInt = F.udf(convertSparseVectortoDenseVectorInt, T.ArrayType(T.IntegerType())) from pyspark.ml.feature import Interaction, VectorAssembler assembler1 = VectorAssembler(inputCols=[featurename + "OHE"], outputCol="vec1") assembled1 = assembler1.transform(indexed) a = assembled1.toPandas() indexed = indexed.drop(featurename).drop(featurename + "HE").withColumn( featurename, toDenseVectorUdfInt(featurename + "OHE")).drop(featurename + "OHE") #indexer = VectorIndexer(inputCol=featurename+"OHE", outputCol=featurename+"tHE", maxCategories=10) #indexerModel = indexer.fit(indexed) #indexed = indexerModel.transform(indexed) return indexed
def get_coefficients( split_urls_and_word_frequency_orders: DataFrame, s: float, additional_weight_function: Callable[[int], float] = lambda e: 1 ) -> DataFrame: """ :param split_urls_and_word_frequency_orders: A DataFrame of split URLs and word frequency orders with columns: id, url, split_url, word_frequency_orders. :param s: s parameter of Zipf distribution. :param additional_weight_function: additional weight function to be applied additional weight beside Zipf to word vector. :return: A DataFrame of split URLs and coefficient of each term with columns: id, url, split_url, coefficients """ def calculate_coefficients(word_frequency_orders): coefficients = [] for i in range(len(word_frequency_orders)): coefficients.append( additional_weight_function(i) * URLVectorCalculator.get_zipf_coefficient( word_frequency_orders[i], s)) return coefficients get_coefficients_udf = F.udf(calculate_coefficients, T.ArrayType(T.DoubleType())) split_urls_and_coefficients = split_urls_and_word_frequency_orders \ .select("id", "url", "split_url", get_coefficients_udf("word_frequency_orders").alias("coefficients")) return split_urls_and_coefficients
def extract_embedding(spark, glove_model_path, output_folder): glove = Glove.load(glove_model_path) dictionary_schema = T.StructType([ T.StructField('index', T.IntegerType(), True), T.StructField('standard_concept_id', T.IntegerType(), True) ]) dictionary_df = spark.createDataFrame([ Row(index=k, standard_concept_id=int(v)) for k, v in glove.inverse_dictionary.items() ], dictionary_schema) vector_schema = T.StructType([ T.StructField('index', T.IntegerType(), True), T.StructField('vector', T.ArrayType(T.DoubleType()), True) ]) vector_df = spark.createDataFrame([ Row(index=idx, vector=vector.tolist()) for idx, vector in enumerate(glove.word_vectors) ], vector_schema) dictionary_df.join(vector_df, 'index').select( 'standard_concept_id', 'vector').write.mode('overwrite').parquet(output_folder)
def calc_TX_PVLS(patient_agg_obs: DataFrame, VL_code: str, end_date_str: str = None) -> pandas.DataFrame: """Calculates TX_PVLS indicator with its corresponding disaggregations. Args: patient_agg_obs: A DataFrame generated by `join_patients_agg_obs()`. VL_code: The code for viral load values. end_date: The string representation of the last date as 'YYYY-MM-DD'. Returns: """ end_date = datetime.today() if end_date_str: end_date = datetime.strptime(end_date_str, '%Y-%m-%d') agg_buckets_udf = F.UserDefinedFunction( lambda a, g: agg_buckets(a, g, end_date), T.ArrayType(T.StringType())) VL_df = patient_agg_obs.withColumn( 'sup_VL', patient_agg_obs[VL_code + '_max_value'] < 150).withColumn( 'agg_buckets', agg_buckets_udf(patient_agg_obs['birthDate'], patient_agg_obs['gender'])) num_patients = VL_df.count() VL_agg_P = VL_df.select( VL_df.sup_VL, F.explode(VL_df.agg_buckets).alias('agg_bucket')).groupBy( 'sup_VL', 'agg_bucket').agg( F.count('*').alias('count')).toPandas().sort_values( ['agg_bucket', 'sup_VL']) VL_agg_P['ratio'] = VL_agg_P['count'] / num_patients return VL_agg_P
def run(self, data, *cols): """ Runs model on each row of the data. :param pyspark.sql.DataFrame data: spark dataframe with one row per model. :param cols: column name(s) to run model on. :return: a spark dataframe """ import pyspark.sql.functions as F import pyspark.sql.types as T from datetime import date, datetime def json_serialize(obj): """ JSON serializer for objects not serializable by default json code This function currently only handles datetime and date objects :param obj: Object to serialize :return: json serialized object """ if isinstance(obj, (datetime, date)): return obj.isoformat() raise TypeError("Type %s not serializable" % type(obj)) def _run(*inp): """ Function to call the model _run function :param tuple inp: inputs passed to the function. TimeSeries Example: ([Row(index=datetime.datetime(2019, 1, 1, 0, 0), raw=1197387.0, interpolated=1197387.0), Row(index=datetime.datetime(2019, 1, 2, 0, 0), raw=1449210.0, interpolated=1449210.0), ... ], Row(_MetricName=u'injections', email_routing_domain=u'att.net'), datetime.datetime(2019, 3, 1, 16, 30)) :return: model result - list of tuples e.g. [("{}", "{}", Timestamp), ("{}", "{}", Timestamp), ...] """ import json output = self._run(*inp) if isinstance(output, list): output = [(json.dumps(model_attribute, default=json_serialize), json.dumps(model_result, default=json_serialize), data_date) for model_attribute, model_result, data_date in output] elif isinstance(output, tuple): output = [(json.dumps(output[0], default=json_serialize), json.dumps(output[1], default=json_serialize), output[2])] return output run_udf = F.udf(_run, T.ArrayType( T.StructType([T.StructField('model_attributes', T.StringType()), T.StructField('model_results', T.StringType()), T.StructField('data_date', T.TimestampType()) ]))) new_df = (data.withColumn('model_output', run_udf(*cols))) return new_df
def _proto3_field_to_spark_data_type(field_desc: FieldDescriptor) -> DataType: """Convert ProtoBuf field descriptor to Spark `DataType` or `StructField` object. Args: field_desc (FieldDescriptor): A ProtoBuf field descriptor. Returns: DataType: A Spark `DataType` or `StructField` object. """ # map type field if _IsMapEntry(field_desc): key_field_desc = field_desc.message_type.fields_by_name["key"] value_field_desc = field_desc.message_type.fields_by_name["value"] key_struct_type = _proto3_field_to_spark_data_type(key_field_desc) value_struct_type = _proto3_field_to_spark_data_type(value_field_desc) return types.MapType(key_struct_type, value_struct_type) if field_desc.type == FieldDescriptor.TYPE_MESSAGE: # nested message field_data_type = _proto3_message_descriptor_to_spark_schema( field_desc.message_type) else: # scalar value types field_data_type = _SPARK_SQL_TYPE_MAP[field_desc.type] # list type field if field_desc.label == FieldDescriptor.LABEL_REPEATED: return types.ArrayType(field_data_type) return field_data_type