def mediaUrls(df): animated_gif_urls = df.where( col("entities.media").isNotNull and array_contains(col("extended_entities.media.type"), "photo") ).select( explode(col("entities.media.media_url_https")).alias("image_url")) image_urls = df.where( col("entities.media").isNotNull and array_contains(col("extended_entities.media.type"), "photo") ).select( explode(col("entities.media.media_url_https")).alias("image_url")) video_urls = (df.where( col("extended_entities").isNotNull and col("extended_entities.media").isNotNull and col("extended_entities.media.video_info").isNotNull and array_contains(col("extended_entities.media.type"), "video") ).select( explode(col("extended_entities.media.video_info.variants")).alias( "video_info")).filter("video_info is not NULL").select( explode(col("video_info"))).withColumn( "video_url", col("col.url")).drop(col("col"))) return image_urls.union(video_urls.union(animated_gif_urls))
def test_group_status(spark: SparkSession, df_group_status: DataFrame) -> None: from pyspark.sql import functions as F from pyspark.sql.types import BooleanType df_group_status.show() df_group_status.printSchema() df_enrich: DataFrame = df_group_status \ .withColumn("cond1", when(col("dt") >= to_date(lit('2020-01-01'), 'yyyy-MM-dd'), lit(True)).otherwise(lit(False))) \ .withColumn("cond2", when(col("dt") >= to_date(lit('2021-01-01'), 'yyyy-MM-dd'), lit(True)).otherwise(lit(False))) df_enrich.show() df_enrich.printSchema() df_enrich_further: DataFrame = df_enrich.groupBy("grp") \ .agg(F.collect_set("cond1"), F.collect_set("cond2")).toDF(*["grp", "cond1_set", "cond2_set"]) df_enrich_further.show() df_enrich_further.printSchema() df_final: DataFrame = df_enrich_further.withColumn("from_cond1_set", ~F.array_contains(F.col("cond1_set"), False)) \ .withColumn("from_cond2_set", ~F.array_contains(F.col("cond2_set"), False)) df_final.show() df_final.printSchema() df_final: DataFrame = df_final.drop(*["cond1_set", "cond2_set"]) df_enrich: DataFrame = df_enrich.drop(*["cond1", "cond2"]) df_enrich.join(df_final, df_enrich["grp"] == df_final["grp"], "inner").show()
def test_array_contains(data_gen): arr_gen = ArrayGen(data_gen) lit = gen_scalar(data_gen, force_no_nulls=True) assert_gpu_and_cpu_are_equal_collect(lambda spark: two_col_df( spark, arr_gen, data_gen).select(array_contains(col('a'), lit.cast(data_gen.data_type)), array_contains(col('a'), col('b')), array_contains(col('a'), col('a')[5])), no_nans_conf)
def test_np_scalar_input(self): import numpy as np from pyspark.sql.functions import array_contains, array_position df = self.spark.createDataFrame([([1, 2, 3], ), ([], )], ["data"]) for dtype in [np.int8, np.int16, np.int32, np.int64]: self.assertEqual(df.select(lit(dtype(1))).dtypes, [("1", "int")]) res = df.select(array_contains(df.data, dtype(1)).alias("b")).collect() self.assertEqual([Row(b=True), Row(b=False)], res) res = df.select(array_position(df.data, dtype(1)).alias("c")).collect() self.assertEqual([Row(c=1), Row(c=0)], res) # java.lang.Integer max: 2147483647 max_int = 2147483647 # Convert int to bigint automatically self.assertEqual( df.select(lit(np.int32(max_int))).dtypes, [("2147483647", "int")]) self.assertEqual( df.select(lit(np.int64(max_int + 1))).dtypes, [("2147483648", "bigint")]) df = self.spark.createDataFrame([([1.0, 2.0, 3.0], ), ([], )], ["data"]) for dtype in [np.float32, np.float64]: self.assertEqual( df.select(lit(dtype(1))).dtypes, [("1.0", "double")]) res = df.select(array_contains(df.data, dtype(1)).alias("b")).collect() self.assertEqual([Row(b=True), Row(b=False)], res) res = df.select(array_position(df.data, dtype(1)).alias("c")).collect() self.assertEqual([Row(c=1), Row(c=0)], res)
def selectGenreWord(self,genreIds): result = None for gId in genreIds: if result == None: result = self.curDataWord.filter(functions.array_contains(self.curDataWord.genre,gId)).select('word','priority','searchApp','searchCount','genre').distinct() res = self.curDataWord.filter(functions.array_contains(self.curDataWord.genre,gId)).select('word','priority','searchApp','searchCount','genre').distinct() result = result.unionAll(res) return result
def main(): stagemetrics = StageMetrics(spark) stagemetrics.begin() titles_df = read_csv('./data/titles.tsv') ratings_df = read_csv('./data/ratings.tsv') principals_df = read_csv('./data/principals.tsv') names_df = read_csv('./data/names.tsv') episodes_df = read_csv('./data/episodes.tsv') crew_df = read_csv('./data/crew.tsv') akas_df = read_csv('./data/akas.tsv') titles_df = titles_df.filter(f.array_contains(f.split(f.col('genres'), ','), 'Comedy') & (f.col('titleType') != 'short')) ratings_df = ratings_df.filter(f.col('averageRating') > 5.0) ratings_df = ratings_df.filter(f.col('numVotes').cast('int') > 8376) principals_df = principals_df.filter((f.col('category') != 'self') & (f.col('category') != 'cinematographer')) principals_df = principals_df.orderBy('tconst') principals_df = principals_df.filter(f.col('ordering').cast('int') <= 3) names_df = names_df.filter(f.col('deathYear').isNotNull()) names_df = names_df.filter(f.array_contains(f.split(f.col('primaryProfession'), ','), 'miscellaneous')) akas_df = akas_df.filter(f.col('isOriginalTitle') == '1') akas_df = akas_df.filter(f.col('region') == 'US') full_movie_df = titles_df.join(akas_df, f.col('tconst') == f.col('titleId'), 'full').join(ratings_df, on=['tconst']).join(episodes_df, on=['tconst']) titles_grp_df = titles_df.withColumn('genres', f.explode(f.split(f.col('genres'), ','))).groupBy('genres').agg(f.count('tconst')) episode_mov_df = episodes_df.filter(f.col('episodeNumber').cast('int') > 10).join(titles_df, on=['tconst']) print(titles_df.count()) print(ratings_df.count()) print(principals_df.count()) print(names_df.count()) print(episodes_df.count()) print(crew_df.count()) print(akas_df.count()) print(full_movie_df.count()) print(episode_mov_df.count()) print(titles_grp_df.count()) print(titles_df.show(100)) print(ratings_df.show(100)) print(principals_df.show(100)) print(names_df.show(100)) print(episodes_df.show(100)) print(crew_df.show(100)) print(akas_df.show(100)) print(full_movie_df.show(100)) print(episode_mov_df.show(100)) print(titles_grp_df.show(100)) stagemetrics.end() stagemetrics.print_report()
def selectAppIdWord(self,appIds): result = None for appId in appIds: if result == None: result = self.curDataWord.filter(functions.array_contains(self.curDataWord.searchapp,appId)).select('word','priority','searchApp','searchCount','genre').distinct() res = self.curDataWord.filter(functions.array_contains(self.curDataWord.searchapp,appId)).select('word','priority','searchApp','searchCount','genre').distinct() result = result.unionAll(res) word = result.select('word') result = result.dropna(how='any') return result,word
def add_revert_types(wmhist, comment_column='event_comment'): wmhist = wmhist.withColumn("revert_tools_match",match_comment(f.col(comment_column),f.col("wiki_db"),f.col("event_timestamp"))) wmhist = wmhist.withColumn("is_undo", f.array_contains(col='revert_tools_match',value='undo')) wmhist = wmhist.withColumn("is_rollback", f.array_contains(col='revert_tools_match',value='rollback')) tool_priority = ['huggle','twinkle','fastbuttons','LiveRC','rollback','undo'] tool_column_names = ["revert_tool_{0}".format(tool) for tool in tool_priority] for tool, tool_column_name in zip(tool_priority, tool_column_names): wmhist = wmhist.withColumn(tool_column_name, f.when(f.array_contains(f.col("revert_tools_match"),tool),tool).otherwise(None)) wmhist = wmhist.withColumn("revert_tool",f.coalesce(*tool_column_names)) wmhist = wmhist.fillna('otherTool',subset=['revert_tool']) return wmhist
def random_text_classifier(input_loc, output_loc): """ This is a dummy function that mocks the following steps: 1. clean input data (tokenization, remove stop words) 2. use a pre-trained model to make prediction 3. write predictions to a HDFS output Naively marks reviews having the text "good" as positive and the rest as negative """ # read input df_raw = spark.read.option("header", True).csv(input_loc) # perform text cleaning # Tokenize text tokenizer = Tokenizer(inputCol='review_str', outputCol='review_token') df_tokens = tokenizer.transform(df_raw).select('cid', 'review_token') # Remove stop words remover = StopWordsRemover(inputCol='review_token', outputCol='review_clean') df_clean = remover.transform(df_tokens).select('cid', 'review_clean') # function to check presence of good and naively assume its a positive review df_out = df_clean.select( 'cid', array_contains(df_clean.review_clean, "good").alias('positive_review')) df_out.write.mode("overwrite").parquet(output_loc)
def random_text_classifier(input_loc, output_loc): """ This is a dummy function to show how to use spark. It is supposed to mock the following steps: 1. clean input data 2. use a pre-trained model to make a prediction 3. write predictions to a HDFS output Since this is meant as an example, we are going to skip building a model, instead we are naively going to mark reviews having the text "good" as positive and the rest as negative """ # read input df_raw = spark.read.option("header", True).csv(input_loc) # Perform text cleaning # tokenize text tokenizer = Tokenizer(inputCol="review_str", outputCol="review_token") df_tokens = tokenizer.transform(df_raw).select("cid", "review_token") # remove stop words remover = StopWordsRemover(inputCol="review_token", outputCol="review_clean") df_clean = remover.transform(df_tokens).select("cid", "review_clean") # now check presence of "good" and naively classify as positive review df_out = df_clean.select( "cid", array_contains(df_clean.review_clean, "good").alias("positive_review")) df_out.write.mode("overwrite").parquet(output_loc)
def outputTopRated(metadata, reviews, category): ''' Input: metadata and reviews collections, and category name Output: Top 1 product in a certain category ''' cate_filtered = metadata.filter( func.array_contains(metadata["categories"], category)) #join cate_filtered dataframe with reviews collection #and select id, title and overall columns inner_join = cate_filtered.join(reviews, cate_filtered.asin == reviews.asin).select( cate_filtered['asin'], 'title', 'overall') #map each row to a list map_join = inner_join.rdd.map(list) #map: (id, title) is key, (rating, 1) is value #reduce: sum all ratings and all 1s (which is gonna be num of reviews) counts = map_join.map(lambda x: ((x[0], x[1]), (x[2], 1))).reduceByKey( lambda a, b: (a[0] + b[0], a[1] + b[1])).sortBy(lambda x: x[1]) counts = counts.map( lambda x: [category, x[0][1], x[1][0], x[1][1]]).sortBy( lambda x: float(x[2]) / float(x[3]), ascending=False) countsDF = counts.toDF() #get the products with num of reviews > 100 countsDF_filtered = countsDF.filter(countsDF[3] > 100) return countsDF_filtered.limit(1)
def test_array_contains_function(self): from pyspark.sql.functions import array_contains df = self.spark.createDataFrame([(["1", "2", "3"], ), ([], )], ['data']) actual = df.select(array_contains(df.data, "1").alias('b')).collect() self.assertEqual([Row(b=True), Row(b=False)], actual)
def random_text_classifier(input_loc, output_loc): """ This is a dummy function to show how to use spark, It is supposed to mock the following steps 1. clean input data 2. use a pre-trained model to make prediction 3. write predictions to a HDFS output Since this is meant as an example, we are going to skip building a model, instead we are naively going to mark reviews having the text "good" as positive and the rest as negative """ # read input df_raw = spark.read.option("header", True).csv(input_loc) # perform text cleaning # Tokenize text tokenizer = Tokenizer(inputCol='review_str', outputCol='review_token') df_tokens = tokenizer.transform(df_raw).select('cid', 'review_token') # Remove stop words remover = StopWordsRemover(inputCol='review_token', outputCol='review_clean') df_clean = remover.transform(df_tokens).select('cid', 'review_clean') # function to check presence of good df_out = df_clean.select( 'cid', array_contains(df_clean.review_clean, "good").alias('positive_review')) # parquet is a popular column storage format, we use it here df_out.write.mode("overwrite").parquet(output_loc)
def main(): Spark = get_Spark_Session("Json_reabd") S=SparkSession.builder.getOrCreate() schema=StructType([StructField("name",StringType()),StructField("age",IntegerType()),StructField("cars",StructType ([StructField("car1", StringType()) ,StructField("car2", StringType()),StructField("car3", StringType())]))]) Json_Df=Spark.read.option("multiline","true").schema(schema).json("d:/Nested_Json.json") Json_Df1=Json_Df Json_Df1.show() for col_name in Json_Df.columns: print col_name for i in Json_Df.select("cars.*").columns: c_name="cars."+i Json_Df= Json_Df.withColumn(i,col=F.col(c_name)) structureSchema = StructType().add("id", StringType()).add("dept", StringType()).add("properties", StructType().add("salary", IntegerType()).add("location", StringType())) print (structureSchema) Json_Df6=Spark.read.option("multiline","true").schema(schema).json("d:/Nested_Json.json") l=[] for i in Json_Df6.select("cars.*").schema.names: l.append(F.col(i)) print l Json_Df6.createOrReplaceTempView("god") print Spark.sql("select name,age,array(cars.*) as dd from god").filter(array_contains(F.col("dd"),"BMW")).show()
def animatedGifUrls(df): return df.where( col("extended_entities").isNotNull and col("extended_entities.media").isNotNull and array_contains( col("extended_entities.media.type"), "animated_gif")).select( explode( col("extended_entities.media.media_url_https")).alias( "animated_gif_url"))
def defineHeuristic4Miners(dfvj): p = list(pools) test = dfvj.where(size(col("vj_dest_address")) > 100).where( array_contains(col("vj_dest_address"), p[0])) for pool in p: tmp = dfvj.where(size(col("vj_dest_address")) > 100).where( array_contains(col("vj_dest_address"), pool)) test = test.unionAll(tmp) tmp = test.selectExpr("vj_dest_address as a").collect() nonTrivialMiners = [] for row in tmp: nonTrivialMiners += row.a nonTrivialMiners = set(nonTrivialMiners) return nonTrivialMiners
def videoUrls(df): return (df.where( col("extended_entities").isNotNull and col("extended_entities.media").isNotNull and col("extended_entities.media.video_info").isNotNull and array_contains(col("extended_entities.media.type"), "video") ).select( explode(col("extended_entities.media.video_info.variants")).alias( "video_info")).filter("video_info is not NULL").select( explode(col("video_info"))).withColumn( "video_url", col("col.url")).drop(col("col")))
def group_tweets_from_hashtag_by_hour(self, hashtag): tweetsByHashtag = self.__tweets_df.where( array_contains(self.__tweets_df.hashtags, hashtag)) result = tweetsByHashtag\ .groupBy(\ hour('created_at')\ .alias('created_at_hour')\ )\ .count()\ .orderBy('created_at_hour') return parse_json_response(result.toJSON().collect())
def explode_sampling(df): base_columns = df.columns columns = list(map(lambda c: split(col(c), args.split_char).alias(c) if c == args.class_col else col(c), base_columns)) with_category_array = df.select(*columns) df = df.select(columns) classes = list(map(lambda row: row[args.class_col], df.select(explode(args.class_col).alias(args.class_col)).distinct().collect())) if seed: sample = with_category_array.filter(array_contains(with_category_array[args.class_col], classes[0])).sample(fraction, seed) else: sample = with_category_array.filter(array_contains(with_category_array[args.class_col], classes[0])).sample(fraction) for clazz in classes[1:]: if seed : sample = sample.union(with_category_array.filter(array_contains(with_category_array[args.class_col], clazz)).sample(fraction, seed)) else: sample = sample.union(with_category_array.filter(array_contains(with_category_array[args.class_col], clazz)).sample(fraction)) select = list(map(lambda c: concat_ws(";", col(c)).alias(c) if c == args.class_col else col(c), base_columns)) return sample.select(select), len(classes)
def process_text(df: DataFrame) -> DataFrame: """Process features extracted from text fields""" df = df.withColumn( "flag_energy_title", sf.array_contains("title_text_features", "energy").astype(IntegerType())) df = df.withColumn( "flag_energy_abstract", sf.array_contains("abstract_text_features", "energy").astype(IntegerType())) df = df.withColumn( "flag_energy_claims", sf.array_contains("claims_text_features", "energy").astype(IntegerType())) feature_cols = [ "english_text_features", "flag_energy_title", "flag_energy_abstract", "flag_energy_claims" ] df = df.select("_file", *feature_cols) return df
def get_most_followed_users(self, hashtag, limit=5): tweetsByHashtag = self.__tweets_df.where( array_contains(self.__tweets_df.hashtags, hashtag)).alias('tweet') users = self.__users_df.alias('user') usersByHashtag = users.join(tweetsByHashtag, users.user_id == tweetsByHashtag.user_id) result = usersByHashtag\ .select(users.user_id, users.screen_name, users.followers_count)\ .orderBy(users.updated_at.desc())\ .dropDuplicates(['user_id']) result = result.orderBy(result.followers_count.desc()).limit(limit) return parse_json_response(result.toJSON().collect())
def createDict(df, all_plants): dict_list = [()] for state in states: plant_names = df.select(df.plant_name).where( array_contains(df.states, state)).rdd.flatMap(lambda x: x).collect() dict1 = dict([(plant_name, 1) if plant_name in plant_names else (plant_name, 0) for plant_name in all_plants]) tuple_data = (state, dict1) dict_list.append(tuple_data) rdd = sc.parallelize(dict_list[1:]) return rdd
def process_ipcr(df: DataFrame) -> DataFrame: """ Generates a flag column for each section and combination of section/class of the patents indicating if the patent is part of this categorization """ col = "bibliographic-data_classifications-ipcr_classification-ipcr" df = df.withColumn("ipcr_values", extract_ipcr(sf.col(col))) df = df.withColumn("ipcr_sections", sf.col("ipcr_values.sections")) df = df.withColumn("ipcr_sections_class", sf.col("ipcr_values.sections_class")) for section in SECTIONS_IPCR: df = df.withColumn( f"section_{section}", sf.array_contains(sf.col("ipcr_sections"), section).astype(IntegerType())) for section_class in SECTIONS_CLASS_IPCR: df = df.withColumn( f"section_class_{section_class}", sf.array_contains(sf.col("ipcr_sections_class"), section_class).astype(IntegerType())) return df
def __init__(self): super(FeatureJsTotal, self).__init__() self.group_by_aggs = { 'js_count': F.count(F.when( F.col('is_js') == True, # noqa F.col('is_js'))) } self.pre_group_by_calcs = { 'is_js': F.array_contains(F.split(F.col('content_type'), '/'), 'javascript') }
def customFunction(row): #person = rdd.map(lambda r: row(*r)) #temp_df = sqlContext.createDataFrame(person) print "reaching\n", row['categories'] t = str(row['categories']) #review_df.createTempView("rev") #temp_df = review_df.filter(review_df["business_id"]==row["business_id"]) temp_df = bus_df.withColumn('cat_true', func.array_contains(bus_df['categories'], t)) #temp_df = bus_df.where((t in bus_df["categories"])) #temp_df = sqlContext.sql("SELECT * FROM rev where rev['business_id']=t") #print (temp_df) df_x = temp_df.filter(temp_df.cat_true == True).drop('cat_true') df_x.write.json("/Users/apple/Desktop/dataset/businesses/"+str(t)+".json")
def one_hot_encode_top_n_tags(dataframe,n): """Produces a PySpark dataframe containing columns indicating whether each of the top n tags are present. :param dataframe: the PySpark dataframe :param n: the number of the top ranked tags to return as tag fields :returns: the PySpark dataframe containing the top n tag fields and all fields in the supplied dataframe """ top_n = [t.tag for t in df_tag_freq.orderBy(desc("frequency")).select("tag").limit(n).collect()] for tag in top_n: # replace tag name ".net" with "dotnet", for example, to avoid problems with periods in tag names tag_column_name = ("tag_"+tag).replace(".","dot") dataframe = dataframe.withColumn(tag_column_name, array_contains(dataframe.tags_split, tag).cast("int")) return dataframe
def __init__(self): super(FeatureImageTotal, self).__init__() self.group_by_aggs = { 'image_count': F.count( F.when( F.col('is_image') == True, # noqa F.col('is_image'))) } self.pre_group_by_calcs = { 'is_image': F.array_contains(F.split(F.col('content_type'), '/'), 'image') }
def transform(self, df): embeddings = df \ .select('embedding_document') \ .filter(~array_contains('embedding_document', np.nan)) \ .toPandas()['embedding_document'] \ .to_list() embeddings = np.array(embeddings, dtype=np.float32) model = umap.UMAP(n_neighbors=15, n_components=5, metric='cosine') model.fit(embeddings) return model
def transform(self, data): df_article, df_clustering, df_embeddings = data df_article_topic = df_article \ .join(df_clustering, on='url_id') \ .join(df_embeddings, on='url_id') \ .select('url_id', 'time', 'header', 'tags', 'topic_id', 'embedding_document') \ .filter(col('topic_id') != -1) \ .filter(~array_contains('embedding_document', np.nan)) \ .orderBy('time') \ .toPandas() n_articles = 50000 df_article_topic = df_article_topic.iloc[-n_articles:] return df_article_topic
def one_hot_encode_top_n_tags(dataframe, n): """Produces a PySpark dataframe containing columns indicating whether each of the top n tags are present. :param dataframe: the PySpark dataframe :param n: the number of the top ranked tags to return as tag fields :returns: the PySpark dataframe containing the top n tag fields and all fields in the supplied dataframe """ top_n = [ t.tag for t in df_tag_freq.orderBy(desc("frequency")).select( "tag").limit(n).collect() ] for tag in top_n: # replace tag name ".net" with "dotnet", for example, to avoid problems with periods in tag names tag_column_name = ("tag_" + tag).replace(".", "dot") dataframe = dataframe.withColumn( tag_column_name, array_contains(dataframe.tags_split, tag).cast("int")) return dataframe
def __init__(self): super(FeatureImageToHtmlRatio, self).__init__() self.group_by_aggs = { 'html_count': F.count( F.when(F.col('is_html') == True, F.col('is_html')) # noqa ), 'image_count': F.count( F.when(F.col('is_image') == True, F.col('is_image')) # noqa ) } self.pre_group_by_calcs = { 'is_html': F.col('content_type') == 'text/html', 'is_image': F.array_contains(F.split(F.col('content_type'), '/'), 'image') }
def test_array_contains_function(self): from pyspark.sql.functions import array_contains df = self.spark.createDataFrame([(["1", "2", "3"],), ([],)], ['data']) actual = df.select(array_contains(df.data, "1").alias('b')).collect() self.assertEqual([Row(b=True), Row(b=False)], actual)
# COMMAND ---------- df.select(split(col("Description"), " ").alias("array_col"))\ .selectExpr("array_col[0]").show(2) # COMMAND ---------- from pyspark.sql.functions import size df.select(size(split(col("Description"), " "))).show(2) # shows 5 and 3 # COMMAND ---------- from pyspark.sql.functions import array_contains df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2) # COMMAND ---------- from pyspark.sql.functions import split, explode df.withColumn("splitted", split(col("Description"), " "))\ .withColumn("exploded", explode(col("splitted")))\ .select("Description", "InvoiceNo", "exploded").show(2) # COMMAND ---------- from pyspark.sql.functions import create_map df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
def augment(df): if 'addons' in df.columns: df = df.select(['*'] + [create_get_addon_name_udf(addon)(df['addons']).alias(addon.replace('.', '__DOT__')) for addon in all_addons] + [create_get_addon_version_udf(addon)(df['addons']).alias(addon.replace('.', '__DOT__') + '-version') for addon in all_addons]) if 'json_dump' in df.columns: df = df.select(['*'] + [functions.array_contains(df['json_dump']['modules']['filename'], module_name).alias(module_id) for module_id, module_name in module_ids.items()]) if 'plugin_version' in df.columns: df = df.withColumn('plugin', df['plugin_version'].isNotNull()) if 'app_notes' in df.columns: df = df.select(['*'] + [(functions.instr(df['app_notes'], app_note.replace('__DOT__', '.')) != 0).alias(app_note) for app_note in all_app_notes] + [(functions.instr(df['app_notes'], 'Has dual GPUs') != 0).alias('has dual GPUs')]) if 'graphics_critical_error' in df.columns: df = df.select(['*'] + [(functions.instr(df['graphics_critical_error'], error.replace('__DOT__', '.')) != 0).alias(error) for error in all_gfx_critical_errors]) if 'total_virtual_memory' in df.columns and 'platform_version' in df.columns and 'platform' in df.columns: def get_arch(total_virtual_memory, platform, platform_version): if total_virtual_memory: try: if int(total_virtual_memory) < 2684354560: return 'x86' else: return 'amd64' except: return 'unknown' elif platform == 'Mac OS X': return 'amd64' else: if 'i686' in platform_version: return 'x86' elif 'x86_64' in platform_version: return 'amd64' get_arch_udf = functions.udf(get_arch, StringType()) df = df.withColumn('os_arch', get_arch_udf(df['total_virtual_memory'], df['platform'], df['platform_version'])) if 'adapter_driver_version' in df.columns: def get_driver_version(adapter_vendor_id, adapter_driver_version): # XXX: Sometimes we have a driver which is not actually made by the vendor, # in those cases these rules are not valid (e.g. 6.1.7600.16385). if adapter_driver_version: if adapter_vendor_id == '0x8086' or adapter_vendor_id == '8086': return adapter_driver_version[adapter_driver_version.rfind('.') + 1:] elif adapter_vendor_id == '0x10de' or adapter_vendor_id == '10de': return adapter_driver_version[-6:-5] + adapter_driver_version[-4:-2] + '.' + adapter_driver_version[-2:] # TODO: AMD? return adapter_driver_version get_driver_version_udf = functions.udf(get_driver_version, StringType()) df = df.withColumn('adapter_driver_version_clean', get_driver_version_udf(df['adapter_vendor_id'], df['adapter_driver_version'])) if 'cpu_info' in df.columns: df = df.withColumn('CPU Info', functions.substring_index(df['cpu_info'], ' | ', 1)) df = df.withColumn('Is Multicore', functions.substring_index(df['cpu_info'], ' | ', -1) != '1') if 'dom_ipc_enabled' in df.columns: df = df.withColumnRenamed('dom_ipc_enabled', 'e10s_enabled') if 'memory_ghost_windows' in df.columns: df = df.withColumn('ghost_windows > 0', df['memory_ghost_windows'] > 0) if 'memory_top_none_detached' in df.columns: df = df.withColumn('top(none)/detached > 0', df['memory_top_none_detached'] > 0) return df