def process_data(spark, data_folder): """Faz a leitura dos arquivos gzip de uma pasta e processa o Spark Dataframe Arguments: spark -- Spark Session data_folder -- pasta contendo os arquivos gzip Returns: Spark Dataframe """ # Reading files from folder df_nasa = spark.read.text(data_folder) # Regex for group matching parse_regex = '(.*) - - \[([\w:/]+\s[+\-]\d{4})\] \"(.*)\" (\d{3}) ([0-9]*|-)' # Create columns based on Regex group match df = df_nasa.withColumn('host',F.regexp_extract(F.col('value'),parse_regex,1)) \ .withColumn('timestamp',F.regexp_extract(F.col('value'),parse_regex,2)) \ .withColumn('request',F.regexp_extract(F.col('value'),parse_regex,3)) \ .withColumn('status_code',F.regexp_extract(F.col('value'),parse_regex,4).cast(IntegerType())) \ .withColumn('bytes',F.regexp_extract(F.col('value'),parse_regex,5).cast(IntegerType())) \ .drop('value') return df
def calculateDifficulty(self, dfIngridients): dfIngridients.persist(StorageLevel.MEMORY_AND_DISK) dfIngridients = dfIngridients.filter( dfIngridients.ingridents.contains("beef")) dfIngridients = dfIngridients.withColumn("cookTimeInt", regexp_extract(dfIngridients.cookTime, "(d+)", 1)) \ .withColumn("prepTimeInt", regexp_extract(dfIngridients.cookTime, "(d+)", 1)) dfIngridients = dfIngridients.withColumn( "totalTime", dfIngridients.cookTimeInt + dfIngridients.prepTimeInt) dfIngridients_filtered = dfIngridients.withColumn("difficulty", when(dfIngridients.totalTime > 60, lit("Hard")).when( dfIngridients.totalTime > 30 & dfIngridients.totalTime < 60, lit("Medium")) \ .when(dfIngridients.totalTime < 30, lit("Easy")) \ .otherwise(lit("Unknown"))).withColumn("currentDate", unix_timestamp() * 1000) dfIngridients_filtered.repartition("difficulty").persist( StorageLevel.MEMORY_AND_DISK) self.log.info("dfIngridients_filtered: {}".format( dfIngridients_filtered.rdd.count())) self.writeToImpala( dataToWrite=dfIngridients_filtered, table=self.config.get('impala').get('tablename'), properties=self.ImpalaProperties.get('impala'))
def analysis_7(units_df, damages_df, log): """Logs the result for Query 7. :param units_df: DataFrame Units_use. :param damages_df: DataFrame Damages_use. :param log: Logger. :return None """ filtered_units = ( units_df .filter(col("FIN_RESP_TYPE_ID").contains("INSURANCE")) .withColumn("VEH_DMAG_SCL_1_ID", when(units_df.VEH_DMAG_SCL_1_ID.contains("DAMAGED"), regexp_extract(col("VEH_DMAG_SCL_1_ID"), "(\\d{1})", 1)).otherwise(0)) .withColumn("VEH_DMAG_SCL_2_ID", when(units_df.VEH_DMAG_SCL_2_ID.contains("DAMAGED"), regexp_extract(col("VEH_DMAG_SCL_1_ID"), "(\\d{1})", 1)).otherwise(0)) .filter((col("VEH_DMAG_SCL_1_ID") > 4) | (col("VEH_DMAG_SCL_2_ID") > 4)) ) units_damages_left_join_filtered = ( filtered_units.alias("U") .join(damages_df.alias("D"), col("U.CRASH_ID") == col("D.CRASH_ID"), "left") .filter(col("DAMAGED_PROPERTY").contains("NONE") | col("DAMAGED_PROPERTY").contains("NO DAMAGE") | col( "DAMAGED_PROPERTY").isNull()) .select("U.CRASH_ID") .distinct() ) crash_count = units_damages_left_join_filtered.count() log.warn("Result for Query 7") log.warn( "Distinct Crash IDs where No Damaged Property was observed and Damage Level (VEH_DMAG_SCL~) is above 4 and " "car avails Insurance = {}".format(crash_count)) return None
def custom_to_dataframe(self, filename): # custom_schema = StructType([self.schema[i] for i in [0,9,10]]) custom_data = spark.read.text(self.path + filename) r = "user_id=(.+)feature_9=(.+)feature_10=(.+)" custom_data = custom_data.select(regexp_extract('value',r,1).alias('user_id'), \ regexp_extract('value',r,2).alias('feature_9').cast("double"), \ regexp_extract('value',r,3).alias('feature_10').cast("double")) return custom_data
def main(log_file): sc = utils.setup_spark_context() sqlContext = SQLContext(sc) try: sql_log_data = sqlContext.read.text(log_file) except: print("######################") print("Bad file name!") return splited_data_frame = sql_log_data.select( regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'), regexp_extract('value', r'^.*\[(\d\d/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]', 1).alias('timestamp'), regexp_extract('value', r'^.*"\w+\s+([^\s]+)\s+HTTP.*"', 1).alias('request'), regexp_extract('value', r'^.*"\s+([^\s]+)', 1).cast('integer').alias('http_status'), regexp_extract('value', r'^.*\s+(\d+)$', 1).cast('integer').alias('content_size_in_bytes')) splited_data_frame.cache() data_frames = {} data_frames['unique_hosts'] = splited_data_frame.groupBy( 'host').count().filter('count = 1').select('host') data_frames['top_20_request'] = splited_data_frame.groupBy( 'request').count().sort(desc("count")).limit(20) data_frames['total_http_404'] = splited_data_frame.groupBy( 'http_status').count().filter('http_status = "404"') data_frames['frequency_status'] = splited_data_frame.groupBy( 'http_status').count() data_frames['top_5_hosts_http_404'] = splited_data_frame.filter( 'http_status = "404"').groupBy('request').count().sort( col("count").desc()).limit(5) data_frames['qty_http_404_per_day'] = splited_data_frame.filter( 'http_status = "404"').groupBy( splited_data_frame.timestamp.substr(1, 11).alias('day')).count().sort( desc('day')) data_frames['sum_bytes'] = splited_data_frame.select( 'content_size_in_bytes').groupBy().sum() data_frames['bytes_per_day'] = splited_data_frame.select( 'content_size_in_bytes', 'timestamp').groupBy( splited_data_frame.timestamp.substr( 1, 11).alias('day')).sum().sort('day') utils.export_all_queries_to_csv(data_frames)
def main(): sc = SparkContext() sqlContext = SQLContext(sc) # Guardar todos os log da pasta files files = listdir('./files') schema_blank = StructType([StructField("value", StringType(), True)]) # DataFrame vazio para unir todos os arquivos em Files main_df = sqlContext.createDataFrame([], schema_blank) for file in files: path_file = './files/' + file temp_df = sqlContext.read.text(path_file) main_df = main_df.union(temp_df) main_df_format = main_df.select( regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'), regexp_extract('value', r'^.*\[(\d\d/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]', 1).alias('timestamp'), regexp_extract('value', r'^.*"\w+\s+([^\s]+)\s+HTTP.*"', 1).alias('URL'), regexp_extract('value', r'^.*"\s+([^\s]+)', 1).cast('integer').alias('codeHTTP'), regexp_extract('value', r'^.*\s+(\d+)$', 1).cast('integer').alias('byte')) # 1. Numero de hosts unicos: host_uniques = main_df_format.groupBy('host').count().filter( 'count = 1').count() with open('./resultados/host_uniques.csv', 'w') as file: writer = csv.writer(file) writer.writerow([',host_uniques']) writer.writerow(['0,{}'.format(host_uniques)]) # 2. O total de erros 404: main_df_format.groupBy('codeHTTP').count().filter( 'codeHTTP = "404"').toPandas().to_csv('./resultados/total_404.csv') # Os 5 URLs que mais causaram erro 404 main_df_format.filter('codeHTTP = "404"').groupBy('URL').count().sort( col("count").desc()).limit(5).toPandas().to_csv( './resultados/top_five_404.csv') # 4. Quantidade de erros 404 por dia main_df_format.filter('codeHTTP = "404"').groupBy( main_df_format.timestamp.substr(1, 11).alias( 'day')).count().toPandas().to_csv('./resultados/per_day_404.csv') # 5. O total de bytes retornados: main_df_format.select('byte').groupBy().sum().toPandas().to_csv( './resultados/total_bytes.csv') print('Arquivos Exportados para a pasta resultados')
def clean(spark, rows): # Load Data df = spark.createDataFrame(Row(**row) for row in rows) # Clean column country re_country = "[a-zA-Z][a-zA-Z\s\-]*" df = df.withColumn( "country", (F.lower(F.trim(F.regexp_extract("country", re_country, 0)))), ) # Clean column campus re_campus = "([a-zA-Z]+[_\ \-]?)+" df = df.withColumn( "campus", (F.lower(F.trim(F.regexp_extract("campus", re_campus, 0))))) # Clean column mobility re_mobility = "([a-zA-Z0-9]+[\ \-]?)+" df = df.withColumn( "mobility", (F.lower(F.trim(F.regexp_extract("mobility", re_mobility, 0))))) # Clean column contracts df = df.withColumn( "contracts", null_negative_int(df["contracts"].cast(T.IntegerType()))) # Clean column alternative_choice re_alternative_choice = "([a-zA-Z]+[_\ \-]?)+" df = df.withColumn( "alternative_choice", (F.lower( F.trim( F.regexp_extract("alternative_choice", re_alternative_choice, 0)))), ) # Clean column distance re_distance = "[0-9]+" df = df.withColumn( "distance", (F.lower(F.trim(F.regexp_extract("distance", re_distance, 0))).cast( T.IntegerType())), ) # Clean column pro_contract df = df.withColumn("pro_contract", df["pro_contract"].cast(T.BooleanType())) return df
def house_number_extract(df): #make address_line_1 all uppercase df = df.withColumn('address_line_1', f.upper('address_line_1')) #extract house number or box number into column housenumber df = df.withColumn('housenumber', f.when( f.col('address_line_1').rlike('^[A-Z]{2}'), f.regexp_extract(f.col('address_line_1'),'(BOX\\s)([0-9]+[0-9A-Z.*-]*)', 2)) .otherwise(f.regexp_extract(f.col('address_line_1'),'^([A-Z]*[0-9]+[0-9A-Z.*-]*)', 1))) return df
def prepare_google_trend(): # Extract week start date and state. google_trend_all = google_trend_csv \ .withColumn('Date', F.regexp_extract(google_trend_csv.week, '(.*?) -', 1)) \ .withColumn('State', F.regexp_extract(google_trend_csv.file, 'Rossmann_DE_(.*)', 1)) # Map state NI -> HB,NI to align with other data sources. google_trend_all = google_trend_all \ .withColumn('State', F.when(google_trend_all.State == 'NI', 'HB,NI').otherwise(google_trend_all.State)) # Expand dates. return expand_date(google_trend_all)
def nasa_ingestao(df): return (df.select( regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'), regexp_extract('value', r'^.*\[(\d\d/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]', 1).alias('timestamp'), regexp_extract('value', r'^.*"\w+\s+([^\s]+)\s+HTTP.*"', 1).alias('url'), regexp_extract('value', r'^.*"\s+([^\s]+)', 1).cast('integer').alias('status'), regexp_extract('value', r'^.*\s+(\d+)$', 1).cast('integer').alias('bytes')))
def server_finder(column): column = F.lower(column) regex_exp = r'(apache|nginx|microsoft)([^\/]+|)(\/|)((\d+\.|\d+\b|)+)' server = F.regexp_extract(column, regex_exp, 1) version = F.regexp_extract(column, regex_exp, 4) server = F.when(server == '', 'other') \ .otherwise(server) column = F.when(version == '', server) \ .otherwise(F.concat_ws('/', server, version)) return column
def process_equipment_failure_sensors(spark, input_data, output_data): ''' Write parquet files on the gold path Parameters: spark : Spark Session input_data (str): Path to input data output_data (str): Path to output data ''' try: df_data = spark.read.format('csv').option("sep", '\t').load(input_data) print('1->Read {} - OK'.format(input_data)) except IOError: print('read error') df_data = df_data.withColumn( 'date', regexp_extract('_c0', r'(\d+-\d+-\d+\s\d+:\d+:\d+)', 1).alias('Date').cast('timestamp')).drop('_c0') df_data = df_data.withColumn('error', when(df_data._c1 == 'ERROR', 1).otherwise(0)).drop('_c1') df_data = df_data.withColumn( 'sensor_id', regexp_replace('_c2', '(\D)', '').cast('integer')).drop('_c2') df_data = df_data.drop('_c3') df_data = df_data.withColumn( 'temperature', regexp_extract('_c4', r'(\d+.\d+)', 1).cast('float')).drop('_c4') df_data = df_data.withColumn( 'vibration', regexp_extract('_c5', r'([\-\+]?\d+.\d+)', 1).cast('float')).drop('_c5') print('2--->Format and clean data {} - OK'.format(input_data)) try: df_data.write.format('parquet').mode('overwrite').save(output_data) print('3----->Write OK') except IOError: print('write error')
def prepare_dataset(self): """ Compute common intermediate DataFrames and cache to reduce the execution time. """ # A DataFrame of movies where the genre cells are split into several rows self.__movies_df_split_genres = self.__movies_df \ .withColumn('genres', explode(split(self.__movies_df.genres, "\\|"))) \ .filter(self.__movies_df.genres != "(no genres listed)") \ .filter(self.__movies_df.genres != "(s listed)") \ .dropna() # A DataFrame of the movies where the title and year of the movie are in separated columns self.__movies_df_with_year_col = self.__movies_df \ .withColumn('year', regexp_extract(self.__movies_df['title'], '[1-2][0-9][0-9][0-9]', 0).cast(IntegerType())) \ .withColumn('title', split(self.__movies_df['title'], '\([1-2][0-9][0-9][0-9]\)').getItem(0)) # A DataFrame that contains only the movies that have been rated or tagged self.__reduced_ratings = self.__ratings_df.select( col("userId"), col("movieId")).distinct() self.__reduced_tags = self.__tags_df.select(col("userId"), col("movieId")).distinct() self.__movies_user_df = self.__reduced_ratings.union( self.__reduced_tags).distinct().cache() # A DataFrame combining average rating per movie where genres are split in rows self.__favor_genre_df = self.__movies_df_split_genres \ .join(self.__ratings_df, self.__movies_df_split_genres.movieId == self.__ratings_df.movieId) \ .drop(self.__ratings_df.movieId) \ .drop(self.__ratings_df.timestamp)
def dccon_parse(df, col): return df.withColumn( col, F.when( F.col(col).startswith('<video'), F.concat(F.lit('<dccon> '), F.regexp_extract(col, r'data-src="[^?]*\?no=([^"]+)"', 1), F.lit(' '), F.regexp_extract(col, r'title="([^"]*)"', 1))).when( F.col(col).startswith('<img'), F.concat( F.lit('<dccon> '), F.regexp_extract(col, r'src="[^?]*\?no=([^"]+)"', 1), F.lit(' '), F.regexp_extract(col, r'title="([^"]*)"', 1))).otherwise(F.col(col)))
def main(inputFile, outputFile, configFile, contentMapping): #config uc = popularityCalculator(configFile) df = spark.read.parquet(inputFile + '/*').dropDuplicates().na.drop() contentMapping = spark.read.csv(contentMapping, header='true') #get rid of ".mp3" in item_name df = df.withColumnRenamed("item_name", "to_del") df = df.withColumn("item_name", F.split(df['to_del'], '\.')[0]) df = df.drop('to_del') #turn Content Mapping String Length to TimeDelta Object strp_time = udf(lambda x: datetime.strptime(x, "%M:%S")) time_delta = udf(lambda y: timedelta(minutes=y.minute, seconds=y.second)) contentMapping = contentMapping.withColumn("strptime", strp_time(F.col("Length"))) contentMapping = contentMapping.withColumn("Content Length", time_delta(F.col("strptime"))) contentMapping = contentMapping.drop('strpTime') contentMapping = contentMapping.withColumnRenamed("Title", "item_name") #Merge df and contentMapping df = df.join(contentMapping, ["item_name"], "outer") #get time played for df = df.withColumn( "Played For", F.unix_timestamp(df["end"]) - F.unix_timestamp(df["start"])) #get total seconds of song as String, convert to bigInt df = df.withColumn( "Song Duration Str", F.regexp_extract(df["Content Length"], "(?<=total: )(.*)(?= seconds)", 0)) df = df.withColumn("Song Duration Int", df["Song Duration Str"].cast(IntegerType())) #Let's get Percentage Played df = df.withColumn("PercentPlayed", df["Played For"] / df["Song Duration Int"]) #Let's keep only the columns we need at this point df = df.select(["device_id", "item_name", "PercentPlayed"]) #assign weights based on Percent Played df = df.withColumn( 'weight', F.when((F.col("PercentPlayed") >= 0.0) & (F.col("PercentPlayed") < 0.25), uc.first)\ .when((F.col("PercentPlayed") >= 0.25) & (F.col("PercentPlayed") < 0.50), uc.second)\ .when((F.col("PercentPlayed") >= 0.50) & (F.col("PercentPlayed") < 0.75), uc.third)\ .when((F.col("PercentPlayed") >= 0.75) & (F.col("PercentPlayed") <= 1.00), uc.fourth)\ .otherwise(-999.999) ) #drop the rows with invalid percent played df = df.filter((df.weight != -999.999)) df.write.parquet(outputFile) # Write onto output Parquet
def create_values(cols): values = [] for col in cols: if col.is_lookup == 1: values.append( f.when( f.col(col.demographic_key).isNull(), f.concat_ws('_', f.lit(col.demographic_key), f.lit('9999'))).when( f.trim(f.col(col.demographic_key)) == '', f.concat_ws('_', f.lit(col.demographic_key), f.lit('9999'))). when( f.length( f.regexp_extract( f.col(col.demographic_key).astype('string'), '(\d+)', 1)) > 0, f.concat_ws( '_', f.lit(col.demographic_key), f.col(col.demographic_key).astype('int').astype( 'string'))).otherwise( f.concat_ws('_', f.lit(col.demographic_key), f.col(col.demographic_key)))) else: values.append(f.col(col.demographic_key)) return values
def compute( self, biomarkers_table: str, source_table: str, disease_table: str, drug_index: str, output_file: str ) -> None: """Loads and processes inputs to generate the Cancer Biomarkers evidence strings""" # Import data biomarkers_df = self.spark.read.csv(biomarkers_table, sep='\t', header=True) source_df = self.spark.read.json(source_table).select( col('label').alias('niceName'), 'source', 'url') disease_df = self.spark.read.json(disease_table).select( regexp_replace(col('name'), '_', '').alias('tumor_type'), regexp_extract(col('url'), r'[^/]+$', 0).alias('diseaseFromSourceMappedId')) drugs_df = self.spark.read.parquet(drug_index).select( col('id').alias('drugId'), col('name').alias('drug')) # Process inputs to generate evidence strings evidence = self.process_biomarkers( biomarkers_df, source_df, disease_df, drugs_df ) # Write evidence strings write_evidence_strings(self.evidence, output_file) logging.info(f'{evidence.count()} evidence strings have been saved to {output_file}.')
def transform(retail_df): """ transformations: extract color name from description attribute select 'Country', 'Quantity', 'UnitPrice' and update product_color by replacing empty value with 'nocolor groupBy country and product color sum Quantity, UnitePrice as total_price and total_quantity respectively Add column avg_spent = total_price/ total_quantity :param retail_df: :return: """ from pyspark.sql.functions import regexp_extract, col, count, sum, expr, regexp_replace extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)" transformed_retail = (retail_df.withColumn( 'product_color', regexp_extract(col("Description"), extract_str, 1)).select( 'Country', 'Quantity', 'UnitPrice', regexp_replace( col("product_color"), '^$', "NOCOLOR").alias('product_color')).groupBy( 'Country', 'product_color').agg( sum('Quantity').alias('total_quantity'), sum('UnitPrice').alias('total_price')).withColumn( 'avg_spent (dollars)', expr('total_price/total_quantity'))) return transformed_retail
def create_cmpgn_event_pivot_col(spark_df_model_email): ''' The following function generates a new column named CMPGN_NM_EVENT. This column is the concatenation of the Campaign Name and Email for each row. The input spark dataframe must have the following columns: - CMPGN_NM, - VENDOR_EVENT_TYPE_TXT ACCEPTS: - spark dataframe RETURNS: - spark dataframe ''' # Start of String Up To Third Occurance of "_" regex_str = '^(?:[^_]*\_){2}([^_]*)' # IDX = 0 to grab entire string idx = 0 spark_df_model_email = spark_df_model_email.withColumn('CMPGN_NM_REG', F.regexp_extract(F.col('CMPGN_NM'), regex_str, idx)) \ .withColumn('CMPGN_NM_EVENT', F.concat_ws("_", F.col('CMPGN_NM_REG'), F.col('VENDOR_EVENT_TYPE_TXT'))) \ .drop('CMPGN_NM_REG') return spark_df_model_email
def invalid_dbs_present_phedex(self): ''' Returns a dataframe with datasets which have "INVALID" status in DBS and are "PRESENT" in phedex :func: run_consistency.invalid_dbs_present_phedex() for reference dbs d_dataset_access_type_id: 1 : valid 2 : invalid 42 : Deprecated 41 : Production 81 : Deleted ''' invalid_dbs_present_phedex = (self.dbs_datasets .filter(col('d_dataset_access_type_id')=='2') .join(self.dbs_blocks,col('d_dataset_id')==col('b_dataset_id')) .join(self.phedex_block_replicas,col('d_dataset')==col('dataset_name')) .filter(col('dataset_name').isNotNull()) .withColumn('input_campaign', fn.regexp_extract(col('d_dataset'), "^/[^/]*/((?:HI|PA|PN|XeXe|)Run201\d\w-[^-]+|CMSSW_\d+|[^-]+)[^/]*/", 1)) .select('input_campaign','d_dataset','d_last_modified_by') # you can select more columns for detail info .distinct()) invalid_dbs_present_phedex.groupby("input_campaign").agg((fn.count(fn.col("d_dataset")))).show() return invalid_dbs_present_phedex.select("d_dataset")
def _add_special_dates(self, dcc_experiment_df: DataFrame): """ Takes in a DataFrame with experimental data, parses out the metadata values for special dates, and adds those values as new columns. """ for col_name, date_prefixes in { "_dateOfBloodCollection": [ "date and time of blood collection = ", "date/time of blood collection = ", ], "_dateOfSacrifice": [ "date and time of sacrifice = ", "date of sacrifice = ", ], }.items(): escaped_prefixes = [ prefix.replace("/", ".") for prefix in date_prefixes ] prefix_regex = f"(?i)(.*)({'|'.join(escaped_prefixes)})(.*)" dcc_experiment_df = dcc_experiment_df.withColumn( col_name + "Array", expr( f'filter(metadata, metadataValue -> metadataValue rlike "{prefix_regex}" )' ), ) dcc_experiment_df = dcc_experiment_df.withColumn( col_name, regexp_extract( col(col_name + "Array").getItem(0), prefix_regex, 3).astype(DateType()), ) return dcc_experiment_df
def compile_regex_extract(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) pattern = op.pattern.op().value idx = op.index.op().value return F.regexp_extract(src_column, pattern, idx)
def get_memedroid_data(memes_df): memedroid_df = memes_df.filter(memes_df.source == 'memedroid') memedroid_schema = StructType().add( 'title', StringType(), True).add( 'tags', StringType(), True).add( 'date', StringType(), True).add( 'popularity', StringType(), True) memedroid_data = memedroid_df.select( functions.col('id'), functions.from_json( functions.col('additional_data'), schema=memedroid_schema ).alias("data") ).select('id','data.*') upvote_percentage = pyspark.sql.functions.split(memedroid_data['popularity'], '%').getItem(0) number_of_votes = pyspark.sql.functions.split(memedroid_data['popularity'], '%').getItem(1) memedroid_data = memedroid_data.withColumn( 'upvote_percentage', upvote_percentage.cast("Integer")).withColumn( 'number_of_votes', regexp_extract(number_of_votes, '[0-9]+',0).cast("Integer")) upvotes = (memedroid_data.upvote_percentage * memedroid_data.number_of_votes * 0.01) memedroid_data = memedroid_data.withColumn('upvotes', upvotes.cast("Integer")) memedroid_data = memedroid_data.filter(memedroid_data.upvotes > 100) return memedroid_data
def extract_state(df): data = df.withColumn('State', regexp_replace( regexp_extract('Location', r'(, )(\w\w)', 2), r'^$', 'none' )) return data
def _process_pipeline(self, read_stream): # filter useless data filtered_stream = read_stream.where( (col("duration_ms").cast("long") != 0) & ~ (col("requested_url").startswith("GET /info") | col("requested_url").startswith("GET /prometheus")) ) mapped_stream = filtered_stream \ .withColumn("country", when(col("stack").isNotNull(), regexp_extract("stack", r".*-(\w+)$", 1)) .otherwise("undefined")) average_duration = mapped_stream.aggregate( Avg(group_fields=["country", "host", "app", "app_version", "api_method"], aggregation_field="duration_ms", aggregation_name=self._component_name)) count_by_status = mapped_stream.aggregate( Count(group_fields=["country", "host", "app", "app_version", "api_method", "status"], aggregation_name=self._component_name)) request_stream = read_stream \ .where(col("header_x-dev").isNotNull()) \ .withColumn("country", when(col("stack").isNotNull(), regexp_extract("stack", r".*-(\w+)$", 1)) .otherwise("undefined")) count_by_app = request_stream.aggregate( Count(group_fields=["country", "app"], aggregation_name=self._component_name + ".requests")) count_by_app_with_status = request_stream \ .where(col("status").isNotNull()) \ .withColumn("status", custom_translate_regex( source_field=col("status"), mapping={r"^2\d\d": "successful"}, default_value="failure")) \ .aggregate(Count(group_fields=["country", "app", "status"], aggregation_name=self._component_name + ".requests")) count_stb_requests = request_stream \ .aggregate(Count(group_fields=["country", "header_x-dev"], aggregation_name=self._component_name + ".requests")) return [average_duration, count_by_status, count_stb_requests, count_by_app, count_by_app_with_status]
def get_crab_popularity_ds(start_date, end_date, verbose=False, base=_BASE_PATH): """ Query the hdfs data and returns a pandas dataframe with: Datatier, Dataset, CMSPrimaryPrimaryDataset, job_count, workflow_count, ChirpCMSSWReadBytes args: - start_date datetime Start of the query period (RecordTime) - end_date datetime End of the query period """ start = int(start_date.timestamp() * 1000) end = int(end_date.timestamp() * 1000) spark = get_spark_session(yarn=True, verbose=verbose) dfs_crabdb = (spark.read.option("basePath", base).json( _get_candidate_files(start_date, end_date, spark, base=base), schema=_get_crab_condor_schema(), ).select("metadata.timestamp", "data.*").filter("""Status in ('Completed', 'Removed') AND CRAB_DataBlock is not NULL AND timestamp >= {} AND timestamp <= {}""".format( start, end)).repartition("CRAB_DataBlock").drop_duplicates([ "GlobalJobId" ]).withColumnRenamed( "CMSPrimaryPrimaryDataset", "PrimaryDataset").withColumn( "Dataset", regexp_extract("CRAB_DataBlock", "^(.*)/([^/]*)#.*$", 1)).withColumn( "Datatier", regexp_extract( "CRAB_DataBlock", "^(.*)/([^/]*)#.*$", 2))) dfs_crabdb = (dfs_crabdb.groupBy( "Datatier", "PrimaryDataset", "Dataset").agg( _max(col("RecordTime")), _min(col("RecordTime")), count(lit(1)), countDistinct("CRAB_Workflow"), _sum(col("ChirpCMSSWReadBytes")), ).withColumnRenamed("count(1)", "job_count").withColumnRenamed( "count(DISTINCT CRAB_Workflow)", "workflow_count").withColumnRenamed( "sum(ChirpCMSSWReadBytes)", "ChirpCMSSWReadBytes").na.fill( "Unknown", ["Datatier", "PrimaryDataset", "Dataset"])) return dfs_crabdb.toPandas()
def __ring_status_node_warnings(self, events): return events \ .where("message like '%Unable to determine external address " "of node with internal address %'") \ .withColumn("host", regexp_extract("message", r".*Unable\s+to\s+determine\s+external\s+address\s+of\s+" r"node\s+with\s+internal\s+address\s+'(\S+)'.*", 1)) \ .aggregate(Count(group_fields=["hostname", "host"], aggregation_name=self._component_name + ".ring_status_node_warnings"))
def parse(self, raw_df): input = raw_df cols = [] for col_name in self.groups_to_cols.keys(): meta = self.groups_to_cols[col_name] cols.append(col_name) if meta['type'] == 'timestamp': input = input.withColumn( col_name, udf.parse_ts_udf(regexp_extract(input.value, self.regexp, meta['group'])).cast(meta['type']) ) else: input = input.withColumn( col_name, regexp_extract(input.value, self.regexp, meta['group']).cast(meta['type']) ) return input.select(*cols)
def prepare_google_trend( google_trend_csv: pyspark.sql.DataFrame, ) -> pyspark.sql.DataFrame: google_trend_all = google_trend_csv.withColumn( "Date", F.regexp_extract(google_trend_csv.week, "(.*?) -", 1)).withColumn( "State", F.regexp_extract(google_trend_csv.file, "Rossmann_DE_(.*)", 1)) # map state NI -> HB,NI to align with other data sources google_trend_all = google_trend_all.withColumn( "State", F.when(google_trend_all.State == "NI", "HB,NI").otherwise(google_trend_all.State), ) # expand dates return expand_date(google_trend_all)
def parse_genetics_evidence(genetics_df: DataFrame) -> DataFrame: """The JSON Schema format is applied to the df.""" return (genetics_df.withColumn( 'literature', when( col('pmid') != '', array(regexp_extract(col('pmid'), r'PMID:(\d+)$', 1))).when( col('study_id').contains('SAIGE'), array(lit('30104761'))) ).withColumn( 'cohortId', when(col('study_id').contains('SAIGE'), array(lit('UK Biobank 500k'))).when( col('study_id').contains('NEALE'), array(lit('UK Biobank 500k'))), ).select( lit('ot_genetics_portal').alias('datasourceId'), lit('genetic_association').alias('datatypeId'), col('gene_id').alias('targetFromSourceId'), col('efo').alias('diseaseFromSourceMappedId'), col('literature'), col('pub_author').alias('publicationFirstAuthor'), 'projectId', substring(col('pub_date'), 1, 4).cast(IntegerType()).alias('publicationYear'), col('trait_reported').alias('diseaseFromSource'), col('study_id').alias('studyId'), col('sample_size').alias('studySampleSize'), col('pval_mantissa').alias('pValueMantissa'), col('pval_exponent').alias('pValueExponent'), col('odds_ratio').alias('oddsRatio'), col('oddsr_ci_lower').alias('oddsRatioConfidenceIntervalLower'), col('oddsr_ci_upper').alias('oddsRatioConfidenceIntervalUpper'), col('beta').alias('beta'), col('beta_ci_lower').alias('betaConfidenceIntervalLower'), col('beta_ci_upper').alias('betaConfidenceIntervalUpper'), col('y_proba_full_model').alias('resourceScore'), col('rsid').alias('variantRsId'), concat_ws('_', col('chrom'), col('pos'), col('ref'), col('alt')).alias('variantId'), regexp_extract(col('consequence_link'), r'\/(SO.+)$', 1).alias('variantFunctionalConsequenceId'), ).dropDuplicates([ 'variantId', 'studyId', 'targetFromSourceId', 'diseaseFromSourceMappedId' ]))
# MAGIC | _status_ | The HTTP status code the server sent back to the client. | # MAGIC | _bytes_ | The number of bytes (`Content-Length`) transferred to the client. | # MAGIC # MAGIC # MAGIC Next, we have to parse it into individual columns. We'll use the special built-in [regexp\_extract()](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.regexp_extract) # MAGIC function to do the parsing. This function matches a column against a regular expression with one or more [capture groups](http://regexone.com/lesson/capturing_groups) and allows you to extract one of the matched groups. We'll use one regular expression for each field we wish to extract. # MAGIC # MAGIC If you can't read these regular expressions, don't worry. Trust us: They work. If you find regular expressions confusing (and they certainly _can_ be), and you want to learn more about them, start with the # MAGIC [RegexOne web site](http://regexone.com/). You might also find [_Regular Expressions Cookbook_](http://shop.oreilly.com/product/0636920023630.do), by Jan Goyvaerts and Steven Levithan, to be helpful. # MAGIC # MAGIC _Some people, when confronted with a problem, think "I know, I'll use regular expressions." Now they have two problems._ (attributed to Jamie Zawinski) # COMMAND ---------- from pyspark.sql.functions import split, regexp_extract split_df = base_df.select(regexp_extract('value', r'^([^\s]+\s)', 1).alias('host'), regexp_extract('value', r'^.*\[(\d\d/\w{3}/\d{4}:\d{2}:\d{2}:\d{2} -\d{4})]', 1).alias('timestamp'), regexp_extract('value', r'^.*"\w+\s+([^\s]+)\s+HTTP.*"', 1).alias('path'), regexp_extract('value', r'^.*"\s+([^\s]+)', 1).cast('integer').alias('status'), regexp_extract('value', r'^.*\s+(\d+)$', 1).cast('integer').alias('content_size')) split_df.show(truncate=False) # COMMAND ---------- # MAGIC %md # MAGIC ### (2c) Data Cleaning # MAGIC # MAGIC Let's see how well our parsing logic worked. First, let's verify that there are no null rows in the original data set. # COMMAND ----------
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() # setup spark/sql context to be used for communication with HDFS sc = SparkContext(appName="phedex_br") if not opts.yarn: sc.setLogLevel("ERROR") sqlContext = HiveContext(sc) schema_def = schema() # read given file(s) into RDD if opts.fname: pdf = sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(opts.fname, schema = schema_def) elif opts.basedir: fromdate, todate = defDates(opts.fromdate, opts.todate) files = getFileList(opts.basedir, fromdate, todate) msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files)) print msg if not files: return pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv') .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(file_path, schema = schema_def) \ for file_path in files]) else: raise ValueError("File or directory not specified. Specify fname or basedir parameters.") # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date) groupdic, nodedic = getJoinDic() acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$" data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$" groupf = udf(lambda x: groupdic[x], StringType()) nodef = udf(lambda x: nodedic[x], StringType()) ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \ .withColumn("node_kind", nodef(pdf.node_id)) \ .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \ .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \ .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1))) # print dataframe schema if opts.verbose: ndf.show() print("pdf data type", type(ndf)) ndf.printSchema() # process aggregation parameters keys = [key.lower().strip() for key in opts.keys.split(',')] results = [result.lower().strip() for result in opts.results.split(',')] aggregations = [agg.strip() for agg in opts.aggregations.split(',')] order = [orde.strip() for orde in opts.order.split(',')] if opts.order else [] asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else [] filtc, filtv = opts.filt.split(":") if opts.filt else (None,None) validateAggregationParams(keys, results, aggregations, order, filtc) if filtc and filtv: ndf = ndf.filter(getattr(ndf, filtc) == filtv) # if delta aggregation is used if DELTA in aggregations: validateDeltaParam(opts.interval, results) result = results[0] #1 for all dates generate interval group dictionary datedic = generateDateDict(fromdate, todate, opts.interval) boundic = generateBoundDict(datedic) max_interval = max(datedic.values()) interval_group = udf(lambda x: datedic[x], IntegerType()) interval_start = udf(lambda x: boundic[x][0], StringType()) interval_end = udf(lambda x: boundic[x][1], StringType()) #2 group data by block, node, interval and last result in the interval ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result)) idf = ndf.withColumn("interval_group", interval_group(ndf.now)) win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc()) idf = idf.withColumn("row_number", rowNumber().over(win)) rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\ .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0))) rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result)) rdf.cache() #3 create intervals that not exist but has minus delta win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win)) hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\ .withColumn("interval_group", adf.interval_group + 1)\ .withColumn(result, lit(0))\ .drop(adf.interval_group_aft) #4 join data frames idf = rdf.unionAll(hdf) #3 join every interval with previous interval win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win)) #5 calculate delta_plus and delta_minus columns and aggregate by date and node ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \ .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0)) aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\ sum(ddf.delta_minus).alias("delta_minus")) aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus) else: resAgg_dic = zipResultAgg(results, aggregations) order, asc = formOrdAsc(order, asc, resAgg_dic) # perform aggregation if order: aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc) else: aggres = ndf.groupBy(keys).agg(resAgg_dic) # output results if opts.fout: fout_header = formFileHeader(opts.fout) if opts.header: aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header) else: aggres.write.format('com.databricks.spark.csv').save(fout_header) else: aggres.show(50)
col("Description")).show(2) # COMMAND ---------- from pyspark.sql.functions import translate df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\ .show(2) # COMMAND ---------- from pyspark.sql.functions import regexp_extract extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)" df.select( regexp_extract(col("Description"), extract_str, 1).alias("color_clean"), col("Description")).show(2) # COMMAND ---------- from pyspark.sql.functions import instr containsBlack = instr(col("Description"), "BLACK") >= 1 containsWhite = instr(col("Description"), "WHITE") >= 1 df.withColumn("hasSimpleColor", containsBlack | containsWhite)\ .where("hasSimpleColor")\ .select("Description").show(3, False) # COMMAND ----------