def CreateSubstring(df, inCol, outCol, strLen, delim, startPos, endPos, makeList=False): if endPos <= startPos: df = df.withColumn(outCol, lit('')) #here we create a substring of a string column startPos = builtin.min(builtin.max(0, startPos), strLen) endPos = builtin.min(builtin.max(startPos, endPos), strLen) #if one end of string coincides with beginning if startPos == 0: df = df.withColumn(outCol, substring_index(inCol, delim, endPos)) #if one end of string coincides with end elif endPos == strLen: df = df.withColumn(outCol, substring_index(inCol, delim, startPos - endPos)) #if string is in middle else: #extract string from beginning upto position and then extract right end df = df.withColumn(outCol, substring_index(inCol, delim, endPos)) \ .withColumn(outCol, substring_index(outCol, delim, startPos - endPos)) #if string should be broken into list if makeList == True: df = df.withColumn(outCol, split(outCol, delim)) return df
def multi_day_client_df(client_addons_df): """A single-locale version of the `clients_daily` data.""" df = (client_addons_df.where("locale = 'en-US'").withColumn( "client_id", substring_index("client_id", "_", -1)).where( substring_index("client_id", "-", -1).isin([1, 2, 3, 4, 5]))) df.createOrReplaceTempView("clients_daily") return df
def run(rucio_path, dbs_path, output, verbose): start = time.time() spark = SparkSession.builder.appName("rucio_dumps_test").getOrCreate() csvreader = spark.read.format("csv") \ .option("nullValue", "null") \ .option("mode", "FAILFAST") avroreader = spark.read.format("avro") rucio_info = avroreader.load(rucio_path) \ .withColumn("filename", fn.input_file_name()) logger.debug("Rucio data types") logger.debug(rucio_info.dtypes) # rucio_info.show(5, False) dbs_files = csvreader.schema(schemas.schema_files()) \ .load(dbs_path) \ .select("f_logical_file_name", "f_dataset_id") # dbs_files.show(5, False) rucio_df = (rucio_info.withColumn("tmp1", fn.substring_index("filename", "/rucio/", -1)) .withColumn("tally_date", fn.substring_index("tmp1", "/", 1)) .withColumn('create_day', fn.date_format(fn.to_date((rucio_info.CREATED_AT / fn.lit(1000)) .cast(types.LongType()) .cast(types.TimestampType())), 'yyyyMMdd') ) .withColumn('tally_day', fn.date_format(fn.to_date("tally_date", "yyyy-MM-dd"), 'yyyyMMdd')) .select("RSE_ID", "BYTES", "NAME", "SCOPE", "tally_day", "create_day") ) # rucio_df.show(5, False) rucio_df = rucio_df \ .join(dbs_files, dbs_files.f_logical_file_name == rucio_df.NAME) \ .groupBy("RSE_ID", "f_dataset_id", "SCOPE", "tally_day", "create_day") \ .agg(fn.sum("BYTES").alias("rep_size")) # rucio_df.show(5, False) rucio_df.write.option("compression", "snappy").parquet(output, mode="overwrite") end = time.time() logger.info("Elapsed Time: {min} min, {sec} sec.".format(min=(end - start) // 60, sec=(end - start) % 60))
def test_substring_index(data_gen, delim): assert_gpu_and_cpu_are_equal_collect(lambda spark: unary_op_df( spark, data_gen).select(f.substring_index(f.col('a'), delim, 1), f.substring_index(f.col('a'), delim, 3), f.substring_index(f.col('a'), delim, 0), f.substring_index(f.col('a'), delim, -1), f.substring_index(f.col('a'), delim, -4)))
def do(self, workflow, etl_process): from pyspark.sql.functions import substring, substring_index, split, col self.new_column = self.action_details.pop("name") self.target = self.action_details.pop("target") self.type = self.action_details.pop("type", "simple") if self.type == "simple": self.pos = self.action_details.pop("pos", 1) self.len = self.action_details.pop("len") workflow.df = workflow.df \ .withColumn(self.new_column, substring(col(self.target), self.pos, self.len)) else: self.delim = self.action_details.pop("delim") self.index = self.action_details.pop("index", 1) if self.type == "delim": workflow.df = workflow.df \ .withColumn(self.new_column, substring_index(col(self.target), self.delim, self.index)) elif self.type == "delim_index": workflow.df = workflow.df \ .withColumn(self.new_column, split(self.target, self.delim).getItem(self.index - 1))
def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column] ) -> Column: column_spec = substring_index( self.column.get_column_spec( source_df=source_df, current_column=current_column ), self.delimiter, self.delimiter_count ) return column_spec
def get_dataset(xmlQuery): """ Runs an RCSB PDB Advanced Search web service using an XML query description. See https://www.rcsb.org/pdb/staticHelp.do?p=help/advancedSearch.html Advanced Search The returned dataset contains the following field dependent on the query type: # structureId, e.g., 1STP # structureChainId, e.g., 4HHB.A # ligandId, e.g., HEM :param xmlQuery: RCSB PDB advanced query xml string :return: dataset with matching ids """ # run advanced query ids = post_query(xmlQuery) # convert list of ids to a list of lists (required for dataframe creation below) id_list = [[i] for i in ids] # convert list of lists to a dataframe spark = SparkSession.builder.getOrCreate() # distinguish 3 types of results based on length of string # structureId: 4 (e.g., 4HHB) # structureEntityId: > 4 (e.g., 4HHB:1) # entityId: < 4 (e.g., HEM) if len(ids[0]) > 4: ds: DataFrame = spark.createDataFrame(id_list, ['pdbEntityId']) # if results contain an entity id, e.g., 101M:1, then map entityId to pdbChainId ds = ds.withColumn("pdbId", substring_index(ds.pdbEntityId, ':', 1)) ds = ds.withColumn("entityId", substring_index(ds.pdbEntityId, ':', -1)) mapping = __get_entity_to_chain_id() ds = ds.join(mapping, (ds.pdbId == mapping.structureId) & (ds.entityId == mapping.entity_id)) ds = ds.select(ds.pdbChainId) elif len(ids[0]) < 4: ds: DataFrame = spark.createDataFrame(id_list, ['ligandId']) else: ds: DataFrame = spark.createDataFrame(id_list, ['pdbId']) return ds
def community_indicators(spark: SparkSession) -> DataFrame: indicators_file = os.environ["CORE_CONF_fs_defaultFS"] + "/datasets/Community_Resiliency_Indicator_System.csv" # Read csv file indicators = spark.read \ .format("csv") \ .option("header", "true") \ .load(indicators_file) \ .cache() indicators = indicators.select(indicators["Neighborhood"].alias("neighborhood"), indicators["Haz_Score"].alias("hazard_score"), indicators["Env_Score"].alias("environment_score"), indicators["VCrim_Rate"].alias("violent_crime_rate"), indicators["Citz_Per"].alias("citizen_density"), indicators["Com_Score"].alias("community_score"), indicators["Food_Score"].alias("food_score"), indicators["Emp_per"].alias("employment_rate"), indicators["PopDens"].alias("population_density"), indicators["DayPopDens"].alias("population_density_day")) # Separate neighborhoods that have been combined with a "/" paired_neighborhoods = indicators.where(indicators["neighborhood"].like("%/%")) left_of_pairs = paired_neighborhoods.withColumn("neighborhood", substring_index(paired_neighborhoods["neighborhood"], "/", 1)) right_of_pairs = paired_neighborhoods.withColumn("neighborhood", substring_index(paired_neighborhoods["neighborhood"], "/", -1)) indicators = indicators.where(~indicators["neighborhood"].like("%/%")) \ .unionAll(left_of_pairs) \ .unionAll(right_of_pairs) # Use neighborhood areas to convert population density to population count areas = neighborhood_boundaries(spark) \ .withColumn("area", calculate_area("polygon")) \ .drop("polygon") indicators = indicators.join(areas, "neighborhood") indicators = indicators.withColumn("population", indicators["population_density"] * indicators["area"]) \ .withColumn("population_day", indicators["population_density_day"] * indicators["area"]) \ .drop("area") return indicators
def main(business_file, postcode_file, ethnicity_file): # load files df_business = spark.read.parquet(business_file) df_postcode = spark.read.csv(postcode_file, header=True) df_ethnicity = spark.read.csv(ethnicity_file, header=True) # filter for businesses in toronto df_toronto = df_business.where("City like '%Toronto%'") # Combine external wellbeing datasets to yelp datasets and pre-process df_join = df_wellbeing.join(df_postcode, on=['Neighbourhood'], how='left') df_join = df_join.drop('Combined Indicators', 'Borough') new_cols_ethn = [c.strip(' ') for c in df_ethnicity.columns] old_cols_ethn = df_ethnicity.schema.names df_ethnicity = reduce( lambda df_ethnicity, idx: df_ethnicity.withColumnRenamed( old_cols_ethn[idx], new_cols_ethn[idx]), range(len(old_cols_ethn)), df_ethnicity) df_indian = df_toronto.where("Categories like '%Indian%'") df_ethnicity = df_ethnicity.join(df_postcode, on=['Neighbourhood'], how='left') df_ethnicity_small = df_ethnicity.select('Neighbourhood', 'Total Population', 'South Asian', 'Postcode') df_ethn_norm = df_ethnicity_small cols = [ 'Chinese', 'South Asian', 'Black', 'Filipino', 'Latin American', 'Southeast Asian', 'Arab', 'West Asian', 'Korean', 'Japanese', 'Not a Visible Minority' ] for field in df_ethnicity_small.columns: if field in cols: df_ethn_norm = df_ethn_norm.withColumn( field, col(field) / col("Total Population")) df_indian_ethn = df_indian.withColumn( "PostCode", functions.substring_index(col("PostalCode"), " ", 1)).join(df_ethn_norm, on='PostCode', how='left') df_ind_eth_sort = df_indian_ethn.orderBy('BusinessStars', ascending=False).select('BusinessID', 'Name', 'Latitude', 'Longitude', \ 'BusinessStars', 'Neighbourhood', 'South Asian') df_ind_eth_sort.coalesce(1).write.csv('df_ind_eth_sort.csv')
def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column], parent_columns: Optional[List[Column]], ) -> Column: column_spec = substring_index( self.column.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ), self.delimiter, self.delimiter_count, ) return column_spec
def reformat_v1_0(flight, pqFolder, pqFileName): """ Read in the original v1.0 dataframe and save as a new parquet file compatible with v1.1 @params: flight - Required : original v1.0 data(Spark DataFrame) pqFolder - Required : folder to save the parquet files into (Str) pqFileName - Required : parquet file name (Bool) """ flight2 = (flight.withColumn('stayDays', correct_stay_days_UDF(col('trip'), col('stay_days'))) .drop('stay_days') .withColumnRenamed('start_date', 'depDate') .withColumn('depDate', to_date('depDate')) .selectExpr('*', 'date_add(depDate, stayDays) as retDate')# this is when the return trip starts, might arrive a day later .withColumnRenamed('from_city_name', 'fromCity') .withColumnRenamed('to_city_name', 'toCity') .withColumnRenamed('search_date', 'searchDate') .withColumn('searchDate', to_date('searchDate')) .withColumnRenamed('company', 'airlineName') .withColumnRenamed('dep_time', 'departureTime') .withColumnRenamed('arr_time', 'arrivalTime') .withColumn('duration_h', split(flight.duration,'h').getItem(0)) .withColumn('duration_m', F.substring_index(split(flight.duration,'h').getItem(1), 'm', 1)) # .withColumn('duration', F.struct(col('duration_h'), col('duration_m'))) .withColumn('duration_m', (col('duration_h')*60 + col('duration_m'))) .drop('duration', 'duration_h', 'flight_number') .withColumnRenamed('price_code', 'currencyCode') .withColumnRenamed('stop', 'stops') .withColumn('stops', col('stops').cast('byte')) .withColumn('stop_info', split(col('stop_info'), ';')) # .withColumn('stop_duration', take_all_duration_UDF(col('stop_info'))) .withColumn('noOfTicketsLeft', correct_tickets_left_UDF('ticket_left')) .withColumn('noOfTicketsLeft', col('noOfTicketsLeft').cast('byte')) .drop('ticket_left') .withColumnRenamed('table_name', 'tableName') .withColumn('task_id', col('task_id').cast('long')) .withColumn('span_days', col('span_days').cast('integer')) .select('price', 'version', 'searchDate', 'tableName', 'task_id', 'currencyCode', 'fromCity', 'toCity', 'trip', 'depDate', 'retDate', 'stayDays', 'departureTime', 'arrivalTime', 'airlineName', 'duration_m', 'flight_code', 'plane', 'stops', 'noOfTicketsLeft', 'airline_code', 'airline_codes', 'stop_info', 'span_days', 'power', 'video', 'wifi') #'stop_duration', ) flight2.repartition(1).write.parquet(os.path.join(pq_folder, pqFileName))
def test_auto_mapper_substring_by_delimiter( spark_session: SparkSession ) -> None: # Arrange spark_session.createDataFrame( [ (1, 'Qureshi', 'Imran', "1970-01-01"), (2, 'Vidal', 'Michael', "1970-02-02"), ], ['member_id', 'last_name', 'first_name', "date_of_birth"] ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).columns( my_column=A.substring_by_delimiter(A.column("last_name"), "s", 1) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df ) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert str(sql_expressions["my_column"]) == str( substring_index(col("b.last_name"), "s", 1).alias("my_column") ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert result_df.where("member_id == 1").select("my_column" ).collect()[0][0] == "Qure" assert result_df.where("member_id == 2" ).select("my_column").collect()[0][0] == "Vidal"
def main(business_data, labour_data, postcode_data): df_business = spark.read.parquet(business_data) df_labour = spark.read.csv(labour_data, header=True) df_postcode = spark.read.csv(postcode_file, header=True) df_toronto = df_business.where("City like '%Toronto%'") # Strip spaces from columns new_cols_lb = [c.strip(' ') for c in df_labour.columns] old_cols_lb = df_labour.schema.names df_labour = reduce( lambda df_labour, idx: df_labour.withColumnRenamed( old_cols_lb[idx], new_cols_lb[idx]), range(len(old_cols_lb)), df_labour) df_labour = df_labour.join(df_postcode, on=['Neighbourhood'], how='left') df_labour = df_labour.drop('CombinedIndicators', 'Borough', 'TotalPopulation') df_lb_norm = df_labour df_lb_norm = df_lb_norm.withColumn( "LabourForceCategory", df_lb_norm["LabourForceCategory"].cast(IntegerType())) df_lb_norm = df_lb_norm.withColumn( "InLabourForce", df_lb_norm["InLabourForce"].cast(IntegerType())) df_lb = df_toronto.withColumn("PostCode", functions.substring_index(col("PostalCode"), " ", 1)) \ .join(df_lb_norm, on='PostCode', how='left') df_lb_pandas = df_lb.toPandas() df_lb_pandas = df_lb_pandas.dropna( ) #neighbourhoods not present in the toronto data dropped df_lb_pandas['ratio_emply'] = df_lb_pandas['InLabourForce'] / df_lb_pandas[ 'LabourForceCategory'] df_lb_pandas_emply = df_lb_pandas[[ 'BusinessID', 'Neighbourhood', 'BusinessStars', 'ratio_emply' ]] df_lb_pandas_emply = df_lb_pandas_emply.astype({'BusinessStars': 'double'}) df_group_lb = df_lb_pandas_emply.groupby('Neighbourhood').mean() df_group_lb.to_csv('df_lb_pandas_emply')
def variants_from_vcf (vcf): """ Given a VCF file in a data frame, extract the first 9 variant columns and give them unique identifiers. Include genotype columns as an array parsed out with a pandas udf """ # Get the main data and put a unique index on each variant maindata = vcf.filter(vcf.data.startswith('#') == False) splitdata = maindata.select("filename",f.split(f.substring_index('data',"[\t ]+",9),"[\t ]+").alias("split_data"),maindata.lineid.alias("VAR_IDX")) # Now pull out the columns one at a time, casting non-strings to appropriate type. Split out INFO and FORMAT here variant = splitdata.select("filename","VAR_IDX",\ f.element_at(splitdata.split_data,1).alias("CHR"),\ f.element_at(splitdata.split_data,2).cast(IntegerType()).alias("POS"),\ f.element_at(splitdata.split_data,3).alias("ID"),\ f.element_at(splitdata.split_data,4).alias("REF"),\ f.element_at(splitdata.split_data,5).alias("ALT"),\ f.element_at(splitdata.split_data,6).cast(FloatType()).alias("QUAL"),\ f.element_at(splitdata.split_data,7).alias("FILTER"),\ f.split(f.element_at(splitdata.split_data,8), ";").alias("INFO"),\ f.split(f.element_at(splitdata.split_data,9), ":").alias("FORMAT")) return(variant)
def user_dimension(user_df): # cleaning user data and droping unnecessary columns user_dim = user_df.withColumn("user_friend_count", F.size(F.split(user_df.friends, ','))) \ .withColumn('year', F.year(user_df.yelping_since)) \ .withColumn('month', F.month(user_df.yelping_since)) \ .withColumnRenamed("fans", "user_fans") \ .withColumnRenamed("review_count", "user_review_count") \ .withColumnRenamed("useful", "useful_vote_cnt") \ .withColumnRenamed("funny", "funny_vote_cnt") \ .withColumnRenamed("cool", "cool_vote_cnt") \ .withColumnRenamed("useful", "useful_vote_cnt") \ .withColumnRenamed("average_stars", "user_average_stars") \ .withColumn("user_elite_year_cnt", F.when(F.length(user_df.elite) == 0, 0) .otherwise(F.size(F.split(user_df.elite, ',')))) \ .withColumn("is_user_elite", F.when(F.substring_index(user_df.elite, ',', -1) == currentYear - 1, True) .otherwise(False)) \ .select("user_id", "name", "user_review_count", "yelping_since", "user_friend_count", "useful_vote_cnt", "funny_vote_cnt", "cool_vote_cnt", "user_average_stars", "user_elite_year_cnt", "is_user_elite", "user_fans", "year", "month") return user_dim
shutil.rmtree(imagePath, ignore_errors=True) shutil.rmtree(deltaPath, ignore_errors=True) request.urlretrieve(imageGzipUrl, imageGzipPath) shutil.unpack_archive(imageGzipPath, imagePath) # read the images from the flowers dataset images = spark.read.format("binaryFile").\ option("recursiveFileLookup", "true").\ option("pathGlobFilter", "*.jpg").\ load(imagePath) # Knowing the file path, extract the flower type and filename using substring_index # Remember, Spark dataframes are immutable, here we are just reusing the images dataframe images = images.withColumn("flowerType_filename", fn.substring_index(images.path, "/", -2)) images = images.withColumn( "flowerType", fn.substring_index(images.flowerType_filename, "/", 1)) images = images.withColumn( "filename", fn.substring_index(images.flowerType_filename, "/", -1)) images = images.drop("flowerType_filename") images.show() # Select the columns we want to write out to df = images.select("path", "content", "flowerType", "filename").repartition(4) df.show() # Write out the delta table to the given path, this will overwrite any table that is currently there df.write.format("delta").mode("overwrite").save(deltaPath) # Reads the delta table that was just written
def get_spark_commits(date_str): # 2.1: Change the github_api_url so that it queries with the input date # Convert the date string into date formate fromDate = datetime.strptime(date_str, '%Y%m%d').date() toDate = fromDate + timedelta(days=1) # Construct the Git URL to fetch JSON Object(s) request = 'https://api.github.com/repos/apache/spark/commits?since=' + str( fromDate) + 'T00:00:00Z&until=' + str(toDate) + 'T00:00:00' print('Beginning file download from: ' + request) import urllib.request, urllib.error try: # Get the json object(s) with Git URL response = urllib.request.urlopen(request) except urllib.error.HTTPError as e: # Return code error (e.g. 404, 501, ...) print('HTTPError: {}'.format(e.code)) except urllib.error.URLError as e: # Not an HTTP-specific error (e.g. connection refused) print('URLError: {}'.format(e.reason)) else: # 200 sourceJASON = response.read() import pandas as pd # response.read() returns a bytes object, which is just a sequence of bytes. # You need to decode it first, because Python doesn't know what the bytes represent. jsonData = json.loads(sourceJASON.decode('utf-8')) from pyspark import SparkContext # Create Spark Context Directly by passing the config parameters sc = SparkContext("local[*]", "PySpark Electronic Arts Test") # from pyspark import SparkSession from pyspark.sql import SparkSession spark = SparkSession(sc) # Create a Spark DataFrame from a Pandas DataFrame using Arrow # Pandas DataFrame is not distributed it exists on Driver node only # Inorder to acheive parallisam we need to distribute the data across the cluster # Spark DataFrame will distribute the DataFrame source_df = spark.createDataFrame(pd.DataFrame(jsonData)) source_df.printSchema source_df.show() from pyspark.sql.types import DateType, IntegerType # Create a new DataFrame by selecting only few Key Value Pairs from the original JSON Object(s) jsonDF = source_df.select(source_df.sha.alias('sha') \ , source_df.author.login.alias('login_name') \ , source_df.committer.id.cast(IntegerType()).alias('commiter_id') \ , F.concat_ws(' ', F.map_values(source_df.commit.message)).alias('message') \ , source_df.commit.author.date.cast(DateType()).alias('commit_date') \ , source_df.commit.author.email.alias('email') \ , F.substring_index(source_df.commit.author.email, '@', -1).alias('email_company') \ , source_df.url.alias('url')) # Save this DataFrame in memory as it will be used multiple times in the future jsonDF.cache() jsonDF.printSchema jsonDF.show() # Set Parameters for PostgreSQL Database Connection url_connect = "jdbc:postgresql://pa1postgreserver.postgres.database.azure.com:5432/postgres?" commitTable = "F_SPARK_COMMITS" authorTable = "F_SPARK_AUTHORS" mode = "append" db_properties = { "user": "******", "password": "******", "driver": "org.postgresql.Driver" } # Read the Authors Table from PostgreSQL DB into a Spark DataFrame Object readAuthorTableDF = spark.read.jdbc(url=url_connect, table=authorTable, properties=db_properties) # Check if the Authors table is empty or not # If the table in the db is empty then insert the authors dataframe directly # If the table is not empty join the 2 author tables and filter the existing authors in db_properties # Insert only the new author records into DB table if len(readAuthorTableDF.head(1)) > 0: authDF = jsonDF.join(readAuthorTableDF, jsonDF.login_name == readAuthorTableDF.login_name, how='left') \ .filter(readAuthorTableDF.login_name.isNull()) \ .select(jsonDF.login_name \ , jsonDF.commiter_id \ , jsonDF.email \ , jsonDF.email_company) else: authDF = jsonDF.select(jsonDF.login_name \ , jsonDF.commiter_id \ , jsonDF.email \ , jsonDF.email_company) authDF.write.jdbc(url=url_connect, table=authorTable, mode="append", properties=db_properties) authDF.show() # Read the Authors table after insearting the new authors readAuthorTableDF = spark.read.jdbc(url=url_connect, table=authorTable, properties=db_properties) # Read the Commits Table from PostgreSQL DB into a Spark DataFrame Object before Update readCommitTableDF = spark.read.jdbc(url=url_connect, table=commitTable, properties=db_properties) # Create DataFrame by joining the DataFrame which is createded from the source JSON with the authors table contents # Do a InnerJoin with authors dbtable data frame to fetch only the records that have a commit_id in authors table commitDF = jsonDF.join(readAuthorTableDF, jsonDF.commiter_id == readAuthorTableDF.commiter_id, how='inner') from pyspark.sql import Row # Check if the Commits table is empty or not # If the table in the db is empty then insert the commits dataframe directly # If the table is not empty then check the last executed date in the commits db table # Now filter all the records with the current date as last executed datetime # The above step will make sure the process is idempotent. # Insert only the new author records into Commits DB table if len(readCommitTableDF.head(1)) > 0: maxDate = readCommitTableDF.orderBy( readCommitTableDF.creation_date.desc()).head(1)[0].creation_date commitDF = commitDF.filter(F.current_timestamp().cast(DateType()) != maxDate).select(jsonDF.sha \ , jsonDF.url \ , jsonDF.message \ , jsonDF.commit_date \ , readAuthorTableDF.author_id \ , readAuthorTableDF.creation_date) else: commitDF = commitDF.select(jsonDF.sha\ , jsonDF.url\ , jsonDF.message , jsonDF.commit_date\ , readAuthorTableDF.author_id\ , readAuthorTableDF.creation_date) commitDF.show() commitDF.write.jdbc(url=url_connect, table=commitTable, mode="append", properties=db_properties)
def augment(df): if 'addons' in df.columns: df = df.select(['*'] + [create_get_addon_name_udf(addon)(df['addons']).alias(addon.replace('.', '__DOT__')) for addon in all_addons] + [create_get_addon_version_udf(addon)(df['addons']).alias(addon.replace('.', '__DOT__') + '-version') for addon in all_addons]) if 'json_dump' in df.columns: df = df.select(['*'] + [functions.array_contains(df['json_dump']['modules']['filename'], module_name).alias(module_id) for module_id, module_name in module_ids.items()]) if 'plugin_version' in df.columns: df = df.withColumn('plugin', df['plugin_version'].isNotNull()) if 'app_notes' in df.columns: df = df.select(['*'] + [(functions.instr(df['app_notes'], app_note.replace('__DOT__', '.')) != 0).alias(app_note) for app_note in all_app_notes] + [(functions.instr(df['app_notes'], 'Has dual GPUs') != 0).alias('has dual GPUs')]) if 'graphics_critical_error' in df.columns: df = df.select(['*'] + [(functions.instr(df['graphics_critical_error'], error.replace('__DOT__', '.')) != 0).alias(error) for error in all_gfx_critical_errors]) if 'total_virtual_memory' in df.columns and 'platform_version' in df.columns and 'platform' in df.columns: def get_arch(total_virtual_memory, platform, platform_version): if total_virtual_memory: try: if int(total_virtual_memory) < 2684354560: return 'x86' else: return 'amd64' except: return 'unknown' elif platform == 'Mac OS X': return 'amd64' else: if 'i686' in platform_version: return 'x86' elif 'x86_64' in platform_version: return 'amd64' get_arch_udf = functions.udf(get_arch, StringType()) df = df.withColumn('os_arch', get_arch_udf(df['total_virtual_memory'], df['platform'], df['platform_version'])) if 'adapter_driver_version' in df.columns: def get_driver_version(adapter_vendor_id, adapter_driver_version): # XXX: Sometimes we have a driver which is not actually made by the vendor, # in those cases these rules are not valid (e.g. 6.1.7600.16385). if adapter_driver_version: if adapter_vendor_id == '0x8086' or adapter_vendor_id == '8086': return adapter_driver_version[adapter_driver_version.rfind('.') + 1:] elif adapter_vendor_id == '0x10de' or adapter_vendor_id == '10de': return adapter_driver_version[-6:-5] + adapter_driver_version[-4:-2] + '.' + adapter_driver_version[-2:] # TODO: AMD? return adapter_driver_version get_driver_version_udf = functions.udf(get_driver_version, StringType()) df = df.withColumn('adapter_driver_version_clean', get_driver_version_udf(df['adapter_vendor_id'], df['adapter_driver_version'])) if 'cpu_info' in df.columns: df = df.withColumn('CPU Info', functions.substring_index(df['cpu_info'], ' | ', 1)) df = df.withColumn('Is Multicore', functions.substring_index(df['cpu_info'], ' | ', -1) != '1') if 'dom_ipc_enabled' in df.columns: df = df.withColumnRenamed('dom_ipc_enabled', 'e10s_enabled') if 'memory_ghost_windows' in df.columns: df = df.withColumn('ghost_windows > 0', df['memory_ghost_windows'] > 0) if 'memory_top_none_detached' in df.columns: df = df.withColumn('top(none)/detached > 0', df['memory_top_none_detached'] > 0) return df
from pyspark.sql import SparkSession from pyspark.sql import functions as F spark = SparkSession.builder.appName('fifth_exercise').getOrCreate() df = spark.read.csv("price_paid_records.csv", header=True) df = df.select( F.col('`Date of Transfer`'), F.substring_index(F.col('`Date of Transfer`'), "-", 1).alias('year')) df = df.select('year', F.split('`Date of Transfer`', "-")[1].alias('month')) df = df.withColumn('month', df.month.cast('integer')) df1 = df.groupBy('year', 'month').count().groupBy('year').max('count').alias('counts') df = df.groupBy('year', 'month').count().alias('original') df = df.join(df1, (F.col('original.count') == F.col('counts.max(count)')) & (F.col('original.year') == F.col('counts.year'))) df = df.select('original.year', 'original.month') df.show()
def read_spark_df_from_msexchange_data_store(self, **args): url = args["hbase_url"] r = requests.get(url) # Converting api data in json file try: d = r.json() except: print("Invalid URL") # Checking for data availability if len(d) == 0: print( "There are no events to process. Please enter a different search criteria in the url." ) # Converting API data into Spark Dataframe print("Reading the data from profiler...") spark = SparkSession.builder.appName( 'mseapi').enableHiveSupport().getOrCreate() sc = spark.sparkContext tsRDD = sc.parallelize(d) df_mail = spark.read.option('multiline', "true").json(tsRDD) total_evt_count = df_mail.count() print("Total number of records: " + str(total_evt_count)) if total_evt_count > 0: mail_len = f.udf(lambda s: len(s), LongType()) mail_sum = f.udf(lambda s: sum(s), LongType()) # mail_mean = f.udf(lambda s: round(mean(s),4), FloatType()) # mail_stdev = f.udf(lambda s: round(stdev(s),4), FloatType()) df_mail_grp = df_mail.filter(f.length(f.trim(df_mail["mail_size"]))>0)\ .withColumn("check", f.when(f.instr(df_mail["mail_size"],',') == 1,f.substring_index(df_mail["mail_size"],',',-1)).otherwise(df_mail["mail_size"]))\ .withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\ .withColumn("mail_size", f.regexp_replace('check', ' ', ''))\ .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',') .cast(ArrayType(IntegerType())).alias("email_size"), f.sum("ext_sndrs").alias("ext_sndrs"))\ .withColumn("no_of_emails", mail_len("email_size"))\ .withColumn("tot_email_size", mail_sum("email_size"))\ .withColumn("avg_email_size", f.round(f.col("tot_email_size")/ f.col("no_of_emails"),4))\ .drop("email_size") #.withColumn("email_size_mean", mail_mean("email_size"))\ #.withColumn("email_size_stdev", f.when(mail_len("email_size") > 1,mail_stdev("email_size")))\ # df_mail_grp = df_mail.filter(f.length(f.trim(df_mail["mail_size"]))>0)\ # .withColumn("check", f.when(f.instr(df_mail["mail_size"],',') == 1,f.substring_index(df_mail["mail_size"],',',-1)).otherwise(df_mail["mail_size"]))\ # .withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\ # .withColumn("mail_size", f.regexp_replace('check', ' ', ''))\ # .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',') # .cast(ArrayType(IntegerType())).alias("email_size"), # f.sum("ext_sndrs").alias("ext_sndrs"))\ # .withColumn("no_of_emails", mail_len("email_size"))\ # .withColumn("tot_email_size", mail_sum("email_size"))\ # .withColumn("avg_email_size", f.round(f.col("tot_email_size")/ f.col("no_of_emails"),4))\ # .drop("email_size") # #.withColumn("email_size_mean", mail_mean("email_size"))\ # #.withColumn("email_size_stdev", f.when(mail_len("email_size") > 1,mail_stdev("email_size")))\ # df_mail_grp = df_mail.withColumn("ext_sndrs", df_mail["ext_sndrs"].cast(LongType()))\ # .withColumn("mail_size", f.regexp_replace('mail_size', ' ', ''))\ # .groupBy(["mail_id"]).agg(f.split(f.concat_ws(",", f.collect_list("mail_size")),',') # .cast(ArrayType(IntegerType())).alias("email_size"), # f.sum("ext_sndrs").alias("ext_sndrs"))\ # .withColumn("no_of_emails", mail_len("email_size"))\ # .withColumn("tot_email_size", mail_sum("email_size"))\ # .withColumn("email_size_mean", mail_mean("email_size"))\ # .withColumn("email_size_stdev", mail_stdev("email_size"))\ # .drop("email_size") df_mail_grp.show(3) return df_mail_grp else: schema = StructType([]) sqlContext = SQLContext(sc) sdf = sqlContext.createDataFrame(sc.emptyRDD(), schema) return sdf
os.remove(imageGzipPath) shutil.rmtree(imagePath, ignore_errors=True) shutil.rmtree(deltaPath, ignore_errors=True) request.urlretrieve(imageGzipUrl, imageGzipPath) shutil.unpack_archive(imageGzipPath, imagePath) # read the images from the flowers dataset images = spark.read.format("binaryFile").\ option("recursiveFileLookup", "true").\ option("pathGlobFilter", "*.jpg").\ load(imagePath) # Knowing the file path, extract the flower type and filename using substring_index # Remember, Spark dataframes are immutable, here we are just reusing the images dataframe images = images.withColumn("flowerType_filename", fn.substring_index(images.path, "/", -2)) images = images.withColumn("flowerType", fn.substring_index(images.flowerType_filename, "/", 1)) images = images.withColumn("filename", fn.substring_index(images.flowerType_filename, "/", -1)) images = images.drop("flowerType_filename") images.show() # Select the columns we want to write out to df = images.select("path", "content", "flowerType", "filename").repartition(4) df.show() # Write out the delta table to the given path, this will overwrite any table that is currently there df.write.format("delta").mode("overwrite").save(deltaPath) # Reads the delta table that was just written dfDelta = spark.read.format("delta").load(deltaPath) dfDelta.show()
avg("col_2").over(spec)).withColumn( "rank_val", rank().over(spec)).withColumn( "dense_rank_val", dense_rank().over(spec)).show() # COMMAND ---------- from pyspark.sql.functions import col, substring, substring_index, instr, split, concat_ws, repeat from pyspark.sql.types import StringType #substring #orders_new_col.show() func_df = orders_new_col.select( 'order_status', substring('order_status', 1, 2).alias("sub"), substring_index('order_status', "E", -3).alias("sub_ind")).select( "*", instr('sub_ind', 'E').alias("instr_val"), split('order_status', "_")[0].alias("split_val")).select( "*", concat_ws("|", "order_status", "sub").alias("concat_val")) func_df.withColumn("repeat_val", repeat("instr_val", 3)).select( "*", concat_ws("|", *func_df.columns).alias("conc_ws")).show(truncate=False) #orders_new_col.select(substring_index('order_status', "_", 2)).show() #list_1 = ["col_1", "col_2"] #df_1 = spark.createDataFrame(list_1, StringType()) #df_1.select(substring_index("value", "_", 1)).show() # COMMAND ----------
from pyspark.sql import * from pyspark.sql.functions import regexp_extract, substring_index if __name__ == "__main__": spark = SparkSession \ .builder \ .master("local[3]") \ .appName("LogFileDemo") \ .getOrCreate() file_df = spark.read.text("data/apache_logs.txt") file_df.printSchema() log_reg = r'^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\S+) "(\S+)" "([^"]*)' logs_df = file_df.select(regexp_extract('value', log_reg, 1).alias('ip'), regexp_extract('value', log_reg, 4).alias('date'), regexp_extract('value', log_reg, 6).alias('request'), regexp_extract('value', log_reg, 10).alias('referrer')) logs_df \ .where("trim(referrer) != '-' ") \ .withColumn("referrer", substring_index("referrer", "/", 3)) \ .groupBy("referrer") \ .count() \ .show(100, truncate=False)