def get_last_user_event_value( target_column: str, user_column: str = "user_id" ) -> Column: return F.last(F.col(target_column)).over( Window() .partitionBy(user_column) .rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) )
def add_paths(df) -> Dataset: df = index_state(df) user_window = Window().partitionBy(USER_ID) w = user_window.orderBy(F.col(EVENT_TIME).asc()) df = df.withColumn(PATH, F.collect_list(STATE_INDEX).over(w)) last_date_col = "last_date" df = df.withColumn(last_date_col, F.max(EVENT_TIME).over(user_window)) df = df.where(df[EVENT_TIME] == df[last_date_col]) return df
def filter_events(self, input_data: DataFrame) -> DataFrame: self._log_info("filtering events") user_window = Window().partitionBy("user_id") article_not_null = F.col("article_id").isNotNull() wiki_not_null = F.col("wiki_id").isNotNull() more_than_one_visit = F.col("visits") > 1 return (input_data.where(article_not_null & wiki_not_null).withColumn( "visits", func.count_user_events()).where(more_than_one_visit).drop("visits") )
def sql_window_api(spark): print("Start running Window and WindowSpec API") sc = spark.sparkContext sqlContext = SQLContext(sc) # orderBy, partitionBy, rowsBetween, rangeBetween df = spark.createDataFrame([("Alice", 2, 50), ("Alice", 3, 50), ("Alice", 2, 60), ("Alice", 3, 60), ("Alice", 2, 70), ("Bob", 3, 50), ("Bob", 3, 60), ("Bob", 4, 50)], ["name", "age", "height"]) window = Window().partitionBy("name") df.withColumn("mean", mean("height").over(window)).show() window = Window().partitionBy("name").orderBy("height").rangeBetween(-4, 0) df.withColumn("mean", mean("height").over(window)).show() window = Window().partitionBy("name").orderBy("height").rowsBetween( Window.currentRow, 1) df.withColumn("mean", mean("height").over(window)).show() print("Finish running Window and WindowSpec API")
def find_edge_events(self, input_data: DataFrame, user_column: str = "user_id") -> DataFrame: user_window = (Window().partitionBy(user_column).rowsBetween( Window.unboundedPreceding, Window.unboundedFollowing)) return (input_data.orderBy("timestamp", ascending=True).select( "*", F.array(F.col("article_id"), F.col("wiki_id")).alias("article_wiki_ids"), ).select( "user_id", F.first("article_wiki_ids").over(user_window).alias("first_event"), F.last("article_wiki_ids").over(user_window).alias("last_event"), ))
import pyspark.sql.functions as f def quarterly_asset_change(w): return (f.col('TOTAL ASSETS') - f.lag('TOTAL ASSETS', 1).over(w)) / f.lag('TOTAL ASSETS', 1).over(w) if __name__ == '__main__': spark = SparkSession.builder \ .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.11:2.2.2') \ .getOrCreate() banks_df = spark.read.format('com.mongodb.spark.sql.DefaultSource').option('uri', 'mongodb://127.0.0.1:27017/precision_lender.banks').load() banks_df = banks_df.select('IDRSSD', 'Financial Institution Name').distinct() balance_sheet_df = spark.read.format('com.mongodb.spark.sql.DefaultSource').option('uri', 'mongodb://127.0.0.1:27017/precision_lender.balance_sheet').load() w = Window().partitionBy('IDRSSD').orderBy('Reporting Period') average_quarterly_asset_growth = balance_sheet_df \ .withColumn('Quarterly Percent Change', quarterly_asset_change(w)) \ .groupBy('IDRSSD').agg(f.mean('Quarterly Percent Change').alias('Mean Quarterly Percent Change')) df = banks_df.join(average_quarterly_asset_growth, on=['IDRSSD']) \ .select('Financial Institution Name', 'Mean Quarterly Percent Change') \ .sort(f.col('Mean Quarterly Percent Change').desc()) df.show(truncate=False)
def count_user_events(user_column: str = "user_id") -> Column: user_window = Window().partitionBy(user_column) return F.count("*").over(user_window)
def process_log_data(spark, input_data, output_data): """ Reads log dataset from S3 and transfroms it into users, time & songplays tables, finally these tables are written back to S3. Args: spark : Spark Session object input_data (str): input S3 bucket path output_data (str): output S3 bucket path """ # Get filepath to log data file log_data_path = os.path.join(input_data, 'log-data/*/*/*.json') print("{}: start processing {}".format(log_prefix, log_data_path)) # Define the schema for log data files schema_log = StructType([ StructField('artist', StringType(), True), StructField('auth', StringType(), True), StructField('firstName', StringType(), True), StructField('gender', StringType(), True), StructField('itemInSession', LongType(), True), StructField('lastName', StringType(), True), StructField('length', DoubleType(), True), StructField('level', StringType(), True), StructField('location', StringType(), True), StructField('method', StringType(), True), StructField('page', StringType(), True), StructField('registration', StringType(), True), StructField('sessionId', LongType(), True), StructField('song', StringType(), True), StructField('status', LongType(), True), StructField('ts', LongType(), True), StructField('userAgent', StringType(), True), StructField('userId', StringType(), True) ]) # Read log data file df_log = spark.read.json(log_data_path, schema=schema_log) # Repartition df_log = df_log.repartition(num_partitions) # Persist logs dataframe for reuse df_log.persist(StorageLevel.MEMORY_AND_DISK) # Filter by actions for song plays df_log_nextSong = df_log.filter( df_log.userId.isNotNull()).filter(df_log.page == 'NextSong') # Extract columns for users table users_latest_state = df_log_nextSong.groupBy('userId').max('ts') \ .select("userId", col("max(ts)").alias("ts")) users_table = df_log_nextSong.join(users_latest_state, on = ['userId','ts']) \ .select(col('userId').alias('user_id'), \ col('firstName').alias('first_name'), \ col('lastName').alias('last_name'), \ 'gender', 'level') # Write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users')) print("{}: users table is written into S3".format(log_prefix)) # Create datetime column from original timestamp column convert_ms_to_s = udf(lambda x: x // 1000, LongType()) df_timestamp = df_log_nextSong.select( col('ts').alias('start_time')).dropDuplicates() df_timestamp = df_timestamp.withColumn( "datetime", from_unixtime(convert_ms_to_s(df_timestamp.start_time))) # Extract columns to create time table time_table = df_timestamp.withColumn("hour", hour("datetime")) \ .withColumn("day", dayofmonth("datetime")) \ .withColumn("week", weekofyear("datetime")) \ .withColumn("month", month("datetime")) \ .withColumn("year", year("datetime")) \ .withColumn("weekday", dayofweek("datetime")) \ .drop('datetime') # Write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time')) print("{}: time table is written into S3".format(log_prefix)) # Read in songs & artists tables to use for songplays table songs_table = spark.read.parquet(os.path.join(output_data, 'songs')) artists_table = spark.read.parquet(os.path.join(output_data, 'artists')) # Extract columns from joined song and log datasets to create songplays table songplays_table = df_log_nextSong.join(songs_table, df_log_nextSong.song == songs_table.title) \ .join(artists_table, df_log_nextSong.artist == artists_table.name) \ .join(time_table, df_log_nextSong.ts == time_table.start_time) \ .select(df_log_nextSong.ts.alias('start_time'), \ df_log_nextSong.userId.alias('user_id'), \ df_log_nextSong.level, \ songs_table.song_id, \ artists_table.artist_id, \ df_log_nextSong.sessionId.alias('session_id'), \ df_log_nextSong.location, \ df_log_nextSong.userAgent.alias('user_agent'), \ time_table.year, \ time_table.month ) \ .withColumn('songplay_id', row_number().over(Window().orderBy('song_id'))) # Write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'songplays')) # Unpersist song dataframe df_log.unpersist() print("{}: logs dataset processing is finished".format(log_prefix))
def process_log_data(spark, input_data, output_data, song_df): """ Description: This function can be used to process log-data files from the given input path and transform the data from json files into users, time and songplays spark tables and writing these tables to the given output path as parquet tables. Arguments: spark: SparkSession object. input_data: Path to the input JSON files. output_data: Path to the output directory that stores output parquet tables. song_df: Song data dataframe. Returns: None. """ # get filepath to log data file log_data = input_data + 'log-data/2018/11' # define schema for log data file log_schema = t.StructType([ t.StructField("artist", t.StringType(), True), t.StructField("auth", t.StringType(), True), t.StructField("firstName", t.StringType(), True), t.StructField("gender", t.StringType(), True), t.StructField("itemInSession", t.IntegerType(), True), t.StructField("lastName", t.StringType(), True), t.StructField("length", t.DecimalType(12, 7), True), t.StructField("level", t.StringType(), True), t.StructField("location", t.StringType(), True), t.StructField("method", t.StringType(), True), t.StructField("page", t.StringType(), True), t.StructField("registration", t.DecimalType(16, 2), True), t.StructField("sessionId", t.IntegerType(), True), t.StructField("song", t.StringType(), True), t.StructField("status", t.IntegerType(), True), t.StructField("ts", t.LongType(), True), t.StructField("userAgent", t.StringType(), True), t.StructField("userId", t.StringType(), True) ]) # read log data file using schema df = spark \ .read \ .format("json") \ .schema(log_schema) \ .load(log_data) # filter by actions for song plays df = df \ .filter('page = "NextSong"') # group by userId for unique users users_list = df \ .groupBy('userId') \ .agg(f.max('ts').alias('ts')) # extract columns to create users table users_table = df \ .join(users_list, ['userId', 'ts'], 'inner') \ .select([df.userId.cast(t.IntegerType()).alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), 'gender', 'level']) \ .dropDuplicates() # write users table to parquet files users_output = output_data + 'users' users_table \ .write \ .option("path", users_output) \ .saveAsTable('users', format='parquet') # create timestamp column from original timestamp column df = df \ .withColumn('timestamp', f.from_utc_timestamp((df.ts/1000.0).cast('timestamp'), 'UTC')) # create datetime column from original timestamp column get_datetime = udf(lambda ts: datetime.fromtimestamp(ts / 1000.0), t.TimestampType()) df = df.withColumn('datetime', get_datetime('ts')) # extract columns to create time table time_table = df \ .select([col('datetime').alias('start_time'), dayofmonth(col('datetime')).alias('day'), weekofyear(col('datetime')).alias('week'), month(col('datetime')).alias('month'), year(col('datetime')).alias('year'), dayofweek(col('datetime')).alias('weekday')]) \ .dropDuplicates() # write time table to parquet files partitioned by year and month time_output = output_data + 'time' time_table \ .write \ .partitionBy('year', 'month') \ .option("path", time_output) \ .saveAsTable('time', format='parquet') # join and extract columns from song and log datasets to create songplays table cond = [ df.artist == song_df.artist_name, df.song == song_df.title, df.length == song_df.duration ] songplays_df = df.join(song_df, cond, 'left') songplays_df = songplays_df \ .select(df.datetime.alias('start_time'), df.userId.alias('user_id'), df.level.alias('level'), song_df.song_id.alias('song_id'), song_df.artist_id.alias('artist_id'), df.sessionId.alias('session_id'), df.location.alias('location'), df.userAgent.alias('user_agent'), year(df.datetime).alias('year'), month(df.datetime).alias('month')) w = Window().orderBy(f.lit('A')) songplays_table = songplays_df.withColumn('songplay_id', f.row_number().over(w)) # write songplays table to parquet files partitioned by year and month songplays_output = output_data + 'songplays' songplays_table \ .select(['songplay_id', 'start_time', 'user_id', 'level', 'song_id', 'artist_id', 'session_id', 'location', 'user_agent', 'year', 'month'])\ .write \ .partitionBy('year', 'month') \ .option("path", songplays_output) \ .saveAsTable('songplays', format='parquet')
#Разделить поле title на два отдельных – smartphone_brand и smartphone_model. df1 = df.withColumn('smartphone_brand', F.split(df['title'],' ').getItem(0)) df1 = df1.withColumn('smartphone_brand1', split_col.getItem(F.size(split_col)-1)) df1 = df1.withColumn('smartphone_brand2', split_col.getItem(F.size(split_col)-2)) df1 = df1.drop('seller','ram','memory') df1 = df1.withColumn('smartphone_model',F.expr('substring(title,length(smartphone_brand)+2, length(title)-length(smartphone_brand)-length(smartphone_brand1)-length(smartphone_brand2)-3)')).drop('smartphone_brand2','smartphone_brand1') # я не знал как сделать по-другому - ничего не нашел #Найти средние стоимости моделей смартфонов. df1 = df1.withColumn("price", df1["price"].cast(IntegerType())) df1 = df1.groupby('smartphone_brand','smartphone_model').agg(F.avg('price').alias('smartphone_price')) #Каждой уникальной записи dataframe’а необходимо присвоить свой порядковый smartphone_id, начиная с 1 df1= df1.withColumn("new_column",F.lit("ABC")) w = Window().partitionBy('new_column').orderBy(F.lit('A')) df1 = df1.withColumn("smartphone_id", F.row_number().over(w)).drop("new_column") df1 = df1.select('smartphone_id', 'smartphone_brand', 'smartphone_model', 'smartphone_price') df1.show(10,False) # In[11]: df1.write.mode('overwrite').parquet("phones.parquet") # In[33]:
# ## Data Enrichment & Additional Transformations (Continued...) # In[8]: # Filter - Keep where Playtype in ['Run','Pass'] nfldata2 = nfldata2.filter((nfldata2.PlayType == "Run") | (nfldata2.PlayType == "Pass")) # Derive Date var(s) nfldata2 = nfldata2.withColumn( "month_day", concat(nfldata2["Date"].substr(6, 2), nfldata2["Date"].substr(9, 2)).cast("int")) # Lag (Get previous PlayType) w = Window().partitionBy('GameID', 'Drive').orderBy('GameID', 'Drive', col('TimeSecs').desc()) nfldata2 = nfldata2.withColumn("PlayType_lag", lag("PlayType").over(w)).withColumn( "PlayType_lag", when(isnull('PlayType_lag'), 'FirstPlay').otherwise( col('PlayType_lag'))).orderBy( 'GameID', 'Drive', col('TimeSecs').desc()) # Print Results #nfldata2.select(["GameID","Drive","qtr","down","TimeSecs","PlayType","PlayType_lag","yrdline100","posteam","month_day"]).show(50,False) # Split into "Run" and "Pass" (I want to build two models) nfldata2_run = nfldata2.filter(col('PlayType') == 'Run') nfldata2_pass = nfldata2.filter(col('PlayType') == 'Pass')
.reduceByKey(lambda a, b: a + b) \ # convert the RDD into a DataFrame temporarily used to further convert into a Pandas DataFrame columns = ["Word", "Frequency"] df = word_and_frequency_pairs.toDF(columns) # convert the dataFrame into a Pandas dataframe for easy sorting pandasDataframe = df.toPandas() pandasDataframeSorted = pandasDataframe.sort_values( by=['Frequency', 'Word'], ascending=[False, True]) # convert back to a PySpark DataFrame pySparkDataFrame = spark.createDataFrame(pandasDataframeSorted) # Add Rank (row index) column w = Window().orderBy(F.col("Frequency").desc(), F.col("Word").asc()) pySparkDataFrame = pySparkDataFrame.withColumn("Rank", F.row_number().over(w)) categoriseAndPrintEntities(distinct_words, pySparkDataFrame, 'words') print('----------------------\n') # extract letters from each word - the words are in the RDD created before letters = words.flatMap(lambda word: [character for character in word]) \ .filter(lambda letter: letter != '-') distinct_letters = letters.distinct() # print totals printTotalsInformation(letters, distinct_letters, 'letters')
def analyze(spark: SparkSession, input_file='emissions.parquet', output_file='rasters.parquet', time_granularity=600, num_cols=100, cell_size=None, use_type=False): prt_high(""" Running compute emissions. ################################## Parameters - Input file: {} - Output file: {} - Time granularity: {} - Cell size: {} - Number of raster columns: {} - Rasters by type: {} ################################## """.format(input_file, output_file, time_granularity, cell_size, num_cols, use_type)) cell_size_meters = None # Process parameters if cell_size is not None: prt_high("Info: Cell size set in parameters, using it") if cell_size[-1] == 'm': # Meters cell_size_meters = int(cell_size[:-1]) cell_size = meters_to_deg(cell_size_meters) # All except the last char else: cell_size = int(cell_size) else: num_cols = int(num_cols) time_granularity = int(time_granularity) # TODO: FILL NUM_VARS automatically prt_warn("WARNING: Number of variables is manually set to 10") num_vars = 10 df = spark.read.parquet(input_file) min_max_lat_lon = df.agg(F.min(df.latitude), F.max(df.latitude), F.min(df.longitude), F.max(df.longitude)) min_max_time = df.agg(F.min(df.time), F.max(df.time)) min_max_row = min_max_lat_lon.first() min_lat = min_max_row[0] max_lat = min_max_row[1] min_lon = min_max_row[2] max_lon = min_max_row[3] min_max_time = min_max_time.first() min_time = min_max_time[0] max_time = min_max_time[1] if cell_size is None: # Calculate cell dimension and number of rows using the number of # columns defined cell_size = (max_lon - min_lon) / num_cols else: num_cols = int(np.ceil((max_lon - min_lon) / cell_size)) num_rows = int(np.ceil((max_lat - min_lat) / cell_size)) prt_info("NUM COLS: " + str(num_cols)) prt_info("NUM ROWS: " + str(num_rows)) prt_info("CELL DIM: " + str(cell_size)) # Create metadata file prt_info("Building metadata") meta = [(num_cols, num_rows, num_vars, min_lat, max_lat, min_lon, max_lon, cell_size, cell_size_meters, min_time, max_time, time_granularity) ] rdd = spark.sparkContext.parallelize(meta) metarow = rdd.map(lambda x: Row(num_cols=int(x[0]), num_rows=int(x[1]), num_vars=int(x[2]), min_lat=float(x[3]), max_lat=float(x[4]), min_lon=float(x[5]), max_lon=float(x[6]), cell_size=float(x[7]), cell_size_meters=float(x[8]), min_time=int(x[9]), max_time=int(x[10]), time_granularity=int(x[11]))) metadf = spark.createDataFrame(metarow) # Create rasters prt_info("Building rasters") # TODO: Implement a way to define cell size to_cell = partial(lat_lon_to_cell, min_lon, min_lat, num_cols, num_rows, cell_size) convertToCellFunc = udf(to_cell, IntegerType()) to_hours = partial(to_time_resolution, time_granularity) convertToHours = udf(to_hours, IntegerType()) df_cell = df.withColumn('cell', convertToCellFunc(df['longitude'], df['latitude'])) df_cell = df_cell.withColumn('hour', convertToHours(df['time'])) # use desc order and first to get the last value # https://stackoverflow.com/questions/43114445/how-to-use-first-and-last-function-in-pyspark w = Window().partitionBy('imo', 'hour').orderBy(df_cell.time.desc()) # lower than any possible sin/cos value so max only gets valid values df_cell = df_cell.withColumn('last_time', first("time").over(w)) # PLACEHOLDER = -9999.0 # df_cell = df_cell.withColumn('last_amp_v', # when(df_cell['last_time'] == df_cell['time'], # df_cell['amp_v']).otherwise(PLACEHOLDER)) # df_cell = df_cell.withColumn('last_cos', # when(df_cell['last_time'] == df_cell['time'], # df_cell['cos_v']).otherwise(PLACEHOLDER)) # df_cell = df_cell.withColumn('last_sin', # when(df_cell['last_time'] == df_cell['time'], # df_cell['sin_v']).otherwise(PLACEHOLDER)) # potential bug: max(last_amp_v), max('last_cos'), max('last_sin'). Each # may be from different ships if they happen to be in the same raster cell if use_type: raster = df_cell.groupBy("cell", "hour", "type") else: raster = df_cell.groupBy("cell", "hour") raster = raster.agg( F.sum("sox_me").alias("sox_me"), F.sum("sox_ae").alias("sox_ae"), F.sum('co2_me').alias('co2_me'), F.sum("co2_ae").alias("co2_ae"), F.sum("nox_me").alias("nox_me"), F.sum('nox_ae').alias('nox_ae'), # F.max('last_amp_v').alias('last_amp_v'), # F.max('last_cos').alias('last_cos'), # F.max('last_sin').alias('last_sin'), F.count('*').alias('sample_count')) # Lower limit for these attributes # raster = raster.withColumn( # 'last_amp_v', when(raster['last_amp_v'] <= (PLACEHOLDER + 1), 0) # .otherwise(raster['last_amp_v'])) # raster = raster.withColumn( # 'last_cos', when(raster['last_cos'] <= (PLACEHOLDER + 1), 0) # .otherwise(raster['last_cos'])) # raster = raster.withColumn( # 'last_sin', when(raster['last_sin'] <= (PLACEHOLDER + 1), 0) # .otherwise(raster['last_sin'])) raster = change_column_type(raster, 'sample_count', IntegerType(), force=True) raster = change_column_type(raster, 'sox_me', FloatType(), force=True) raster = change_column_type(raster, 'sox_ae', FloatType(), force=True) raster = change_column_type(raster, 'co2_me', FloatType(), force=True) raster = change_column_type(raster, 'co2_ae', FloatType(), force=True) raster = change_column_type(raster, 'nox_me', FloatType(), force=True) raster = change_column_type(raster, 'nox_ae', FloatType(), force=True) # raster = change_column_type(raster, 'last_amp_v', FloatType(), # force=True) # raster = change_column_type(raster, 'last_cos', FloatType(), force=True) # raster = change_column_type(raster, 'last_sin', FloatType(), force=True) raster = ensure_columns_type(raster) # Write rasters raster.write.mode('overwrite').parquet(output_file) # Write metadata metadf.write.mode('overwrite').parquet(output_file + '.meta') return
## Extracting weekday from nav_date column data = data.withColumn('weekday', f.date_format('nav_date', 'E')) ## converting weekday from string to numerical EX: Monday as 0 so on data = data.withColumn('weekday', weekday(data.weekday)) ## Using window function to calculate cumulative sum windowval = Window.partitionBy('nav_date').orderBy('Bin').rangeBetween( Window.unboundedPreceding, 0) data = data.withColumn('cumsum', f.sum('count').over(windowval)) df_bin_sorted = data.withColumn('cumsum', data.cumsum.cast(IntegerType())) ### w = Window().partitionBy('nav_date') df_bin_sorted = df_bin_sorted.select( "*", f.collect_list("cumsum").over(w).alias("group_cumsum")) def cumsumpct_calculate(num, maximum): return num / float(np.max(maximum)) udf_cum_pct = f.udf(cumsumpct_calculate, FloatType()) df_bin_sorted = df_bin_sorted.withColumn( 'cumsumpct', f.format_number( udf_cum_pct(df_bin_sorted.cumsum, df_bin_sorted.group_cumsum), 4))
''' Created on 27-Jun-2020 Window Functions or Windowed Aggregates: Window (also, windowing or windowed) functions perform a calculation over a set of rows. It is an important tool to do statistics. Most Databases support Window functions. Spark from version 1.4 start supporting Window functions. Spark Window Functions have the following traits: a. perform a calculation over a group of rows, called the Frame. b. a frame corresponding to the current row c. return a new value to for each row by an aggregate/window function d. Can use SQL grammar or DataFrame API. @author: kasho ''' from com.pyspark.poc.utils.BaseConfUtils import BaseConfUtils from pyspark.sql import Window from pyspark.sql.types import * from pyspark.sql.functions import * conf = BaseConfUtils() sparkContxt = conf.createSparkContext("Windows Fun") sqlContxt = conf.createSQLContext(sparkContxt) @udf("long") def median_udf(s): index = int(len(s) / 2) return s[index]
"RespondsToMailOffers", "OptOutMailings", "NonUSTravel", "OwnsComputer", "HasCreditCard", "NewCellphoneUser", "MadeCallToRetentionTeam", "CreditRating", "PrizmCode", "Occupation", "MaritalStatus" } # Cast given columns to type double churn = casting(churn, string_columns) # Change output ame churn = churn.withColumn("label", col("Churn") == "Yes") # Cast output to type int churn = churn.withColumn("label", col("label").cast('int')) # Generate window w = Window().orderBy(lit('A')) # Create column to produce row numbers churn = churn.withColumn("row_num", row_number().over(w)) ######################## #---Machine Learning---# ######################## # Create machine learning pipeline piped = ml_pipeline(churn) # Standardize dataset training, testing = piped.randomSplit([.75, .25]) # Create evaluator