Exemplo n.º 1
0
 def generate_age_information(self, dcc_experiment_df: DataFrame,
                              mice_df: DataFrame):
     experiment_df_a = dcc_experiment_df.alias("exp")
     mice_df_a = mice_df.alias("mice")
     dcc_experiment_df = experiment_df_a.join(
         mice_df_a,
         (experiment_df_a["specimenID"] == mice_df["_specimenID"])
         & (experiment_df_a["_centreID"] == mice_df["_centreID"]),
         "left_outer",
     )
     dcc_experiment_df = self._add_special_dates(dcc_experiment_df)
     dcc_experiment_df = dcc_experiment_df.withColumn(
         "ageInDays",
         datediff(
             when(
                 col("_dateOfBloodCollection").isNotNull(),
                 col("_dateOfBloodCollection"),
             ).when(
                 col("_dateOfSacrifice").isNotNull(),
                 col("_dateOfSacrifice")).otherwise(
                     col("_dateOfExperiment")),
             col("mice._DOB"),
         ),
     )
     dcc_experiment_df = dcc_experiment_df.withColumn(
         "ageInWeeks",
         udf(lambda x: math.floor(x / 7) if x is not None else None,
             IntegerType())(col("ageInDays")),
     )
     return dcc_experiment_df.select("exp.*", "ageInWeeks", "ageInDays")
Exemplo n.º 2
0
def process_fact_parking(sensordata_sdf: DataFrame,
                         dim_parkingbay_sdf: DataFrame,
                         dim_location_sdf: DataFrame,
                         dim_st_marker_sdf: DataFrame, load_id, loaded_on):
    """Transform sensordata into fact_parking"""

    dim_date_id = loaded_on.strftime("%Y%M%d")
    midnight = loaded_on.replace(hour=0, minute=0, second=0, microsecond=0)
    dim_time_id = (midnight - loaded_on).seconds

    # Build fact
    fact_parking = sensordata_sdf\
        .join(dim_parkingbay_sdf.alias("pb"), "bay_id", "left_outer")\
        .join(dim_location_sdf.alias("l"), ["lat", "lon"], "left_outer")\
        .join(dim_st_marker_sdf.alias("st"), "st_marker_id", "left_outer")\
        .select(
            lit(dim_date_id).alias("dim_date_id"),
            lit(dim_time_id).alias("dim_time_id"),
            when(col("pb.dim_parking_bay_id").isNull(), lit(EMPTY_UUID))
            .otherwise(col("pb.dim_parking_bay_id")).alias("dim_parking_bay_id"),
            when(col("l.dim_location_id").isNull(), lit(EMPTY_UUID))
            .otherwise(col("l.dim_location_id")).alias("dim_location_id"),
            when(col("st.dim_st_marker_id").isNull(), lit(EMPTY_UUID))
            .otherwise(col("st.dim_st_marker_id")).alias("dim_st_marker_id"),
            "status",
            lit(load_id).alias("load_id"),
            lit(loaded_on.isoformat()).cast("timestamp").alias("loaded_on")
        )
    return fact_parking
Exemplo n.º 3
0
def resolve_image_record_parameter_association(
        image_record_observation_df: DataFrame,
        simple_observations_df: DataFrame):
    simple_df = simple_observations_df.alias("simple")
    image_df = image_record_observation_df.alias("image").withColumn(
        "parameterAsc",
        explode("image.seriesMediaParameterValue.parameterAssociation"))
    image_vs_simple_parameters_df = image_df.join(
        simple_df,
        (col("simple.experiment_id") == col("image.experiment_id"))
        & (col("simple.parameter_stable_id")
           == col("parameterAsc._parameterID")),
    )
    image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumn(
        "paramName", col("simple.parameter_name"))
    image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumn(
        "paramSeq", lit("0"))
    image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumn(
        "paramValue",
        when(col("data_point").isNotNull(), col("data_point")).otherwise(
            when(col("category").isNotNull(),
                 col("category")).otherwise(col("text_value"))),
    )
    image_vs_simple_parameters_df = image_vs_simple_parameters_df.groupBy(
        col("image.observation_id"), col("image.parameter_stable_id")).agg(
            collect_set("parameterAsc._parameterID").alias("paramIDs"),
            collect_set("paramName").alias("paramNames"),
            collect_set("paramSeq").alias("paramSeqs"),
            collect_set("paramValue").alias("paramValues"),
        )
    image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumnRenamed(
        "observation_id",
        "img_observation_id").withColumnRenamed("parameter_stable_id",
                                                "img_parameter_stable_id")
    image_vs_simple_parameters_df = image_vs_simple_parameters_df.select(
        "img_observation_id",
        "img_parameter_stable_id",
        "paramIDs",
        "paramNames",
        "paramSeqs",
        "paramValues",
    )
    image_record_observation_df = image_record_observation_df.join(
        image_vs_simple_parameters_df,
        (image_record_observation_df["observation_id"]
         == image_vs_simple_parameters_df["img_observation_id"])
        & (image_record_observation_df["parameter_stable_id"]
           == image_vs_simple_parameters_df["img_parameter_stable_id"]),
        "left_outer",
    )
    image_record_observation_df = (
        image_record_observation_df.withColumnRenamed(
            "paramIDs", "parameter_association_stable_id").withColumnRenamed(
                "paramNames", "parameter_association_name").withColumnRenamed(
                    "paramSeqs",
                    "parameter_association_sequence_id").withColumnRenamed(
                        "paramValues", "parameter_association_value"))
    return image_record_observation_df
Exemplo n.º 4
0
def process_dim_location(sensordata_sdf: DataFrame, dim_location: DataFrame,
                         load_id, loaded_on):
    """Transform sensordata into dim_location"""

    # Get landing data distint rows
    sensordata_sdf = sensordata_sdf\
        .select(["lat", "lon"]).distinct()

    # Using a left_outer join
    # identify rows that do NOT EXIST in landing data that EXISTS in existing Dimension table
    oldrows_sdf = dim_location.alias("dim")\
        .join(sensordata_sdf, ["lat", "lon"], "left_outer")\
        .where(sensordata_sdf["lat"].isNull() & sensordata_sdf["lon"].isNull())\
        .select(col("dim.*"))

    # Using a left_outer join
    # Identify rows that EXISTS in incoming landing data that does also EXISTS in existing Dimension table
    # and take the values of the incoming landing data. That is, we update existing table values.
    existingrows_sdf = sensordata_sdf.alias("in")\
        .join(dim_location.alias("dim"), ["lat", "lon"], "left_outer")\
        .where(dim_location["lat"].isNotNull() & dim_location["lon"].isNotNull())\
        .select(
            col("dim.dim_location_id"),
            col("in.lat"),
            col("in.lon")
        )

    # Using a left_outer join
    # Identify rows that EXISTS in landing data that does NOT EXISTS in existing Dimension table
    newrows_sdf = sensordata_sdf.alias("in")\
        .join(dim_location, ["lat", "lon"], "left_outer")\
        .where(dim_location["lat"].isNull() & dim_location["lon"].isNull())\
        .select(col("in.*"))

    # Add load_id, loaded_at and dim_parking_bay_id
    existingrows_sdf = existingrows_sdf.withColumn("load_id", lit(load_id))\
        .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))
    newrows_sdf = newrows_sdf.withColumn("load_id", lit(load_id))\
        .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))\
        .withColumn("dim_location_id", uuidUdf())

    # Select relevant columns
    relevant_cols = ["dim_location_id", "lat", "lon", "load_id", "loaded_on"]
    oldrows_sdf = oldrows_sdf.select(relevant_cols)
    existingrows_sdf = existingrows_sdf.select(relevant_cols)
    newrows_sdf = newrows_sdf.select(relevant_cols)

    allrows_sdf = oldrows_sdf\
        .union(existingrows_sdf)\
        .union(newrows_sdf)

    return allrows_sdf
    def get_history_product(self, old_dataframe: DataFrame,
                            new_dataframe: DataFrame):
        # We expect the same set of columns for both DF
        col_names = old_dataframe.columns
        # Exclude the column on which dataframes are joined
        col_names.remove('id')

        # Join DFs
        df_old = old_dataframe.alias("df_old")
        df_new = new_dataframe.alias("df_new")
        joined_df = df_old.join(df_new, on='id', how='outer')

        # Distinguish columns from old and new DF
        old_columns = [f'df_old.{c_name}' for c_name in col_names]
        new_columns = [f'df_new.{c_name}' for c_name in col_names]

        # Prepare expressions for finding the values in the new column
        old_columns_null_expr = ' IS NULL AND '.join(old_columns) + ' IS NULL'
        new_columns_null_expr = ' IS NULL AND '.join(new_columns) + ' IS NULL'
        equals_expr = ''
        for old_c, new_c in zip(old_columns, new_columns):
            equals_expr += f'{old_c} = {new_c} AND '
        equals_expr = equals_expr.rstrip(' AND ')

        # Add 'meta' column
        meta_col = when(expr(equals_expr), 'not_changed').when(
            expr(old_columns_null_expr),
            'inserted').when(expr(new_columns_null_expr),
                             'deleted').otherwise('changed')
        joined_df = joined_df.withColumn('meta', meta_col)

        # Create final set of columns
        final_df = joined_df
        for c_name in col_names:
            c_name_new_column = when(expr('meta = "deleted"'),
                                     expr(f'df_old.{c_name}')).otherwise(
                                         expr(f'df_new.{c_name}'))
            final_df = final_df.withColumn(f'new_{c_name}', c_name_new_column)

        # Drop temp columns and return result
        # result = joined_df.drop(*old_columns)  DOESN'T WORK! PIECE OF SHIT
        result = final_df
        for c_name in col_names:
            # result = result.drop(new_dataframe[c_name]) PIECE OF SHIT! WORKS ONLY WITH ONE COLUMN

            # I have to use this hack because of the weird Spark behavior (see pieces of shits above)
            result = result.drop(c_name)
            result = result.withColumn(c_name, expr(f'new_{c_name}'))
            result = result.drop(f'new_{c_name}')

        return result.select('id', 'name', 'score', 'meta')
    def transform_with_data_frame(self, df: DataFrame, source_df: DataFrame,
                                  keys: List[str]) -> DataFrame:
        # now add on my stuff
        column_spec: Column = self.get_column_spec(source_df=source_df)
        conditions = [col(f'b.{key}') == col(f'a.{key}') for key in keys]

        existing_columns: List[Column] = [
            col('a.' + column_name) for column_name in df.columns
        ]

        result_df: DataFrame = df.alias('a').join(
            source_df.alias('b'),
            conditions).select(existing_columns + [column_spec])
        return result_df
Exemplo n.º 7
0
def transform_imm_city_demographics(imm_df: DataFrame,
                                    airport_detail_df: DataFrame,
                                    demo_graph_df: DataFrame) -> DataFrame:
    """Transform to imm_city_demographics dataframe."""
    t1 = imm_df.alias('t1')
    t2 = airport_detail_df.alias('t2')
    # imm_df join with airport_detail_df to get city column
    imm_city_df = t1.join(
        t2,
        [t1.state_code == t2.state_code, t1.airport_code == t2.airport_code
         ]).select(
             col('t1.cicid').alias('cicid'),
             col('t1.month').alias('month'),
             col('t1.year').alias('year'),
             col('t1.birth_country').alias('birth_country'),
             col('t1.arrival_date').alias('arrival_date'),
             col('t1.departure_date').alias('departure_date'),
             col('t1.birth_year').alias('birth_year'),
             col('t1.gender').alias('gender'),
             col('t2.city').alias('city'),
             col('t2.name').alias('airport_name'),
             col('t1.state_code').alias('state_code'),
             col('t1.airport_code').alias('airport_code'),
             col('t1.visa_code').alias('visa_code'))
    # imm_city_df join with demo_graph_df to get city demographics info
    t1 = imm_city_df.alias('t1')
    t2 = demo_graph_df.alias('t2')
    imm_city_demograph_df = t1.join(t2, t1.city == t2.city).select(
        col('t1.cicid').alias('cicid'),
        col('t1.month').alias('month'),
        col('t1.year').alias('year'),
        col('t1.birth_country').alias('birth_country'),
        col('t1.arrival_date').alias('arrival_date'),
        col('t1.departure_date').alias('departure_date'),
        col('t1.birth_year').alias('birth_year'),
        col('t1.gender').alias('gender'),
        col('t1.city').alias('city'),
        col('t2.median_age').alias('city_median_age'),
        col('t2.male_population').alias('city_male_population'),
        col('t2.female_population').alias('city_female_population'),
        col('t2.total_population').alias('city_total_population'),
        col('t2.veteran_count').alias('city_veteran_count'),
        col('t2.foreign_born').alias('city_foreign_born_count'),
        col('t1.airport_name').alias('airport_name'),
        col('t1.state_code').alias('state_code'),
        col('t1.airport_code').alias('airport_code'),
        col('t1.visa_code').alias('visa_code'))
    return imm_city_demograph_df
Exemplo n.º 8
0
def even_repartition(session: SparkSession, df: ps.DataFrame, num: int,
                     cols: List[Any]) -> ps.DataFrame:
    if num == 1:
        return _single_repartition(df)
    if len(cols) == 0:
        if num == 0:
            return df
        rdd = (_zipWithIndex(df.rdd).partitionBy(
            num, lambda k: k).mapPartitions(_to_rows))
        return session.createDataFrame(rdd, df.schema)
    else:
        keys = df.select(*cols).distinct()
        krdd = _zipWithIndex(keys.rdd, True)
        new_schema = to_spark_schema(
            to_schema(df.schema).extract(cols) +
            f"{_PARTITION_DUMMY_KEY}:long")
        idx = session.createDataFrame(krdd, new_schema)
        if num <= 0:
            idx = idx.persist()
            num = idx.count()
        idf = (df.alias("df").join(idx.alias("idx"), on=cols,
                                   how="inner").select(_PARTITION_DUMMY_KEY,
                                                       "df.*"))

        def _to_kv(rows: Iterable[Any]) -> Iterable[Any]:  # pragma: no cover
            for row in rows:
                yield (row[0], row[1:])

        rdd = (idf.rdd.mapPartitions(_to_kv).partitionBy(
            num, lambda k: k).mapPartitions(_to_rows))
        return session.createDataFrame(rdd, df.schema)
Exemplo n.º 9
0
def process_dim_parking_bay(parkingbay_sdf: DataFrame,
                            dim_parkingbay_sdf: DataFrame, load_id, loaded_on):
    """Transform sensordata into dim_parking_bay"""

    # Get landing data distint rows
    parkingbay_sdf = parkingbay_sdf\
        .select([
            "bay_id",
            "marker_id",
            "meter_id",
            "rd_seg_dsc",
            "rd_seg_id",
            "the_geom"])\
        .distinct()

    # Get new rows to insert through LEFT JOIN with existing rows
    nr_parkingbay_sdf = parkingbay_sdf.alias("pb")\
        .join(dim_parkingbay_sdf, "bay_id", "left_outer")\
        .where(dim_parkingbay_sdf["bay_id"].isNull())\
        .select(col("pb.*"))

    # Add load_id, loaded_at and dim_parking_bay_id
    nr_parkingbay_sdf = nr_parkingbay_sdf.withColumn("load_id", lit(load_id))\
        .withColumn("loaded_on", lit(loaded_on.isoformat()))\
        .withColumn("dim_parking_bay_id", uuidUdf())

    # Select relevant columns
    nr_parkingbay_sdf = nr_parkingbay_sdf.select([
        "dim_parking_bay_id", "bay_id", "marker_id", "meter_id", "rd_seg_dsc",
        "rd_seg_id", "the_geom", "load_id", "loaded_on"
    ])
    return nr_parkingbay_sdf
def process_data_frame(destination_file_path: str,
                        join_column_name: str,
                        partition_column_name: str,
                        spark: SparkSession,
                        renamed_df: DataFrame):
    """
    This function is used to process the csv files
    It first checks if the destination already exists. If not it will create it, and if it does exist, it will just append the new data.
    No data will be overwritten, so it is safe to run this function as many times with the same data as you want.
    """
    
    destination_data = destination_file_path.split("/")[-1]
    
    try:
        existing_data = spark.read.parquet(destination_file_path).select(F.col(join_column_name)).distinct()
    except:
        existing_data = None
        
    if existing_data and existing_data is not None:
        
        renamed_df = (renamed_df.alias("new")
                                .join(existing_data.alias("cur"),
                                        on=["match_id"],
                                        how="left_anti"
                                    )
                                .select("new.*")
                     )
    
        print("Processing " + str(renamed_df.count()) + f" new rows for {destination_data}.")
        renamed_df.write.format("parquet").mode("append").partitionBy(partition_column_name).save(destination_file_path)
              
    else:
        print(f"Processing complete csv file for {destination_data}, " + str(renamed_df.count()) + " rows.")
        renamed_df.write.format("parquet").mode("overwrite").partitionBy(partition_column_name).save(destination_file_path)
Exemplo n.º 11
0
def find_similar_movies_by_movie_id(
        movie_id: int,
        movie_ratings_df: DataFrame,
        movie_names_df: DataFrame,
        similar_movies_count: int = 10,
        score_threshold: float = 0.97,
        co_occurrence_threshold: int = 50) -> DataFrame:

    # Remove useless columns to shrink dataframe before join.
    movie_ratings_df = movie_ratings_df.select('user_id', 'movie_id', 'rating')

    movies_similarities_df = cal_movies_similarities(movie_ratings_df).cache()

    movies_similarities_df = movies_similarities_df.repartition(100)
    similar_movies_df = movies_similarities_df. \
        filter(((func.col('movie_id_1') == movie_id) | (func.col('movie_id_2') == movie_id))
               & (func.col('co_occurrence_count') > co_occurrence_threshold)
               & (func.col('similarity_score') > score_threshold)). \
        orderBy(func.desc('similarity_score'), func.desc('co_occurrence_count')).\
        limit(similar_movies_count).\
        select(func.when(func.col('movie_id_1') == movie_id, func.col('movie_id_2')).otherwise(func.col('movie_id_1')).alias('movie_id'),
               func.col('similarity_score'),
               func.col('co_occurrence_count')
               )

    similar_movies_names_df = similar_movies_df.alias('sm').\
        join(movie_names_df.alias('mn'), func.col('mn.id') == func.col('sm.movie_id')).\
        select(func.col('sm.movie_id').alias('movie_id'),
               func.col('mn.name').alias('movie_name'),
               func.col('sm.similarity_score').alias('similarity_score'),
               func.col('sm.co_occurrence_count').alias('co_occurrence_count')).\
        orderBy(func.desc('similarity_score'), func.desc('co_occurrence_count'))

    return similar_movies_names_df
Exemplo n.º 12
0
def create_songplays(song_data: DataFrame, log_data: DataFrame) -> DataFrame:
    """
    Process the song and log data and create table "songplays". Processing:
    - user ID and timestamp cannot be empty
    - convert unix timestamp into timestamp
    - join the song and log tables (explanation below)
    - add an ID column
    - select the relevant columns and rename them

    Since there is no song ID nor artist ID in the log data, we need to join
    both tables using song title, artist name and song duration, which should
    uniquely describe a song. And this is the best we can do, because there is
    no other information about the songs in the log data.

    Even though some songs have the same artist and title, they don't have the
    same duration. So we will not use this information to join the tables.

    Parameters
    ----------
    song_data : pyspark.sql.dataframe.DataFrame
        Song dataset.
    log_data : pyspark.sql.dataframe.DataFrame
        Log dataset.

    Returns
    -------
    songplays : pyspark.sql.dataframe.DataFrame
    """
    return (log_data.alias('logs').where(
        (F.col('userId').isNotNull()) & (F.col('ts').isNotNull())).withColumn(
            'start_time',
            F.from_unixtime(F.col('ts') / 1000).cast('timestamp')).join(
                song_data.alias('songs'),
                (F.col('logs.song') == F.col('songs.title')) &
                (F.col('logs.artist') == F.col('songs.artist_name')),
                'inner').withColumn(
                    'songplay_id', F.monotonically_increasing_id()).select(
                        'songplay_id', 'start_time',
                        F.year('start_time').alias('year'),
                        F.month('start_time').alias('month'),
                        F.col('logs.userId').alias('user_id'), 'logs.level',
                        'songs.song_id', 'songs.artist_id',
                        F.col('logs.sessionId').alias('session_id'),
                        'logs.location',
                        F.col('logs.userAgent').alias('user_agent')))
Exemplo n.º 13
0
    def cross_reference_specimens(
        self, specimen_df: DataFrame, colonies_df: DataFrame, entity_type: str
    ) -> DataFrame:
        """
        Cross-reference Specimen data with colony tracking report information.
        """
        specimen_df = specimen_df.alias("specimen")
        colonies_df = colonies_df.alias("colony")

        # join the specimen information with the colony information
        specimen_df = specimen_df.join(
            colonies_df,
            (specimen_df["_colonyID"] == colonies_df["colony_name"]),
            "left_outer",
        )

        # generate the specimen allelic composition using the colony information
        # override 3i specimen project using the colony phenotyping consortium
        # override the data source and the project info for MGP and MGP Legacy colonies
        specimen_df = (
            specimen_df.transform(self.generate_specimen_allelic_composition)
            .transform(override_europhenome_datasource)
            .transform(self.override_3i_specimen_project)
        )
        specimen_df = specimen_df.withColumn(
            "_productionCentre",
            when(
                col("_productionCentre").isNull(),
                when(
                    col("_phenotypingCentre").isNotNull(), col("_phenotypingCentre")
                ).otherwise(col("colony.production_centre")),
            ).otherwise(col("_productionCentre")),
        )
        specimen_df = specimen_df.select(
            "specimen.*",
            "_productionCentre",
            "allelicComposition",
            "colony.phenotyping_consortium",
        )

        if entity_type == "embryo":
            specimen_df = specimen_df.transform(self.add_embryo_life_stage_acc)
        if entity_type == "mouse":
            specimen_df = specimen_df.transform(self.add_mouse_life_stage_acc)
        return specimen_df
Exemplo n.º 14
0
def cal_movies_similarities(movie_ratings_df: DataFrame):
    """
    Calculate similarity of movie pairs based on their co-occurrence
    and the cosine similarity of their ratings when watched by same person.
    :param movie_ratings_df:
    :return:
    """
    # Find all pair of different movies watched by the same person.
    # func.col('mr1.movie_id') < func.col('mr2.movie_id') to avoid duplication.
    # Parenthesis is mandatory for combined condition (e.g. &, |)
    ratings_pairs_df = movie_ratings_df.alias('mr1'). \
        join(movie_ratings_df.alias('mr2'),
             (func.col('mr1.user_id') == func.col('mr2.user_id')) & (
                         func.col('mr1.movie_id') < func.col('mr2.movie_id'))). \
        select(
        func.col('mr1.movie_id').alias('movie_id_1'),
        func.col('mr2.movie_id').alias('movie_id_2'),
        func.col('mr1.rating').alias('rating_1'),
        func.col('mr2.rating').alias('rating_2')
    )

    # Calculate dot product (numerator) and magnitude (denominator) of cosine similarity equation.
    # Each movie is considered a vector of its ratings.
    ratings_pairs_df = ratings_pairs_df.groupBy('movie_id_1', 'movie_id_2'). \
        agg(func.sum(func.col('rating_1') * func.col('rating_2')).alias('sim_dot_product'),
            (func.sqrt(func.sum(func.pow(func.col('rating_1'), 2))) * func.sqrt(
                func.sum(func.pow(func.col('rating_2'), 2)))).alias('sim_magnitude'),
            func.count(func.col('movie_id_1')).alias('co_occurrence_count')
            )

    # Calculate cosine similarity as a new column:
    # (doc product of two movie ratings / doc product of two magnitude ratings)
    movies_similarities_df = ratings_pairs_df. \
        withColumn('similarity_score',
                   func.when(func.col('sim_magnitude') != 0,
                             func.col('sim_dot_product') / func.col('sim_magnitude')).otherwise(0)
                   ).select('movie_id_1', 'movie_id_2', 'similarity_score', 'co_occurrence_count')

    return movies_similarities_df
Exemplo n.º 15
0
def find_movie_recommendations_by_user_id(
        user_id: int, ratings_df: DataFrame, movie_names_df: DataFrame,
        spark: SparkSession) -> Union[DataFrame, bool]:
    """
    Classification Algorithm:
    Use Collaborative filtering to find similar movies to recommend new movies based on their similarity
    Collaborative filtering is based on the assumption that people who agreed in the past will agree in the future,
    and that they will like similar kinds of items as they liked in the past.
    https://spark.apache.org/docs/3.0.1/ml-collaborative-filtering.html
    :param search_movie_df:
    :param ratings_df:
    :param movie_names_df:
    :return:
    """
    als_model = train_als_model_by_ratings(ratings_df)

    # Examples of find recommendations for top numItems movies (items) or users
    # // Generate top 10 movie recommendations for each user
    # userRecs = model.recommendForAllUsers(numItems)
    # // Generate top 10 user recommendations for each movie
    # movieRecs = model.recommendForAllItems(numItems)
    # // Generate top 10 movie recommendations for a specified set of users (in dataframe format).
    # userSubsetRecs = model.recommendForUserSubset(users, numItems)
    # // Generate top 10 user recommendations for a specified set of movies (in dataframe format).
    # movieSubSetRecs = model.recommendForItemSubset(movies, numItems)

    numItems = 10
    # The recommendForUserSubset input must be a dataframe which only include one column called "user_id".
    user_schema = types.StructType(
        [types.StructField('user_id', types.IntegerType())])
    users_df = spark.createDataFrame([[
        user_id,
    ]], user_schema)
    # movie_recs_df consists of a dataframe for Rows with two columns "user_id" and "recommendations".
    # Each row hold the the recommendations for each user in users_df subset (users_df has only one item here.)
    # "recommendations" are stored as an array of (movie_id, rating) Rows.
    # "rating" is actually predicted rating that the given give to each movie from 10.
    movies_recs_df = als_model.recommendForUserSubset(users_df, numItems)

    if not movies_recs_df.take(1):
        return False

    movie_rating_df = spark.createDataFrame(
        movies_recs_df.collect()[0].recommendations)

    movie_rating_df = movie_rating_df.alias('mr').join(movie_names_df.alias('mn'), func.col('mr.movie_id') == func.col('mn.id')).\
        select(func.col('mr.movie_id').alias('id'), func.col('mn.name').alias('name'), func.col('mr.rating').alias('predicted_rating'))

    return movie_rating_df
Exemplo n.º 16
0
def override_3i_specimen_data(dcc_specimen_df: DataFrame) -> DataFrame:
    dcc_specimen_df_a = dcc_specimen_df.alias("a")
    dcc_specimen_df_b = dcc_specimen_df.alias("b")
    dcc_specimen_df = dcc_specimen_df_a.join(
        dcc_specimen_df_b,
        (dcc_specimen_df_a["_specimenID"] == dcc_specimen_df_b["_specimenID"])
        & (dcc_specimen_df_a["_centreID"] == dcc_specimen_df_b["_centreID"])
        &
        (dcc_specimen_df_a["_dataSource"] != dcc_specimen_df_b["_dataSource"]),
        "left_outer",
    )
    dcc_specimen_df = dcc_specimen_df.where(
        col("b._specimenID").isNull()
        | ((col("b._specimenID").isNotNull())
           & (col("a._dataSource") != "3i")))
    return dcc_specimen_df.select("a.*")
Exemplo n.º 17
0
    def transform_with_data_frame(self, df: DataFrame,
                                  source_df: Optional[DataFrame],
                                  keys: List[str]) -> DataFrame:
        assert source_df
        # now add on my stuff
        column_spec: Column = self.get_column_spec(source_df=source_df)
        conditions = [col(f"b.{key}") == col(f"a.{key}") for key in keys]

        existing_columns: List[Column] = [
            col("a." + column_name) for column_name in df.columns
        ]

        result_df: DataFrame = (df.alias("a").join(
            source_df.alias("b"),
            conditions).select(existing_columns + [column_spec]))
        return result_df
Exemplo n.º 18
0
 def update_sum_count_or_insert(self, new_data: DataFrame, table: str, id_col: str):
     try:
         delta_table = DeltaTable.forPath(self.spark_configuration.spark_session, self.delta_src + table)
     except AnalysisException:
         # If delta table not exists just create it
         new_data.write \
             .format("delta") \
             .save(self.delta_src + table)
         return
     delta_table.alias("current_data").merge(
         new_data.alias("updates"),
         "current_data.{0} = updates.{0}".format(id_col))\
         .whenMatchedUpdate(set={
             "count": "current_data.count + updates.count"
         }) \
         .whenNotMatchedInsertAll() \
         .execute()
Exemplo n.º 19
0
    def merge(
            self,
            df: DataFrame,
            location: str,
            condition: str,  # Only supports SQL-like string condition
            match_update_dict: dict,  # "target_column": "expression"
            insert_when_not_matched: False,  # Set to True for upsert
            save_mode: str = 'table'):
        '''Merge a dataframe to target table or path.

        This merge operation can represent both update and upsert operation.
        Source and target table is defaultly alias-ed as 'SRC' and 'TGT'. This could be used in condition string and update/insert expressions.
        Args:
            df (DataFrame): The source dataframe to write.
            save_mode (str): 'table' or 'path'
            location (str): The table name or path to be merge into.
            condition (str): The condition in SQL-like string form.
            match_update_dict (dict): Contains ("target_column": "expression"). 
                This represents the updated value if matched.
                NOTE: "target_column"'s come without schema ("SRC" or "TGT").
            not_match_insert_dict (dict): Contains ("target_column": "expression"). 
                This represents the inserted value if not matched. 
                Other columns which are not specified shall be null.
                NOTE: "target_column"'s come without schema ("SRC" or "TGT").
        '''
        super(DeltaDataSource,
              self).merge(df,
                          condition,
                          match_update_dict,
                          insert_when_not_matched=insert_when_not_matched)
        save_mode = save_mode.lower()
        if save_mode == "table":
            target_table = DeltaTable.forName(self.spark, location)
        elif save_mode == "path":
            target_table = DeltaTable.forPath(self.spark, location)
        else:
            raise ValueError("save_mode should be 'path' or 'table'.")

        merger = target_table.alias("TGT").merge(df.alias("SRC"), condition)
        merger = merger.whenMatchedUpdate(set=match_update_dict)

        if insert_when_not_matched:
            merger = merger.whenNotMatchedInsert(values=match_update_dict)

        merger.execute()
Exemplo n.º 20
0
 def _upsert_save(self, data: DataFrame) -> None:
     if self._load().rdd.isEmpty():
         self._insert_save(data)
     else:
         joined_data = data.alias("new").join(self._load().alias("old"),
                                              self._table_pk, "outer")
         upsert_dataset = joined_data.select([  # type: ignore
             coalesce(f"new.{col_name}", f"old.{col_name}").alias(col_name)
             for col_name in set(data.columns) -
             set(self._table_pk)  # type: ignore
         ] + self._table_pk)
         temporary_persisted_tbl_name = f"temp_{uuid.uuid4().int}"
         with StagedHiveDataSet(
                 upsert_dataset,
                 stage_database_name=self._database,
                 stage_table_name=temporary_persisted_tbl_name,
         ) as temp_table:
             self._overwrite_save(temp_table.staged_data)
Exemplo n.º 21
0
 def override_3i_specimen_data(self,
                               dcc_specimen_df: DataFrame) -> DataFrame:
     """
     Whenever a Specime is presetn both in the 3i project and any other data source (e.g. EuroPhenome or IMPC) the other data source specimen data should be used instead of the 3i one.
     """
     dcc_specimen_df_a = dcc_specimen_df.alias("a")
     dcc_specimen_df_b = dcc_specimen_df.alias("b")
     dcc_specimen_df = dcc_specimen_df_a.join(
         dcc_specimen_df_b,
         (dcc_specimen_df_a["_specimenID"]
          == dcc_specimen_df_b["_specimenID"])
         &
         (dcc_specimen_df_a["_centreID"] == dcc_specimen_df_b["_centreID"])
         & (dcc_specimen_df_a["_dataSource"] !=
            dcc_specimen_df_b["_dataSource"]),
         "left_outer",
     )
     dcc_specimen_df = dcc_specimen_df.where(
         col("b._specimenID").isNull()
         | ((col("b._specimenID").isNotNull())
            & (col("a._dataSource") != "3i")))
     return dcc_specimen_df.select("a.*").dropDuplicates()
Exemplo n.º 22
0
def process_dim_parking_bay(parkingbay_sdf: DataFrame,
                            dim_parkingbay_sdf: DataFrame, load_id, loaded_on):
    """Transform incoming parkingbay_sdf data and existing dim_parking_bay
    into the latest version of records of dim_parking_bay data.
    """
    # Get landing data distint rows
    parkingbay_sdf = parkingbay_sdf\
        .select([
            "bay_id",
            "marker_id",
            "meter_id",
            "rd_seg_dsc",
            "rd_seg_id"])\
        .distinct()

    # Using a left_outer join on the business key (bay_id),
    # identify rows that do NOT EXIST in landing data that EXISTS in existing Dimension table
    oldrows_parkingbay_sdf = dim_parkingbay_sdf.alias("dim")\
        .join(parkingbay_sdf, "bay_id", "left_outer")\
        .where(parkingbay_sdf["bay_id"].isNull())\
        .select(col("dim.*"))

    # Using a left_outer join on the business key (bay_id),
    # Identify rows that EXISTS in incoming landing data that does also EXISTS in existing Dimension table
    # and take the values of the incoming landing data. That is, we update existing table values.
    existingrows_parkingbay_sdf = parkingbay_sdf.alias("pb")\
        .join(dim_parkingbay_sdf.alias("dim"), "bay_id", "left_outer")\
        .where(dim_parkingbay_sdf["bay_id"].isNotNull())\
        .select(
            col("dim.dim_parking_bay_id"),
            col("pb.bay_id"),
            col("pb.marker_id"),
            col("pb.meter_id"),
            col("pb.rd_seg_dsc"),
            col("pb.rd_seg_id")
        )

    # Using a left_outer join on the business key (bay_id),
    # Identify rows that EXISTS in landing data that does NOT EXISTS in existing Dimension table
    newrows_parkingbay_sdf = parkingbay_sdf.alias("pb")\
        .join(dim_parkingbay_sdf, "bay_id", "left_outer")\
        .where(dim_parkingbay_sdf["bay_id"].isNull())\
        .select(col("pb.*"))

    # Add load_id, loaded_at and dim_parking_bay_id
    existingrows_parkingbay_sdf = existingrows_parkingbay_sdf.withColumn("load_id", lit(load_id))\
        .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))
    newrows_parkingbay_sdf = newrows_parkingbay_sdf.withColumn("load_id", lit(load_id))\
        .withColumn("loaded_on", lit(loaded_on.isoformat()).cast("timestamp"))\
        .withColumn("dim_parking_bay_id", uuidUdf())

    # Select relevant columns
    relevant_cols = [
        "dim_parking_bay_id", "bay_id", "marker_id", "meter_id", "rd_seg_dsc",
        "rd_seg_id", "load_id", "loaded_on"
    ]
    oldrows_parkingbay_sdf = oldrows_parkingbay_sdf.select(relevant_cols)
    existingrows_parkingbay_sdf = existingrows_parkingbay_sdf.select(
        relevant_cols)
    newrows_parkingbay_sdf = newrows_parkingbay_sdf.select(relevant_cols)

    allrows_parkingbay_sdf = oldrows_parkingbay_sdf\
        .union(existingrows_parkingbay_sdf)\
        .union(newrows_parkingbay_sdf)

    return allrows_parkingbay_sdf
Exemplo n.º 23
0
    def get_associated_body_weight(
        self,
        specimen_level_experiment_df: DataFrame,
        mouse_specimen_df: DataFrame,
        impress_df: DataFrame,
    ) -> DataFrame:
        """
        Takes in DataFrame with Experimental data, one with Mouse Specimens and one with Impress information,
        and applies the algorithm to select the associated BW to any given experiment
        and calculate the age of experiment for the selected BW measurement.
        """
        # Explode the nested experiment DF structure so every row represents an observation
        weight_observations: DataFrame = specimen_level_experiment_df.withColumn(
            "simpleParameter", explode_outer("simpleParameter"))

        # Select the parameter relevant pieces from the IMPReSS DF
        parameters = impress_df.select(
            "pipelineKey",
            "procedure.procedureKey",
            "parameter.parameterKey",
            "parameter.analysisWithBodyweight",
        ).distinct()

        # Filter the IMPReSS using the analysisWithBodyweight flag
        weight_parameters = parameters.where(
            col("analysisWithBodyweight").isin(
                ["is_body_weight", "is_fasted_body_weight"]))

        # Join both the  observations DF and the BW parameters DF to obtain the observations that are BW
        weight_observations = weight_observations.join(
            weight_parameters,
            ((weight_observations["_pipeline"]
              == weight_parameters["pipelineKey"])
             & (weight_observations["_procedureID"]
                == weight_parameters["procedureKey"])
             & (weight_observations["simpleParameter._parameterID"]
                == weight_parameters["parameterKey"])),
        )
        # Create a boolean flag for fasted BW procedures
        weight_observations = weight_observations.withColumn(
            "weightFasted",
            col("analysisWithBodyweight") == "is_fasted_body_weight")

        weight_observations = weight_observations.select(
            "specimenID",
            "_centreID",
            col("unique_id").alias("sourceExperimentId"),
            col("_dateOfExperiment").alias("weightDate"),
            col("simpleParameter._parameterID").alias("weightParameterID"),
            col("simpleParameter.value").alias("weightValue"),
            "weightFasted",
        )
        weight_observations = weight_observations.where(
            col("weightValue").isNotNull())

        # Join the body weight observations so we can determine the  age of the specimen for any BW measurement
        weight_observations = weight_observations.join(
            mouse_specimen_df,
            (weight_observations["specimenID"]
             == mouse_specimen_df["_specimenID"])
            & (weight_observations["_centreID"]
               == mouse_specimen_df["_centreID"]),
        )
        weight_observations = weight_observations.withColumn(
            "weightDaysOld", datediff("weightDate", "_DOB"))

        #  Group the weight observations by Specimen
        weight_observations = weight_observations.groupBy("specimenID").agg(
            collect_set(
                struct(
                    "sourceExperimentId",
                    "weightDate",
                    "weightParameterID",
                    "weightValue",
                    "weightDaysOld",
                    "weightFasted",
                )).alias("weight_observations"))

        # Create a temporary "procedureGroup" column to be used in the  BW selection
        specimen_level_experiment_df = specimen_level_experiment_df.withColumn(
            "procedureGroup",
            udf(lambda prod_id: prod_id[:prod_id.rfind("_")],
                StringType())(col("_procedureID")),
        )

        # Join all the observations with the BW observations grouped by specimen
        specimen_level_experiment_df = specimen_level_experiment_df.join(
            weight_observations, "specimenID", "left_outer")
        # Schema for the struct that is going to group all the associated BW data
        output_weight_schema = StructType([
            StructField("sourceExperimentId", StringType()),
            StructField("weightDate", DateType()),
            StructField("weightParameterID", StringType()),
            StructField("weightValue", StringType()),
            StructField("weightDaysOld", IntegerType()),
            StructField("error", ArrayType(StringType())),
        ])

        # Alias both the experiment and the specimen df so is easier to join and manipulate
        experiment_df_a = specimen_level_experiment_df.alias("exp")
        mice_df_a = mouse_specimen_df.alias("mice")

        specimen_level_experiment_df = experiment_df_a.join(
            mice_df_a,
            (specimen_level_experiment_df["specimenID"]
             == mouse_specimen_df["_specimenID"])
            & (specimen_level_experiment_df["_centreID"]
               == mouse_specimen_df["_centreID"]),
            "left_outer",
        )

        # Add special dates to the experiment x specimen dataframe
        # for some experiments the date of sacrifice or date of blood collection
        # has to be used as reference for age and  BW calculations
        specimen_level_experiment_df = self._add_special_dates(
            specimen_level_experiment_df)
        get_associated_body_weight_udf = udf(self._get_closest_weight,
                                             output_weight_schema)
        specimen_level_experiment_df = specimen_level_experiment_df.withColumn(
            "weight",
            get_associated_body_weight_udf(
                when(
                    col("_dateOfBloodCollection").isNotNull(),
                    col("_dateOfBloodCollection"),
                ).when(
                    col("_dateOfSacrifice").isNotNull(),
                    col("_dateOfSacrifice")).otherwise(
                        col("_dateOfExperiment")),
                col("procedureGroup"),
                col("weight_observations"),
            ),
        )
        specimen_level_experiment_df = specimen_level_experiment_df.select(
            "exp.*", "weight")
        return specimen_level_experiment_df
Exemplo n.º 24
0
def analyze_stream_data(lrModelList=None, batch_summary_df: DataFrame = None):

    os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 ' \
                                        'pyspark-shell'
    spark = SparkSession.builder.appName("TwitterSparkStream").getOrCreate()
    spark.conf.set("spark.sql.crossJoin.enabled", "true")

    schema = StructType([
        StructField("timestamp", TimestampType(), True),
        StructField("text", StringType(), True),
        StructField("sentiment", DoubleType(), True)
    ])
    # initialize Spark Structured Stream with KafkaConsumer subscribing to topic "tweet" from twitter_stream.py
    # parse the kafka stream to the schema
    stream_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "tweet") \
        .option("startingOffsets", "latest") \
        .load() \
        .select(from_json(col("value").cast("string"), schema).alias("data")) \
        .select(col("data.timestamp").alias("timestamp"), col("data.text").alias("text"),
                col("data.sentiment").alias("sentiment"))

    # set window of 30 seconds with watermark of 10 seconds and aggregate min, avg, and max sentiment over the windows,
    # as well as the amount of tweets with positive, negative and neutral sentiment in that window
    windowed_stream_df = stream_df.withWatermark("timestamp", "10 seconds") \
        .groupBy(window("timestamp", "30 seconds")) \
        .agg(avg("sentiment").alias("avg_sentiment"),
             max("sentiment").alias("max_sentiment"),
             min("sentiment").alias("min_sentiment"),
             count(when(stream_df["sentiment"] > 0.1, True)).alias("pos_tweet_count_stream"),
             count(when((stream_df["sentiment"] <= 0.1) & (stream_df["sentiment"] >= -0.1), True)).alias(
                 "neutral_tweet_count_stream"),
             count(when(stream_df["sentiment"] < -0.1, True)).alias("neg_tweet_count_stream"))

    windowed_stream_df = windowed_stream_df.select(
        windowed_stream_df.window.start.alias("start"), "min_sentiment",
        "avg_sentiment", "max_sentiment", "pos_tweet_count_stream",
        "neutral_tweet_count_stream",
        "neg_tweet_count_stream").sort("start", ascending=False)

    if lrModelList is not None:
        # if there are LinearRegression Models as function arguments, predict the amount of positive, negative and
        # neutral tweets, based on the models trained on the offline data in spark_offline.py
        featAssembler = VectorAssembler(
            inputCols=["avg_sentiment", "max_sentiment", "min_sentiment"],
            outputCol="features")
        windowed_stream_df = featAssembler.transform(windowed_stream_df)

        windowed_stream_df = lrModelList[0].transform(windowed_stream_df)
        windowed_stream_df = lrModelList[1].transform(windowed_stream_df)
        windowed_stream_df = lrModelList[2].transform(windowed_stream_df)

        windowed_stream_df = windowed_stream_df.select(
            "start", "avg_sentiment", "max_sentiment", "min_sentiment",
            "pos_tweet_count_stream", "pos_tweet_count_pred",
            "neutral_tweet_count_stream", "neutral_tweet_count_pred",
            "neg_tweet_count_stream", "neg_tweet_count_pred")

    if batch_summary_df is not None:
        # if there is a dataframe as function argument with the average values of the specific metrics from offline data
        # then calculate the difference between the amount of positive, negative and neutral tweets in the online data
        # with the average of the specific metric from the offline data for a comparison
        windowed_stream_df = windowed_stream_df.withColumn("tmp1", lit("mean"))
        windowed_stream_df = windowed_stream_df.alias("stream").join(
            batch_summary_df.alias("batch"),
            col("stream.tmp1") == col("batch.summary"), "inner")

        windowed_stream_df = windowed_stream_df\
                                .withColumn("pos_tweet_count_diff", col("pos_tweet_count_stream") - col("pos_tweet_count_batch"))\
                                .withColumn("neutral_tweet_count_diff", col("neutral_tweet_count_stream") - col("neutral_tweet_count_batch")) \
                                .withColumn("neg_tweet_count_diff", col("neg_tweet_count_stream") - col("neg_tweet_count_batch"))

        windowed_stream_df = windowed_stream_df.select(
            "start", "min_sentiment", "avg_sentiment", "max_sentiment",
            "pos_tweet_count_batch", "pos_tweet_count_stream",
            "pos_tweet_count_diff", "pos_tweet_count_pred",
            "neutral_tweet_count_batch", "neutral_tweet_count_stream",
            "neutral_tweet_count_diff", "neutral_tweet_count_pred",
            "neg_tweet_count_batch", "neg_tweet_count_stream",
            "neg_tweet_count_diff", "neg_tweet_count_pred")

    # write spark stream to console in output mode "complete"
    query = windowed_stream_df.writeStream\
                            .format("console")\
                            .outputMode("complete")\
                            .option("truncate", False)\
                            .start().awaitTermination()
Exemplo n.º 25
0
def map_experiments_to_observations(
    experiment_df: DataFrame,
    line_df: DataFrame,
    mouse_df: DataFrame,
    embryo_df,
    allele_df: DataFrame,
    colony_df: DataFrame,
    pipeline_df: DataFrame,
    strain_df: DataFrame,
    ontology_df: DataFrame,
):
    experiment_df = experiment_df.withColumnRenamed("_sourceFile",
                                                    "experiment_source_file")
    experiment_df = experiment_df.withColumnRenamed("unique_id",
                                                    "experiment_id")
    experiment_df = experiment_df.alias("experiment")

    colony_df = colony_df.alias("colony")
    embryo_df = embryo_df.withColumn("_DOB", lit(None).cast(StringType()))
    embryo_df = embryo_df.withColumn("_VALUE", lit(None).cast(StringType()))
    mouse_df = mouse_df.withColumn("_stage", lit(None).cast(StringType()))
    mouse_df = mouse_df.withColumn("_stageUnit", lit(None).cast(StringType()))
    specimen_df = mouse_df.union(embryo_df.select(mouse_df.columns))

    specimen_df = specimen_df.withColumnRenamed("_sourceFile",
                                                "specimen_source_file")
    specimen_df = specimen_df.withColumnRenamed("unique_id", "specimen_id")
    specimen_df = specimen_df.alias("specimen")

    allele_df = allele_df.alias("allele")
    strain_df = strain_df.alias("strain")

    observation_df: DataFrame = experiment_df.join(
        specimen_df,
        (experiment_df["experiment._centreID"]
         == specimen_df["specimen._centreID"])
        & (experiment_df["experiment.specimenID"]
           == specimen_df["specimen._specimenID"]),
        "left_outer",
    )
    observation_df = observation_df.join(
        colony_df,
        (observation_df["specimen._colonyID"]
         == colony_df["colony.colony_name"]),
        "left_outer",
    )
    observation_df = observation_df.join(
        allele_df,
        observation_df["colony.allele_symbol"] ==
        allele_df["allele.alleleSymbol"],
        "left_outer",
    )

    experimental_observation_df = observation_df.where(
        (lower(col("specimen._colonyID")) != "baseline")
        & (col("specimen._isBaseline") != True)).join(
            strain_df,
            col("colony.colony_background_strain") == col("strain.strainName"),
            "left_outer",
        )

    baseline_observation_df = observation_df.where(
        (lower(col("specimen._colonyID")) == "baseline")
        | (col("specimen._isBaseline") == True)).join(
            strain_df,
            when(
                concat(lit("MGI:"),
                       col("specimen._strainID")) == col("strain.mgiStrainID"),
                concat(lit("MGI:"),
                       col("specimen._strainID")) == col("strain.mgiStrainID"),
            ).otherwise(col("specimen._strainID") == col("strain.strainName")),
            "left_outer",
        )

    ## TODO fallback to imits when its missing and do the join again

    observation_df = baseline_observation_df.union(experimental_observation_df)

    simple_observation_df = process_parameter_values(observation_df,
                                                     pipeline_df,
                                                     "simpleParameter")
    simple_observation_df = add_observation_type(simple_observation_df)
    simple_observation_df = resolve_simple_value(simple_observation_df,
                                                 pipeline_df)
    simple_observation_df = unify_schema(simple_observation_df).select(
        Constants.OBSERVATION_COLUMNS)

    line_df = (line_df.withColumnRenamed(
        "_sourceFile", "experiment_source_file").withColumnRenamed(
            "unique_id",
            "experiment_id").withColumn("specimen_source_file",
                                        lit(None)).alias("experiment"))

    line_observation_df = line_df.join(
        colony_df, line_df["_colonyID"] == colony_df["colony.colony_name"])

    line_observation_df = line_observation_df.join(
        strain_df,
        col("colony.colony_background_strain") == col("strain.strainName"))

    line_observation_df = line_observation_df.join(
        allele_df,
        observation_df["colony.allele_symbol"] ==
        allele_df["allele.alleleSymbol"],
        "left_outer",
    )
    line_simple_observation_df = process_parameter_values(line_observation_df,
                                                          pipeline_df,
                                                          "simpleParameter",
                                                          exp_type="line")
    line_simple_observation_df = add_observation_type(
        line_simple_observation_df)
    line_simple_observation_df = resolve_simple_value(
        line_simple_observation_df, pipeline_df)
    line_simple_observation_df = line_simple_observation_df.withColumn(
        "specimen_id", lit(None))
    line_simple_observation_df = unify_schema(
        line_simple_observation_df).select(Constants.OBSERVATION_COLUMNS)

    simple_observation_df = simple_observation_df.union(
        line_simple_observation_df)

    body_weight_curve_observation_df = get_body_weight_curve_observations(
        simple_observation_df.where(
            col("observation_type") == "unidimensional"))

    simple_media_observation_df = process_parameter_values(
        observation_df, pipeline_df, "mediaParameter")
    if simple_media_observation_df is not None:
        simple_media_observation_df = resolve_simple_media_value(
            simple_media_observation_df)
        simple_media_observation_df = unify_schema(
            simple_media_observation_df).select(Constants.OBSERVATION_COLUMNS)

    ontological_observation_df = process_parameter_values(
        observation_df, pipeline_df, "ontologyParameter")
    ontological_observation_df = resolve_ontology_value(
        ontological_observation_df, ontology_df)
    ontological_observation_df = unify_schema(
        ontological_observation_df).select(Constants.OBSERVATION_COLUMNS)

    time_series_observation_df = process_parameter_values(
        observation_df, pipeline_df, "seriesParameter")
    time_series_observation_df = resolve_time_series_value(
        time_series_observation_df)
    time_series_observation_df = unify_schema(
        time_series_observation_df).select(Constants.OBSERVATION_COLUMNS)

    image_record_observation_df = process_parameter_values(
        observation_df, pipeline_df, "seriesMediaParameter")
    image_record_observation_df = resolve_image_record_value(
        image_record_observation_df)
    image_record_observation_df = resolve_image_record_parameter_association(
        image_record_observation_df, simple_observation_df)
    image_record_observation_df = unify_schema(
        image_record_observation_df).select(Constants.OBSERVATION_COLUMNS)

    observation_df = (
        simple_observation_df.union(ontological_observation_df).union(
            image_record_observation_df).union(time_series_observation_df).
        union(body_weight_curve_observation_df))
    if simple_media_observation_df is not None:
        observation_df = observation_df.union(simple_media_observation_df)
    observation_df = observation_df.where(col("parameter_status").isNull())
    observation_df = format_columns(observation_df).drop_duplicates()
    observation_df = observation_df.withColumn(
        "experiment_source_file",
        regexp_extract(col("experiment_source_file"),
                       "(.*\/)(.*\/.*\.xml)",
                       idx=2),
    )
    observation_df = observation_df.withColumn(
        "specimen_source_file",
        regexp_extract(col("specimen_source_file"),
                       "(.*\/)(.*\/.*\.xml)",
                       idx=2),
    )
    observation_df = observation_df.withColumn("life_stage_name", lit(None))
    observation_df = observation_df.withColumn("life_stage_acc", lit(None))
    for life_stage in Constants.PROCEDURE_LIFE_STAGE_MAPPER:
        life_stage_name = life_stage["lifeStage"]
        observation_df = observation_df.withColumn(
            "life_stage_name",
            when(
                col("life_stage_name").isNull(),
                when(
                    (col("procedure_stable_id").rlike("|".join(
                        [f"({ proc })" for proc in life_stage["procedures"]]))
                     | (col("developmental_stage_name") == life_stage_name)),
                    lit(life_stage_name),
                ).otherwise(lit(None)),
            ).otherwise(col("life_stage_name")),
        )
        observation_df = observation_df.withColumn(
            "life_stage_acc",
            when(
                col("life_stage_acc").isNull(),
                when(
                    (col("procedure_stable_id").rlike("|".join(
                        [f"({ proc })" for proc in life_stage["procedures"]]))
                     | (col("developmental_stage_name") == life_stage_name)),
                    lit(life_stage["lifeStageAcc"]),
                ).otherwise(lit(None)),
            ).otherwise(col("life_stage_acc")),
        )
    observation_df = observation_df.withColumn(
        "life_stage_name",
        when((col("life_stage_name").isNull()),
             lit("Early adult")).otherwise(col("life_stage_name")),
    )
    observation_df = observation_df.withColumn(
        "life_stage_acc",
        when((col("life_stage_acc").isNull()),
             lit("IMPCLS:0005")).otherwise(col("life_stage_acc")),
    )
    return observation_df
Exemplo n.º 26
0
 def alias(self, *args, **kwargs):
     return DataFrame.alias(self, *args, **kwargs)
Exemplo n.º 27
0
    def transform_with_data_frame_single_select(self, df: DataFrame,
                                                source_df: DataFrame,
                                                keys: List[str]) -> DataFrame:
        # get all the column specs
        column_specs: List[Column] = [
            child_mapper.get_column_specs(source_df=source_df)[column_name]
            for column_name, child_mapper in self.mappers.items()
        ]

        try:
            # print("========== NEW 2 ==============")
            if not self.drop_key_columns:
                column_specs = [col(f"b.{c}") for c in keys] + column_specs

            if self.enable_logging:
                print(f"-------- automapper ({self.view}) column specs ------")
                print(self.to_debug_string(source_df=source_df))
                print(
                    f"-------- end automapper ({self.view}) column specs ------"
                )
                print(
                    f"-------- automapper ({self.source_view}) source_df schema ------"
                )
                source_df.printSchema()
                print(
                    f"-------- end automapper ({self.source_view}) source_df schema ------"
                )

            if self.check_schema_for_all_columns:
                for column_name, mapper in self.mappers.items():
                    check_schema_result: Optional[
                        CheckSchemaResult] = mapper.check_schema(
                            parent_column=None, source_df=source_df)
                    if check_schema_result and len(
                            check_schema_result.result.errors) > 0:
                        print(
                            f"==== ERROR: Schema Mismatch [{column_name}] ==="
                            f"{str(check_schema_result)}")
                    else:
                        print(f"==== Schema Matches: [{column_name}] ====")

            # run all the selects
            df = source_df.alias("b").select(*column_specs)
            # write out final checkpoint for this automapper
            if self.checkpoint_path:
                checkpoint_path = Path(self.checkpoint_path).joinpath(
                    self.view or "df").joinpath("final")
                df.write.parquet(str(checkpoint_path))
                df = df.sql_ctx.read.parquet(str(checkpoint_path))
        except AnalysisException:
            # iterate through each column to find the problem child
            for column_name, mapper in self.mappers.items():
                try:
                    print(f"========= Processing {column_name} =========== ")
                    column_spec = mapper.get_column_specs(
                        source_df=source_df)[column_name]
                    source_df.alias("b").select(column_spec).limit(1).count()
                    print(
                        f"========= Done Processing {column_name} =========== "
                    )
                except AnalysisException as e2:
                    print(
                        f"========= checking Schema {column_name} =========== "
                    )
                    check_schema_result = mapper.check_schema(
                        parent_column=None, source_df=source_df)
                    msg: str = ""
                    if e2.desc.startswith("cannot resolve 'array"):
                        msg = "Looks like the elements of the array have different structures.  " \
                              "All items in an array should have the exact same structure.  " \
                              "You can pass in include_nulls to AutoMapperDataTypeComplexBase to force it to create " \
                              "null values for each element in the structure. "
                    msg += self.get_message_for_exception(
                        column_name + ": " + str(check_schema_result), df, e2,
                        source_df)
                    raise AutoMapperException(msg) from e2
        except Exception as e:
            print("====  OOPS ===========")
            msg = self.get_message_for_exception("", df, e, source_df)
            raise Exception(msg) from e

        print(f"========= Finished AutoMapper {self.view} =========== ")
        return df
Exemplo n.º 28
0
    def get_history_product(self, old_dataframe: DataFrame, new_dataframe: DataFrame):

        cols = old_dataframe.columns

        try:
            pk = self.primary_keys
            expressions = []
            for p in pk:
                expressions.append(f'old.{p} <=> new.{p}')
            expression = ' and '.join(expressions)
        except TypeError:
            expressions = []
            for c in cols:
                expressions.append(f'old.{c} <=> new.{c}')
            expression = ' and '.join(expressions)

        not_changed_keys = []
        for c in cols:
            not_changed_keys.append(f'old.{c} <=> new.{c}')
        not_changed_expression = ' and '.join(not_changed_keys)



        old_dataframe = old_dataframe.alias('old')
        new_dataframe = new_dataframe.alias('new')

        not_changed_rows = old_dataframe.join(new_dataframe, expr(not_changed_expression), how='semi')

        changed_rows_new = new_dataframe.join(old_dataframe, expr(expression), how='semi')\
            .exceptAll(not_changed_rows)

        changed_rows_old = old_dataframe.join(new_dataframe, expr(expression), how='semi') \
            .exceptAll(not_changed_rows)

        old_rows = old_dataframe\
            .exceptAll(not_changed_rows) \
            .exceptAll(changed_rows_old)

        new_rows = new_dataframe\
            .exceptAll(not_changed_rows) \
            .exceptAll(changed_rows_new)

        not_changed_rows = not_changed_rows.withColumn('meta', lit('not_changed'))
        changed_rows = changed_rows_new.withColumn('meta', lit('changed'))
        old_rows = old_rows.withColumn('meta', lit('deleted'))
        new_rows = new_rows.withColumn('meta', lit('inserted'))

        # for r in not_changed_rows.collect():
        #     print(r)
        #
        # for r in changed_rows.collect():
        #     print(r)
        #
        # for r in old_rows.collect():
        #     print(r)
        #
        # for r in new_rows.collect():
        #     print(r)
        # print('=============================================')


        result_dateframe = not_changed_rows\
                            .union(changed_rows)\
                            .union(old_rows)\
                            .union(new_rows)\
                            .sort('id', 'name')

        # resdf = result_dateframe.collect()
        # for d in resdf:
        #     print(d)

        return result_dateframe
Exemplo n.º 29
0
    def _transform_with_data_frame_single_select(
        self, df: DataFrame, source_df: DataFrame, keys: List[str]
    ) -> DataFrame:
        """
        This internal function transforms the data frame using the mappings in a single select command


        :param df: destination data frame
        :param source_df: source data frame
        :param keys: key columns
        """
        # get all the column specs
        column_specs: List[Column] = [
            child_mapper.get_column_specs(source_df=source_df)[column_name]
            for column_name, child_mapper in self.mappers.items()
        ]

        try:
            # print("========== NEW 2 ==============")
            if self.copy_all_unmapped_properties:
                # find all source properties not mapped
                source_properties: List[str] = source_df.columns
                mapped_properties: List[str] = list(self.mappers.keys())
                unmapped_properties: List[str] = [
                    p for p in source_properties if p not in mapped_properties
                ]
                copy_all_unmapped_properties_exclude: List[str] = (
                    self.copy_all_unmapped_properties_exclude or []
                )
                column_schema: Dict[str, StructField] = (
                    {f.name: f for f in source_df.schema} if self.use_schema else {}
                )

                # for each unmapped property add a simple A.column()
                column_specs.extend(
                    [
                        AutoMapperDataTypeColumn(column_name)
                        .get_column_spec(
                            source_df=source_df,
                            current_column=None,
                            parent_columns=None,
                        )
                        .cast(column_schema[column_name].dataType)
                        if column_name in column_schema
                        else AutoMapperDataTypeColumn(column_name).get_column_spec(
                            source_df=source_df,
                            current_column=None,
                            parent_columns=None,
                        )
                        for column_name in unmapped_properties
                        if column_name not in copy_all_unmapped_properties_exclude
                    ]
                )

            if not self.drop_key_columns:
                column_specs = [col(f"b.{c}") for c in keys] + column_specs

            print(f"COLUMN SPECS --- {column_specs}")

            self.logger.debug(f"-------- automapper ({self.view}) column specs ------")
            self.logger.debug(self.to_debug_string(source_df=source_df))
            self.logger.debug(
                f"-------- end automapper ({self.view}) column specs ------"
            )
            self.logger.debug(
                f"-------- automapper ({self.source_view}) source_df schema ------"
            )
            # noinspection PyProtectedMember
            self.logger.debug(source_df._jdf.schema().treeString())  # type: ignore
            self.logger.debug(
                f"-------- end automapper ({self.source_view}) source_df schema ------"
            )

            if self.check_schema_for_all_columns:
                for column_name, mapper in self.mappers.items():
                    check_schema_result: Optional[
                        CheckSchemaResult
                    ] = mapper.check_schema(parent_column=None, source_df=source_df)
                    if (
                        check_schema_result
                        and len(check_schema_result.result.errors) > 0
                    ):
                        self.logger.info(
                            f"==== Schema Mismatch [{column_name}] ==="
                            f"{str(check_schema_result)}"
                        )
                    else:
                        self.logger.debug(f"==== Schema Matches: [{column_name}] ====")

            # run all the selects
            df = source_df.alias("b").select(*column_specs)
            # write out final checkpoint for this automapper
            if self.checkpoint_path:
                checkpoint_path = (
                    Path(self.checkpoint_path)
                    .joinpath(self.view or "df")
                    .joinpath("final")
                )
                df.write.parquet(str(checkpoint_path))
                df = df.sql_ctx.read.parquet(str(checkpoint_path))
        except (AnalysisException, ValueError):
            self.logger.warning(
                f"-------- automapper ({self.view}) column specs ------"
            )
            self.logger.warning(self.to_debug_string(source_df=source_df))
            self.logger.warning(
                f"-------- end automapper ({self.view}) column specs ------"
            )
            self.logger.debug(
                f"-------- automapper ({self.source_view}) source_df schema ------"
            )
            # noinspection PyProtectedMember
            self.logger.debug(source_df._jdf.schema().treeString())  # type: ignore
            self.logger.debug(
                f"-------- end automapper ({self.source_view}) source_df schema ------"
            )
            # iterate through each column to find the problem child
            for column_name, mapper in self.mappers.items():
                try:
                    self.logger.debug(
                        f"========= Processing {column_name} =========== "
                    )
                    column_spec = mapper.get_column_specs(source_df=source_df)[
                        column_name
                    ]
                    source_df.alias("b").select(column_spec).limit(1).count()
                    self.logger.debug(
                        f"========= Done Processing {column_name} =========== "
                    )
                except (AnalysisException, ValueError) as e2:
                    self.logger.error(
                        f"=========  Processing {column_name} FAILED =========== "
                    )
                    column_spec = mapper.get_column_specs(source_df=source_df)[
                        column_name
                    ]
                    self.logger.warning(
                        ColumnSpecWrapper(column_spec).to_debug_string()
                    )
                    self.logger.error(
                        f"========= checking schema for failed column {column_name} =========== "
                    )
                    check_schema_result = mapper.check_schema(
                        parent_column=None, source_df=source_df
                    )
                    msg: str = ""
                    if isinstance(e2, AnalysisException) and e2.desc.startswith(
                        "cannot resolve 'array"
                    ):
                        msg = (
                            "Looks like the elements of the array have different structures.  "
                            "All items in an array should have the exact same structure.  "
                            "You can pass in include_nulls to AutoMapperDataTypeComplexBase to force it to create "
                            "null values for each element in the structure. \n"
                        )
                        # find the data types of each item in the list
                    column_values: Optional[List[Any]] = None
                    # This can cause GC overhead limit reached error obfuscating the actual error
                    # try:
                    #     # get column value in first row
                    #     column_values: Optional[List[Any]] = (
                    #         [
                    #             row.asDict(recursive=True)[column_name]
                    #             for row in source_df.select(column_name)
                    #             .limit(5)
                    #             .collect()
                    #         ]
                    #         if bool(source_df.head(1))  # df is not empty
                    #         else None
                    #     )
                    # except Exception as e3:
                    #     print(e3)
                    #     column_values = None

                    msg += self._get_message_for_exception(
                        column_name=column_name,
                        check_schema_result=check_schema_result,
                        df=df,
                        e=e2,
                        source_df=source_df,
                        column_values=column_values,
                    )
                    raise AutoMapperAnalysisException(
                        automapper_name=self.view,
                        msg=msg,
                        column_name=column_name,
                        check_schema_result=check_schema_result,
                        column_values=column_values,
                    ) from e2
        except Exception as e:
            self.logger.error("====  OOPS ===========")
            msg = self._get_message_for_exception(
                column_name="",
                check_schema_result=None,
                df=df,
                e=e,
                source_df=source_df,
                column_values=None,
            )
            raise Exception(msg) from e

        self.logger.debug(f"========= Finished AutoMapper {self.view} =========== ")
        return df
def transform_data(clickstream_df: DataFrame, purchases_df: DataFrame,
                   logger: Log4jWrapper) -> DataFrame:
    """
    Builds purchases attribution dataset from mobile app clickstream
    and user purchases data.

    :param df: dataset to transform as DataFrame.
    :return: transformed DataFrame.
    """
    user_window: Window = Window.partitionBy("userId").orderBy("eventTime")

    logger.warning("build leading events dataset ...")
    leading_events_df = (clickstream_df.filter(
        F.col("eventType").isin({"app_open", "app_close"})).withColumn(
            "nextEventId",
            F.lead(F.col("eventId"), 1).over(user_window)).select(
                F.col("userId"),
                F.col("eventTime").alias("startSessionEventTime"),
                F.col("eventType").alias("startSessionEventType"),
                F.col("attributes"), F.col("nextEventId")).filter(
                    F.col("startSessionEventType") == "app_open").withColumn(
                        "campaignId",
                        F.get_json_object(
                            F.col("attributes"), "$.campaign_id")).withColumn(
                                "channelId",
                                F.get_json_object(
                                    F.col("attributes"),
                                    "$.channel_id")).drop("attributes"))

    leading_events_df.show()

    logger.warning("calculate user sessions ...")
    sessions_df = (leading_events_df.alias("leading_events").join(
        clickstream_df.alias("all_events"),
        on=F.col("leading_events.nextEventId") == F.col("all_events.eventId"),
        how="left").select(
            F.col("leading_events.userId"),
            F.col("leading_events.startSessionEventTime"),
            F.col("leading_events.campaignId"),
            F.col("leading_events.channelId"),
            F.col("all_events.eventTime").alias("endSessionEventTime"),
        ).withColumn("sessionId", F.monotonically_increasing_id()))

    sessions_df.show()

    logger.warning("append session to each event ...")
    sessioned_purchases_df = (clickstream_df.alias("c").filter(
        F.col("c.eventType") == "purchase").join(
            sessions_df.alias("s"),
            on=[
                F.col("c.userId") == F.col("s.userId"),
                F.col("c.eventTime") >= F.col("s.startSessionEventTime"),
                (F.col("c.eventTime") <= F.col("s.endSessionEventTime"))
                | F.col("s.endSessionEventTime").isNull(),
            ]).select(
                F.col("s.userId"),
                F.col("s.sessionId"),
                F.col("s.campaignId"),
                F.col("s.channelId"),
                F.get_json_object(F.col("c.attributes"),
                                  "$.purchase_id").alias("purchaseId"),
            ).orderBy(F.col("userId"), F.col("eventTime")))

    sessioned_purchases_df.show()

    logger.warning("build purchases attribution ...")
    projection_df = (sessioned_purchases_df.alias("s").join(
        purchases_df.alias("p"),
        on=F.col("p.purchaseId") == F.col("s.purchaseId")).select(
            F.col("p.purchaseId"), F.col("p.purchaseTime"),
            F.col("p.billingCost"), F.col("p.isConfirmed"),
            F.col("s.sessionId"), F.col("s.campaignId"), F.col("s.channelId")))

    projection_df.show()

    return projection_df