示例#1
0
def remove_false_positives(az_phewas_df: DataFrame) -> DataFrame:
    """Remove associations present in the synonymous negative control."""

    false_positives = az_phewas_df.filter(
        col('CollapsingModel') == 'syn').select('Gene',
                                                'Phenotype').distinct()
    true_positives = az_phewas_df.join(false_positives,
                                       on=['Gene', 'Phenotype'],
                                       how='left_anti').distinct()
    logging.info(
        f'{az_phewas_df.count() - true_positives.count()} false positive evidence of association have been dropped.'
    )

    return true_positives
示例#2
0
 def type_replace(
     main_df: dataframe.DataFrame,
     type_df: dataframe.DataFrame,
     replaced_column: str,
 ) -> dataframe.DataFrame:
     '''Functions adds vehicle type information
     from the type table to the preliminary temporary table
     if the type code from both tables the same.'''
     return (main_df.join(
         type_df,
         main_df[replaced_column] == type_df['TXTCode'],
         'left',
     ).drop(replaced_column,
            "TXTCode").withColumnRenamed('TXTTextLong',
                                         replaced_column))
示例#3
0
 def _apply_joiner_to_df(self, source_name: str, joiner: dict,
                         df: DataFrame):
     joiner_df = joiner["target_join_df"].alias(source_name)
     if isinstance(joiner_df, DataFrame):
         if self._is_join_needed(joiner_df, df):
             joiner[
                 "optimizer"] = "" if "optimizer" not in joiner else joiner[
                     "optimizer"]
             optimized_joiner_df = self._optimize_joiner(
                 joiner_df, joiner["optimizer"])
             df = df.join(optimized_joiner_df,
                          on=joiner["filter_condition"],
                          how=joiner["join_type"])
         else:
             print("{} has already been joined to base df.".format(
                 source_name))
     else:
         exception_msg = "{} cannot be resolved!!! Please add the source to the partner.".format(
             source_name)
         raise Exception(exception_msg)
     return df
示例#4
0
def model_dim_countries(df_countries: SparkDataFrame,
                        df_temperature: SparkDataFrame, path: str) -> None:
    """Model dimension countries data
    Join countries data and temperature data
    Write data into parquet file

    :param df_countries: countries data frame
    :param df_temperature: temperature data frame
    :param path: the path of parquet file to write
    """
    # join the table
    df = df_countries.join(df_temperature, 'lower_name', how='left')
    # normalize the name
    title_case_udf = udf(lambda x: x and x.title())
    df = df.withColumn(
        'name',
        when(isnull(df['name']),
             title_case_udf(df['lower_name'])).otherwise(df['name']))
    # drop duplicated columns
    df = df.drop('lower_name', 'country')

    # write the data to parquet file
    df.write.mode('overwrite').parquet(path)
示例#5
0
def create_final_dataframe(distances: DataFrame, review: DataFrame,
                           yearly_weather_pivot: DataFrame) -> DataFrame:
    """
    Combines the 3 dataframes from the previous steps to create the final dataframe. This dataframe contains the not
    only the actual review and the weather metrics, but also the date, state, and city so one can easily further extend
    this final dataframe.
    """
    distance_review_join_condition = [
        distances.business_id == review.business_id
    ]

    distances_review = (distances.join(review,
                                       on=distance_review_join_condition,
                                       how='inner').select(
                                           col('city'), col('state'),
                                           col('text'), col('review_date'),
                                           col('us_station_id')).repartition(
                                               200, 'us_station_id',
                                               'review_date'))

    distances_review_weather_join_condition = [
        distances_review.review_date == yearly_weather_pivot.weather_date,
        distances_review.us_station_id == yearly_weather_pivot.station_id
    ]

    final_table = (distances_review.join(
        yearly_weather_pivot,
        on=distances_review_weather_join_condition,
        how='inner').groupby('city', 'state', 'review_date', 'text').agg(
            avg('PRCP').alias('prcp'),
            avg('SNOW').alias('snow'),
            avg('SNWD').alias('snwd'),
            avg('TMAX').alias('tmax'),
            avg('TMIN').alias('tmin')).repartition(200))

    return final_table
def join_loans_and_repayments_combined(df1: DataFrame, df2: DataFrame):
    return df1.join(df2, "LoanID")