def remove_false_positives(az_phewas_df: DataFrame) -> DataFrame: """Remove associations present in the synonymous negative control.""" false_positives = az_phewas_df.filter( col('CollapsingModel') == 'syn').select('Gene', 'Phenotype').distinct() true_positives = az_phewas_df.join(false_positives, on=['Gene', 'Phenotype'], how='left_anti').distinct() logging.info( f'{az_phewas_df.count() - true_positives.count()} false positive evidence of association have been dropped.' ) return true_positives
def type_replace( main_df: dataframe.DataFrame, type_df: dataframe.DataFrame, replaced_column: str, ) -> dataframe.DataFrame: '''Functions adds vehicle type information from the type table to the preliminary temporary table if the type code from both tables the same.''' return (main_df.join( type_df, main_df[replaced_column] == type_df['TXTCode'], 'left', ).drop(replaced_column, "TXTCode").withColumnRenamed('TXTTextLong', replaced_column))
def _apply_joiner_to_df(self, source_name: str, joiner: dict, df: DataFrame): joiner_df = joiner["target_join_df"].alias(source_name) if isinstance(joiner_df, DataFrame): if self._is_join_needed(joiner_df, df): joiner[ "optimizer"] = "" if "optimizer" not in joiner else joiner[ "optimizer"] optimized_joiner_df = self._optimize_joiner( joiner_df, joiner["optimizer"]) df = df.join(optimized_joiner_df, on=joiner["filter_condition"], how=joiner["join_type"]) else: print("{} has already been joined to base df.".format( source_name)) else: exception_msg = "{} cannot be resolved!!! Please add the source to the partner.".format( source_name) raise Exception(exception_msg) return df
def model_dim_countries(df_countries: SparkDataFrame, df_temperature: SparkDataFrame, path: str) -> None: """Model dimension countries data Join countries data and temperature data Write data into parquet file :param df_countries: countries data frame :param df_temperature: temperature data frame :param path: the path of parquet file to write """ # join the table df = df_countries.join(df_temperature, 'lower_name', how='left') # normalize the name title_case_udf = udf(lambda x: x and x.title()) df = df.withColumn( 'name', when(isnull(df['name']), title_case_udf(df['lower_name'])).otherwise(df['name'])) # drop duplicated columns df = df.drop('lower_name', 'country') # write the data to parquet file df.write.mode('overwrite').parquet(path)
def create_final_dataframe(distances: DataFrame, review: DataFrame, yearly_weather_pivot: DataFrame) -> DataFrame: """ Combines the 3 dataframes from the previous steps to create the final dataframe. This dataframe contains the not only the actual review and the weather metrics, but also the date, state, and city so one can easily further extend this final dataframe. """ distance_review_join_condition = [ distances.business_id == review.business_id ] distances_review = (distances.join(review, on=distance_review_join_condition, how='inner').select( col('city'), col('state'), col('text'), col('review_date'), col('us_station_id')).repartition( 200, 'us_station_id', 'review_date')) distances_review_weather_join_condition = [ distances_review.review_date == yearly_weather_pivot.weather_date, distances_review.us_station_id == yearly_weather_pivot.station_id ] final_table = (distances_review.join( yearly_weather_pivot, on=distances_review_weather_join_condition, how='inner').groupby('city', 'state', 'review_date', 'text').agg( avg('PRCP').alias('prcp'), avg('SNOW').alias('snow'), avg('SNWD').alias('snwd'), avg('TMAX').alias('tmax'), avg('TMIN').alias('tmin')).repartition(200)) return final_table
def join_loans_and_repayments_combined(df1: DataFrame, df2: DataFrame): return df1.join(df2, "LoanID")