def mutual_info(sdf, colnames): check_columns(sdf, colnames) n = len(colnames) probs = [] for i in range(n): probs.append(distribution(sdf, colnames[i])) res = np.zeros(shape=(n, n)) for i in range(n): for j in range(i, n): tdf = VectorAssembler(inputCols=[colnames[i], colnames[j]], outputCol='__vectors').transform(sdf) tdf = distribution(tdf, '__vectors') tdf = disassemble(dense_to_array(tdf, '__col', '__features'), '__features') tdf = tdf.join(probs[i].toDF('__features_0', '__p0'), on='__features_0') tdf = tdf.join(probs[j].toDF('__features_1', '__p1'), on='__features_1') mi = tdf.select( F.sum( F.expr( 'log2(__probability / (__p0 * __p1)) * __probability')) ).take(1)[0][0] res[i, j] = mi res[j, i] = mi return pd.DataFrame(res, index=colnames, columns=colnames)
def annotate_pval_dataset(self, cur_df): import pyspark try: tr_inst = self.spark.read.parquet(self.training_temp_dir) te_inst = self.spark.read.parquet(self.testing_temp_dir) return tr_inst, te_inst except pyspark.sql.utils.AnalysisException as ex: template = "An exception of type {0} occurred. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) self.logger.info(message) self.logger.info("PROCESS") self.logger.debug("NOTEXISTS ANNOTATE_FILE") self.logger.debug("RUN_PROCESS") except: self.logger.info("TEST_PURPOSE") from pyspark.ml.feature import VectorAssembler postfix = self.postfix.format(self.sel_top) obs_df = cur_df cur_cols = obs_df.columns for i in self.non_feature_column: cur_cols.remove(i) self.logger.debug("feature_columns") cur_cols = sorted(cur_cols) self.logger.debug(cur_cols) import json json.dump({"non_demo_features": cur_cols}, open(self.json_feature_dump_loc, "w")) obs_df = VectorAssembler( inputCols=cur_cols, outputCol="features_imputed").transform(obs_df) cur_time_list = obs_df.select("ID", "TIME_SPAN") of_annotated = obs_df of_excl_training = dict() demo_feature = self.add_demo() of_annotated = VectorAssembler( inputCols=["features_imputed", "demo_feature"], outputCol="features").transform( of_annotated.join(demo_feature, "ID")) of_annotated.show() from pyspark.sql.functions import col, lit, when self.logger.debug("ANNOTATED") cur_test_ids = self.get_target_test_id() self.logger.debug(cur_test_ids) # TODO CHECK why I put 'why 0' comment over here? self.logger.debug(len(cur_test_ids)) tr_inst, te_inst = self.cur_annotator.prep_TR_TE( of_annotated, test_id_list=cur_test_ids) self.logger.debug("IDS") self.logger.debug( tr_inst.select("ID").distinct().count(), te_inst.select("ID").distinct().count()) self.logger.debug("TR_TE_CNT:{0}_{1}".format(tr_inst.count(), te_inst.count())) train_data_ID = tr_inst.select("ID").distinct().rdd.flatMap( list).collect() testing_data_ID = te_inst.select("ID").distinct().rdd.flatMap( list).collect() self.action_df.show() train_action_df = self.action_df.where( col("ID").isin(train_data_ID)).persist() self.logger.debug(train_action_df.select("ID").distinct().count()) train_terminal_outcome = self.terminal_outcome.where( col("ID").isin(train_data_ID)).persist() self.logger.debug( train_terminal_outcome.select("ID").distinct().count()) intv_w_p_val = self.identify_relevant_action( train_action_df, train_terminal_outcome, tr_inst.select("ID").distinct().count()) intv_w_p_val.join( self.def_df.where(col("SOURCE").isin(["CPT", "MED", "PROC"])), self.itemid).orderBy("p_val").show(100, truncate=False) from pyspark.sql.functions import sum, rand, max, lit from pyspark.ml.feature import VectorAssembler cur_annot_topk = self.sel_top self.action_df.show() self.terminal_outcome.show() annot_df = self.action_df.join(self.terminal_outcome, "ID").persist() annot_df.show() pos_inst_dict = dict() from pyspark.sql.functions import count for cur_of in [self.target_disch_col]: # For debug purpose, pass if target_of is not identified self.logger.debug(cur_of) intv_w_p_val.where("DISCH_DX == '{0}'".format(cur_of)).orderBy( col("p_val").cast("double")).show(50, truncate=False) target_annot_criteria = intv_w_p_val.where( "DISCH_DX == '{0}'".format(cur_of)).orderBy( col("p_val").cast("double")).limit(cur_annot_topk) target_annot_criteria.write.save(self.annot_intv_dir.format( cur_of, cur_annot_topk), mode="overwrite") target_annot_criteria = target_annot_criteria.select( self.itemid).rdd.flatMap(list).collect() if len(target_annot_criteria) == 0: self.logger.info( "NO TERMINAL DX {0} idenfieid from pts".format(cur_of)) pos_inst_dict[cur_of] = None continue self.logger.debug(target_annot_criteria) self.logger.debug(len(target_annot_criteria)) self.logger.debug("selected intv!!") self.def_df.where(col( self.itemid).isin(target_annot_criteria)).show(cur_annot_topk, truncate=False) pos_inst_dict[cur_of] = annot_df.where((col(self.itemid).isin(target_annot_criteria)) & (col("DISCH_DX") == cur_of))\ .select("ID", col("TIME_OBS").cast("date").alias("TIME_OBS"), lit("1").cast("double").alias("{0}_label".format(cur_of)))\ .distinct().persist() pos_inst_dict[cur_of].groupBy("{0}_label".format(cur_of)).agg( count("*")).show() from pyspark.sql.functions import broadcast true_inst = annot_df.where( (col(self.itemid).isin(target_annot_criteria)) & (col("DISCH_DX") == cur_of)) excl_id = annot_df.withColumn("IS_TARGET_OF",when(col("DISCH_DX") ==cur_of,lit("1").cast("double")).otherwise(lit("0").cast("double")))\ .withColumn("IS_REL_INTV", when(col(self.itemid).isin(target_annot_criteria), lit("1").cast("double")).otherwise(lit("0").cast("double")))\ .groupBy("ID").agg(sum("IS_TARGET_OF").alias("SUM_IS_TARGET_OF"),sum("IS_REL_INTV").alias("SUM_IS_REL_INTV"))\ .where("(SUM_IS_TARGET_OF <> 0) AND (SUM_IS_REL_INTV == 0)").select("ID").distinct().rdd.flatMap(list).collect() self.logger.debug("NUM_PTS_EXCLUDED:{0}".format(len(excl_id))) self.logger.debug("TRAINING_INST_COUNT:{0}".format( tr_inst.count())) tr_inst = tr_inst.withColumn("TIME_OBS",col("TIME_SPAN.TIME_TO").cast("date"))\ .withColumn("{0}_excl".format(cur_of), col("ID").isin(excl_id).cast("double")).repartition("ID","TIME_OBS")\ .join(broadcast(pos_inst_dict[cur_of]),["ID","TIME_OBS"],"left_outer").fillna(value=0.0,subset=["{0}_label".format(cur_of)]).persist() print(tr_inst.count()) tr_inst.groupBy("{0}_label".format(cur_of), "{0}_excl".format(cur_of)).agg(count("*")).show() te_inst = te_inst.withColumn("TIME_OBS",col("TIME_SPAN.TIME_TO").cast("date"))\ .withColumn("{0}_excl".format(cur_of), col("ID").isin(excl_id).cast("double")).repartition("ID","TIME_OBS")\ .join(broadcast(pos_inst_dict[cur_of]),["ID","TIME_OBS"],"left_outer").fillna(value=0.0, subset=["{0}_label".format(cur_of)]).persist() print(te_inst.count()) te_inst.groupBy("{0}_label".format(cur_of), "{0}_excl".format(cur_of)).agg(count("*")).show() tr_inst.groupBy("ID").agg( max("{0}_label".format(cur_of)).alias( "{0}_label".format(cur_of)), max("{0}_excl".format(cur_of)).alias( "{0}_excl".format(cur_of))).groupBy( "{0}_label".format(cur_of), "{0}_excl".format(cur_of)).agg(count("*")).show() te_inst.groupBy("ID").agg( max("{0}_label".format(cur_of)).alias( "{0}_label".format(cur_of)), max("{0}_excl".format(cur_of)).alias( "{0}_excl".format(cur_of))).groupBy( "{0}_label".format(cur_of), "{0}_excl".format(cur_of)).agg(count("*")).show() tr_inst.write.save(self.training_temp_dir, mode="overwrite") te_inst.write.save(self.testing_temp_dir, mode="overwrite") tr_inst = self.spark.read.parquet(self.training_temp_dir) te_inst = self.spark.read.parquet(self.testing_temp_dir) #te_inst.show() return (tr_inst, te_inst)