예제 #1
0
def mutual_info(sdf, colnames):
    check_columns(sdf, colnames)
    n = len(colnames)
    probs = []
    for i in range(n):
        probs.append(distribution(sdf, colnames[i]))
    res = np.zeros(shape=(n, n))
    for i in range(n):
        for j in range(i, n):
            tdf = VectorAssembler(inputCols=[colnames[i], colnames[j]],
                                  outputCol='__vectors').transform(sdf)
            tdf = distribution(tdf, '__vectors')
            tdf = disassemble(dense_to_array(tdf, '__col', '__features'),
                              '__features')
            tdf = tdf.join(probs[i].toDF('__features_0', '__p0'),
                           on='__features_0')
            tdf = tdf.join(probs[j].toDF('__features_1', '__p1'),
                           on='__features_1')
            mi = tdf.select(
                F.sum(
                    F.expr(
                        'log2(__probability / (__p0 * __p1)) * __probability'))
            ).take(1)[0][0]
            res[i, j] = mi
            res[j, i] = mi
    return pd.DataFrame(res, index=colnames, columns=colnames)
    def annotate_pval_dataset(self, cur_df):
        import pyspark
        try:
            tr_inst = self.spark.read.parquet(self.training_temp_dir)
            te_inst = self.spark.read.parquet(self.testing_temp_dir)
            return tr_inst, te_inst
        except pyspark.sql.utils.AnalysisException as ex:
            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            self.logger.info(message)
            self.logger.info("PROCESS")
            self.logger.debug("NOTEXISTS ANNOTATE_FILE")
            self.logger.debug("RUN_PROCESS")
        except:
            self.logger.info("TEST_PURPOSE")

        from pyspark.ml.feature import VectorAssembler
        postfix = self.postfix.format(self.sel_top)
        obs_df = cur_df

        cur_cols = obs_df.columns
        for i in self.non_feature_column:
            cur_cols.remove(i)
            self.logger.debug("feature_columns")
        cur_cols = sorted(cur_cols)
        self.logger.debug(cur_cols)
        import json

        json.dump({"non_demo_features": cur_cols},
                  open(self.json_feature_dump_loc, "w"))

        obs_df = VectorAssembler(
            inputCols=cur_cols, outputCol="features_imputed").transform(obs_df)

        cur_time_list = obs_df.select("ID", "TIME_SPAN")
        of_annotated = obs_df
        of_excl_training = dict()

        demo_feature = self.add_demo()

        of_annotated = VectorAssembler(
            inputCols=["features_imputed", "demo_feature"],
            outputCol="features").transform(
                of_annotated.join(demo_feature, "ID"))

        of_annotated.show()

        from pyspark.sql.functions import col, lit, when
        self.logger.debug("ANNOTATED")

        cur_test_ids = self.get_target_test_id()
        self.logger.debug(cur_test_ids)
        # TODO CHECK why I put 'why 0' comment over here?
        self.logger.debug(len(cur_test_ids))
        tr_inst, te_inst = self.cur_annotator.prep_TR_TE(
            of_annotated, test_id_list=cur_test_ids)

        self.logger.debug("IDS")
        self.logger.debug(
            tr_inst.select("ID").distinct().count(),
            te_inst.select("ID").distinct().count())

        self.logger.debug("TR_TE_CNT:{0}_{1}".format(tr_inst.count(),
                                                     te_inst.count()))

        train_data_ID = tr_inst.select("ID").distinct().rdd.flatMap(
            list).collect()

        testing_data_ID = te_inst.select("ID").distinct().rdd.flatMap(
            list).collect()

        self.action_df.show()

        train_action_df = self.action_df.where(
            col("ID").isin(train_data_ID)).persist()

        self.logger.debug(train_action_df.select("ID").distinct().count())

        train_terminal_outcome = self.terminal_outcome.where(
            col("ID").isin(train_data_ID)).persist()

        self.logger.debug(
            train_terminal_outcome.select("ID").distinct().count())

        intv_w_p_val = self.identify_relevant_action(
            train_action_df, train_terminal_outcome,
            tr_inst.select("ID").distinct().count())
        intv_w_p_val.join(
            self.def_df.where(col("SOURCE").isin(["CPT", "MED", "PROC"])),
            self.itemid).orderBy("p_val").show(100, truncate=False)

        from pyspark.sql.functions import sum, rand, max, lit
        from pyspark.ml.feature import VectorAssembler
        cur_annot_topk = self.sel_top

        self.action_df.show()
        self.terminal_outcome.show()

        annot_df = self.action_df.join(self.terminal_outcome, "ID").persist()
        annot_df.show()
        pos_inst_dict = dict()
        from pyspark.sql.functions import count
        for cur_of in [self.target_disch_col]:
            # For debug purpose, pass if target_of is not identified
            self.logger.debug(cur_of)
            intv_w_p_val.where("DISCH_DX == '{0}'".format(cur_of)).orderBy(
                col("p_val").cast("double")).show(50, truncate=False)
            target_annot_criteria = intv_w_p_val.where(
                "DISCH_DX == '{0}'".format(cur_of)).orderBy(
                    col("p_val").cast("double")).limit(cur_annot_topk)
            target_annot_criteria.write.save(self.annot_intv_dir.format(
                cur_of, cur_annot_topk),
                                             mode="overwrite")
            target_annot_criteria = target_annot_criteria.select(
                self.itemid).rdd.flatMap(list).collect()
            if len(target_annot_criteria) == 0:
                self.logger.info(
                    "NO TERMINAL DX {0} idenfieid from pts".format(cur_of))
                pos_inst_dict[cur_of] = None
                continue
            self.logger.debug(target_annot_criteria)
            self.logger.debug(len(target_annot_criteria))
            self.logger.debug("selected intv!!")
            self.def_df.where(col(
                self.itemid).isin(target_annot_criteria)).show(cur_annot_topk,
                                                               truncate=False)
            pos_inst_dict[cur_of] = annot_df.where((col(self.itemid).isin(target_annot_criteria)) & (col("DISCH_DX") == cur_of))\
                .select("ID", col("TIME_OBS").cast("date").alias("TIME_OBS"), lit("1").cast("double").alias("{0}_label".format(cur_of)))\
                .distinct().persist()
            pos_inst_dict[cur_of].groupBy("{0}_label".format(cur_of)).agg(
                count("*")).show()
            from pyspark.sql.functions import broadcast

            true_inst = annot_df.where(
                (col(self.itemid).isin(target_annot_criteria))
                & (col("DISCH_DX") == cur_of))
            excl_id = annot_df.withColumn("IS_TARGET_OF",when(col("DISCH_DX") ==cur_of,lit("1").cast("double")).otherwise(lit("0").cast("double")))\
                .withColumn("IS_REL_INTV", when(col(self.itemid).isin(target_annot_criteria), lit("1").cast("double")).otherwise(lit("0").cast("double")))\
                .groupBy("ID").agg(sum("IS_TARGET_OF").alias("SUM_IS_TARGET_OF"),sum("IS_REL_INTV").alias("SUM_IS_REL_INTV"))\
                .where("(SUM_IS_TARGET_OF <> 0) AND (SUM_IS_REL_INTV == 0)").select("ID").distinct().rdd.flatMap(list).collect()
            self.logger.debug("NUM_PTS_EXCLUDED:{0}".format(len(excl_id)))
            self.logger.debug("TRAINING_INST_COUNT:{0}".format(
                tr_inst.count()))
            tr_inst = tr_inst.withColumn("TIME_OBS",col("TIME_SPAN.TIME_TO").cast("date"))\
                .withColumn("{0}_excl".format(cur_of), col("ID").isin(excl_id).cast("double")).repartition("ID","TIME_OBS")\
                .join(broadcast(pos_inst_dict[cur_of]),["ID","TIME_OBS"],"left_outer").fillna(value=0.0,subset=["{0}_label".format(cur_of)]).persist()
            print(tr_inst.count())
            tr_inst.groupBy("{0}_label".format(cur_of),
                            "{0}_excl".format(cur_of)).agg(count("*")).show()
            te_inst = te_inst.withColumn("TIME_OBS",col("TIME_SPAN.TIME_TO").cast("date"))\
                .withColumn("{0}_excl".format(cur_of), col("ID").isin(excl_id).cast("double")).repartition("ID","TIME_OBS")\
                .join(broadcast(pos_inst_dict[cur_of]),["ID","TIME_OBS"],"left_outer").fillna(value=0.0, subset=["{0}_label".format(cur_of)]).persist()
            print(te_inst.count())
            te_inst.groupBy("{0}_label".format(cur_of),
                            "{0}_excl".format(cur_of)).agg(count("*")).show()

            tr_inst.groupBy("ID").agg(
                max("{0}_label".format(cur_of)).alias(
                    "{0}_label".format(cur_of)),
                max("{0}_excl".format(cur_of)).alias(
                    "{0}_excl".format(cur_of))).groupBy(
                        "{0}_label".format(cur_of),
                        "{0}_excl".format(cur_of)).agg(count("*")).show()
            te_inst.groupBy("ID").agg(
                max("{0}_label".format(cur_of)).alias(
                    "{0}_label".format(cur_of)),
                max("{0}_excl".format(cur_of)).alias(
                    "{0}_excl".format(cur_of))).groupBy(
                        "{0}_label".format(cur_of),
                        "{0}_excl".format(cur_of)).agg(count("*")).show()

        tr_inst.write.save(self.training_temp_dir, mode="overwrite")
        te_inst.write.save(self.testing_temp_dir, mode="overwrite")

        tr_inst = self.spark.read.parquet(self.training_temp_dir)
        te_inst = self.spark.read.parquet(self.testing_temp_dir)
        #te_inst.show()

        return (tr_inst, te_inst)