Пример #1
0
    def test_is_duration_events(self):
        schema = StructType([
            StructField("patientID", IntegerType(), True),
            StructField("start", TimestampType(), True),
            StructField("end", TimestampType(), True),
        ])

        patients_pd = pd.DataFrame({"patientID": [1, 2, 3]})
        patients = self.spark.createDataFrame(patients_pd)

        cohort1 = Cohort("patients", "patients", patients, None)
        self.assertFalse(cohort1.is_duration_events())

        data = [(1, datetime(1993, 10, 9), datetime(1993, 10, 9))]

        events = self.spark.createDataFrame(data=data, schema=schema)

        cohort2 = Cohort("patients", "patients", patients, events)
        self.assertTrue(cohort2.is_duration_events())

        data = [(1, datetime(1993, 10, 9), None),
                (2, datetime(1993, 10, 9), None)]

        events = self.spark.createDataFrame(data=data, schema=schema)

        cohort2 = Cohort("patients", "patients", patients, events)
        self.assertFalse(cohort2.is_duration_events())
def plot_mean_duration_per_value(figure: Figure, cohort: Cohort) -> Figure:
    assert cohort.is_duration_events()

    df = agg_by_col(cohort.events, frozenset(["value"]), "duration",
                    "mean").sort_values("value")
    ax = figure.gca()
    ax.barh(range(len(df.value)), df["avg(duration)"].values)
    ax.set_yticklabels(df.value.values)
    ax.set_yticks(range(len(df.value)))
    return figure
def plot_duration_distribution_per_month_as_bar(figure: Figure,
                                                cohort: Cohort) -> Figure:
    assert cohort.is_duration_events()

    df = event_duration_agg(cohort, "count").sort_values("duration")
    df.duration = np.ceil(df.duration / 30)
    df.duration = df.duration.astype("int32")
    df = df.groupby("duration").sum().reset_index()
    ax = figure.gca()
    ax.bar(range(len(df)), df["count(1)"].values)
    ax.set_xticklabels(df.duration.values)
    ax.set_xticks(range(len(df)))

    return figure
def plot_duration_distribution_per_day_as_line(figure: Figure,
                                               cohort: Cohort) -> Figure:
    assert cohort.is_duration_events()

    df = event_duration_agg(cohort, "count").sort_values("duration")
    ax = figure.gca()
    ax.plot(df.duration, df["count(1)"])
    ax.set_yscale("log")

    major = IndexLocator(365, +0.0)
    minor = IndexLocator(30, +0.0)
    ax.xaxis.set_minor_locator(minor)
    ax.xaxis.set_major_locator(major)
    ax.grid(True, which="major", axis="x")
    return figure