def test_surgery_usage_regression_df(): case_service = "Cardiac Surgery" analytics = ScmAnalytics.ScmAnalytics(lhs_config) surgery_df = analytics.surgery_df usage_df = analytics.usage_df item_ids = ["38242", "129636"] surgery_df = surgery_df[surgery_df["case_service"] == case_service] usage_df = usage_df[usage_df["case_service"] == case_service] surgery_df = surgery_df[surgery_df["event_id"].isin( set(usage_df["event_id"]))] all_procedures = set.union(*surgery_df["procedures"]) r_df = SURegressionModel.surgery_usage_regression_df(surgery_df, usage_df, item_ids=item_ids) # print(r_df.iloc[0]) interactions = [("cabg double", "esvh"), ("ita", "esvh")] features = ["cabg double", "esvh", "ita", "cabg single"] # print(all_procedures) x, feature_df = SURegressionModel.extract_features(r_df, features, all_procedures, interactions)
usages = list(day_df["real_usage"]) default_day_df["real_usage"] = list( np.random.choice(usages) for i in range(len(default_day_df))) default_day_df[ "change"] = day_df["received_qty"] - default_day_df["real_usage"] default_day_df["inventory_level"] = default_day_df["change"].cumsum() default_day_df["inventory_level"] = default_day_df[ "inventory_level"] + initial_inventory return default_day_df["inventory_level"] case_service = "Cardiac Surgery" item_id = "38242" trials = 1 analytics = ScmAnalytics.ScmAnalytics(lhs_config) surgery_df = analytics.surgery_df usage_df = analytics.usage_df item_ids = [item_id] surgery_df = surgery_df[surgery_df["case_service"] == case_service] surgery_df = surgery_df.drop_duplicates("event_id", keep="last") usage_df = usage_df[usage_df["case_service"] == case_service] surgery_df = surgery_df[surgery_df["event_id"].isin(set(usage_df["event_id"]))] surgery_df["procedures"] = surgery_df["procedures"].apply( lambda x: set(e.replace(" ", "_") for e in x)) all_procedures = set.union(*surgery_df["procedures"]) r_df = SURegressionModel.surgery_usage_regression_df(surgery_df, usage_df, item_ids=item_ids)
def run(case_service="Cardiac Surgery", item_id="1686"): analytics = ScmAnalytics.ScmAnalytics(lhs_config) case_service_filter = [{ "dim": "case_service", "op": "eq", "val": case_service }] usage_df = analytics.usage_df usage_df = usage_df[usage_df["start_date"].notna()] usage_df = Analytics.process_filters(usage_df, filters=case_service_filter) usage_events = set(usage_df["event_id"]) item_usage_df = usage_df[usage_df["item_id"] == item_id] surgery_df = pre_process_columns(analytics.surgery_df) surgery_df = surgery_df[surgery_df["start_date"].notna()] surgery_df = surgery_df[ surgery_df["start_date"] > datetime.date(2016, 1, 1)] surgery_df = Analytics.process_filters(surgery_df, filters=case_service_filter) surgery_df = surgery_df[surgery_df["event_id"].isin(usage_events)] surgery_df = surgery_df.join( item_usage_df.set_index("event_id")[["used_qty"]], on="event_id", how="left").fillna(0) surgery_df["procedures"] = surgery_df["procedures"].apply( lambda x: frozenset(x)) usage_dist = surgery_df.groupby(["procedures"]).agg({ "used_qty": lambda x: list(x) }).reset_index() usage_dist["occurrences"] = usage_dist["used_qty"].apply(lambda x: len(x)) usage_dist = usage_dist[usage_dist["occurrences"] > 25] usage_dist["mean"] = usage_dist["used_qty"].apply(lambda x: np.mean(x)) usage_dist["variance"] = usage_dist["used_qty"].apply( lambda x: np.var(x, ddof=1)) usage_dist["var/mean"] = usage_dist["variance"] / usage_dist["mean"] df = surgery_df[surgery_df["procedures"].isin( usage_dist["procedures"])][["start_date", "used_qty"]] rolling_df = df[["used_qty"]].rolling(100).mean() plt.plot(list(rolling_df["used_qty"])) rolling_df = df[["used_qty"]].rolling(50).mean() plt.plot(list(rolling_df["used_qty"])) plt.savefig("{}_rolling_usage.png".format(item_id), format="png") traces = [] x_max = 0 for i in range(len(usage_dist)): case = usage_dist.iloc[i]["procedures"] data = usage_dist.iloc[i]["used_qty"] label = ", ".join(case) end = max(usage_dist.iloc[i]["used_qty"]) + 1 traces.append( go.Histogram(x=data, name=label, xbins=dict(start=0, end=end, size=1), histnorm='probability', opacity=0.75)) x_max = int(end) if end > x_max else x_max
def boostrap_info_process(item_id="38242"): case_service = "Cardiac Surgery" #item_id = "3824ns_info_state_rvs2" info_granularity = 1 eps_trunk = 1e-3 elective_outdir = "scm_implementation/ns_info_state_rvs/elective" emergency_outdir = "scm_implementation/ns_info_state_rvs/emergency" analytics = ScmAnalytics.ScmAnalytics(lhs_config) filters = [{ "dim": "case_service", "op": "eq", "val": case_service }, { "dim": "urgent_elective", "op": "eq", "val": "Elective" }] elective_filter = [{ "dim": "urgent_elective", "op": "eq", "val": "Elective" }] emergency_filter = [{ "dim": "urgent_elective", "op": "eq", "val": "Urgent" }] case_service_filter = [{ "dim": "case_service", "op": "eq", "val": case_service }] surgery_df = pre_process_columns(analytics.surgery_df) surgery_df = surgery_df[surgery_df["start_date"].notna()] surgery_df = surgery_df[ surgery_df["start_date"] > datetime.date(2016, 1, 1)] surgery_df = Analytics.process_filters(surgery_df, filters=elective_filter + case_service_filter) dist_df = surgeries_per_day_distribution(surgery_df, day_group_by="is_weekday", filters=[]) data = dist_df.set_index("is_weekday").loc[True]["data"] bins = range(1 + int(max(data))) binom_x = [x + 0.5 for x in bins] n = int(max(data)) p = np.mean(data) / n surgery_df = pre_process_columns(analytics.surgery_df) surgery_df = surgery_df[surgery_df["start_date"].notna()] surgery_df = surgery_df[ surgery_df["start_date"] > datetime.date(2016, 1, 1)] surgery_df = Analytics.process_filters(surgery_df, filters=emergency_filter + case_service_filter) dist_df = surgeries_per_day_distribution(surgery_df, filters=[]) emergency_surgeries_mean = np.mean(dist_df) surgery_df = Analytics.process_filters(analytics.surgery_df, filters=case_service_filter) surgery_df["procedure_count"] = surgery_df["procedures"].apply( lambda x: len(x)) procedure_count_df = surgery_df.groupby("procedure_count").agg({ "event_id": "count" }).reset_index() procedure_count_df = procedure_count_df[ procedure_count_df["procedure_count"] != 6] procedure_count_df["p"] = procedure_count_df["procedure_count"] / sum( procedure_count_df["procedure_count"]) procedure_count_rv = pacal.DiscreteDistr( procedure_count_df["procedure_count"], procedure_count_df["p"]) """ Procedure weights """ usage_events = set(analytics.usage_df["event_id"]) surgery_df = analytics.surgery_df[analytics.surgery_df["event_id"].isin( usage_events)] surgery_df = Analytics.process_filters(surgery_df, filters=case_service_filter) surgery_df["procedures"] = surgery_df["procedures"].apply( lambda x: set(e.replace(" ", "_") for e in x)) procedures = surgery_df["procedures"].apply(lambda x: list(x)).to_list() procedures = pd \ .DataFrame({"procedure": [val for sublist in procedures for val in sublist], "count": [1 for sublist in procedures for val in sublist]}) \ .groupby("procedure") \ .agg({"count": "count"}) \ .reset_index() procedures["p"] = procedures["count"] / sum(procedures["count"]) def procedure_pick_rv(size): return np.random.choice(procedures["procedure"], p=procedures["p"], replace=False, size=size) synthetic_surgeries = pd.DataFrame({"event_id": list(range(1000))}) synthetic_surgeries["procedure_count"] = procedure_count_rv.rand(1000) synthetic_surgeries["procedures"] = synthetic_surgeries[ "procedure_count"].apply(lambda x: procedure_pick_rv(x)) synthetic_procedure_df = pd.concat( [pd.Series(row['event_id'], row['procedures']) for _, row in synthetic_surgeries.iterrows()]) \ .reset_index() \ .rename(columns={"index": "procedure", 0: "event_id"} ) synthetic_procedure_df["flag"] = 1 synthetic_surgeries_df = synthetic_procedure_df \ .pivot(index="event_id", columns="procedure", values="flag") \ .fillna(0) \ .reset_index() feature_df = pd.read_csv(os.path.join("regression_results", item_id)) features = feature_df["feature"] featured_procedures = list( filter(lambda x: "." not in x, feature_df["feature"])) if "other" in featured_procedures: featured_procedures.remove("other") for fp in featured_procedures: if fp not in synthetic_surgeries_df: print(procedures.set_index("procedure").loc[fp]) synthetic_surgeries_df[fp] = 0 all_procedures = set.union(*surgery_df["procedures"]) interactions = list(filter(lambda x: "." in x, feature_df["feature"])) interactions = list(Interaction(i.split(".")) for i in interactions) data, _ = SURegressionModel.extract_features_data(synthetic_surgeries_df, featured_procedures, [], interactions, other=True) for f in feature_df["feature"]: if f not in data: print(f) data[f] = 0 synthetic_surgeries_df["feature_vector"] = data[features].values.tolist() coeff = np.array(feature_df["estimate"]) synthetic_surgeries_df["expected_usage"] = synthetic_surgeries_df["feature_vector"] \ .apply(lambda x: np.exp(np.dot(x, coeff))) """ Information rv for empirical surgeries """ surgery_df = surgery_df.drop_duplicates("event_id", keep="last") empirical_procedure_df = pd.concat( [pd.Series(row['event_id'], row['procedures']) for _, row in surgery_df.iterrows()]) \ .reset_index() \ .rename(columns={"index": "procedure", 0: "event_id"} ) empirical_procedure_df["flag"] = 1 empirical_surgeries_df = empirical_procedure_df \ .pivot(index="event_id", columns="procedure", values="flag") \ .fillna(0) \ .reset_index() data, _ = SURegressionModel.extract_features_data(empirical_surgeries_df, featured_procedures, [], interactions, other=True) empirical_surgeries_df["feature_vector"] = data[features].values.tolist() empirical_surgeries_df["expected_usage"] = empirical_surgeries_df["feature_vector"] \ .apply(lambda x: np.exp(np.dot(x, coeff))) """ Plotly histogram for per surgery info rv, empirical surgeries and synthetic using regression results """ s = 0 e = int( max(max(empirical_surgeries_df["expected_usage"]), max(synthetic_surgeries_df["expected_usage"])) + 1) empirical_trace = go.Histogram( x=empirical_surgeries_df["expected_usage"], name='Empirical Surgery Info RV (mean={:0.2f})'.format( np.mean(empirical_surgeries_df["expected_usage"])), xbins=dict(start=s, end=e, size=info_granularity), histnorm='probability density', opacity=0.75) synthetic_trace = go.Histogram( x=synthetic_surgeries_df["expected_usage"], name='Synthetic Surgery Info RV (mean={:0.2f})'.format( np.mean(synthetic_surgeries_df["expected_usage"])), xbins=dict(start=s, end=e, size=info_granularity), histnorm='probability density', opacity=0.75) layout = go.Layout(title="Per Surgery Info R.V Item: {0}".format(item_id), xaxis={'title': 'Info [Expected Usage]'}, yaxis={'title': 'Probability Density'}) figure = go.Figure(data=[empirical_trace, synthetic_trace], layout=layout) plot(figure, filename="{0}_Per_Surgery_Info_Rv.html".format(item_id)) """ Plotly histogram for per weekday elective surgery RV """ empirical_rv_df = empirical_surgeries_df.groupby(["expected_usage"]) \ .agg({"event_id": "count"}) \ .rename(columns={"event_id": "count"}) \ .reset_index() empirical_rv_df["p"] = empirical_rv_df["count"] / sum( empirical_rv_df["count"]) emp_surgery_rv = pacal.DiscreteDistr(empirical_rv_df["expected_usage"], empirical_rv_df["p"]) surgery_demand_rv = pacal.BinomialDistr(n, p) days = 100000 elective_samples = [ sum(emp_surgery_rv.rand(x)) for x in np.random.binomial(n, p, days) ] elective_samples = [ round(sample / info_granularity) * info_granularity for sample in elective_samples ] weekday_elective_trace = go.Histogram( x=elective_samples, name='{} Elective Info RV (mean={:0.2f})'.format( item_id, np.mean(elective_samples)), xbins=dict(start=0, end=max(elective_samples), size=info_granularity), histnorm='probability', opacity=0.75) """ Plotly histogram for per day emergency surgery RV """ emergency_samples = [ sum(emp_surgery_rv.rand(x)) for x in np.random.poisson(emergency_surgeries_mean, days) ] emergency_samples = [ round(sample / info_granularity) * info_granularity for sample in emergency_samples ] emergency_trace = go.Histogram( x=emergency_samples, name='{} Emergency Info RV (mean={:0.2f})'.format( item_id, np.mean(emergency_samples)), xbins=dict(start=0, end=max(emergency_samples), size=info_granularity), histnorm='probability', opacity=0.75) layout = go.Layout( title="Weekday Elective Info R.V Item: {0}".format(item_id), xaxis={'title': 'Info State (Poisson Usage)]'}, yaxis={'title': 'Probability'}) figure = go.Figure(data=[weekday_elective_trace, emergency_trace], layout=layout) plot(figure, filename="{0}_Weekday_Elective_Info_Rv.html".format(item_id)) elective_info_df = pd.DataFrame({"info": elective_samples, "count": [1] * len(elective_samples)}) \ .groupby(["info"]) \ .agg({"count": "count"}) \ .reset_index() elective_info_df["p"] = elective_info_df["count"] / sum( elective_info_df["count"]) elective_info_rv = pacal.DiscreteDistr(elective_info_df["info"], elective_info_df["p"]) emergency_info_df = pd.DataFrame({"info": emergency_samples, "count": [1] * len(emergency_samples)}) \ .groupby(["info"]) \ .agg({"count": "count"}) \ .reset_index() emergency_info_df["p"] = emergency_info_df["count"] / sum( emergency_info_df["count"]) emergency_info_rv = pacal.DiscreteDistr(emergency_info_df["info"], emergency_info_df["p"]) max_v = 999 for d in elective_info_rv.get_piecewise_pdf().getDiracs(): if 1 - elective_info_rv.cdf(d.a) < eps_trunk: max_v = d.a break diracs = (pacal.CondLtDistr(elective_info_rv, max_v)) \ .get_piecewise_pdf().getDiracs() diracs = list(filter(lambda d: d.f > 0, diracs)) elective_info_rv = pacal.DiscreteDistr([d.a for d in diracs], [d.f for d in diracs]) max_v = 999 for d in emergency_info_rv.get_piecewise_pdf().getDiracs(): if 1 - emergency_info_rv.cdf(d.a) < eps_trunk: max_v = d.a break diracs = (pacal.CondLtDistr(emergency_info_rv, max_v)) \ .get_piecewise_pdf().getDiracs() diracs = list(filter(lambda d: d.f > 0, diracs)) emergency_info_rv = pacal.DiscreteDistr([d.a for d in diracs], [d.f for d in diracs]) with open(os.path.join(elective_outdir, "{0}.pickle".format(item_id)), "wb") as f: pickle.dump(elective_info_rv, f) with open(os.path.join(emergency_outdir, "{0}.pickle".format(item_id)), "wb") as f: pickle.dump(emergency_info_rv, f) return emergency_trace, weekday_elective_trace
def test_usage_r_regression_flow(): from scm_analytics.model.SurgeryUsageRegressionModel import Interaction pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) pd.options.mode.chained_assignment = None case_service = "Cardiac Surgery" item_id = "38242" pthres = 0.05 occ_thres = 5 analytics = ScmAnalytics.ScmAnalytics(lhs_config) surgery_df = analytics.surgery_df usage_df = analytics.usage_df item_ids = ["38242", "129636"] surgery_df = surgery_df[surgery_df["case_service"] == case_service] usage_df = usage_df[usage_df["case_service"] == case_service] surgery_df = surgery_df[surgery_df["event_id"].isin( set(usage_df["event_id"]))] surgery_df["procedures"] = surgery_df["procedures"].apply( lambda x: set(e.replace(" ", "_") for e in x)) all_procedures = set.union(*surgery_df["procedures"]) r_df = SURegressionModel.surgery_usage_regression_df(surgery_df, usage_df, item_ids=item_ids) interactions = list([ Interaction((p1, p2)) for p1, p2 in combinations(sorted(list(all_procedures)), 2) ]) features = sorted(list(all_procedures)) data, feature_df = SURegressionModel.extract_features_data( r_df, features, all_procedures, interactions, other=True, sum_others=False) print(feature_df) feature_df = feature_df[feature_df["occurrence"] >= occ_thres] interactions = list( filter(lambda x: str(x) in set(feature_df["feature"]), interactions)) features = list(filter(lambda x: x in set(feature_df["feature"]), features)) while True: data, feature_df = SURegressionModel.extract_features_data( r_df, features, all_procedures, interactions, other=True, sum_others=False) data["y"] = list(r_df[item_id]) feature_df, r2, _ = SURegressionModel.run_r_regression( data, feature_df, model="gaussian") print(feature_df) print("r2:", r2) thres_df = feature_df[feature_df["feature"].isin(features + interactions)] if thres_df[thres_df["p.value"] > pthres].empty and thres_df[ thres_df["occurrence"] < occ_thres].empty: break feature_df = feature_df[feature_df["p.value"] <= pthres] feature_df = feature_df[feature_df["occurrence"] >= occ_thres] interactions = list( filter(lambda x: str(x) in set(feature_df["feature"]), interactions)) features = list( filter(lambda x: x in set(feature_df["feature"]), features)) feature_df = feature_df[["feature", "occurrence"]] feature_df, r2, _ = SURegressionModel.run_r_regression(data, feature_df, model="poisson") feature_df.to_csv(os.path.join("regression_results", item_id), index=False) data.to_csv(os.path.join("r_scripts", "test_data2.csv"), index=False) print(feature_df) print("r2:", r2)
def run(case_service="Cardiac Surgery", item_id="1686", procedure_set=None): analytics = ScmAnalytics.ScmAnalytics(lhs_config) case_service_filter = [{ "dim": "case_service", "op": "eq", "val": case_service }] usage_df = analytics.usage_df usage_df = usage_df[usage_df["start_date"].notna()] usage_df = Analytics.process_filters(usage_df, filters=case_service_filter) usage_events = set(usage_df["event_id"]) item_usage_df = usage_df[usage_df["item_id"] == item_id] surgery_df = pre_process_columns(analytics.surgery_df) surgery_df = surgery_df[surgery_df["start_date"].notna()] surgery_df = surgery_df[ surgery_df["start_date"] > datetime.date(2016, 1, 1)] surgery_df = Analytics.process_filters(surgery_df, filters=case_service_filter) surgery_df = surgery_df[surgery_df["event_id"].isin(usage_events)] surgery_df = surgery_df.join( item_usage_df.set_index("event_id")[["used_qty"]], on="event_id", how="left").fillna(0) surgery_df["procedures"] = surgery_df["procedures"].apply( lambda x: frozenset(x)) surgery_df = surgery_df[surgery_df["procedures"] == procedure_set] traces = [] x_max = int(max(surgery_df["used_qty"])) + 1 data = surgery_df["used_qty"] label = ", ".join(procedure_set) fn = "__".join(procedure_set) fn = "Usage_Dist_item_" + item_id + "_" + fn.replace(" ", "_") # # traces.append(go.Histogram( # x=data, # name=label, # xbins=dict( # start=0, # end=x_max, # size=1 # ), # histnorm='probability', # opacity=1, # # )) # # tickvals = list(x + 0.5 for x in range(x_max)) # ticktext = list(str(x) for x in range(x_max)) # layout = go.Layout( # title="Item: {} Empirical Usage Distribution for common cases".format(item_id), # xaxis={'title': 'Used Qty', # 'tickvals': tickvals, # 'ticktext': ticktext}, # yaxis={'title': 'Probability'}, # font={"size": 16}, # plot_bgcolor="white", # bargap=0.2) # figure = go.Figure( # data=traces, # layout=layout, # ) # # figure.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey') # figure.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey') # # plot(figure, filename="{}_empircal_usage_distribution.html".format(item_id)) # figure.write_image(fn, width=900, height=600) import matplotlib import matplotlib.ticker as plticker matplotlib.rcParams.update({'font.size': 12}) plt.figure(figsize=(4, 3.5)) plt.tight_layout() plt.gcf().subplots_adjust(bottom=0.15, left=0.15) n, bins, patches = plt.hist(data, range(x_max + 1), density=True, facecolor='#08306b', rwidth=0.95) spacing = np.round((max(n) + 0.1) / 4, decimals=1) plt.yticks(np.arange(0, max(n) + 0.1, spacing)) #matplotlib.pyplot.grid(b=True, which='major', axis='y') plt.ylabel("Probability") plt.xlabel("Used Quantity") plt.xticks(range(x_max + 1)) plt.savefig(fn + ".svg", format='svg') plt.savefig(fn + ".eps", format='eps')
def test_usage_r_regression_flow(item_id=None, save_results=False): summary = {"item_id": item_id} pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) pd.options.mode.chained_assignment = None case_service = "Cardiac Surgery" item_id = item_id if item_id else "38242" pthres = 0.05 occ_thres = 5 tail_trim = 0.01 analytics = ScmAnalytics.ScmAnalytics(lhs_config) surgery_df = analytics.surgery_df usage_df = analytics.usage_df item_ids = [item_id] surgery_df = surgery_df[surgery_df["case_service"] == case_service] usage_df = usage_df[usage_df["case_service"] == case_service] surgery_df = surgery_df[surgery_df["event_id"].isin( set(usage_df["event_id"]))] surgery_df["procedures"] = surgery_df["procedures"].apply( lambda x: set(e.replace(" ", "_") for e in x)) all_procedures = set.union(*surgery_df["procedures"]) r_df = SURegressionModel.surgery_usage_regression_df(surgery_df, usage_df, item_ids=item_ids) if tail_trim: usage_df = usage_df[usage_df["item_id"] == item_id] trim_index = int(len(r_df) * (1 - tail_trim)) expected_usage = np.mean(r_df[item_id]) max_usage = max(r_df[item_id]) usage_prob = len(usage_df) / len(surgery_df) trim_thres = r_df.sort_values(by=[item_id])[item_id].iloc[trim_index] discard_ratio = len( usage_df[usage_df["used_qty"] > trim_thres]) / len(usage_df) usage_df = usage_df[usage_df["used_qty"] <= trim_thres] print("Usage Probability:", usage_prob) print("Mean Usage:", expected_usage) print("Max Usage:", max_usage) print("Trim Threshold:", trim_thres) print("Discard Ratio:", discard_ratio) summary["usage_p"] = usage_prob summary["mean_usage"] = expected_usage summary["max_usage"] = max_usage summary["trim_thres"] = trim_thres summary["discard_ratio"] = discard_ratio interactions = list([ Interaction((p1, p2)) for p1, p2 in combinations(sorted(list(all_procedures)), 2) ]) features = sorted(list(all_procedures)) data, feature_df = SURegressionModel.extract_features_data( r_df, features, all_procedures, interactions, other=True, sum_others=False) print(feature_df) feature_df = feature_df[feature_df["occurrence"] >= occ_thres] interactions = list( filter(lambda x: str(x) in set(feature_df["feature"]), interactions)) features = list(filter(lambda x: x in set(feature_df["feature"]), features)) while True: data, feature_df = SURegressionModel.extract_features_data( r_df, features, all_procedures, interactions, other=True, sum_others=False) data["y"] = list(r_df[item_id]) feature_df, r2, _ = SURegressionModel.run_r_regression( data, feature_df, model="gaussian") print(feature_df) print("r2:", r2) thres_df = feature_df[feature_df["feature"].isin(features + interactions)] if thres_df[thres_df["p.value"] > pthres].empty and thres_df[ thres_df["occurrence"] < occ_thres].empty: break feature_df = feature_df[feature_df["p.value"] <= pthres] feature_df = feature_df[feature_df["occurrence"] >= occ_thres] interactions = list( filter(lambda x: str(x) in set(feature_df["feature"]), interactions)) features = list( filter(lambda x: x in set(feature_df["feature"]), features)) feature_df = feature_df[["feature", "occurrence"]] feature_df, r2, fitted_y = SURegressionModel.run_r_regression( data, feature_df, model="poisson") residuals = fitted_y - data["y"] constant_residuals = np.mean(data["y"]) - data["y"] feature_df.to_csv(os.path.join("regression_results", item_id), index=False) data.to_csv(os.path.join("r_scripts", "test_data2.csv"), index=False) print(feature_df) print("r2:", r2) summary["r2"] = r2 step = 0.5 s = np.floor(min(residuals)) - step / 2 e = np.ceil(max(residuals)) + step / 2 mu = np.mean(residuals) std = np.std(residuals, ddof=1) bins = np.arange(s, e, step) norm_x = np.arange(s, e, step / 10) weights = np.ones(len(residuals)) / len(residuals) traces = [ go.Histogram(x=residuals, name='Poisson Residuals (Fit - Empirical)', xbins=dict(start=s, end=e, size=step), histnorm='probability density', opacity=0.75), go.Scatter( x=norm_x, y=stats.norm.pdf(norm_x, mu, std), mode='lines', name='Poisson Residuals mu={0:.5f}, sigma={1:.2f}'.format(mu, std), ), go.Histogram(x=constant_residuals, name='Constant Model Residuals (Fit - Empirical)', xbins=dict(start=s, end=e, size=step), histnorm='probability density', opacity=0.75), go.Scatter( x=norm_x, y=stats.norm.pdf(norm_x, 0, np.std(data["y"], ddof=1)), mode='lines', name='Constant Residuals, sigma={0:.2f}'.format( np.std(data["y"], ddof=1)), ) ] layout = go.Layout(title="Residuals", xaxis={'title': 'Residual'}, yaxis={'title': 'Probability Density'}) figure = go.Figure(data=traces, layout=layout) plot(figure, filename="{0}_residuals_r2_{1:0.2f}.html".format(item_id, r2)) plt.hist(residuals, bins=bins, density=True, rwidth=0.96, alpha=0.5, label="Residuals 'fit - empirical'") plt.plot(norm_x, stats.norm.pdf(norm_x, mu, std), label="mu={0:.5f}, sigma={1:.2f}".format(mu, std)) plt.title("Residuals Histogram from Regression Model") plt.ylabel("Probability Density") plt.xlabel("Residual") plt.legend() if save_results: plt.savefig("{0}_surgery_item_usage_residuals_r2_{1:0.2f}.png".format( item_id, r2), format="png") #plt.show() return summary