def get_naive_decision_tree(): payload = request.get_json() labels, records, out_path = ( payload["labels"], payload["records"], payload["outPath"], ) df = pandas.DataFrame.from_records(records, columns=labels) thrL, thrH = df.iloc[:, ::2], df.iloc[:, 1::2] dt = DecisionTree(thrL_in=thrL, thrH_in=thrH) dt.create() df_out = dt.thrL_out.join(dt.thrH_out) matrix = [[str(cell) for cell in row] for row in df_out.as_matrix().tolist()] predictor_matrix = ASCIIDecoder(path=out_path).dataframe tree = dt.construct_tree(predictor_matrix).json return jsonify({ "records": matrix, "labels": list(df_out.columns), "tree": [tree] })
def test_decision_tree_with_predefined_threshold_splits(): records = [ ("-inf", "0.25", "-inf", "2", "5", "20", "-inf", "inf", "-inf", "70"), ("", "", "", "", "20", "inf", "", "", "70", "275"), ("", "", "", "", "", "", "", "", "275", "inf"), ] labels = [ "CPR_thrL", "CPR_thrH", "TP_thrL", "TP_thrH", "WSPD_thrL", "WSPD_thrH", "CAPE_thrL", "CAPE_thrH", "SR_thrL", "SR_thrH", ] df = pandas.DataFrame.from_records(records, columns=labels) thrL, thrH = df.iloc[:, ::2], df.iloc[:, 1::2] dt = DecisionTree(thrL_in=thrL, thrH_in=thrH) dt.create() # root = dt.construct_tree() expected_thrL_matrix = [ [float("-inf"), float("-inf"), 5., float("-inf"), float("-inf")], [float("-inf"), float("-inf"), 5., float("-inf"), 70.], [float("-inf"), float("-inf"), 5., float("-inf"), 275.], [float("-inf"), float("-inf"), 20., float("-inf"), float("-inf")], [float("-inf"), float("-inf"), 20., float("-inf"), 70.], [float("-inf"), float("-inf"), 20., float("-inf"), 275.], ] assert np.array_equal(dt.thrL_out, expected_thrL_matrix) expected_thrH_matrix = [ [0.25, 2., 20., float("inf"), 70.], [0.25, 2., 20., float("inf"), 275.], [0.25, 2., 20., float("inf"), float("inf")], [0.25, 2., float("inf"), float("inf"), 70.], [0.25, 2., float("inf"), float("inf"), 275.], [0.25, 2., float("inf"), float("inf"), float("inf")], ] assert np.array_equal(dt.thrH_out, expected_thrH_matrix)
def create_weather_types_matrix(): payload = request.get_json() labels, records = payload["labels"], payload["records"] df = pandas.DataFrame.from_records(records, columns=labels) thrL, thrH = df.iloc[:, ::2], df.iloc[:, 1::2] dt = DecisionTree(thrL_in=thrL, thrH_in=thrH) thrL, thrH = dt.create() df_out = pandas.concat([thrL, thrH], axis=1) df_out = df_out[labels] matrix = [[str(cell) for cell in row] for row in df_out.values] return jsonify({"matrix": matrix})
def get_error_rep(): payload = request.get_json() labels, matrix, path, numCols = ( payload["labels"], payload["matrix"], payload["path"], payload["numCols"], ) matrix = [[float(cell) for cell in row] for row in matrix] df = pandas.DataFrame.from_records(matrix, columns=labels) thrL, thrH = df.iloc[:, ::2], df.iloc[:, 1::2] predictor_matrix = ASCIIDecoder(path=path).dataframe rep = DecisionTree.cal_rep_error(predictor_matrix, thrL_out=thrL, thrH_out=thrH, nBin=int(numCols)) s = StringIO() np.savetxt(s, rep, delimiter=",") return jsonify(s.getvalue())
def get_decision_tree(): payload = request.get_json() labels, matrix = payload["labels"], payload["matrix"] matrix = [[float(cell) for cell in row] for row in matrix] df = pandas.DataFrame.from_records(matrix, columns=labels) thrL, thrH = df.iloc[:, ::2], df.iloc[:, 1::2] tree = DecisionTree.construct_tree(thrL_out=thrL, thrH_out=thrH) return jsonify([tree.json])
def get_error_rep(): payload = request.get_json() labels, matrix, path, numCols, cheaper, ranges = ( payload["labels"], payload["matrix"], sanitize_path(payload["path"]), payload["numCols"], payload["cheaper"], payload["ranges"], ) matrix = [[float(cell) for cell in row] for row in matrix] df = pandas.DataFrame.from_records(matrix, columns=labels) thrL, thrH = df.iloc[:, ::2], df.iloc[:, 1::2] loader = load_point_data_by_path(path, cheaper=cheaper) dt = DecisionTree(threshold_low=thrL, threshold_high=thrH, ranges=ranges) rep = dt.cal_rep_error(loader, nBin=int(numCols)) s = StringIO() rep.to_csv(s) return jsonify(s.getvalue())
def get_wt_codes(): payload = request.get_json() labels, records = payload["labels"], payload["matrix"] records = [[float(cell) for cell in row] for row in records] df = pandas.DataFrame.from_records(records, columns=labels) thrL, thrH = df.iloc[:, ::2], df.iloc[:, 1::2] codes = DecisionTree.wt_code(thrL, thrH) return jsonify({"codes": codes})
def get_decision_tree(): payload = request.get_json() labels, matrix, ranges = ( payload["labels"], payload["matrix"], payload["fieldRanges"], ) matrix = [[float(cell) for cell in row] for row in matrix] df = pandas.DataFrame.from_records(matrix, columns=labels) thrL, thrH = df.iloc[:, ::2], df.iloc[:, 1::2] dt = DecisionTree(threshold_low=thrL, threshold_high=thrH, ranges=ranges) return jsonify([dt.tree.json])
def get_wt_codes(): payload = request.get_json() labels, records, ranges = ( payload["labels"], payload["matrix"], payload["fieldRanges"], ) records = [[float(cell) for cell in row] for row in records] df = pandas.DataFrame.from_records(records, columns=labels) thrL, thrH = df.iloc[:, ::2], df.iloc[:, 1::2] dt = DecisionTree(threshold_low=thrL, threshold_high=thrH, ranges=ranges) return jsonify({"codes": dt.leaf_codes})
def create_weather_types_matrix(): payload = request.get_json() labels, records, ranges = ( payload["labels"], payload["records"], payload["fieldRanges"], ) df = pandas.DataFrame.from_records(records, columns=labels) thrL, thrH = df.iloc[:, ::2], df.iloc[:, 1::2] dt = DecisionTree.create_from_sparse_thresholds(low=thrL, high=thrH, ranges=ranges) df_out = pandas.concat([dt.threshold_low, dt.threshold_high], axis=1) df_out = df_out[labels] matrix = [[str(cell) for cell in row] for row in df_out.values] return jsonify({"matrix": matrix})
def test_decision_tree_with_predefined_threshold_splits(sparse_breakpoints): sparse_thresholds_low, sparse_thresholds_high, ranges = sparse_breakpoints dt = DecisionTree.create_from_sparse_thresholds( low=sparse_thresholds_low, high=sparse_thresholds_high, ranges=ranges) expected_threshold_low_matrix = [ [float("-inf"), float("-inf"), 5.0, float("-inf"), float("-inf")], [float("-inf"), float("-inf"), 5.0, float("-inf"), 70.0], [float("-inf"), float("-inf"), 5.0, float("-inf"), 275.0], [float("-inf"), float("-inf"), 20.0, float("-inf"), float("-inf")], [float("-inf"), float("-inf"), 20.0, float("-inf"), 70.0], [float("-inf"), float("-inf"), 20.0, float("-inf"), 275.0], ] assert np.array_equal(dt.threshold_low, expected_threshold_low_matrix) expected_threshold_high_matrix = [ [0.25, 2.0, 20.0, float("inf"), 70.0], [0.25, 2.0, 20.0, float("inf"), 275.0], [0.25, 2.0, 20.0, float("inf"), float("inf")], [0.25, 2.0, float("inf"), float("inf"), 70.0], [0.25, 2.0, float("inf"), float("inf"), 275.0], [0.25, 2.0, float("inf"), float("inf"), float("inf")], ] assert np.array_equal(dt.threshold_high, expected_threshold_high_matrix)
def test_decision_tree_construction(breakpoints): low, high = breakpoints ranges = { "cpr": ["-inf", "inf"], "tp_acc": ["-inf", "inf"], "cp_acc": ["-inf", "inf"], "sr24h": ["-inf", "inf"], "cape_wa": ["-inf", "inf"], } dt = DecisionTree(threshold_low=low, threshold_high=high, ranges=ranges) expected = { "name": "Root", "children": [ { "name": "-inf < cpr < 0.25", "children": [ { "name": "-inf < tp_acc < 2", "children": [ { "name": "-inf < cp_acc < 5", "children": [ { "name": "-inf < sr24h < 70", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 0, "code": "11101", }, }, { "name": "70 < sr24h < 275", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 1, "code": "11102", }, }, { "name": "275 < sr24h < inf", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 2, "code": "11103", }, }, ], "parent": None, "meta": { "predictor": "cp_acc", "level": 2, "idxWT": 2, "code": "11100", }, }, { "name": "5 < cp_acc < 20", "children": [ { "name": "-inf < sr24h < 70", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 3, "code": "11201", }, }, { "name": "70 < sr24h < 275", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 4, "code": "11202", }, }, { "name": "275 < sr24h < inf", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 5, "code": "11203", }, }, ], "parent": None, "meta": { "predictor": "cp_acc", "level": 2, "idxWT": 5, "code": "11200", }, }, { "name": "20 < cp_acc < inf", "children": [ { "name": "-inf < sr24h < 70", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 6, "code": "11301", }, }, { "name": "70 < sr24h < 275", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 7, "code": "11302", }, }, { "name": "275 < sr24h < inf", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 8, "code": "11303", }, }, ], "parent": None, "meta": { "predictor": "cp_acc", "level": 2, "idxWT": 8, "code": "11300", }, }, ], "parent": None, "meta": { "predictor": "tp_acc", "level": 1, "idxWT": 6, "code": "11000", }, }, { "name": "2 < tp_acc < inf", "children": [ { "name": "-inf < cp_acc < 5", "children": [ { "name": "-inf < sr24h < 70", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 9, "code": "12101", }, }, { "name": "70 < sr24h < 275", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 10, "code": "12102", }, }, { "name": "275 < sr24h < inf", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 11, "code": "12103", }, }, ], "parent": None, "meta": { "predictor": "cp_acc", "level": 2, "idxWT": 11, "code": "12100", }, }, { "name": "5 < cp_acc < 20", "children": [ { "name": "-inf < sr24h < 70", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 12, "code": "12201", }, }, { "name": "70 < sr24h < 275", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 13, "code": "12202", }, }, { "name": "275 < sr24h < inf", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 14, "code": "12203", }, }, ], "parent": None, "meta": { "predictor": "cp_acc", "level": 2, "idxWT": 14, "code": "12200", }, }, { "name": "20 < cp_acc < inf", "children": [ { "name": "-inf < sr24h < 70", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 15, "code": "12301", }, }, { "name": "70 < sr24h < 275", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 16, "code": "12302", }, }, { "name": "275 < sr24h < inf", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 17, "code": "12303", }, }, ], "parent": None, "meta": { "predictor": "cp_acc", "level": 2, "idxWT": 17, "code": "12300", }, }, ], "parent": None, "meta": { "predictor": "tp_acc", "level": 1, "idxWT": 15, "code": "12000", }, }, ], "parent": None, "meta": { "predictor": "cpr", "level": 0, "idxWT": 9, "code": "10000" }, }, { "name": "0.25 < cpr < inf", "children": [ { "name": "-inf < tp_acc < 2", "children": [ { "name": "-inf < cp_acc < 5", "children": [ { "name": "-inf < sr24h < 70", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 18, "code": "21101", }, }, { "name": "70 < sr24h < 275", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 19, "code": "21102", }, }, { "name": "275 < sr24h < inf", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 20, "code": "21103", }, }, ], "parent": None, "meta": { "predictor": "cp_acc", "level": 2, "idxWT": 20, "code": "21100", }, }, { "name": "5 < cp_acc < 20", "children": [ { "name": "-inf < sr24h < 70", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 21, "code": "21201", }, }, { "name": "70 < sr24h < 275", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 22, "code": "21202", }, }, { "name": "275 < sr24h < inf", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 23, "code": "21203", }, }, ], "parent": None, "meta": { "predictor": "cp_acc", "level": 2, "idxWT": 23, "code": "21200", }, }, { "name": "20 < cp_acc < inf", "children": [ { "name": "-inf < sr24h < 70", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 24, "code": "21301", }, }, { "name": "70 < sr24h < 275", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 25, "code": "21302", }, }, { "name": "275 < sr24h < inf", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 26, "code": "21303", }, }, ], "parent": None, "meta": { "predictor": "cp_acc", "level": 2, "idxWT": 26, "code": "21300", }, }, ], "parent": None, "meta": { "predictor": "tp_acc", "level": 1, "idxWT": 24, "code": "21000", }, }, { "name": "2 < tp_acc < inf", "children": [ { "name": "-inf < cp_acc < 5", "children": [ { "name": "-inf < sr24h < 70", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 27, "code": "22101", }, }, { "name": "70 < sr24h < 275", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 28, "code": "22102", }, }, { "name": "275 < sr24h < inf", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 29, "code": "22103", }, }, ], "parent": None, "meta": { "predictor": "cp_acc", "level": 2, "idxWT": 29, "code": "22100", }, }, { "name": "5 < cp_acc < 20", "children": [ { "name": "-inf < sr24h < 70", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 30, "code": "22201", }, }, { "name": "70 < sr24h < 275", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 31, "code": "22202", }, }, { "name": "275 < sr24h < inf", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 32, "code": "22203", }, }, ], "parent": None, "meta": { "predictor": "cp_acc", "level": 2, "idxWT": 32, "code": "22200", }, }, { "name": "20 < cp_acc < inf", "children": [ { "name": "-inf < sr24h < 70", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 33, "code": "22301", }, }, { "name": "70 < sr24h < 275", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 34, "code": "22302", }, }, { "name": "275 < sr24h < inf", "children": [], "parent": None, "meta": { "predictor": "sr24h", "level": 4, "idxWT": 35, "code": "22303", }, }, ], "parent": None, "meta": { "predictor": "cp_acc", "level": 2, "idxWT": 35, "code": "22300", }, }, ], "parent": None, "meta": { "predictor": "tp_acc", "level": 1, "idxWT": 33, "code": "22000", }, }, ], "parent": None, "meta": { "predictor": "cpr", "level": 0, "idxWT": 27, "code": "20000" }, }, ], "parent": None, "meta": { "level": -1, "idxWT": 18, "code": "00000" }, } assert strip_node_shape(dt.tree.json) == expected
def save_operation(): payload = request.get_json() labels = payload["labels"] matrix = payload["matrix"] ranges = payload["fieldRanges"] pdt_path = sanitize_path(payload["pdtPath"]) mf_cols = payload["mfcols"] cheaper = payload["cheaper"] mode = payload["mode"] output_path = Path(sanitize_path(payload["outPath"])) if mode == "all": version = payload["version"] family = payload["family"] accumulation = payload["accumulation"] accumulation = f"{accumulation}h" if accumulation else "" dataset_name = payload["datasetName"] output_path = output_path / f"{family}{accumulation}{dataset_name}_{version}" os.makedirs(output_path, exist_ok=True) if mode in ["breakpoints", "all"]: csv = payload["breakpointsCSV"] path = output_path if mode == "all": path = path / "BP.csv" with open(path, "w") as f: f.write(csv) if mode in ["mf", "all"]: matrix = [[float(cell) for cell in row] for row in matrix] df = pandas.DataFrame.from_records(matrix, columns=labels) thrL, thrH = df.iloc[:, ::2], df.iloc[:, 1::2] loader = load_point_data_by_path(pdt_path, cheaper=cheaper) dt = DecisionTree(threshold_low=thrL, threshold_high=thrH, ranges=ranges) rep = dt.cal_rep_error(loader, nBin=int(mf_cols)) path = output_path if mode == "all": path = path / f"{loader.error_type.name}.csv" with open(path, "w") as f: rep.to_csv( f, header=[str(i + 1) for i in range(int(mf_cols))], index_label="WT Code", ) if mode in ["wt", "all"]: ylim = payload["yLim"] bins = payload["bins"] num_bins = payload["numBins"] thrGridOut = payload["thrGridOut"] matrix = [[float(cell) for cell in row[1:]] for row in thrGridOut] df = pandas.DataFrame.from_records(matrix, columns=labels) loader = load_point_data_by_path(pdt_path, cheaper=cheaper) bins = [float(each) for each in bins] thrL_out, thrH_out = df.iloc[:, ::2], df.iloc[:, 1::2] path = output_path if mode == "all": path = path / "WTs" os.makedirs(path, exist_ok=True) for idx in range(len(thrL_out)): thrL = thrL_out.iloc[idx] thrH = thrH_out.iloc[idx] wt = WeatherType( thrL=thrL, thrH=thrH, thrL_labels=labels[::2], thrH_labels=labels[1::2] ) dataframe, title_tokens = wt.evaluate(loader.error_type.name, loader=loader) title = wrap_title(title=title_tokens, chunk_size=6) error = dataframe[loader.error_type.name] wt_code = thrGridOut[idx][0] wt.plot( error, bins, title, y_lim=int(ylim), num_bins=int(num_bins), out_path=os.path.join(path, f"WT_{wt_code}.png"), ) if mode in ["bias", "all"]: thrGridOut = payload["thrGridOut"] bins = payload["bins"] num_bins = payload["numBins"] bins = [float(each) for each in bins] matrix = [[float(cell) for cell in row[1:]] for row in thrGridOut] df = pandas.DataFrame.from_records(matrix, columns=labels) loader = load_point_data_by_path(pdt_path, cheaper=cheaper) thrL_out, thrH_out = df.iloc[:, ::2], df.iloc[:, 1::2] path = output_path if mode == "all": path = path / "Bias.csv" csv = [] for idx in range(len(thrL_out)): thrL = thrL_out.iloc[idx] thrH = thrH_out.iloc[idx] wt = WeatherType( thrL=thrL, thrH=thrH, thrL_labels=labels[::2], thrH_labels=labels[1::2] ) dataframe, title_tokens = wt.evaluate(loader.error_type.name, loader=loader) error = dataframe[loader.error_type.name] discretized_error = wt.discretize_error(error=error, num_bins=int(num_bins)) bias = loader.error_type.bias( error=discretized_error, low=bins[0], high=bins[-1] ) bias = f"{bias:.2f}" wt_code = thrGridOut[idx][0] csv += [(wt_code, bias)] pandas.DataFrame.from_records(csv, columns=["WT Code", "Bias"]).to_csv( path, index=False ) if mode == "all": family = payload["family"] version = payload["version"] accumulation = payload["accumulation"] accumulation = f", {accumulation}-hourly" if accumulation else "" with open(output_path / "README.txt", "w") as f: text = dedent( f""" ecPoint-{family}{accumulation} Version: {version} Timestamp: {datetime.now()} """ ) f.write(text.lstrip()) loader = load_point_data_by_path(pdt_path, cheaper=cheaper) if pdt_path.endswith(".ascii"): ext = "ascii" elif pdt_path.endswith(".parquet"): ext = "parquet" else: ext = "ascii" exclude_cols = payload["excludePredictors"] cols = [col for col in loader.columns if col not in exclude_cols] loader.clone(*cols, path=output_path / f"PDT.{ext}") return Response(json.dumps({}), mimetype="application/json")