def get_model_data(model_type, model_name, compression_options=None): from preconvert.output import json if model_name in data_cache: return data_cache[model_type + "_" + model_name] """ Gets the model data """ try: data = pkgutil.get_data("data", "shrynk/{}_{}.jsonl.gzip".format(model_type, model_name)) data = [ json.loads(line) for line in decompress(data).decode("utf8").split("\n") if line.strip() ] # print("from package") except FileNotFoundError: try: with open(shrynk_path("{}_{}.jsonl".format(model_type, model_name))) as f: data = [json.loads(x) for x in f.read().split("\n") if x] except FileNotFoundError: data = [] if compression_options is not None: known_kwargs = set([json.dumps(x) for x in compression_options]) for x in data: x["bench"] = [y for y in x["bench"] if y["kwargs"] in known_kwargs] # print("filtered compressions") data_cache[model_type + "_" + model_name] = data return data
def iread(fn): from preconvert.output import json if not isinstance(fn, str): raise TypeError("Cannot iteratively read compressed file now") with open(fn) as f: for i, line in enumerate(f): try: yield json.loads(line) except Exception as e: msg = "JSON-L parsing error in line number {} in the jsonl file".format( i) raise Exception(msg, line)
def predict(self, features): from preconvert.output import json if isinstance(features, pd.DataFrame): features = self.get_features(features) if isinstance(features, dict): features = pd.DataFrame([features]) warnings.filterwarnings(module='sklearn*', action='ignore', category=DeprecationWarning) pred = self.clf.predict(features.fillna(-100))[0] if not isinstance(pred, str): pred = pred[0] return json.loads(pred)
def get_benchmark_html(df, fname): features = pdc.get_features(df) bench_res = None save = False if IN_PRODUCTION: blob = get_blob(features) if blob.exists(): results = json.loads(blob.download_as_string()) bench_res = results["bench"] else: results = pdc.run_benchmarks(df, save=False, ignore_seen=False, timeout=False)[0] # make a copy not to pop kwargs from results object which will be saved bench_res = deepcopy(results)["bench"] save = True else: bench_res = pdc.run_benchmarks(df, save=False, ignore_seen=False, timeout=False)[0]["bench"] kwargs = [x.pop("kwargs") for x in bench_res] bench_res = pd.DataFrame(bench_res, index=kwargs) inferred = pdc.infer(features) z_name = "z {}".format(tuple(weights)) bench_res[z_name] = (scale(bench_res) * weights).sum(axis=1) bench_res = bench_res.round(5).sort_values(z_name) bench_res = bench_res[[z_name, "size", "write_time", "read_time"]] y = json.dumps(inferred) res_index = [i + 1 for i, x in enumerate(bench_res.index) if x == y] + [-1] if save: ip = request.environ.get("HTTP_X_FORWARDED_FOR", "") ip = ip.split(",")[0] results["web"] = { "utctime": datetime.utcnow().isoformat(), "ip": ip, "predicted": inferred, "res_index": res_index[0], # 1 is 1st, 2 is 2nd "filename": fname, "weights": weights.tolist(), } blob.upload_from_string(json.dumps(results)) print("saved blob") bench_res.index = [ " ".join(["{}={!r}".format(k, v) for k, v in json.loads(x).items()]) for x in bench_res.index ] learning = "none" if res_index and res_index[0] == 1 else "inherit" nth = { 1: "1st", 2: "2nd", 3: "3rd", -1: "999" }.get(res_index[0], str(res_index[0]) + "th") # upload(features, "{}-{}".format(file.filename, time.time())) features = { k.replace("quantile_proportion", "quantile"): round(v, 3) if isinstance(v, float) else v for k, v in features.items() } return str( Markup( '<center> <h5 class="tagline"> Results: </h5></center>' + '<div class="container" style="margin-top: 2rem"><div class="row">' + '<div class="col l10 offset-l2" style="padding-bottom: 2rem; padding-top: 1rem;">The data was featurized, and a prediction was made. Then, all the compressions were ran for this file so we can see if the prediction was correct (the ground truth).</div>' + '<div class="col s12 m6 l3 offset-l2">' + "<b>Filename: </b>" + fname + "<br><b>Features: </b>" + '<code class="codes">' + json.dumps(features, indent=4) + "</code>" + '</div>' + '<div class="col s12 m6 l3 offset-l3">' + "<br><center style='line-height: 3'><b>Predicted: </b><br>" # just using features here instead of data to be faster + " ".join(["{}={!r}".format(k, v) for k, v in inferred.items()]) + "<br><b>Result:</b><br><span class='result {}'>{}</span> / {}<br><div style='display: {}'><span style='color: #ee6e73'>Wrong!</span> We will learn from this...</div>" .format(nth[-2:], nth, bench_res.shape[0], learning) + "</center></div></div>" + "<center><h4>Ground truth</h4><div class='show-on-small hide-on-med-and-up' style='padding: 0.5rem; color: grey'> -- scroll -> </center>" + replacenth( format_res(bench_res, tuple(weights), fname), "<tr ", '<tr class="resultinv {}" '.format(nth[-2:]), int(nth[:-2]), )))