def compute_features(params: Namespace) -> None: """Compute features to use for training. Args: params (Namespace): Input parameters for operations. """ # Set up utils.set_seed(seed=params.seed) # Load data projects_url = ( "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/projects.json" ) projects = utils.load_json_from_url(url=projects_url) df = pd.DataFrame(projects) # Compute features df["text"] = df.title + " " + df.description df.drop(columns=["title", "description"], inplace=True) df = df[["id", "created_on", "text", "tags"]] # Save features = df.to_dict(orient="records") df_dict_fp = Path(config.DATA_DIR, "features.json") utils.save_dict(d=features, filepath=df_dict_fp) return df, features
def diff( author: str = config.AUTHOR, repo: str = config.REPO, tag_a: str = "workspace", tag_b: str = "", ): # pragma: no cover, can't be certain what diffs will exist """Difference between two release TAGs.""" # Tag b if tag_b == "": tags_url = f"https://api.github.com/repos/{author}/{repo}/tags" tag_b = utils.load_json_from_url(url=tags_url)[0]["name"] logger.info(f"Comparing {tag_a} with {tag_b}:") # Params params_a = params(author=author, repo=repo, tag=tag_a, verbose=False) params_b = params(author=author, repo=repo, tag=tag_b, verbose=False) params_diff = utils.dict_diff(d_a=params_a, d_b=params_b, d_a_name=tag_a, d_b_name=tag_b) logger.info(f"Parameter differences: {json.dumps(params_diff, indent=2)}") # Performance performance_a = performance(author=author, repo=repo, tag=tag_a, verbose=False) performance_b = performance(author=author, repo=repo, tag=tag_b, verbose=False) performance_diff = utils.dict_diff( d_a=performance_a, d_b=performance_b, d_a_name=tag_a, d_b_name=tag_b ) logger.info(f"Performance differences: {json.dumps(performance_diff, indent=2)}") return params_diff, performance_diff
def df(): projects_dict = utils.load_json_from_url( url= "https://raw.githubusercontent.com/GokuMohandas/madewithml/main/datasets/projects.json" ) df = pd.DataFrame(projects_dict) return df
def tags(): tags_list = utils.load_json_from_url( url= "https://raw.githubusercontent.com/GokuMohandas/madewithml/main/datasets/tags.json" ) tags = [item["tag"] for item in tags_list] return tags
def download_data(): """Download data from online to local drive. Note: We could've just copied files from `datasets` but we'll use this later on with other data sources. """ # Download data projects_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/projects.json" tags_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/tags.json" projects = utils.load_json_from_url(url=projects_url) tags = utils.load_json_from_url(url=tags_url) # Save data projects_fp = Path(config.DATA_DIR, "projects.json") tags_fp = Path(config.DATA_DIR, "tags.json") utils.save_dict(d=projects, filepath=projects_fp) utils.save_dict(d=tags, filepath=tags_fp) logger.info("✅ Data downloaded!")
def download_auxiliary_data(): """Load auxiliary data from URL and save to local drive.""" # Download auxiliary data tags_url = "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/tags.json" tags = utils.load_json_from_url(url=tags_url) # Save data tags_fp = Path(config.DATA_DIR, "tags.json") utils.save_dict(d=tags, filepath=tags_fp) logger.info("✅ Auxiliary data downloaded!")
def performance( author: str = config.AUTHOR, repo: str = config.REPO, tag: str = "workspace", verbose: bool = True, ): if tag == "workspace": performance = utils.load_dict(filepath=Path(config.MODEL_DIR, "performance.json")) else: url = f"https://raw.githubusercontent.com/{author}/{repo}/{tag}/model/performance.json" performance = utils.load_json_from_url(url=url) if verbose: logger.info(json.dumps(performance, indent=2)) return performance
def get_tags(author=config.AUTHOR, repo=config.REPO): # Get list of tags tags_list = ["workspace"] + [ tag["name"] for tag in utils.load_json_from_url( url=f"https://api.github.com/repos/{author}/{repo}/tags") ] # Get metadata by tag tags = {} for tag in tags_list: tags[tag] = {} tags[tag]["params"] = cli.params(tag=tag, verbose=False) tags[tag]["performance"] = pd.json_normalize( cli.performance(tag=tag, verbose=False), sep=".").to_dict(orient="records")[0] return tags
def test_load_json_from_url(): tags_url = "https://raw.githubusercontent.com/GokuMohandas/applied-ml/main/datasets/tags.json" tags_dict = utils.list_to_dict(utils.load_json_from_url(url=tags_url), key="tag") assert "transformers" in tags_dict
def tags(): # Load tags tags_url = "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/tags.json" tags_dict = utils.load_json_from_url(url=tags_url) tags = [tag["tag"] for tag in tags_dict] return tags
def diff(commit_a: str = "workspace", commit_b: str = "head"): # pragma: no cover """Compare relevant differences (params, metrics) between commits. Inspired by DVC's `dvc metrics diff`but repurposed to display diffs pertinent to our experiments. Args: commit_a (str, optional): Primary commit. Defaults to "workspace". commit_b (str, optional): Commit to compare to. Defaults to "head". Raises: ValueError: Invalid commit. """ diffs = {} commits = ["a", "b"] if commit_a.lower() in ("head", "current"): commit_a = "main" if commit_b.lower() in ("head", "current"): commit_b = "main" # Get params params = {"a": {}, "b": {}} for i, commit in enumerate([commit_a, commit_b]): if commit == "workspace": params[commits[i]] = utils.load_dict( filepath=Path(config.CONFIG_DIR, "params.json")) continue params_url = ( f"https://raw.githubusercontent.com/GokuMohandas/applied-ml/{commit}/model/params.json" ) params[commits[i]] = utils.load_json_from_url(url=params_url) # Parameter differences diffs["params"] = {} for arg in params["a"]: a = params["a"][arg] b = params["b"][arg] if a != b: diffs["params"][arg] = {commit_a: a, commit_b: b} logger.info( f"Parameter differences:\n{json.dumps(diffs['params'], indent=2)}") # Get metrics metrics = {"a": {}, "b": {}} for i, commit in enumerate([commit_a, commit_b]): if commit == "workspace": metrics[commits[i]] = utils.load_dict( filepath=Path(config.MODEL_DIR, "performance.json")) continue metrics_url = f"https://raw.githubusercontent.com/GokuMohandas/applied-ml/{commit}/model/performance.json" metrics[commits[i]] = utils.load_json_from_url(url=metrics_url) # Recursively flatten metrics_a = pd.json_normalize(metrics["a"], sep=".").to_dict(orient="records")[0] metrics_b = pd.json_normalize(metrics["b"], sep=".").to_dict(orient="records")[0] if metrics_a.keys() != metrics_b.keys(): raise Exception( "Cannot compare these commits because they have different metrics." ) # Metric differences diffs["metrics"] = {} diffs["metrics"]["improvements"] = {} diffs["metrics"]["regressions"] = {} for metric in metrics_a: if ((metric in metrics_b) and (metrics_a[metric] != metrics_b[metric]) and (isinstance(metrics_a[metric], numbers.Number)) and (metric.split(".")[-1] != "num_samples")): item = { commit_a: metrics_a[metric], commit_b: metrics_b[metric], "diff": metrics_a[metric] - metrics_b[metric], } if item["diff"] >= 0.0: diffs["metrics"]["improvements"][metric] = item else: diffs["metrics"]["regressions"][metric] = item logger.info( f"Metric differences:\n{json.dumps(diffs['metrics'], indent=2)}") return diffs