def test_fred_category_series_function(self): fred.key('my_fred_key') fred.category_series(123) expected = 'http://api.stlouisfed.org/fred/category/series' params = { 'api_key': 'my_fred_key', 'category_id': 123, 'file_type': 'json' } self.get.assert_called_with(expected, params=params)
def test_fred_category_series_function(self): fred.key('my_fred_key') fred.category_series(123) expected = 'https://api.stlouisfed.org/fred/category/series' params = {'api_key': 'my_fred_key', 'category_id': 123, 'file_type': 'json'} self.get.assert_called_with(expected, params=params)
def source_FRED(credentials, small_sample:bool=False, id_freq_list_path:str="") -> None: """ Source the full FRED dataset and save to files. https://fred.stlouisfed.org/ """ # Setup directories f they do not exist # TODO # Create fred connection using api-key fred.key(credentials.API_KEY_FED.key) if small_sample: try: if id_freq_list_path == "": filename="dummy_id_freq_list.json" with open(cfg.source.path.FRED.meta + filename, "r") as fp: ids_freqs = json.load(fp) else: try: with open(id_freq_list_path, "r") as fp: ids_freqs = json.load(fp) except Exception as e: logger.warning(e) logger.warning(f"Not able to read provided file in path {id_freq_list_path}.") logger.info("Using precomputed list for retrieval from FRED.") except Exception as e: logger.info(e) logger.info("Not able to find predefined list of ids. Crawling FRED instead.") # Crawl to get a full list of available time series. ids_freqs = {} for s in fred.category_series(33936)["seriess"]: ids_freqs[s["id"]] = s["frequency_short"] filename="dummy_id_freq_list.json" #path = os.path.join(cfg.source.path.FRED.meta, filename) with open(cfg.source.path.FRED.meta + filename, "w") as fp: json.dump(ids_freqs, fp, sort_keys=True, indent=4, separators=(",", ": ")) # Download and save all time series. saving each sample as a JSON for id in ids_freqs.keys(): observations = fred.observations(id) json_out = { "source" : "FRED", "id" : id, "frequency" : ids_freqs[id], "values" : [float(obs["value"]) for obs in observations["observations"]] } filename=f"{id}.json" #path = os.path.join(cfg.source.path.FRED.raw, filename) with open(cfg.source.path.FRED.raw + filename, "w") as fp: json.dump(json_out, fp) # Statistics of sourcing # Random dummy data for preprocessing num_preprocessed = 0 for i in range(10000): if num_preprocessed % 1000 == 0: curr_dir = f"dir{num_preprocessed // 1000 :03d}/" os.makedirs(cfg.source.path.FRED.raw + curr_dir, exist_ok=True) out = { "source" : "FRED", "id" : f"{i:04d}", "frequency" : np.random.choice(["Y", "Q", "M", "W", "D", "H"]), "values" : list(np.random.rand(100)), } filename = f"{i:04d}.json" with open(cfg.source.path.FRED.raw + curr_dir + filename, "w") as fp: json.dump(out, fp) num_preprocessed += 1 else: # Crawl to get a full list of available time series. # save every n minutes to avoid having to go redo... #if not os.path.isfile(os.path.join(cfg.source.path.FRED.meta, "ids_freq_list_test.json")): #logger.info("Crawling FRED.") #crawl_fred(api_key=credentials.API_KEY_FED.key, nodes_to_visit=[0], sleep_time=cfg.source.api.FRED.sleep, rate_limit=cfg.source.api.FRED.limit) #logger.info("Done crawling.") #path = os.path.join(cfg.source.path.FRED.meta, "ids_meta.json") logger.info(f"Downloading.") download_ids(api_key=credentials.API_KEY_FED.key, sleep_time=cfg.source.api.FRED.sleep, rate_limit=cfg.source.api.FRED.limit)
def crawl_fred(api_key:str, nodes_to_visit:List[int]=[0], sleep_time:int=60, rate_limit:int=100) -> None: """ Crawling the FRED dataset. Saving all time series ids and metadata. """ fred.key(api_key) file_number = 0 tot_downloaded = 0 num_nodes_visited = 0 num_requests = 0 list_json:List[Dict] = [] num_files_written = 0 curr_dir = f"dir{tot_downloaded // cfg.source.files_per_folder :04d}/" # initialize category_names = {} for node in nodes_to_visit: node_children = fred.children(node) for child in node_children["categories"]: category_names[child["id"]] = {"name": child["name"], "parent_id": child["parent_id"]} while nodes_to_visit: curr_node = nodes_to_visit.pop() #logger.info(f"Current node: {curr_node:>4}") try: children = fred.children(curr_node) num_requests += 1 if children["categories"]: for child in children["categories"]: nodes_to_visit.append(child["id"]) category_names[child["id"]] = {"name": child["name"], "parent_id": child["parent_id"]} seriess = fred.category_series(curr_node)["seriess"] num_requests += 1 for ts in seriess: id_meta = ts id_meta["source"] = "FRED" id_meta["node_id"] = curr_node id_meta["category_name"] = category_names[curr_node]["name"] id_meta["parent_id"] = category_names[curr_node]["parent_id"] tot_downloaded += 1 list_json.append(id_meta) if len(list_json) > cfg.source.samples_per_json: filename = f"meta_{num_files_written:>06}.json" if num_files_written % cfg.source.files_per_folder == 0: curr_dir = f"dir{num_files_written // cfg.source.files_per_folder :04d}/" os.makedirs(os.path.join(cfg.source.path.FRED.meta, curr_dir), exist_ok=True) with open(os.path.join(*[cfg.source.path.FRED.meta, curr_dir, filename]), "w") as fp: json.dump(list_json, fp, sort_keys=True, indent=4, separators=(",", ": ")) fp.close() num_files_written += 1 list_json = [] num_nodes_visited += 1 if num_nodes_visited % 100 == 0: logger.info(f"Visited {num_nodes_visited:>5} nodes and currently have {tot_downloaded:>6} time series ids saved") fname = time.ctime().replace(" ", "-").replace(":","-")+"-nodes-to-visit.txt" with open(os.path.join(cfg.source.path.FRED.meta, fname), "w") as f: f.write("\n".join([str(node) for node in nodes_to_visit])) f.close() if num_requests > rate_limit: time.sleep(sleep_time) num_requests=0 except Exception as e: logger.debug(e) logger.debug(f"Current node {curr_node}") logger.debug(f"{num_requests:>3} requests last minute")
def test_fred_category_series_function(self): fred.key('my_fred_key') fred.category_series() expected = 'http://api.stlouisfed.org/fred/category/series' params = {'api_key': 'my_fred_key'} self.get.assert_called_with(expected, params=params)
def analysis_series_child(series_id): seriess = fred.category_series(series_id)["seriess"] for item_series in seriess: item_series["parent_id"] = series_id insert_sql('series', item_series) analysis_observations_child(item_series['id'])