def alphavantage_import_eod_prices( ctx: Context, tickers_input: Optional[Reference[Ticker]], api_key: str, tickers: Optional[List] = None, ) -> Iterator[Records[AlphavantageEodPrice]]: assert api_key is not None tickers = prepare_tickers(tickers, tickers_input) if not tickers: return None ticker_latest_dates_imported = ( ctx.get_state_value("ticker_latest_dates_imported") or {}) conn = JsonHttpApiConnection() def fetch_prices(params: Dict, tries: int = 0) -> Optional[Records]: if tries > 2: return None resp = conn.get(ALPHAVANTAGE_API_BASE_URL, params, stream=True) try: record = resp.json() # Json response means error if is_alphavantage_error(record): # TODO: Log this failure? print(f"Error for {params} {record}") return None if is_alphavantage_rate_limit(record): time.sleep(60) return fetch_prices(params, tries=tries + 1) except: pass # print(resp.raw.read().decode("utf8")) # resp.raw.seek(0) records = list(read_csv(resp.iter_lines())) return records for ticker in tickers: assert isinstance(ticker, str) latest_date_imported = ensure_datetime( ticker_latest_dates_imported.get(ticker, MIN_DATETIME)) assert latest_date_imported is not None if utcnow() - ensure_utc(latest_date_imported) < timedelta(days=1): # Only check once a day continue params = prepare_params_for_ticker(ticker, ticker_latest_dates_imported) params["apikey"] = api_key records = fetch_prices(params) if records: # Symbol not included for r in records: r["symbol"] = ticker yield records # Update state ticker_latest_dates_imported[ticker] = utcnow() ctx.emit_state_value("ticker_latest_dates_imported", ticker_latest_dates_imported) if not ctx.should_continue(): break
def timestamp_increment_key(prefix: str = "", max_length: int = 28) -> str: """ Generates keys that are unique and monotonic in time for a given run. Appends random chars to ensure multiple processes can run at once and not collide. """ curr_ms = utcnow().strftime("%y%m%d_%H%M%S%f") rand_len = max_length - (21 + len(prefix)) key = f"{prefix}_{curr_ms}_{rand_str(rand_len).lower()}" return key
def should_continue(self) -> bool: """ Long running snaps should check this function periodically to honor time limits. """ if not self.execution_context.execution_timelimit_seconds: return True seconds_elapsed = (utcnow() - self.snap_log.started_at).total_seconds() return seconds_elapsed < self.execution_context.execution_timelimit_seconds
def marketstack_import_tickers( ctx: Context, access_key: str, exchanges: List = ["XNYS", "XNAS"], ) -> Iterator[Records[MarketstackTicker]]: use_https = False # TODO: when do we want this True? # default_from_date = ctx.get_param("from_date", MIN_DATE) assert access_key is not None assert isinstance(exchanges, list) last_imported_at = ensure_datetime( ctx.get_state_value("last_imported_at") or "2020-01-01 00:00:00") assert last_imported_at is not None last_imported_at = ensure_utc(last_imported_at) if utcnow() - last_imported_at < timedelta(days=1): # TODO: from config return conn = JsonHttpApiConnection() if use_https: endpoint_url = HTTPS_MARKETSTACK_API_BASE_URL + "tickers" else: endpoint_url = MARKETSTACK_API_BASE_URL + "tickers" for exchange in exchanges: params = { "limit": 1000, "offset": 0, "access_key": access_key, "exchange": exchange, } while ctx.should_continue(): resp = conn.get(endpoint_url, params) json_resp = resp.json() assert isinstance(json_resp, dict) records = json_resp["data"] if len(records) == 0: # All done break # Add a flattened exchange indicator for r in records: r["exchange_acronym"] = r.get("stock_exchange", {}).get("acronym") yield records # Setup for next page params["offset"] = params["offset"] + len(records) ctx.emit_state_value("last_imported_at", utcnow())
def should_continue(self) -> bool: """ Long running functions should check this function periodically to honor time limits. """ if (not self.execution_context.execution_timelimit_seconds or not self.execution_start_time): return True seconds_elapsed = (utcnow() - self.execution_start_time).total_seconds() return seconds_elapsed < self.execution_context.execution_timelimit_seconds
def test_fred(): api_key = ensure_api_key() from snapflow_fred import module as fred env = Environment(metadata_storage="sqlite://") g = graph() # Initial graph gdp = g.create_node( "fred.import_observations", params={ "api_key": api_key, "series_id": "gdp" }, ) blocks = produce(gdp, env=env, modules=[fred]) records = blocks[0].as_records() assert len(records) >= (utcnow().year - 1946) * 4 - 1 assert len(records) < (utcnow().year + 1 - 1946) * 4 - 1
def ensure_log(self, block: DataBlockMetadata, direction: Direction, name: str): if self.metadata_api.execute( select(DataBlockLog).filter_by( function_log_id=self.function_log.id, stream_name=name, data_block_id=block.id, direction=direction, )).scalar_one_or_none(): return drl = DataBlockLog( # type: ignore function_log_id=self.function_log.id, stream_name=name, data_block_id=block.id, direction=direction, processed_at=utcnow(), ) self.metadata_api.add(drl)
def prepare_params_for_ticker( ticker: str, ticker_latest_dates_imported: Dict[str, datetime]) -> Dict: latest_date_imported = ensure_datetime( ticker_latest_dates_imported.get(ticker, MIN_DATETIME)) if ensure_utc(latest_date_imported) <= utcnow() - timedelta(days=100): # More than 100 days worth, get full outputsize = "full" else: # Less than 100 days, compact will suffice outputsize = "compact" params = { "symbol": ticker, "outputsize": outputsize, "datatype": "csv", "function": "TIME_SERIES_DAILY_ADJUSTED", } return params
def import_observations( ctx: Context, api_key: str, series_id: str, ) -> Iterator[Records[FredObservation]]: latest_fetched_at = ctx.get_state_value("latest_fetched_at") if latest_fetched_at: # Two year curing window (to be safe) obs_start = ensure_datetime(latest_fetched_at) - timedelta(days=365 * 2) else: obs_start = MIN_DATE params = { "file_type": "json", "api_key": api_key, "series_id": series_id, "observation_start": obs_start, "offset": 0, "limit": 100000, } conn = JsonHttpApiConnection(date_format="%Y-%m-%d") endpoint_url = FRED_API_BASE_URL + "series/observations" while True: resp = conn.get(endpoint_url, params) json_resp = resp.json() assert isinstance(json_resp, dict) records = json_resp["observations"] if len(records) == 0: # All done break for r in records: r["series_id"] = params[ "series_id"] # Add series ID to data so we know what the data is r["value"] = ( None if r["value"] == "." else r["value"] ) # FRED quirk, returns empty decimal number "." instead of null yield records num_returned = len(records) if num_returned < json_resp["limit"]: # we got back less than limit, so must be done (no other way to tell?) break params["offset"] += num_returned # We only update date if we have fetched EVERYTHING available as of now ctx.emit_state_value("latest_fetched_at", utcnow())
def base_import( data_source: str, ctx: DataFunctionContext, user_key: str, use_sample: bool = False ): params = { "user_key": user_key, } url = BULK_CSV_SAMPLE_URL if use_sample else BULK_CSV_URL while ctx.should_continue(): resp = HttpApiConnection().get(url=url, params=params,) ib = io.BytesIO(resp.content) with tarfile.open(fileobj=ib) as csv_files: raw = csv_files.extractfile("{}.csv".format(data_source)) ctx.emit_state_value("imported_{}".format(data_source), True) ctx.emit( raw, data_format=CsvFileFormat, schema=CRUNCHBASE_CSV_TO_SCHEMA_MAP[data_source], ) ctx.emit_state_value("latest_imported_at".format(data_source), utcnow()) return
def start_function_run( self, node: Node, bound_interface: BoundInterface) -> Iterator[DataFunctionContext]: from snapflow.core.graph import GraphMetadata # assert self.current_runtime is not None, "Runtime not set" md = self.env.get_metadata_api() node_state_obj = node.get_state(self.env) if node_state_obj is None: node_state = {} else: node_state = node_state_obj.state new_graph_meta = node.graph.get_metadata_obj() graph_meta = md.execute( select(GraphMetadata).filter( GraphMetadata.hash == new_graph_meta.hash), filter_env=False, ).scalar_one_or_none() if graph_meta is None: md.add(new_graph_meta) md.flush() # [new_graph_meta]) graph_meta = new_graph_meta function_log = DataFunctionLog( # type: ignore graph_id=graph_meta.hash, node_key=node.key, node_start_state=node_state.copy( ), # {k: v for k, v in node_state.items()}, node_end_state=node_state, function_key=node.function.key, function_params=node.params, # runtime_url=self.current_runtime.url, started_at=utcnow(), ) md.add(function_log) md.flush([function_log]) function_ctx = DataFunctionContext( env=self.env, function=self.exe.function, node=self.exe.node, executable=self.exe, metadata_api=self.env.md_api, inputs=bound_interface.inputs, function_log=function_log, bound_interface=bound_interface, execution_context=self.exe.execution_context, execution_start_time=self.start_time, ) try: yield function_ctx # Validate local memory objects: Did we leave any non-storeables hanging? validate_data_blocks(self.env) except Exception as e: # Don't worry about exhaustion exceptions if not isinstance(e, InputExhaustedException): logger.debug(f"Error running node:\n{traceback.format_exc()}") function_log.set_error(e) function_log.persist_state(self.env) function_log.completed_at = utcnow() # TODO: should clean this up so transaction surrounds things that you DO # want to rollback, obviously # md.commit() # MUST commit here since the re-raised exception will issue a rollback if self.exe.execution_context.abort_on_function_error: raise e finally: function_ctx.finish_execution() # Persist state on success OR error: function_log.persist_state(self.env) function_log.completed_at = utcnow()
def __init__(self, exe: Executable): self.exe = exe self.env = exe.execution_context.env self.logger = exe.execution_context.logger self.node = self.exe.node self.start_time = utcnow()
def alphavantage_import_company_overview( ctx: Context, tickers_input: Optional[Reference[Ticker]], api_key: str, tickers: Optional[List] = None, ) -> Iterator[Records[AlphavantageCompanyOverview]]: assert api_key is not None tickers = prepare_tickers(tickers, tickers_input) if tickers is None: # We didn't get an input block for tickers AND # the config is empty, so we are done return None ticker_latest_dates_imported = ( ctx.get_state_value("ticker_latest_dates_imported") or {}) conn = JsonHttpApiConnection() batch_size = 50 records = [] tickers_updated = [] def fetch_overview(params: Dict, tries: int = 0) -> Optional[Dict]: if tries > 2: return None resp = conn.get(ALPHAVANTAGE_API_BASE_URL, params) record = resp.json() # Alphavantage returns 200 and json error message on failure if is_alphavantage_error(record): # TODO: Log this failure? # print(f"Error for ticker {params['symbol']}: {record}") return None if is_alphavantage_rate_limit(record): time.sleep(20) return fetch_overview(params, tries=tries + 1) return record for i, ticker in enumerate(tickers): assert isinstance(ticker, str) latest_date_imported = ensure_datetime( ticker_latest_dates_imported.get(ticker, MIN_DATETIME)) assert latest_date_imported is not None # Refresh at most once a day # TODO: make this configurable instead of hard-coded 1 day if utcnow() - ensure_utc(latest_date_imported) < timedelta(days=1): continue params = { "apikey": api_key, "symbol": ticker, "function": "OVERVIEW", } record = fetch_overview(params) if not record: continue # Clean up json keys to be more DB friendly record = {title_to_snake_case(k): v for k, v in record.items()} records.append(record) tickers_updated.append(ticker) if len(records) >= batch_size or i == len(tickers) - 1: yield records # Update state for updated_ticker in tickers_updated: ticker_latest_dates_imported[updated_ticker] = utcnow() ctx.emit_state_value("ticker_latest_dates_imported", ticker_latest_dates_imported) if not ctx.should_continue(): break records = [] tickers_updated = []
def import_subscription_items( ctx: Context, api_key: str, curing_window_days: int = 90 ) -> Iterator[Records[StripeSubscriptionItemRaw]]: """ # TODO: repeated code """ latest_full_import_at = ctx.get_state_value("latest_full_import_at") latest_full_import_at = ensure_datetime(latest_full_import_at) current_starting_after = ctx.get_state_value("current_starting_after") params = { "limit": 100, "status": "all", } # if earliest_created_at_imported <= latest_full_import_at - timedelta(days=int(curing_window_days)): if latest_full_import_at and curing_window_days: # Import only more recent than latest imported at date, offset by a curing window # (default 90 days) to capture updates to objects (refunds, etc) params["created[gte]"] = int( (latest_full_import_at - timedelta(days=int(curing_window_days))).timestamp()) if current_starting_after: params["starting_after"] = current_starting_after conn = JsonHttpApiConnection() endpoint_url = STRIPE_API_BASE_URL + "subscriptions" all_done = False while ctx.should_continue(): resp = conn.get(endpoint_url, params, auth=HTTPBasicAuth(api_key, "")) json_resp = resp.json() assert isinstance(json_resp, dict) records = json_resp["data"] if len(records) == 0: # All done all_done = True break for record in records: item_params = { "limit": 100, "subscription": record["id"], } while True: items_url = STRIPE_API_BASE_URL + "subscription_items" items_resp = conn.get(items_url, item_params, auth=HTTPBasicAuth(api_key, "")) items_json_resp = items_resp.json() assert isinstance(items_json_resp, dict) items = items_json_resp["data"] if len(items) == 0: # All done break yield items if not items_json_resp.get("has_more"): break latest_item_id = items[-1]["id"] item_params["starting_after"] = latest_item_id if not ctx.should_continue(): break latest_object_id = records[-1]["id"] if not json_resp.get("has_more"): all_done = True break params["starting_after"] = latest_object_id ctx.emit_state_value("current_starting_after", latest_object_id) else: # Don't update any state, we just timed out return # We only update state if we have fetched EVERYTHING available as of now if all_done: ctx.emit_state_value("latest_imported_at", utcnow()) # IMPORTANT: we reset the starting after cursor so we start from the beginning again on next run ctx.emit_state_value("current_starting_after", None)
def stripe_importer( endpoint: str, ctx: Context, api_key: str, curing_window_days: int = None, extra_params: Dict = None, ): """ Stripe only allows fetching records in one order: from newest to oldest, so we use its cursor based pagination to iterate once through all historical. Stripe doesn't have a way to request by "updated at" times, so we must refresh old records according to our own logic, using a "curing window" to re-import records up to one year (the default) old. """ latest_full_import_at = ctx.get_state_value("latest_full_import_at") latest_full_import_at = ensure_datetime(latest_full_import_at) current_starting_after = ctx.get_state_value("current_starting_after") params = { "limit": 100, } if extra_params: params.update(extra_params) # if earliest_created_at_imported <= latest_full_import_at - timedelta(days=int(curing_window_days)): if latest_full_import_at and curing_window_days: # Import only more recent than latest imported at date, offset by a curing window # (default 90 days) to capture updates to objects (refunds, etc) params["created[gte]"] = int( (latest_full_import_at - timedelta(days=int(curing_window_days))).timestamp()) if current_starting_after: params["starting_after"] = current_starting_after conn = JsonHttpApiConnection() endpoint_url = STRIPE_API_BASE_URL + endpoint all_done = False while ctx.should_continue(): resp = conn.get(endpoint_url, params, auth=HTTPBasicAuth(api_key, "")) json_resp = resp.json() assert isinstance(json_resp, dict) records = json_resp["data"] if len(records) == 0: # All done all_done = True break # Return acutal data yield records latest_object_id = records[-1]["id"] if not json_resp.get("has_more"): all_done = True break params["starting_after"] = latest_object_id ctx.emit_state_value("current_starting_after", latest_object_id) else: # Don't update any state, we just timed out return # We only update state if we have fetched EVERYTHING available as of now if all_done: ctx.emit_state_value("latest_imported_at", utcnow()) # IMPORTANT: we reset the starting after cursor so we start from the beginning again on next run ctx.emit_state_value("current_starting_after", None)