예제 #1
0
def alphavantage_import_eod_prices(
    ctx: Context,
    tickers_input: Optional[Reference[Ticker]],
    api_key: str,
    tickers: Optional[List] = None,
) -> Iterator[Records[AlphavantageEodPrice]]:
    assert api_key is not None
    tickers = prepare_tickers(tickers, tickers_input)
    if not tickers:
        return None
    ticker_latest_dates_imported = (
        ctx.get_state_value("ticker_latest_dates_imported") or {})
    conn = JsonHttpApiConnection()

    def fetch_prices(params: Dict, tries: int = 0) -> Optional[Records]:
        if tries > 2:
            return None
        resp = conn.get(ALPHAVANTAGE_API_BASE_URL, params, stream=True)
        try:
            record = resp.json()
            # Json response means error
            if is_alphavantage_error(record):
                # TODO: Log this failure?
                print(f"Error for {params} {record}")
                return None
            if is_alphavantage_rate_limit(record):
                time.sleep(60)
                return fetch_prices(params, tries=tries + 1)
        except:
            pass
        # print(resp.raw.read().decode("utf8"))
        # resp.raw.seek(0)
        records = list(read_csv(resp.iter_lines()))
        return records

    for ticker in tickers:
        assert isinstance(ticker, str)
        latest_date_imported = ensure_datetime(
            ticker_latest_dates_imported.get(ticker, MIN_DATETIME))
        assert latest_date_imported is not None
        if utcnow() - ensure_utc(latest_date_imported) < timedelta(days=1):
            # Only check once a day
            continue
        params = prepare_params_for_ticker(ticker,
                                           ticker_latest_dates_imported)
        params["apikey"] = api_key
        records = fetch_prices(params)
        if records:
            # Symbol not included
            for r in records:
                r["symbol"] = ticker
            yield records
        # Update state
        ticker_latest_dates_imported[ticker] = utcnow()
        ctx.emit_state_value("ticker_latest_dates_imported",
                             ticker_latest_dates_imported)
        if not ctx.should_continue():
            break
예제 #2
0
def timestamp_increment_key(prefix: str = "", max_length: int = 28) -> str:
    """
    Generates keys that are unique and monotonic in time for a given run.
    Appends random chars to ensure multiple processes can run at once and not collide.
    """
    curr_ms = utcnow().strftime("%y%m%d_%H%M%S%f")
    rand_len = max_length - (21 + len(prefix))
    key = f"{prefix}_{curr_ms}_{rand_str(rand_len).lower()}"
    return key
예제 #3
0
 def should_continue(self) -> bool:
     """
     Long running snaps should check this function periodically
     to honor time limits.
     """
     if not self.execution_context.execution_timelimit_seconds:
         return True
     seconds_elapsed = (utcnow() - self.snap_log.started_at).total_seconds()
     return seconds_elapsed < self.execution_context.execution_timelimit_seconds
예제 #4
0
def marketstack_import_tickers(
    ctx: Context,
    access_key: str,
    exchanges: List = ["XNYS", "XNAS"],
) -> Iterator[Records[MarketstackTicker]]:
    use_https = False  # TODO: when do we want this True?
    # default_from_date = ctx.get_param("from_date", MIN_DATE)
    assert access_key is not None
    assert isinstance(exchanges, list)
    last_imported_at = ensure_datetime(
        ctx.get_state_value("last_imported_at") or "2020-01-01 00:00:00")
    assert last_imported_at is not None
    last_imported_at = ensure_utc(last_imported_at)
    if utcnow() - last_imported_at < timedelta(days=1):  # TODO: from config
        return
    conn = JsonHttpApiConnection()
    if use_https:
        endpoint_url = HTTPS_MARKETSTACK_API_BASE_URL + "tickers"
    else:
        endpoint_url = MARKETSTACK_API_BASE_URL + "tickers"
    for exchange in exchanges:
        params = {
            "limit": 1000,
            "offset": 0,
            "access_key": access_key,
            "exchange": exchange,
        }
        while ctx.should_continue():
            resp = conn.get(endpoint_url, params)
            json_resp = resp.json()
            assert isinstance(json_resp, dict)
            records = json_resp["data"]
            if len(records) == 0:
                # All done
                break
            # Add a flattened exchange indicator
            for r in records:
                r["exchange_acronym"] = r.get("stock_exchange",
                                              {}).get("acronym")
            yield records
            # Setup for next page
            params["offset"] = params["offset"] + len(records)
    ctx.emit_state_value("last_imported_at", utcnow())
예제 #5
0
 def should_continue(self) -> bool:
     """
     Long running functions should check this function periodically
     to honor time limits.
     """
     if (not self.execution_context.execution_timelimit_seconds
             or not self.execution_start_time):
         return True
     seconds_elapsed = (utcnow() -
                        self.execution_start_time).total_seconds()
     return seconds_elapsed < self.execution_context.execution_timelimit_seconds
예제 #6
0
def test_fred():
    api_key = ensure_api_key()

    from snapflow_fred import module as fred

    env = Environment(metadata_storage="sqlite://")

    g = graph()

    # Initial graph
    gdp = g.create_node(
        "fred.import_observations",
        params={
            "api_key": api_key,
            "series_id": "gdp"
        },
    )
    blocks = produce(gdp, env=env, modules=[fred])
    records = blocks[0].as_records()
    assert len(records) >= (utcnow().year - 1946) * 4 - 1
    assert len(records) < (utcnow().year + 1 - 1946) * 4 - 1
예제 #7
0
 def ensure_log(self, block: DataBlockMetadata, direction: Direction,
                name: str):
     if self.metadata_api.execute(
             select(DataBlockLog).filter_by(
                 function_log_id=self.function_log.id,
                 stream_name=name,
                 data_block_id=block.id,
                 direction=direction,
             )).scalar_one_or_none():
         return
     drl = DataBlockLog(  # type: ignore
         function_log_id=self.function_log.id,
         stream_name=name,
         data_block_id=block.id,
         direction=direction,
         processed_at=utcnow(),
     )
     self.metadata_api.add(drl)
예제 #8
0
def prepare_params_for_ticker(
        ticker: str, ticker_latest_dates_imported: Dict[str,
                                                        datetime]) -> Dict:
    latest_date_imported = ensure_datetime(
        ticker_latest_dates_imported.get(ticker, MIN_DATETIME))
    if ensure_utc(latest_date_imported) <= utcnow() - timedelta(days=100):
        # More than 100 days worth, get full
        outputsize = "full"
    else:
        # Less than 100 days, compact will suffice
        outputsize = "compact"
    params = {
        "symbol": ticker,
        "outputsize": outputsize,
        "datatype": "csv",
        "function": "TIME_SERIES_DAILY_ADJUSTED",
    }
    return params
예제 #9
0
def import_observations(
    ctx: Context,
    api_key: str,
    series_id: str,
) -> Iterator[Records[FredObservation]]:
    latest_fetched_at = ctx.get_state_value("latest_fetched_at")
    if latest_fetched_at:
        # Two year curing window (to be safe)
        obs_start = ensure_datetime(latest_fetched_at) - timedelta(days=365 *
                                                                   2)
    else:
        obs_start = MIN_DATE
    params = {
        "file_type": "json",
        "api_key": api_key,
        "series_id": series_id,
        "observation_start": obs_start,
        "offset": 0,
        "limit": 100000,
    }
    conn = JsonHttpApiConnection(date_format="%Y-%m-%d")
    endpoint_url = FRED_API_BASE_URL + "series/observations"
    while True:
        resp = conn.get(endpoint_url, params)
        json_resp = resp.json()
        assert isinstance(json_resp, dict)
        records = json_resp["observations"]
        if len(records) == 0:
            # All done
            break
        for r in records:
            r["series_id"] = params[
                "series_id"]  # Add series ID to data so we know what the data is
            r["value"] = (
                None if r["value"] == "." else r["value"]
            )  # FRED quirk, returns empty decimal number "." instead of null
        yield records
        num_returned = len(records)
        if num_returned < json_resp["limit"]:
            # we got back less than limit, so must be done (no other way to tell?)
            break
        params["offset"] += num_returned
    # We only update date if we have fetched EVERYTHING available as of now
    ctx.emit_state_value("latest_fetched_at", utcnow())
예제 #10
0
def base_import(
    data_source: str, ctx: DataFunctionContext, user_key: str, use_sample: bool = False
):
    params = {
        "user_key": user_key,
    }
    url = BULK_CSV_SAMPLE_URL if use_sample else BULK_CSV_URL

    while ctx.should_continue():
        resp = HttpApiConnection().get(url=url, params=params,)

        ib = io.BytesIO(resp.content)

        with tarfile.open(fileobj=ib) as csv_files:
            raw = csv_files.extractfile("{}.csv".format(data_source))
            ctx.emit_state_value("imported_{}".format(data_source), True)
            ctx.emit(
                raw,
                data_format=CsvFileFormat,
                schema=CRUNCHBASE_CSV_TO_SCHEMA_MAP[data_source],
            )
            ctx.emit_state_value("latest_imported_at".format(data_source), utcnow())

        return
예제 #11
0
    def start_function_run(
            self, node: Node,
            bound_interface: BoundInterface) -> Iterator[DataFunctionContext]:
        from snapflow.core.graph import GraphMetadata

        # assert self.current_runtime is not None, "Runtime not set"
        md = self.env.get_metadata_api()
        node_state_obj = node.get_state(self.env)
        if node_state_obj is None:
            node_state = {}
        else:
            node_state = node_state_obj.state
        new_graph_meta = node.graph.get_metadata_obj()
        graph_meta = md.execute(
            select(GraphMetadata).filter(
                GraphMetadata.hash == new_graph_meta.hash),
            filter_env=False,
        ).scalar_one_or_none()
        if graph_meta is None:
            md.add(new_graph_meta)
            md.flush()  # [new_graph_meta])
            graph_meta = new_graph_meta

        function_log = DataFunctionLog(  # type: ignore
            graph_id=graph_meta.hash,
            node_key=node.key,
            node_start_state=node_state.copy(
            ),  # {k: v for k, v in node_state.items()},
            node_end_state=node_state,
            function_key=node.function.key,
            function_params=node.params,
            # runtime_url=self.current_runtime.url,
            started_at=utcnow(),
        )
        md.add(function_log)
        md.flush([function_log])
        function_ctx = DataFunctionContext(
            env=self.env,
            function=self.exe.function,
            node=self.exe.node,
            executable=self.exe,
            metadata_api=self.env.md_api,
            inputs=bound_interface.inputs,
            function_log=function_log,
            bound_interface=bound_interface,
            execution_context=self.exe.execution_context,
            execution_start_time=self.start_time,
        )
        try:
            yield function_ctx
            # Validate local memory objects: Did we leave any non-storeables hanging?
            validate_data_blocks(self.env)
        except Exception as e:
            # Don't worry about exhaustion exceptions
            if not isinstance(e, InputExhaustedException):
                logger.debug(f"Error running node:\n{traceback.format_exc()}")
                function_log.set_error(e)
                function_log.persist_state(self.env)
                function_log.completed_at = utcnow()
                # TODO: should clean this up so transaction surrounds things that you DO
                #       want to rollback, obviously
                # md.commit()  # MUST commit here since the re-raised exception will issue a rollback
                if self.exe.execution_context.abort_on_function_error:
                    raise e
        finally:
            function_ctx.finish_execution()
            # Persist state on success OR error:
            function_log.persist_state(self.env)
            function_log.completed_at = utcnow()
예제 #12
0
 def __init__(self, exe: Executable):
     self.exe = exe
     self.env = exe.execution_context.env
     self.logger = exe.execution_context.logger
     self.node = self.exe.node
     self.start_time = utcnow()
예제 #13
0
def alphavantage_import_company_overview(
    ctx: Context,
    tickers_input: Optional[Reference[Ticker]],
    api_key: str,
    tickers: Optional[List] = None,
) -> Iterator[Records[AlphavantageCompanyOverview]]:
    assert api_key is not None
    tickers = prepare_tickers(tickers, tickers_input)
    if tickers is None:
        # We didn't get an input block for tickers AND
        # the config is empty, so we are done
        return None
    ticker_latest_dates_imported = (
        ctx.get_state_value("ticker_latest_dates_imported") or {})
    conn = JsonHttpApiConnection()
    batch_size = 50
    records = []
    tickers_updated = []

    def fetch_overview(params: Dict, tries: int = 0) -> Optional[Dict]:
        if tries > 2:
            return None
        resp = conn.get(ALPHAVANTAGE_API_BASE_URL, params)
        record = resp.json()
        # Alphavantage returns 200 and json error message on failure
        if is_alphavantage_error(record):
            # TODO: Log this failure?
            # print(f"Error for ticker {params['symbol']}: {record}")
            return None
        if is_alphavantage_rate_limit(record):
            time.sleep(20)
            return fetch_overview(params, tries=tries + 1)
        return record

    for i, ticker in enumerate(tickers):
        assert isinstance(ticker, str)
        latest_date_imported = ensure_datetime(
            ticker_latest_dates_imported.get(ticker, MIN_DATETIME))
        assert latest_date_imported is not None
        # Refresh at most once a day
        # TODO: make this configurable instead of hard-coded 1 day
        if utcnow() - ensure_utc(latest_date_imported) < timedelta(days=1):
            continue
        params = {
            "apikey": api_key,
            "symbol": ticker,
            "function": "OVERVIEW",
        }
        record = fetch_overview(params)
        if not record:
            continue

        # Clean up json keys to be more DB friendly
        record = {title_to_snake_case(k): v for k, v in record.items()}
        records.append(record)
        tickers_updated.append(ticker)
        if len(records) >= batch_size or i == len(tickers) - 1:
            yield records
            # Update state
            for updated_ticker in tickers_updated:
                ticker_latest_dates_imported[updated_ticker] = utcnow()
                ctx.emit_state_value("ticker_latest_dates_imported",
                                     ticker_latest_dates_imported)
            if not ctx.should_continue():
                break
            records = []
            tickers_updated = []
예제 #14
0
def import_subscription_items(
    ctx: Context,
    api_key: str,
    curing_window_days: int = 90
) -> Iterator[Records[StripeSubscriptionItemRaw]]:
    """
    # TODO: repeated code
    """
    latest_full_import_at = ctx.get_state_value("latest_full_import_at")
    latest_full_import_at = ensure_datetime(latest_full_import_at)
    current_starting_after = ctx.get_state_value("current_starting_after")
    params = {
        "limit": 100,
        "status": "all",
    }
    # if earliest_created_at_imported <= latest_full_import_at - timedelta(days=int(curing_window_days)):
    if latest_full_import_at and curing_window_days:
        # Import only more recent than latest imported at date, offset by a curing window
        # (default 90 days) to capture updates to objects (refunds, etc)
        params["created[gte]"] = int(
            (latest_full_import_at -
             timedelta(days=int(curing_window_days))).timestamp())
    if current_starting_after:
        params["starting_after"] = current_starting_after
    conn = JsonHttpApiConnection()
    endpoint_url = STRIPE_API_BASE_URL + "subscriptions"
    all_done = False
    while ctx.should_continue():
        resp = conn.get(endpoint_url, params, auth=HTTPBasicAuth(api_key, ""))
        json_resp = resp.json()
        assert isinstance(json_resp, dict)
        records = json_resp["data"]
        if len(records) == 0:
            # All done
            all_done = True
            break

        for record in records:
            item_params = {
                "limit": 100,
                "subscription": record["id"],
            }
            while True:
                items_url = STRIPE_API_BASE_URL + "subscription_items"
                items_resp = conn.get(items_url,
                                      item_params,
                                      auth=HTTPBasicAuth(api_key, ""))
                items_json_resp = items_resp.json()
                assert isinstance(items_json_resp, dict)
                items = items_json_resp["data"]
                if len(items) == 0:
                    # All done
                    break
                yield items
                if not items_json_resp.get("has_more"):
                    break
                latest_item_id = items[-1]["id"]
                item_params["starting_after"] = latest_item_id
            if not ctx.should_continue():
                break

        latest_object_id = records[-1]["id"]
        if not json_resp.get("has_more"):
            all_done = True
            break
        params["starting_after"] = latest_object_id
        ctx.emit_state_value("current_starting_after", latest_object_id)
    else:
        # Don't update any state, we just timed out
        return
    # We only update state if we have fetched EVERYTHING available as of now
    if all_done:
        ctx.emit_state_value("latest_imported_at", utcnow())
        # IMPORTANT: we reset the starting after cursor so we start from the beginning again on next run
        ctx.emit_state_value("current_starting_after", None)
예제 #15
0
def stripe_importer(
    endpoint: str,
    ctx: Context,
    api_key: str,
    curing_window_days: int = None,
    extra_params: Dict = None,
):
    """
    Stripe only allows fetching records in one order: from newest to oldest,
    so we use its cursor based pagination to iterate once through all historical.

    Stripe doesn't have a way to request by "updated at" times, so we must
    refresh old records according to our own logic, using a "curing window"
    to re-import records up to one year (the default) old.
    """
    latest_full_import_at = ctx.get_state_value("latest_full_import_at")
    latest_full_import_at = ensure_datetime(latest_full_import_at)
    current_starting_after = ctx.get_state_value("current_starting_after")
    params = {
        "limit": 100,
    }
    if extra_params:
        params.update(extra_params)
    # if earliest_created_at_imported <= latest_full_import_at - timedelta(days=int(curing_window_days)):
    if latest_full_import_at and curing_window_days:
        # Import only more recent than latest imported at date, offset by a curing window
        # (default 90 days) to capture updates to objects (refunds, etc)
        params["created[gte]"] = int(
            (latest_full_import_at -
             timedelta(days=int(curing_window_days))).timestamp())
    if current_starting_after:
        params["starting_after"] = current_starting_after
    conn = JsonHttpApiConnection()
    endpoint_url = STRIPE_API_BASE_URL + endpoint
    all_done = False
    while ctx.should_continue():
        resp = conn.get(endpoint_url, params, auth=HTTPBasicAuth(api_key, ""))
        json_resp = resp.json()
        assert isinstance(json_resp, dict)
        records = json_resp["data"]
        if len(records) == 0:
            # All done
            all_done = True
            break

        # Return acutal data
        yield records

        latest_object_id = records[-1]["id"]
        if not json_resp.get("has_more"):
            all_done = True
            break
        params["starting_after"] = latest_object_id
        ctx.emit_state_value("current_starting_after", latest_object_id)
    else:
        # Don't update any state, we just timed out
        return
    # We only update state if we have fetched EVERYTHING available as of now
    if all_done:
        ctx.emit_state_value("latest_imported_at", utcnow())
        # IMPORTANT: we reset the starting after cursor so we start from the beginning again on next run
        ctx.emit_state_value("current_starting_after", None)