Пример #1
0
async def LoadData(asset_db_writer, daily_bar_writer, show_progress,
                   start_session, end_session):
    bmdp = provider.BitmexDataProvider(start_session, end_session)
    futures_df = pd.DataFrame()
    urls = await bmdp.GetTradeFileUrls()
    with cli.maybe_show_progress(range(len(urls)),
                                 show_progress=show_progress,
                                 label="Loading BitMEX data",
                                 item_show_func=lambda x: urls[x]
                                 if x is not None else '') as progress:
        for _ in progress:
            # TODO: Pass granularity at command line.
            async for ohlcv in bmdp.LoadData(
                    granularity=provider.BitmexDataProvider.Granularity.DAY):
                new_details_df = pd.DataFrame()
                asset_details = await bmdp.GetAssetDetails(ohlcv)
                for asset_detail in asset_details.values():
                    detail_data = GetFutureNeededAssetDetails(asset_detail)
                    if detail_data is not None:
                        # futures sid is the futures_df.index
                        new_details_df = new_details_df.append(
                            pd.DataFrame(pd.Series(detail_data)).T,
                            ignore_index=True)
                futures_df = (futures_df.append(new_details_df).rename_axis(
                    'sid').drop_duplicates('symbol'))
                futures_df['sid'] = futures_df.index
                # flatten the multi-index
                ohlcv.columns = ohlcv.columns.droplevel()
                daily_bar_writer.write(GetOHLCVPerSid(ohlcv, futures_df),
                                       show_progress=show_progress)

    root_symbols_df = futures_df[['root_symbol', 'exchange']].drop_duplicates()
    root_symbols_df['root_symbol_id'] = root_symbols_df.index.values
    asset_db_writer.write(futures=futures_df, root_symbols=root_symbols_df)
    await bmdp.Close()
Пример #2
0
    def write(self,
              data,
              assets=None,
              show_progress=False,
              invalid_data_behavior='warn'):
        """
        Parameters
        ----------
        data : iterable[tuple[int, pandas.DataFrame or bcolz.ctable]]
            The data chunks to write. Each chunk should be a tuple of sid
            and the data for that asset.
        assets : set[int], optional
            The assets that should be in ``data``. If this is provided
            we will check ``data`` against the assets and provide better
            progress information.
        show_progress : bool, optional
            Whether or not to show a progress bar while writing.
        invalid_data_behavior : {'warn', 'raise', 'ignore'}, optional
            What to do when data is encountered that is outside the range of
            a uint32.

        Returns
        -------
        table : bcolz.ctable
            The newly-written table.
        """
        ctx = maybe_show_progress(
            ((sid, to_ctable(df, invalid_data_behavior)) for sid, df in data),
            show_progress=show_progress,
            item_show_func=self.progress_bar_item_show_func,
            label=self.progress_bar_message,
            length=len(assets) if assets is not None else None,
        )
        with ctx as it:
            return self._write_internal(it, assets)
Пример #3
0
        def _pricing_iter():
            sid = 0
            with maybe_show_progress(
                    symbols,
                    show_progress,
                    label='Downloading Yahoo pricing data: ') as it, \
                    requests.Session() as session:
                for symbol in it:
                    path = _cachpath(symbol, 'ohlcv')
                    try:
                        df = cache[path]
                    except KeyError:
                        df = cache[path] = DataReader(
                            symbol,
                            'yahoo',
                            start,
                            end,
                            session=session,
                        ).sort_index()

                    # the start date is the date of the first trade and
                    # the end date is the date of the last trade
                    metadata.iloc[sid] = df.index[0], df.index[-1], symbol
                    df.rename(
                        columns={
                            'Open': 'open',
                            'High': 'high',
                            'Low': 'low',
                            'Close': 'close',
                            'Volume': 'volume',
                        },
                        inplace=True,
                    )
                    yield sid, df
                    sid += 1
Пример #4
0
    def write(self, data, show_progress=False, invalid_data_behavior='warn'):
        """Write a stream of minute data.

        Parameters
        ----------
        data : iterable[(int, pd.DataFrame)]
            The data to write. Each element should be a tuple of sid, data
            where data has the following format:
              columns : ('open', 'high', 'low', 'close', 'volume')
                  open : float64
                  high : float64
                  low  : float64
                  close : float64
                  volume : float64|int64
              index : DatetimeIndex of market minutes.
            A given sid may appear more than once in ``data``; however,
            the dates must be strictly increasing.
        show_progress : bool, optional
            Whether or not to show a progress bar while writing.
        """
        ctx = maybe_show_progress(
            data,
            show_progress=show_progress,
            item_show_func=lambda e: e if e is None else str(e[0]),
            label="Merging minute equity files:",
        )
        write_sid = self.write_sid
        with ctx as it:
            for e in it:
                write_sid(*e, invalid_data_behavior=invalid_data_behavior)
Пример #5
0
def insert(dest, codes):
    """插入股票代码分钟级别数据"""
    c, s, e = info_func()
    writer = BcolzMinuteBarWriter(
        dest,
        c,
        s,
        e,
        CN_EQUITIES_MINUTES_PER_DAY,
    )
    ctx = maybe_show_progress(
        codes,
        show_progress=True,
        item_show_func=lambda e: e,
        label="【新增】分钟级别数据",
    )

    d_fmt = r"%Y-%m-%d"
    start_str = s.strftime(d_fmt)
    end_str = e.strftime(d_fmt)
    m_index = c.minutes_for_sessions_in_range(start_str, end_str)

    with ctx as it:
        for code in it:
            sid = int(code)
            df = fetch_single_minutely_equity(code, s.date(), e.date())
            if df.empty:
                continue
            # 务必转换为UTC时区
            # 且由于指数分钟级别数据包含的是所有分钟,需要截断至交易分钟
            df = df.tz_localize('Asia/Shanghai').tz_convert('UTC').reindex(
                m_index, method='ffill')
            writer.write_sid(sid, df)
Пример #6
0
def insert_equity_extra_data_sf1(sharadar_metadata_df, sf1_df, cursor, show_progress=True):
    tickers = sf1_df['ticker'].unique()
    related_tickers = sharadar_metadata_df['relatedtickers'].dropna()
    # Add a space at the begin and end of relatedtickers, search for ' TICKER '
    related_tickers = ' ' + related_tickers.astype(str) + ' '

    with maybe_show_progress(tickers, show_progress, label='Parsing fundamental data: ') as it:
        for ticker in it:
            df_ticker = sf1_df[sf1_df['ticker'] == ticker]
            df_ticker.set_index('datekey', inplace=True)
            df_ticker = df_ticker.sort_index(ascending=False)
            df_ticker = df_ticker.drop(['ticker', 'lastupdated', 'calendardate'], axis=1)

            sid = lookup_sid(sharadar_metadata_df, related_tickers, ticker)
            
            for datekey, row in df_ticker.iterrows():
                for column in row.index:
                    if column != 'dimension':
                        field = column + '_' + row['dimension'].lower()
                        value = row[column]
                        if type(value) == float and np.isnan(value):
                            continue
                        date = datekey + ONE_DAY
                        # end_date not used (set -1)
                        sql = "INSERT OR REPLACE INTO equity_supplementary_mappings (sid, field, start_date, end_date, value) VALUES(?, ?, ?, -1, ?)"
                        cursor.execute(sql, (sid, field, date.value, str(value)))
Пример #7
0
        def _pricing_iter():
            sid = 0
            with maybe_show_progress(
                    symbols,
                    show_progress,
                    label='Downloading Yahoo pricing data: ') as it, \
                    requests.Session() as session:
                for symbol in it:
                    path = _cachpath(symbol, 'ohlcv')
                    try:
                        df = cache[path]
                    except KeyError:
                        df = cache[path] = DataReader(
                            symbol,
                            'yahoo',
                            start,
                            end,
                            session=session,
                        ).sort_index()

                    # the start date is the date of the first trade and
                    # the end date is the date of the last trade
                    metadata.iloc[sid] = df.index[0], df.index[-1], symbol
                    df.rename(
                        columns={
                            'Open': 'open',
                            'High': 'high',
                            'Low': 'low',
                            'Close': 'close',
                            'Volume': 'volume',
                        },
                        inplace=True,
                    )
                    yield sid, df
                    sid += 1
Пример #8
0
    def write(self, data, show_progress=False):
        """Write a stream of minute data.

        Parameters
        ----------
        data : iterable[(int, pd.DataFrame)]
            The data to write. Each element should be a tuple of sid, data
            where data has the following format:
              columns : ('open', 'high', 'low', 'close', 'volume')
                  open : float64
                  high : float64
                  low  : float64
                  close : float64
                  volume : float64|int64
              index : DatetimeIndex of market minutes.
            A given sid may appear more than once in ``data``; however,
            the dates must be strictly increasing.
        show_progress : bool, optional
            Whether or not to show a progress bar while writing.
        """
        ctx = maybe_show_progress(
            data,
            show_progress=show_progress,
            item_show_func=lambda e: e if e is None else str(e[0]),
            label="Merging minute equity files:",
        )
        write_sid = self.write_sid
        with ctx as it:
            for e in it:
                write_sid(*e)
Пример #9
0
def fetch_symbol_metadata_frame(api_key,
                                cache,
                                retries=5,
                                environ=None,
                                show_progress=False):
    """
    Download Quandl symbol metadata.

    Parameters
    ----------
    api_key : str
        The quandl api key to use. If this is None then no api key will be
        sent.
    cache : DataFrameCache
        The cache to use for persisting the intermediate data.
    retries : int, optional
        The number of times to retry each request before failing.
    environ : mapping[str -> str], optional
        The environment to use to find the zipline home. By default this
        is ``os.environ``.
    show_progress : bool, optional
        Show a progress bar for the download of this data.

    Returns
    -------
    metadata_frame : pd.DataFrame
        A dataframe with the following columns:
          symbol: the asset's symbol
          name: the full name of the asset
          start_date: the first date of data for this asset
          end_date: the last date of data for this asset
          auto_close_date: end_date + one day
          exchange: the exchange for the asset; this is always 'quandl'
        The index of the dataframe will be used for symbol->sid mappings but
        otherwise does not have specific meaning.
    """
    raw_iter = _fetch_raw_metadata(api_key, cache, retries, environ)

    def item_show_func(_, _it=iter(count())):
        'Downloading page: %d' % next(_it)

    with maybe_show_progress(raw_iter,
                             show_progress,
                             item_show_func=item_show_func,
                             label='Downloading WIKI metadata: ') as blocks:
        data = pd.concat(blocks, ignore_index=True).rename(
            columns={
                'dataset_code': 'symbol',
                'name': 'asset_name',
                'oldest_available_date': 'start_date',
                'newest_available_date': 'end_date',
            }).sort('symbol')

    data = data[~data.symbol.isin(excluded_symbols)]
    # cut out all the other stuff in the name column
    # we need to escape the paren because it is actually splitting on a regex
    data.asset_name = data.asset_name.str.split(r' \(', 1).str.get(0)
    data['exchange'] = 'quandl'
    data['auto_close_date'] = data['end_date'] + pd.Timedelta(days=1)
    return data
Пример #10
0
    def write(self,
              data,
              assets=None,
              show_progress=False,
              invalid_data_behavior='warn'):
        """
        Parameters
        ----------
        data : iterable[tuple[int, pandas.DataFrame or bcolz.ctable]]
            The data chunks to write. Each chunk should be a tuple of sid
            and the data for that asset.
        assets : set[int], optional
            The assets that should be in ``data``. If this is provided
            we will check ``data`` against the assets and provide better
            progress information.
        show_progress : bool, optional
            Whether or not to show a progress bar while writing.
        invalid_data_behavior : {'warn', 'raise', 'ignore'}, optional
            What to do when data is encountered that is outside the range of
            a uint32.

        Returns
        -------
        table : bcolz.ctable
            The newly-written table.
        """
        ctx = maybe_show_progress(
            ((sid, to_ctable(df, invalid_data_behavior)) for sid, df in data),
            show_progress=show_progress,
            item_show_func=self.progress_bar_item_show_func,
            label=self.progress_bar_message,
            length=len(assets) if assets is not None else None,
        )
        with ctx as it:
            return self._write_internal(it, assets)
Пример #11
0
def append(dest, codes):
    """添加股票代码分钟级别数据"""
    c, s, e = info_func()
    writer = BcolzMinuteBarWriter.open(dest, e)
    ctx = maybe_show_progress(
        codes,
        show_progress=True,
        item_show_func=lambda e: e,
        label="【更新】分钟级别数据",
    )

    d_fmt = r"%Y-%m-%d"
    start_str = s.strftime(d_fmt)
    end_str = e.strftime(d_fmt)
    m_index = c.minutes_for_sessions_in_range(start_str, end_str)

    with ctx as it:
        for code in it:
            sid = int(code)
            last_dt = writer.last_date_in_output_for_sid(sid)
            if last_dt is pd.NaT:
                start = s
            else:
                start = last_dt + c.day
            if start > e:
                continue
            # print(sid, start.date(), e.date())
            df = fetch_single_minutely_equity(code, start.date(), e.date())
            if df.empty:
                continue
            # 务必转换为UTC时区
            # 且由于指数分钟级别数据包含的是所有分钟,需要截断至交易分钟
            df = df.tz_localize('Asia/Shanghai').tz_convert('UTC').reindex(
                m_index, method='ffill')
            writer.write_sid(sid, df)
Пример #12
0
        def _pricing_iter():
            sid = 0
            with maybe_show_progress(
                    symbols,
                    show_progress,
                    label='Processing CSV: ') as it, \
                    requests.Session() as session:
                for symbol in it:
                    # path = _cachpath(symbol, 'yuusha')
                    # try:
                    #     df = cache[path]
                    # except KeyError:
                    #     df = cache[path] = LoadOneSymbol(df_multiSymbol, symbol)

                    df = LoadOneSymbol(df_multiSymbol, symbol)

                    # the start date is the date of the first trade and
                    # the end date is the date of the last trade
                    start_date = df.index[0]
                    end_date = df.index[-1]
                    # The auto_close date is the day after the last trade.
                    ac_date = end_date + pd.Timedelta(days=1)
                    ex_date = pd.to_datetime(df["expiration"][0])
                    strike = df["strike"][0]
                    callput = df["callput"][0]
                    style = df["style"][0]
                    underlying = df["underlying"][0]
                    exchange = df["exchange"][0]
                    metadata.iloc[sid] = start_date, end_date, ac_date, ex_date, strike, \
                        callput, style, underlying, exchange, symbol

                    yield sid, df
                    sid += 1
Пример #13
0
def fetch_symbol_metadata_frame(api_key,
                                cache,
                                retries=5,
                                environ=None,
                                show_progress=False):
    """
    Download Quandl symbol metadata.

    Parameters
    ----------
    api_key : str
        The quandl api key to use. If this is None then no api key will be
        sent.
    cache : DataFrameCache
        The cache to use for persisting the intermediate data.
    retries : int, optional
        The number of times to retry each request before failing.
    environ : mapping[str -> str], optional
        The environment to use to find the zipline home. By default this
        is ``os.environ``.
    show_progress : bool, optional
        Show a progress bar for the download of this data.

    Returns
    -------
    metadata_frame : pd.DataFrame
        A dataframe with the following columns:
          symbol: the asset's symbol
          name: the full name of the asset
          start_date: the first date of data for this asset
          end_date: the last date of data for this asset
          auto_close_date: end_date + one day
          exchange: the exchange for the asset; this is always 'quandl'
        The index of the dataframe will be used for symbol->sid mappings but
        otherwise does not have specific meaning.
    """
    raw_iter = _fetch_raw_metadata(api_key, cache, retries, environ)

    def item_show_func(_, _it=iter(count())):
        'Downloading page: %d' % next(_it)

    with maybe_show_progress(raw_iter,
                             show_progress,
                             item_show_func=item_show_func,
                             label='Downloading WIKI metadata: ') as blocks:
        data = pd.concat(blocks, ignore_index=True).rename(columns={
            'dataset_code': 'symbol',
            'name': 'asset_name',
            'oldest_available_date': 'start_date',
            'newest_available_date': 'end_date',
        }).sort('symbol')

    data = data[~data.symbol.isin(excluded_symbols)]
    # cut out all the other stuff in the name column
    # we need to escape the paren because it is actually splitting on a regex
    data.asset_name = data.asset_name.str.split(r' \(', 1).str.get(0)
    data['exchange'] = 'QUANDL'
    data['auto_close_date'] = data['end_date'] + pd.Timedelta(days=1)
    return data
Пример #14
0
def _pricing_iter(csvdir, symbols, metadata, divs_splits, show_progress):
    with maybe_show_progress(symbols,
                             show_progress,
                             label="Loading custom pricing data: ") as it:
        files = os.listdir(csvdir)
        for sid, symbol in enumerate(it):
            logger.debug("%s: sid %s" % (symbol, sid))

            try:
                fname = [
                    fname for fname in files if "%s.csv" % symbol in fname
                ][0]
            except IndexError:
                raise ValueError("%s.csv file is not in %s" % (symbol, csvdir))

            dfr = read_csv(
                os.path.join(csvdir, fname),
                parse_dates=[0],
                infer_datetime_format=True,
                index_col=0,
            ).sort_index()

            start_date = dfr.index[0]
            end_date = dfr.index[-1]

            # The auto_close date is the day after the last trade.
            ac_date = end_date + Timedelta(days=1)
            metadata.iloc[sid] = start_date, end_date, ac_date, symbol

            if "split" in dfr.columns:
                tmp = 1.0 / dfr[dfr["split"] != 1.0]["split"]
                split = DataFrame(data=tmp.index.tolist(),
                                  columns=["effective_date"])
                split["ratio"] = tmp.tolist()
                split["sid"] = sid

                splits = divs_splits["splits"]
                index = Index(
                    range(splits.shape[0], splits.shape[0] + split.shape[0]))
                split.set_index(index, inplace=True)
                divs_splits["splits"] = splits.append(split)

            if "dividend" in dfr.columns:
                # ex_date   amount  sid record_date declared_date pay_date
                tmp = dfr[dfr["dividend"] != 0.0]["dividend"]
                div = DataFrame(data=tmp.index.tolist(), columns=["ex_date"])
                div["record_date"] = NaT
                div["declared_date"] = NaT
                div["pay_date"] = NaT
                div["amount"] = tmp.tolist()
                div["sid"] = sid

                divs = divs_splits["divs"]
                ind = Index(range(divs.shape[0], divs.shape[0] + div.shape[0]))
                div.set_index(ind, inplace=True)
                divs_splits["divs"] = divs.append(div)

            yield sid, dfr
Пример #15
0
            def _pricing_iter():
                sid = 0
                with maybe_show_progress(
                        symbols,
                        show_progress,
                        label='Downloading Tradea Database pricing data ') as it, \
                        requests.Session() as session:
                    for symbol in it:
                        logger.debug('zipline bundle downloading %s' % symbol)
                        try:
                            instrument = Instrument(
                                symbol=symbol, asset_type=AssetType.us_equity)

                            df = self.historical_market_data_service.getHistoricalData(
                                instrument,
                                period=Period.day,
                                number_of_periods=1,
                                fromDate=start,
                                toDate=end,
                                bar_type=BarType.time_bar,
                                force_download=False,
                                cleanOutliers=False)
                        except Exception as e:
                            logger.error(
                                'Error downloading bundle zipline %s : %s' %
                                (symbol, str(e)))
                            print('Error downloading bundle zipline %s : %s' %
                                  (symbol, str(e)))
                            df = None
                            continue

                        # the start date is the date of the first trade and
                        # the end date is the date of the last trade
                        indexSet = df.index.copy()
                        indexSet = (indexSet + pd.DateOffset(hours=3)
                                    ) - pd.DateOffset(days=1)
                        df.index = indexSet

                        start_date = df.index[0]
                        end_date = df.index[-1]
                        # The auto_close date is the day after the last trade.
                        ac_date = end_date + pd.Timedelta(days=1)
                        metadata.iloc[
                            sid] = start_date, end_date, ac_date, symbol

                        df.rename(
                            columns={
                                Bar.open: 'open',
                                Bar.high: 'high',
                                Bar.low: 'low',
                                Bar.close: 'close',
                                Bar.volume: 'volume',
                            },
                            inplace=True,
                        )
                        yield sid, df
                        sid += 1
Пример #16
0
    def _pricing_iter(self):
        with maybe_show_progress(self.symbols,
                                 self.show_progress,
                                 label='Loading custom pricing data: ') as it:
            for sid, symbol in enumerate(it):
                logger.debug('%s: sid %s' % (symbol, sid))

                dfr = read_csv(os.path.join(self.csvdir, '%s.csv' % symbol),
                               parse_dates=[0],
                               infer_datetime_format=True,
                               index_col=0).sort_index()

                # the start date is the date of the first trade and
                # the end date is the date of the last trade
                start_date = dfr.index[0]
                end_date = dfr.index[-1]

                # The auto_close date is the day after the last trade.
                ac_date = end_date + Timedelta(days=1)
                self.metadata.iloc[sid] = start_date, end_date, ac_date, symbol

                if 'split' in dfr.columns:
                    if self.splits is None:
                        self.splits = DataFrame()
                    tmp = dfr[dfr['split'] != 1.0]['split']
                    split = DataFrame(data=tmp.index.tolist(),
                                      columns=['effective_date'])
                    split['ratio'] = tmp.tolist()
                    split['sid'] = sid

                    index = Index(
                        range(self.splits.shape[0],
                              self.splits.shape[0] + split.shape[0]))
                    split.set_index(index, inplace=True)
                    self.splits = self.splits.append(split)

                if 'dividend' in dfr.columns:
                    if self.dividends is None:
                        self.dividends = DataFrame()
                    # ex_date   amount  sid record_date declared_date pay_date
                    tmp = dfr[dfr['dividend'] != 0.0]['dividend']
                    div = DataFrame(data=tmp.index.tolist(),
                                    columns=['ex_date'])
                    div['record_date'] = NaT
                    div['declared_date'] = NaT
                    div['pay_date'] = NaT
                    div['amount'] = tmp.tolist()
                    div['sid'] = sid
                    ind = Index(
                        range(self.dividends.shape[0],
                              self.dividends.shape[0] + div.shape[0]))
                    div.set_index(ind, inplace=True)
                    if self.dividends is None:
                        self.dividends = DataFrame()
                    self.dividends = self.dividends.append(div)

                yield sid, dfr
Пример #17
0
def _pricing_iter(csvdir, symbols, metadata, divs_splits, show_progress):
    with maybe_show_progress(symbols,
                             show_progress,
                             label='Loading custom pricing data: ') as it:
        files = os.listdir(csvdir)
        for sid, symbol in enumerate(it):
            logger.debug(f'{symbol}: sid {sid}')

            try:
                fname = [
                    fname for fname in files if '%s.csv' % symbol in fname
                ][0]
            except IndexError:
                raise ValueError(f"{symbol}.csv file is not in {csvdir}")

            dfr = read_csv(os.path.join(csvdir, fname),
                           parse_dates=[0],
                           infer_datetime_format=True,
                           index_col=0).sort_index()

            start_date = dfr.index[0]
            end_date = dfr.index[-1]

            # The auto_close date is the day after the last trade.
            ac_date = end_date + Timedelta(days=1)
            metadata.iloc[sid] = start_date, end_date, ac_date, symbol

            if 'split' in dfr.columns:
                tmp = 1. / dfr[dfr['split'] != 1.0]['split']
                split = DataFrame(data=tmp.index.tolist(),
                                  columns=['effective_date'])
                split['ratio'] = tmp.tolist()
                split['sid'] = sid

                splits = divs_splits['splits']
                index = Index(
                    range(splits.shape[0], splits.shape[0] + split.shape[0]))
                split.set_index(index, inplace=True)
                divs_splits['splits'] = splits.append(split)

            if 'dividend' in dfr.columns:
                # ex_date   amount  sid record_date declared_date pay_date
                tmp = dfr[dfr['dividend'] != 0.0]['dividend']
                div = DataFrame(data=tmp.index.tolist(), columns=['ex_date'])
                div['record_date'] = NaT
                div['declared_date'] = NaT
                div['pay_date'] = NaT
                div['amount'] = tmp.tolist()
                div['sid'] = sid

                divs = divs_splits['divs']
                ind = Index(range(divs.shape[0], divs.shape[0] + div.shape[0]))
                div.set_index(ind, inplace=True)
                divs_splits['divs'] = divs.append(div)

            yield sid, dfr
Пример #18
0
def _pricing_iter(csvdir, symbols, metadata, divs_splits, show_progress):
    with maybe_show_progress(symbols, show_progress,
                             label='Loading custom pricing data: ') as it:
        files = os.listdir(csvdir)
        for sid, symbol in enumerate(it):
            logger.debug('%s: sid %s' % (symbol, sid))

            try:
                fname = [fname for fname in files
                         if '%s.csv' % symbol in fname][0]
            except IndexError:
                raise ValueError("%s.csv file is not in %s" % (symbol, csvdir))

            dfr = read_csv(os.path.join(csvdir, fname),
                           parse_dates=[0],
                           infer_datetime_format=True,
                           index_col=0).sort_index()

            start_date = dfr.index[0]
            end_date = dfr.index[-1]

            # The auto_close date is the day after the last trade.
            ac_date = end_date + Timedelta(days=1)
            metadata.iloc[sid] = start_date, end_date, ac_date, symbol

            if 'split' in dfr.columns:
                tmp = 1. / dfr[dfr['split'] != 1.0]['split']
                split = DataFrame(data=tmp.index.tolist(),
                                  columns=['effective_date'])
                split['ratio'] = tmp.tolist()
                split['sid'] = sid

                splits = divs_splits['splits']
                index = Index(range(splits.shape[0],
                                    splits.shape[0] + split.shape[0]))
                split.set_index(index, inplace=True)
                divs_splits['splits'] = splits.append(split)

            if 'dividend' in dfr.columns:
                # ex_date   amount  sid record_date declared_date pay_date
                tmp = dfr[dfr['dividend'] != 0.0]['dividend']
                div = DataFrame(data=tmp.index.tolist(), columns=['ex_date'])
                div['record_date'] = NaT
                div['declared_date'] = NaT
                div['pay_date'] = NaT
                div['amount'] = tmp.tolist()
                div['sid'] = sid

                divs = divs_splits['divs']
                ind = Index(range(divs.shape[0], divs.shape[0] + div.shape[0]))
                div.set_index(ind, inplace=True)
                divs_splits['divs'] = divs.append(div)

            yield sid, dfr
Пример #19
0
    def _pricing_iter():
        with maybe_show_progress(
                tickers_df.iterrows(),
                show_progress,
                label='Fetch stocks pricing data from db: ') as it, \
                requests.Session() as session:
            for index, row in tickers_df.iterrows():
                symbol = row['asset_name']
                path = _cachpath(symbol, 'ohlcv')

                try:
                    data = cache[path]
                except:
                    sql_text = "SELECT tran_date as date, open, high, low, close, volume FROM `stock_spy` WHERE name='{0}' order by tran_date desc".format(
                        symbol)
                    data = cache[path] = pd.read_sql(
                        sql_text,
                        con=sqlite_conn,
                        index_col='date',
                        parse_dates=['date']).sort_index()
                    if traceDebug:
                        print("read {} sql and get df data:".format(symbol))
                        print(data)

                # the start date is the date of the first trade and
                # the end date is the date of the last trade
                start_date = pd.to_datetime(data.iloc[0].name)
                end_date = pd.to_datetime(data.iloc[-1].name)
                if traceDebug:
                    print("start_date: ")
                    print(start_date)
                # The auto_close date is the day after the last trade.
                ac_date = end_date + pd.Timedelta(days=1)

                sid = row['sid']
                if traceDebug:
                    print("sid-{}:symbol-{}", sid, symbol)
                    print("start_date", type(start_date), start_date)
                    print("end_date", type(end_date), end_date)
                    print("ac_date", type(ac_date), ac_date)

                metadata.iloc[sid] = start_date, end_date, ac_date, symbol
                new_index = ['open', 'high', 'low', 'close', 'volume']
                data_df = data.reindex(columns=new_index,
                                       copy=False)  # fix bug

                sessions = calendar.sessions_in_range(start_date, end_date)
                data_df = data_df.reindex(
                    sessions.tz_localize(None),
                    copy=False,
                ).fillna(0.0)

                yield sid, data_df
Пример #20
0
def ingest(environ,
           asset_db_writer,
           minute_bar_writer,
           daily_bar_writer,
           adjustment_writer,
           calendar,
           start_session,
           end_session,
           cache,
           show_progress,
           output_dir):
    symbols = ['600019.SH']

    pro_api = ts.pro_api(tushare_token)

    dtype = [('start_date', 'datetime64[ns]'),
                 ('end_date', 'datetime64[ns]'),
                 ('auto_close_date', 'datetime64[ns]'),
                 ('symbol', 'object')]
    metadata = DataFrame(empty(len(symbols), dtype=dtype))

    with maybe_show_progress(symbols, show_progress,
                             label='Loading CN A %s pricing data: ' % (symbols)) as it:
        for sid, symbol in enumerate(it):
            tushare_daily = ts.pro_bar(pro_api=pro_api,
                               ts_code=symbol,
                               asset='E',
                               start_date=start_session.strftime('%Y%m%d'),
                               end_date=end_session.strftime('%Y%m%d'),
                               adj='qfq')

            tushare_daily['day'] = pd.to_datetime(tushare_daily['trade_date'])
            tushare_daily['volume'] = tushare_daily['vol']
            tushare_daily['id'] = tushare_daily['ts_code']

            tushare_daily = tushare_daily.filter(items=['day', 'open', 'high', 'low', 'close', 'volume'])
            tushare_daily = tushare_daily.set_index('day').sort_index()

            start_date = tushare_daily.index[0]
            end_date = tushare_daily.index[-1]

            end_date = start_date if start_date > end_date else end_date

            # The auto_close date is the day after the last trade.
            ac_date = end_date + Timedelta(days=1)
            metadata.iloc[sid] = start_date, end_date, ac_date, symbol

            daily_bar_writer.write([(sid, tushare_daily)], show_progress=show_progress)

    metadata['exchange'] = 'SSE'
    asset_db_writer.write(equities=metadata)
    adjustment_writer.write(None)
Пример #21
0
        def _pricing_iter():
            sid = 0
            with maybe_show_progress(
                    symbols,
                    show_progress,
                    label='Downloading Yahoo pricing data: ') as it, \
                    requests.Session() as session:
                for symbol in it:
                    print symbol
                    path = _cachpath(symbol, 'ohlcv')
                    try:
                        df = cache[path]
                    except KeyError:
                        df = cache[path] = get_data(
                            symbol,
                            start,
                            end
                        )
                        # df = cache[path] = DataReader(
                        #     symbol,
                        #     'yahoo',
                        #     start,
                        #     end,
                        #     session=session,
                        # ).sort_index()

                    # the start date is the date of the first trade and
                    # the end date is the date of the last trade

                    df = df[df.Volume>0]
                    start_date = df.index[0]
                    end_date = df.index[-1]
                    df = df.reindex(trading_days[(trading_days>=start_date)])
                    df.Volume = df.Volume.fillna(0)
                    df = df.ffill()
                    # The auto_close date is the day after the last trade.
                    ac_date = end_date + pd.Timedelta(days=1)
                    metadata.iloc[sid] = start_date, end_date, ac_date, symbol

                    df.rename(
                        columns={
                            'Open': 'open',
                            'High': 'high',
                            'Low': 'low',
                            'Close': 'close',
                            'Volume': 'volume',
                        },
                        inplace=True,
                    )
                    yield sid, df
                    sid += 1
Пример #22
0
 def _read_and_convert(self, calendar, show_progress):
     """returns the generator of symbol index and the dataframe storing its price data
     """
     with maybe_show_progress(self._symbols,
                              show_progress,
                              label='Downloading from {}: '.format(
                                  self._exchange)) as it:
         for symbol_index, symbol in enumerate(it):
             # read data from csv file and set the index
             df_data = self._downloader(symbol)
             # apply filter when it is provided
             if self._filter is not None:
                 df_data = self._filter(df_data)
             self._update_symbol_metadata(symbol_index, symbol, df_data)
             yield symbol_index, df_data
Пример #23
0
def _pricing_iter(mongo_db, symbols, metadata, divs_splits, show_progress):
    with maybe_show_progress(symbols,
                             show_progress,
                             label='Loading custom pricing data: ') as it:
        for sid, symbol in enumerate(it):
            logger.debug('%s: sid %s' % (symbol, sid))
            collector = mongo_db[symbol]

            dfr = read_mongo(collector).sort_index()
            # print(dfr)

            start_date = dfr.index[0]
            end_date = dfr.index[-1]

            # The auto_close date is the day after the last trade.
            ac_date = end_date + Timedelta(days=1)
            metadata.iloc[sid] = start_date, end_date, ac_date, symbol

            if 'split' in dfr.columns:
                tmp = 1. / dfr[dfr['split'] != 1.0]['split']
                split = DataFrame(data=tmp.index.tolist(),
                                  columns=['effective_date'])
                split['ratio'] = tmp.tolist()
                split['sid'] = sid

                splits = divs_splits['splits']
                index = Index(
                    range(splits.shape[0], splits.shape[0] + split.shape[0]))
                split.set_index(index, inplace=True)
                divs_splits['splits'] = splits.append(split)

            if 'dividend' in dfr.columns:
                # ex_date   amount  sid record_date declared_date pay_date
                tmp = dfr[dfr['dividend'] != 0.0]['dividend']
                div = DataFrame(data=tmp.index.tolist(), columns=['ex_date'])
                div['record_date'] = NaT
                div['declared_date'] = NaT
                div['pay_date'] = NaT
                div['amount'] = tmp.tolist()
                div['sid'] = sid

                divs = divs_splits['divs']
                ind = Index(range(divs.shape[0], divs.shape[0] + div.shape[0]))
                div.set_index(ind, inplace=True)
                divs_splits['divs'] = divs.append(div)

            yield sid, dfr
Пример #24
0
    def create_equities_df(df, tickers, sessions, sharadar_metadata_df,
                           show_progress):
        # Prepare an empty DataFrame for equities, the index of this dataframe is the sid.
        equities_df = pd.DataFrame(columns=METADATA_HEADERS)
        with maybe_show_progress(tickers,
                                 show_progress,
                                 label='Loading custom pricing data: ') as it:
            for ticker in it:
                df_ticker = df[df['ticker'] == ticker]
                df_ticker = df_ticker.sort_index()

                sid = df_ticker.index.get_level_values('sid')[0]

                sharadar_metadata = sharadar_metadata_df[
                    sharadar_metadata_df['permaticker'] == sid].iloc[0, :]

                asset_name = sharadar_metadata.loc['name']

                # The date when this asset was created.
                start_date = sharadar_metadata.loc['firstpricedate']

                # The last date we have trade data for this asset.
                end_date = sharadar_metadata.loc['lastpricedate']

                # The first date we have trade data for this asset.
                first_traded = start_date

                # The date on which to close any positions in this asset.
                auto_close_date = end_date + pd.Timedelta(days=1)

                # The canonical name of the exchange, for example 'NYSE' or 'NASDAQ'
                exchange = sharadar_metadata.loc['exchange']
                if (exchange is None) or (exchange == 'None'):
                    exchange = 'OTC'

                # Synch to the official exchange calendar, if necessary
                date_index = df_ticker.index.get_level_values('date')
                start_date_df = date_index[0]
                end_date_df = date_index[-1]
                synch_to_calendar(sessions, start_date_df, end_date_df,
                                  df_ticker, df)

                # Add a row to the metadata DataFrame.
                equities_df.loc[
                    sid] = ticker, asset_name, start_date, end_date, first_traded, auto_close_date, exchange
        return equities_df
Пример #25
0
        def _pricing_iter():
            sid = 0
            with maybe_show_progress(
                    symbols,
                    show_progress,
                    label='Fetch stocks pricing data from db: ') as it, \
                    requests.Session() as session:
                for symbol in it:
                    path = _cachpath(symbol, 'ohlcv')
                    try:
                        df = cache[path]
                    except KeyError:
                        query = "select * from '%s' order by date desc" % symbol
                        df = cache[path] = pd.read_sql(
                            sql=query,
                            con=conn,
                            index_col='date',
                            parse_dates=['date']).sort_index()
                        if boDebug:
                            print("read_sqllite df", type(df), "length",
                                  len(df))

                    # the start date is the date of the first trade and
                    # the end date is the date of the last trade
                    start_date = df.index[0]
                    end_date = df.index[-1]
                    # The auto_close date is the day after the last trade.
                    ac_date = end_date + pd.Timedelta(days=1)
                    if boDebug:
                        print("start_date", type(start_date), start_date)
                        print("end_date", type(end_date), end_date)
                        print("ac_date", type(ac_date), ac_date)

                    metadata.iloc[sid] = start_date, end_date, ac_date, symbol
                    new_index = ['open', 'high', 'low', 'close', 'volume']
                    df = df.reindex(columns=new_index, copy=False)  #fix bug
                    # FIX IT
                    sessions = calendar.sessions_in_range(start_date, end_date)
                    df = df.reindex(
                        sessions.tz_localize(None),
                        copy=False,
                    ).fillna(0.0)

                    yield sid, df
                    sid += 1
Пример #26
0
def _pricing_iter(metadata, symbols, show_progress, start_session, end_session,
                  cache):
    sid = 0
    with maybe_show_progress(symbols,
                             show_progress,
                             label='BitMex pricing data: ') as it:

        for symbol in it:
            _get_metadata(sid, symbol, metadata)
            for day in pd.date_range(start_session,
                                     end_session,
                                     freq='D',
                                     closed='left'):
                key = symbol + '-' + day.strftime("%Y-%m-%d")
                if key not in cache:
                    cache[key] = _get_minute_bar(symbol, day)
                yield sid, cache[key]
            sid += 1
Пример #27
0
        def _pricing_iter():
            with maybe_show_progress(symbols, show_progress,
                                     label='Loading custom pricing data: ') as it:
                for sid, symbol in enumerate(it):
                    logger.debug('%s: sid %s' % (symbol, sid))

                    df = pandas.read_csv(os.path.join(csvdir, '%s.csv' % symbol),
                                         parse_dates=[0], infer_datetime_format=True, index_col=0).sort_index()

                    # the start date is the date of the first trade and
                    # the end date is the date of the last trade
                    start_date = df.index[0]
                    end_date = df.index[-1]

                    # The auto_close date is the day after the last trade.
                    ac_date = end_date + pandas.Timedelta(days=1)
                    metadata.iloc[sid] = start_date, end_date, ac_date, symbol

                    yield sid, df
Пример #28
0
        def _pricing_iter():
            sid = 0
            with maybe_show_progress(
                    symbols,
                    show_progress,
                    label='Downloading Yahoo pricing data: ') as it, \
                    requests.Session() as session:
                for ticker in it:
                    path = _cache_path(ticker, 'ohlcv')
                    try:
                        df = cache[path]
                    except KeyError:
                        df = cache[path] = DataReader(
                            ticker,
                            'yahoo',
                            start_session,
                            end_session,
                            session=session,
                        ).sort_index()

                    # the start date is the date of the first trade and
                    # the end date is the date of the last trade
                    start_date = df.index[0]
                    end_date = df.index[-1]
                    # The auto_close date is the day after the last trade.
                    ac_date = end_date + pd.Timedelta(days=1)
                    metadata.iloc[sid] = start_date, end_date, ac_date, ticker

                    df.rename(
                        columns={
                            'Open': 'open',
                            'High': 'high',
                            'Low': 'low',
                            'Close': 'close',
                            'Volume': 'volume',
                            'Adj Close': 'price',
                        },
                        inplace=True,
                    )
                    yield sid, df
                    sid += 1
Пример #29
0
        def _pricing_iter():
            sid = 0
            with maybe_show_progress(
                    symbols,
                    show_progress,
                    label='Downloading IEX pricing data: ') as it, \
                    requests.Session() as session:
                for symbol in it:
                    path = _cachpath(symbol, 'ohlcv')
                    try:
                        df = cache[path]
                    except KeyError:
                        df = cache[path] = get_historical_data(
                            symbol,
                            start=start,
                            end=None,
                            output_format='pandas').sort_index()
                    df.index = pd.to_datetime(df.index)
                    # the start date is the date of the first trade and
                    # the end date is the date of the last trade
                    start_date = df.index[0]
                    end_date = df.index[-1]
                    # The auto_close date is the day after the last trade.
                    ac_date = end_date + pd.Timedelta(days=1)
                    metadata.iloc[sid] = start_date, end_date, ac_date, symbol

                    df.rename(
                        columns={
                            'Open': 'open',
                            'High': 'high',
                            'Low': 'low',
                            'Close': 'close',
                            'Volume': 'volume',
                        },
                        inplace=True,
                    )
                    yield sid, df
                    sid += 1
Пример #30
0
def insert_daily_metrics(sharadar_metadata_df, daily_df, cursor, show_progress=True):
    tickers = daily_df['ticker'].unique()
    related_tickers = sharadar_metadata_df['relatedtickers'].dropna()
    # Add a space at the begin and end of relatedtickers, search for ' TICKER '
    related_tickers = ' ' + related_tickers.astype(str) + ' '

    with maybe_show_progress(tickers, show_progress, label='Parsing fundamental data: ') as it:
        for ticker in it:
            df_ticker = daily_df[daily_df['ticker'] == ticker]
            df_ticker.set_index('date', inplace=True)
            df_ticker = df_ticker.sort_index(ascending=False)
            df_ticker = df_ticker.drop(['ticker', 'lastupdated'], axis=1)

            sid = lookup_sid(sharadar_metadata_df, related_tickers, ticker)

            for date, row in df_ticker.iterrows():
                for field in row.index:
                    value = row[field]
                    if np.isnan(value):
                        continue

                    # end_date not used (set -1)
                    sql = "INSERT OR REPLACE INTO equity_supplementary_mappings (sid, field, start_date, end_date, value) VALUES(?, ?, ?, -1, ?)"
                    cursor.execute(sql, (sid, field, date.value, str(value)))
Пример #31
0
        def _pricing_iter():
            sid = 0
            with maybe_show_progress(
                    symbols,
                    show_progress,
                    label='Downloading Yahoo pricing data: ') as it, \
                    requests.Session() as session:
                for symbol in it:
                    path = _cachpath(symbol, 'csv')
                    try:
                        df = cache[path]
                    except KeyError:
                        df = cache[path] = load_yahoo_csv(
                            r"/mnt/hgfs/595P/project_Z/tests/resources/yahoo-test.csv",
                            identifier_col="Date")

                    # the start date is the date of the first trade and
                    # the end date is the date of the last trade
                    start_date = df.index[0]
                    end_date = df.index[-1]
                    # The auto_close date is the day after the last trade.
                    ac_date = end_date + pd.Timedelta(days=1)
                    metadata.iloc[sid] = start_date, end_date, ac_date, symbol

                    df.rename(
                        columns={
                            'Open': 'adjusted_close',
                            'High': 'ask',
                            'Low': 'bid',
                            'Close': 'close',
                            'Volume': 'volume',
                        },
                        inplace=True,
                    )
                    yield sid, df
                    sid += 1
Пример #32
0
 def _read_and_convert(self, symbols, show_progress):
     """returns the generator of symbol index and the dataframe storing its price data
     """
     path = self._get_csvdir(show_progress)
     with maybe_show_progress(symbols,
                              show_progress,
                              label='Loading csv files: ') as it:
         for symbol_index, symbol in enumerate(it):
             file_path = '{0}/{1}.csv'.format(path, symbol)
             if os.path.exists(file_path):
                 # read data from csv file and set the index
                 df_data = pd.read_csv(file_path,
                                       index_col=self._index_column,
                                       parse_dates=True,
                                       dtype={
                                           "Volumn": np.uint64
                                       }).sort_index()
                 # rename columns if necessary
                 if self._column_mapper:
                     df_data.rename(columns=self._column_mapper,
                                    inplace=True)
                 self._filter(df_data)
                 self._update_symbol_metadata(symbol_index, symbol, df_data)
                 yield symbol_index, df_data
Пример #33
0
    def ingest(
            environ,
            asset_db_writer,
            minute_bar_writer,  # unused
            daily_bar_writer,
            adjustment_writer,
            fundamental_writer,
            calendar,
            start_session,
            end_session,
            cache,
            show_progress,
            output_dir,
            # pass these as defaults to make them 'nonlocal' in py2
            start=start,
            end=end):
        if start is None:
            start = start_session
        if end is None:
            end = None

        metadata = pd.DataFrame(
            np.empty(len(symbols),
                     dtype=[
                         ('start_date', 'datetime64[ns]'),
                         ('end_date', 'datetime64[ns]'),
                         ('auto_close_date', 'datetime64[ns]'),
                         ('symbol', 'object'),
                     ]))

        def _pricing_iter():
            sid = 0
            with maybe_show_progress(
                    symbols,
                    show_progress,
                    label='Downloading Yahoo pricing data: ') as it, \
                    requests.Session() as session:
                for symbol in it:
                    path = _cachpath(symbol, 'ohlcv')
                    try:
                        df = cache[path]
                    except KeyError:
                        df = cache[path] = DataReader(
                            symbol,
                            'yahoo',
                            start,
                            end,
                            session=session,
                        ).sort_index()

                    # the start date is the date of the first trade and
                    # the end date is the date of the last trade
                    start_date = df.index[0]
                    end_date = df.index[-1]
                    # The auto_close date is the day after the last trade.
                    ac_date = end_date + pd.Timedelta(days=1)
                    metadata.iloc[sid] = start_date, end_date, ac_date, symbol

                    df.rename(
                        columns={
                            'Open': 'open',
                            'High': 'high',
                            'Low': 'low',
                            'Close': 'close',
                            'Volume': 'volume',
                        },
                        inplace=True,
                    )
                    yield sid, df
                    sid += 1

        daily_bar_writer.write(_pricing_iter(), show_progress=show_progress)

        symbol_map = pd.Series(metadata.symbol.index, metadata.symbol)

        # Hardcode the exchange to "YAHOO" for all assets and (elsewhere)
        # register "YAHOO" to resolve to the NYSE calendar, because these are
        # all equities and thus can use the NYSE calendar.
        metadata['exchange'] = "YAHOO"
        asset_db_writer.write(equities=metadata)

        adjustments = []
        with maybe_show_progress(
                symbols,
                show_progress,
                label='Downloading Yahoo adjustment data: ') as it, \
                requests.Session() as session:
            for symbol in it:
                path = _cachpath(symbol, 'adjustment')
                try:
                    df = cache[path]
                except KeyError:
                    df = cache[path] = DataReader(
                        symbol,
                        'yahoo-actions',
                        start,
                        end,
                        session=session,
                    ).sort_index()

                df['sid'] = symbol_map[symbol]
                adjustments.append(df)

        adj_df = pd.concat(adjustments)
        adj_df.index.name = 'date'
        adj_df.reset_index(inplace=True)

        splits = adj_df[adj_df.action == 'SPLIT']
        splits = splits.rename(columns={
            'value': 'ratio',
            'date': 'effective_date'
        }, )
        splits.drop('action', axis=1, inplace=True)

        dividends = adj_df[adj_df.action == 'DIVIDEND']
        dividends = dividends.rename(columns={
            'value': 'amount',
            'date': 'ex_date'
        }, )
        dividends.drop('action', axis=1, inplace=True)
        # we do not have this data in the yahoo dataset
        dividends['record_date'] = pd.NaT
        dividends['declared_date'] = pd.NaT
        dividends['pay_date'] = pd.NaT

        adjustment_writer.write(splits=splits, dividends=dividends)
Пример #34
0
def _pricing_iter(csvdir, symbols, metadata, divs_splits, show_progress):
    with maybe_show_progress(symbols,
                             show_progress,
                             label='Loading custom pricing data: ') as it:
        files = os.listdir(csvdir)
        # print(files) # Erol debug added
        for sid, symbol in enumerate(it):
            logger.debug('%s: sid %s' % (symbol, sid))
            # print(sid) # Erol debug added
            # print(symbol) # Erol debug added

            try:
                fname = [
                    fname for fname in files
                    #  if '%s.csv' % symbol in fname][0]  # Erol: it looks like this is what is f3$%ing it up
                    if '%s.csv' % symbol == fname
                ][0]  # Erol: I fixed it here
                # print(fname) # Erol debug added

            except IndexError:
                raise ValueError("%s.csv file is not in %s" % (symbol, csvdir))

            dfr = read_csv(os.path.join(csvdir, fname),
                           parse_dates=[0],
                           infer_datetime_format=True,
                           index_col=0).sort_index()

            start_date = dfr.index[0]
            end_date = dfr.index[-1]

            # Erol added this to sync to the official trading calendar, I'll probably just handle this on the dataside
            # Check valid trading dates, according to the selected exchange calendar
            # sessions = calendar.sessions_in_range(start_session, end_session)
            # dfr = dfr.reindex(sessions.tz_localize(None))[start_date:end_date]

            # The auto_close date is the day after the last trade.
            ac_date = end_date + Timedelta(days=1)
            metadata.iloc[sid] = start_date, end_date, ac_date, symbol

            if 'split' in dfr.columns:
                tmp = 1. / dfr[dfr['split'] != 1.0]['split']
                split = DataFrame(data=tmp.index.tolist(),
                                  columns=['effective_date'])
                split['ratio'] = tmp.tolist()
                split['sid'] = sid

                splits = divs_splits['splits']
                index = Index(
                    range(splits.shape[0], splits.shape[0] + split.shape[0]))
                split.set_index(index, inplace=True)
                divs_splits['splits'] = splits.append(split)

            if 'dividend' in dfr.columns:
                # ex_date   amount  sid record_date declared_date pay_date
                tmp = dfr[dfr['dividend'] != 0.0]['dividend']
                div = DataFrame(data=tmp.index.tolist(), columns=['ex_date'])
                div['record_date'] = NaT
                div['declared_date'] = NaT
                div['pay_date'] = NaT
                # Erol add a pay_date - This fixed the problem
                div['pay_date'] = div['ex_date']
                div['amount'] = tmp.tolist()
                div['sid'] = sid

                divs = divs_splits['divs']
                ind = Index(range(divs.shape[0], divs.shape[0] + div.shape[0]))
                div.set_index(ind, inplace=True)
                divs_splits['divs'] = divs.append(div)

            yield sid, dfr
Пример #35
0
        def _pricing_iter():
            sid = 0
            with maybe_show_progress(
                    symbols,
                    show_progress,
                    label='Downloading Yahoo pricing data: ') as it, \
                    requests.Session() as session:
                for symbol in it:
                    path = _cachpath(symbol, 'ohlcv')
                    try:
                        df = cache[path]
                    except KeyError:
                        provider = "yahoo"
                        try:
                            print("To Download symbol:", symbol, path)
                            df = cache[path] = DataReader(
                                name=symbol +
                                '.ss' if symbol.startswith('6') else symbol +
                                '.sz',
                                data_source='yahoo',
                                start=start,
                                end=end,
                                retry_count=1,
                                session=session,
                            ).sort_index()
                            if df is None:  #FIXIT timeout maybe
                                raise Exception("Empty Result!", symbol)
                        except Exception, e:
                            print(
                                'Got a Exception - reason "%s" for stock(%s) in yahoo, try tushare'
                                % (str(e), symbol))
                            import tushare as ts
                            try:
                                df = cache[path] = ts.get_h_data(
                                    symbol,
                                    start=start.strftime("%Y-%m-%d")
                                    if start != None else None,
                                    end=end.strftime("%Y-%m-%d")
                                    if end != None else None,
                                    retry_count=5,
                                    pause=1).sort_index()
                                provider = 'tushare'
                                if df is None:  #FIXIT timeout maybe
                                    raise Exception("Empty Result!", symbol)
                            except Exception, e1:
                                print(
                                    'Got a Exception - reason "%s" for stock(%s) in tushare, ignore it'
                                    % (str(e1), symbol))
                                sys.exit()
                                #sid += 1
                                #continue

                        print("Got stock(%s) from provide(%s)" %
                              (symbol, provider))
                        # the start date is the date of the first trade and
                        # the end date is the date of the last trade
                        start_date = df.index[0]
                        end_date = df.index[-1]
                        # The auto_close date is the day after the last trade.
                        ac_date = end_date + pd.Timedelta(days=1)
                        metadata.iloc[
                            sid] = start_date, end_date, ac_date, symbol

                        if provider == 'tushare':
                            new_index = [
                                'open', 'high', 'low', 'close', 'volume'
                            ]
                            df = df.reindex(columns=new_index,
                                            copy=False)  # fix bug
                        else:
                            df.rename(
                                columns={
                                    'Open': 'open',
                                    'High': 'high',
                                    'Low': 'low',
                                    'Adj Close': 'close',
                                    'Volume': 'volume',
                                },
                                inplace=True,
                            )
                        sessions = calendar.sessions_in_range(
                            start_date, end_date)
                        df = df.reindex(
                            sessions.tz_localize(None),
                            copy=False,
                        ).fillna(0.0)
                        yield sid, df
                        sid += 1
Пример #36
0
def tiingo_metadata(tickers='ALL', asset_finder=None):
    tickers_df = pd.DataFrame(CLIENT.list_stock_tickers())

    tickers_df = tickers_df.loc[
        (tickers_df['exchange'].isin(['NYSE', 'NASDAQ']))
        & (tickers_df['assetType'] == 'Stock')]

    if tickers != 'ALL':
        tickers_df = tickers_df.loc[tickers_df['ticker'].isin(tickers)]

    tickers_df['startDate'] = pd.to_datetime(tickers_df['startDate'])
    tickers_df['endDate'] = pd.to_datetime(tickers_df['endDate'])
    tickers_df.dropna(inplace=True)

    # we currently don't support it when a symbol is held by more than
    # one security at a time. for the ones duplicated, we choose to
    # get the currently traded
    duplicates = tickers_df.loc[tickers_df.duplicated(subset=['ticker'])]
    tickers_df.drop_duplicates(subset=['ticker'], inplace=True)
    tickers_df = tickers_df[~tickers_df['ticker'].isin(duplicates['ticker'])]

    tickers_df.drop(columns=['assetType', 'priceCurrency'], inplace=True)

    ex_duplicates = []
    with maybe_show_progress(
            duplicates['ticker'],
            True,
            item_show_func=lambda e: e if e is None else str(e),
            label='Retrieving metadata for duplicate tickers: ') as it:

        for ticker in it:
            ex_duplicate = {}
            ticker_meta = CLIENT.get_ticker_metadata(ticker)
            ex_duplicate['ticker'] = ticker_meta['ticker']
            ex_duplicate['exchange'] = ticker_meta['exchangeCode']
            ex_duplicate['startDate'] = pd.to_datetime(
                ticker_meta['startDate'])
            ex_duplicate['endDate'] = pd.to_datetime(ticker_meta['endDate'])
            ex_duplicates.append(ex_duplicate)

    tickers_df = pd.concat([tickers_df, pd.DataFrame(ex_duplicates)])

    tickers_df.dropna(inplace=True)
    tickers_df.reset_index(inplace=True)
    tickers_df.drop(columns=['index'], inplace=True)

    tickers_df.rename(columns={
        'ticker': 'symbol',
        'startDate': 'start_date',
        'endDate': 'end_date'
    },
                      inplace=True)

    assets_to_sids = asset_to_sid_map(asset_finder,
                                      tickers_df['symbol'].values)

    tickers_df['sid'] = [
        assets_to_sids[symbol] for symbol in tickers_df['symbol']
    ]
    tickers_df.index = tickers_df['sid']
    tickers_df.drop(columns=['sid'], inplace=True)

    tickers_df['first_traded'] = tickers_df['start_date']
    tickers_df['auto_close_date'] = tickers_df['end_date'] + Timedelta(days=1)

    return tickers_df, assets_to_sids
Пример #37
0
def _pricing_iter(symbols,
                  divs_splits,
                  show_progress,
                  metadata,
                  sids_written,
                  assets_to_sids={}):
    start_date = pd.to_datetime('2000-1-1', utc=True)
    end_date = pd.to_datetime('today', utc=True) + Timedelta(days=20)

    cal: TradingCalendar = trading_calendars.get_calendar('NYSE')
    sessions = cal.sessions_in_range(start_date, end_date)

    with maybe_show_progress(symbols,
                             show_progress,
                             item_show_func=lambda e: e
                             if e is None else str(e),
                             label='Loading tiingo pricing data: ') as it:

        for symbol in it:
            sid = assets_to_sids[symbol]

            try:
                df = pd.DataFrame(
                    CLIENT.get_ticker_price(symbol,
                                            fmt='json',
                                            startDate=start_date,
                                            frequency='daily'))

                if df.empty:
                    print(f'No data for {symbol}, skpping...')
                    continue

                df.index = pd.to_datetime(df['date'], utc=True)
                df.drop(columns=[
                    'date', 'adjOpen', 'adjHigh', 'adjLow', 'adjClose',
                    'adjVolume'
                ],
                        inplace=True)
                df.rename(columns={
                    'splitFactor': 'split',
                    'divCash': 'dividend'
                },
                          inplace=True)
                df = fill_daily_gaps(df)
                df = drop_extra_sessions(df)

                if 'split' in df.columns:
                    tmp = 1. / df[df['split'] != 1.0]['split']
                    split = DataFrame(data=tmp.index.tz_convert(None).tolist(),
                                      columns=['effective_date'])
                    split['ratio'] = tmp.tolist()
                    split['sid'] = sid

                    splits = divs_splits['splits']
                    index = Index(
                        range(splits.shape[0],
                              splits.shape[0] + split.shape[0]))
                    split.set_index(index, inplace=True)
                    divs_splits['splits'] = splits.append(split)

                if 'dividend' in df.columns:
                    # ex_date   amount  sid record_date declared_date pay_date
                    tmp = df[df['dividend'] != 0.0]['dividend']
                    div = DataFrame(data=tmp.index.tz_convert(None).tolist(),
                                    columns=['ex_date'])

                    natValue = pd.to_datetime('1800-1-1')
                    div['record_date'] = natValue
                    div['declared_date'] = natValue

                    div['pay_date'] = [
                        sessions[sessions.get_loc(ex_date) +
                                 10].tz_convert(None)
                        for ex_date in div['ex_date']
                    ]

                    div['amount'] = tmp.tolist()
                    div['sid'] = sid

                    divs = divs_splits['divs']
                    ind = Index(
                        range(divs.shape[0], divs.shape[0] + div.shape[0]))
                    div.set_index(ind, inplace=True)

                    divs_splits['divs'] = divs.append(div)

            except KeyboardInterrupt:
                exit()

            except Exception as e:
                print(f'\nException for symbol {symbol}')
                print(e)

            sids_written.append(sid)
            yield sid, df
Пример #38
0
    def ingest(environ,
               asset_db_writer,
               minute_bar_writer,  # unused
               daily_bar_writer,
               adjustment_writer,
               calendar,
               cache,
               show_progress,
               output_dir,
               # pass these as defaults to make them 'nonlocal' in py2
               start=start,
               end=end):
        if start is None:
            start = calendar[0]
        if end is None:
            end = None

        metadata = pd.DataFrame(np.empty(len(symbols), dtype=[
            ('start_date', 'datetime64[ns]'),
            ('end_date', 'datetime64[ns]'),
            ('auto_close_date', 'datetime64[ns]'),
            ('symbol', 'object'),
        ]))

        def _pricing_iter():
            sid = 0
            with maybe_show_progress(
                    symbols,
                    show_progress,
                    label='Downloading Yahoo pricing data: ') as it, \
                    requests.Session() as session:
                for symbol in it:
                    path = _cachpath(symbol, 'ohlcv')
                    try:
                        df = cache[path]
                    except KeyError:
                        df = cache[path] = DataReader(
                            symbol,
                            'yahoo',
                            start,
                            end,
                            session=session,
                        ).sort_index()

                    # the start date is the date of the first trade and
                    # the end date is the date of the last trade
                    start_date = df.index[0]
                    end_date = df.index[-1]
                    # The auto_close date is the day after the last trade.
                    ac_date = end_date + pd.Timedelta(days=1)
                    metadata.iloc[sid] = start_date, end_date, ac_date, symbol

                    df.rename(
                        columns={
                            'Open': 'open',
                            'High': 'high',
                            'Low': 'low',
                            'Close': 'close',
                            'Volume': 'volume',
                        },
                        inplace=True,
                    )
                    yield sid, df
                    sid += 1

        daily_bar_writer.write(_pricing_iter(), show_progress=True)

        symbol_map = pd.Series(metadata.symbol.index, metadata.symbol)
        asset_db_writer.write(equities=metadata)

        adjustments = []
        with maybe_show_progress(
                symbols,
                show_progress,
                label='Downloading Yahoo adjustment data: ') as it, \
                requests.Session() as session:
            for symbol in it:
                path = _cachpath(symbol, 'adjustment')
                try:
                    df = cache[path]
                except KeyError:
                    df = cache[path] = DataReader(
                        symbol,
                        'yahoo-actions',
                        start,
                        end,
                        session=session,
                    ).sort_index()

                df['sid'] = symbol_map[symbol]
                adjustments.append(df)

        adj_df = pd.concat(adjustments)
        adj_df.index.name = 'date'
        adj_df.reset_index(inplace=True)

        splits = adj_df[adj_df.action == 'SPLIT']
        splits = splits.rename(
            columns={'value': 'ratio', 'date': 'effective_date'},
        )
        splits.drop('action', axis=1, inplace=True)

        dividends = adj_df[adj_df.action == 'DIVIDEND']
        dividends = dividends.rename(
            columns={'value': 'amount', 'date': 'ex_date'},
        )
        dividends.drop('action', axis=1, inplace=True)
        # we do not have this data in the yahoo dataset
        dividends['record_date'] = pd.NaT
        dividends['declared_date'] = pd.NaT
        dividends['pay_date'] = pd.NaT

        adjustment_writer.write(splits=splits, dividends=dividends)
Пример #39
0
    def ingest(environ,
               asset_db_writer,
               minute_bar_writer,  # unused
               daily_bar_writer,
               adjustment_writer,
               calendar,
               start_session,
               end_session,
               cache,
               show_progress,
               output_dir,
               # pass these as defaults to make them 'nonlocal' in py2
               start=start,
               end=end):
        if start is None:
            start = start_session
        if end is None:
            end = None

        metadata = pd.DataFrame(np.empty(len(symbols), dtype=[
            ('start_date', 'datetime64[ns]'),
            ('end_date', 'datetime64[ns]'),
            ('auto_close_date', 'datetime64[ns]'),
            ('symbol', 'object'),
        ]))

        trading_days = get_calendar('SH').all_sessions
        trading_days = trading_days.astype("datetime64[ns]")

        def _pricing_iter():
            sid = 0
            with maybe_show_progress(
                    symbols,
                    show_progress,
                    label='Downloading Yahoo pricing data: ') as it, \
                    requests.Session() as session:
                for symbol in it:
                    print symbol
                    path = _cachpath(symbol, 'ohlcv')
                    try:
                        df = cache[path]
                    except KeyError:
                        df = cache[path] = get_data(
                            symbol,
                            start,
                            end
                        )
                        # df = cache[path] = DataReader(
                        #     symbol,
                        #     'yahoo',
                        #     start,
                        #     end,
                        #     session=session,
                        # ).sort_index()

                    # the start date is the date of the first trade and
                    # the end date is the date of the last trade

                    df = df[df.Volume>0]
                    start_date = df.index[0]
                    end_date = df.index[-1]
                    df = df.reindex(trading_days[(trading_days>=start_date)])
                    df.Volume = df.Volume.fillna(0)
                    df = df.ffill()
                    # The auto_close date is the day after the last trade.
                    ac_date = end_date + pd.Timedelta(days=1)
                    metadata.iloc[sid] = start_date, end_date, ac_date, symbol

                    df.rename(
                        columns={
                            'Open': 'open',
                            'High': 'high',
                            'Low': 'low',
                            'Close': 'close',
                            'Volume': 'volume',
                        },
                        inplace=True,
                    )
                    yield sid, df
                    sid += 1

        daily_bar_writer.write(_pricing_iter(), show_progress=show_progress)

        symbol_map = pd.Series(metadata.symbol.index, metadata.symbol)

        # Hardcode the exchange to "YAHOO" for all assets and (elsewhere)
        # register "YAHOO" to resolve to the NYSE calendar, because these are
        # all equities and thus can use the NYSE calendar.
        metadata['exchange'] = "hs300"
        asset_db_writer.write(equities=metadata)

        adjustments = []
        with maybe_show_progress(
                symbols,
                show_progress,
                label='Downloading Yahoo adjustment data: ') as it, \
                requests.Session() as session:
            for symbol in it:
                path = _cachpath(symbol, 'adjustment')
                try:
                    df = cache[path]
                except KeyError:
                    data = cache[path] = request(
                        "123.56.77.52:10030",
                        "Divid",
                        {"symbol": symbol}
                    )
                    df = pd.DataFrame(data).sort_index()
                    # print df
                    # df = cache[path] = DataReader(
                    #     symbol,
                    #     'yahoo-actions',
                    #     start,
                    #     end,
                    #     session=session,
                    # ).sort_index()

                df['sid'] = symbol_map[symbol]
                adjustments.append(df)

        adj_df = pd.concat(adjustments)
        adj_df.index.name = 'date'
        adj_df.reset_index(inplace=True)
        adj_df.date = pd.to_datetime(adj_df.date)
        adj_df = adj_df[adj_df.date > pd.Timestamp("2010-01-01")]

        splits = adj_df[adj_df.action == 'SPLIT']
        splits = splits.rename(
            columns={'value': 'ratio', 'date': 'effective_date'},
        )
        splits.drop('action', axis=1, inplace=True)

        dividends = adj_df[adj_df.action == 'DIVIDEND']
        dividends = dividends.rename(
            columns={'value': 'amount', 'date': 'ex_date'},
        )
        dividends.drop('action', axis=1, inplace=True)
        # we do not have this data in the yahoo dataset
        dividends['record_date'] = pd.NaT
        dividends['declared_date'] = pd.NaT
        dividends['pay_date'] = pd.NaT

        adjustment_writer.write(splits=splits, dividends=dividends)