Пример #1
0
def legacy_kdata_to_csv():
    for index, security_item in get_security_list().iterrows():
        for fuquan in (True, False):
            dir = get_kdata_dir_old(security_item, fuquan)
            if os.path.exists(dir):
                files = [os.path.join(dir, f) for f in os.listdir(dir) if
                         ('all' not in f and 'json' in f and os.path.isfile(os.path.join(dir, f)))]

                for f in files:
                    tmp = os.path.basename(f).split('_')
                    if fuquan:
                        csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'hfq')
                        if not os.path.exists(csv_path):
                            df = pd.read_json(f, dtype={'code': str})
                            logger.info("{} to {}".format(f, csv_path))

                            df = df.loc[:,
                                 ['timestamp', 'code', 'low', 'open', 'close', 'high', 'volume', 'turnover',
                                  'securityId',
                                  'fuquan']]
                            df.columns = KDATA_COLUMN_SINA_FQ

                            df.to_csv(csv_path, index=False)
                    else:
                        csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'bfq')
                        if not os.path.exists(csv_path):
                            df = pd.read_json(f, dtype={'code': str})
                            logger.info("{} to {}".format(f, csv_path))

                            df = df.loc[:, KDATA_COLUMN_SINA]

                            df.to_csv(csv_path, index=False)
Пример #2
0
def legacy_kdata_to_csv():
    for index, security_item in get_security_list().iterrows():
        for fuquan in (True, False):
            dir = get_kdata_dir_old(security_item, fuquan)
            if os.path.exists(dir):
                files = [os.path.join(dir, f) for f in os.listdir(dir) if
                         ('all' not in f and 'json' in f and os.path.isfile(os.path.join(dir, f)))]

                for f in files:
                    tmp = os.path.basename(f).split('_')
                    if fuquan:
                        csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'hfq')
                        if not os.path.exists(csv_path):
                            df = pd.read_json(f, dtype={'code': str})
                            logger.info("{} to {}".format(f, csv_path))

                            df = df.loc[:,
                                 ['timestamp', 'code', 'low', 'open', 'close', 'high', 'volume', 'turnover',
                                  'securityId',
                                  'fuquan']]
                            df.columns = KDATA_COLUMN_FQ

                            df.to_csv(csv_path, index=False)
                    else:
                        csv_path = get_kdata_path(security_item, tmp[0], tmp[1], 'bfq')
                        if not os.path.exists(csv_path):
                            df = pd.read_json(f, dtype={'code': str})
                            logger.info("{} to {}".format(f, csv_path))

                            df = df.loc[:, KDATA_COLUMN]

                            df.to_csv(csv_path, index=False)
Пример #3
0
    def yield_request(self, item, trading_dates=[], fuquan=None):
        the_quarters = []
        force_download = False
        if trading_dates:
            force_download = True
            for the_date in trading_dates:
                the_quarters.append(get_year_quarter(the_date))
        else:
            the_quarters = get_quarters(item['listDate'])

        the_quarters = set(the_quarters)

        if fuquan:
            fuquans = [fuquan]
        else:
            fuquans = ['bfq', 'hfq']

        # get day k data
        for year, quarter in the_quarters:
            for fuquan in fuquans:
                data_path = get_kdata_path(item, source='sina', year=year, quarter=quarter, fuquan=fuquan)
                data_exist = os.path.exists(data_path) or kdata_exist(item, year, quarter, fuquan, source='sina')

                if not data_exist or force_download:
                    url = self.get_k_data_url(item['code'], year, quarter, fuquan)
                    yield Request(url=url, headers=DEFAULT_KDATA_HEADER,
                                  meta={'path': data_path, 'item': item, 'fuquan': fuquan},
                                  callback=self.download_day_k_data)
Пример #4
0
def check_convert_result():
    for index, security_item in get_security_list().iterrows():
        for fuquan in ('bfq', 'hfq'):
            dayk_path = get_kdata_path(security_item, fuquan=fuquan)
            if os.path.exists(dayk_path):
                df_result = pd.read_csv(dayk_path)

                if fuquan == 'hfq':
                    df = pd.DataFrame(columns=data_contract.KDATA_COLUMN_FQ)
                else:
                    df = pd.DataFrame(columns=data_contract.KDATA_COLUMN)

                dir = get_kdata_dir(security_item, fuquan=fuquan)

                if os.path.exists(dir):
                    files = [
                        os.path.join(dir, f) for f in os.listdir(dir)
                        if ('day' not in f and 'csv' in f
                            and os.path.isfile(os.path.join(dir, f)))
                    ]
                    for f in files:
                        df = df.append(pd.read_csv(f), ignore_index=True)
                    assert_df(df, df_result)
                    logger.info("{} merge as one ok".format(
                        security_item['code']))
Пример #5
0
def get_kdata(security_item, the_date=None, start_date=None, end_date=None, fuquan='bfq', dtype=None, source='163',
              level='day'):
    if type(security_item) == str:
        if 'stock' in security_item:
            security_item = get_security_item(id=security_item)
        else:
            security_item = get_security_item(code=security_item)

    the_path = files_contract.get_kdata_path(security_item, source=source, fuquan=fuquan)

    if os.path.isfile(the_path):
        if not dtype:
            dtype = {"code": str}
        df = pd.read_csv(the_path, dtype=dtype)
        df = df.set_index(df['timestamp'], drop=False)
        df.index = pd.to_datetime(df.index)
        df = df.sort_index()
        if the_date:
            if the_date in df.index:
                return df.loc[the_date]
            else:
                return pd.DataFrame()

        if not start_date:
            start_date = security_item['listDate']
        if not end_date:
            end_date = datetime.datetime.today()

        df = df.loc[start_date:end_date]
        return df
    return pd.DataFrame()
Пример #6
0
    def yield_request(self, item, start_date=None, end_date=None):
        data_path = get_kdata_path(item, source='163')

        if start_date:
            start = start_date.strftime('%Y%m%d')
        else:
            start = item['listDate'].replace('-', '')

        if end_date:
            end = end_date.strftime('%Y%m%d')
        else:
            end = datetime.today().strftime('%Y%m%d')

        if not os.path.exists(data_path) or start_date or end_date:
            if item['exchange'] == 'sh':
                exchange_flag = 0
            else:
                exchange_flag = 1
            url = self.get_k_data_url(exchange_flag, item['code'], start, end)
            yield Request(url=url,
                          meta={
                              'path': data_path,
                              'item': item
                          },
                          callback=self.download_day_k_data)
Пример #7
0
    def yield_request(self, item, trading_dates=[], fuquan=None):
        the_quarters = []
        force_download = False
        if trading_dates:
            force_download = True
            for the_date in trading_dates:
                the_quarters.append(get_year_quarter(the_date))
        else:
            the_quarters = get_quarters(item['listDate'])

        the_quarters = set(the_quarters)

        if fuquan:
            fuquans = [fuquan]
        else:
            fuquans = ['bfq', 'hfq']

        # get day k data
        for year, quarter in the_quarters:
            for fuquan in fuquans:
                data_path = get_kdata_path(item, source='sina', year=year, quarter=quarter, fuquan=fuquan)
                data_exist = os.path.exists(data_path) or kdata_exist(item, year, quarter, fuquan, source='sina')

                if not data_exist or force_download:
                    url = self.get_k_data_url(item['code'], year, quarter, fuquan)
                    yield Request(url=url, headers=DEFAULT_KDATA_HEADER,
                                  meta={'path': data_path, 'item': item, 'fuquan': fuquan},
                                  callback=self.download_day_k_data)
 def spider_closed(self, spider, reason):
     self.df_pe['close'] = self.df_close['close']
     self.df_pe['code'] = self.security_item['code']
     self.df_pe['securityId'] = self.security_item['id']
     self.df_pe['name'] = self.security_item['name']
     self.df_pe.to_csv(get_kdata_path(self.security_item), index=False)
     spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Пример #9
0
 def spider_closed(self, spider, reason):
     self.df_pe['close'] = self.df_close['close']
     self.df_pe['code'] = self.security_item['code']
     self.df_pe['securityId'] = self.security_item['id']
     self.df_pe['name'] = self.security_item['name']
     self.df_pe.to_csv(get_kdata_path(self.security_item), index=False)
     spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
    def add_factor_to_163(security_item):
        path_163 = get_kdata_path(security_item, source='163', fuquan='bfq')
        df_163 = pd_read_csv(path_163)

        if 'factor' in df_163.columns:
            df = df_163[df_163['factor'].isna()]

            if df.empty:
                logger.info("{} 163 factor is ok", security_item['code'])
                return

        path_sina = get_kdata_path(security_item, source='sina', fuquan='hfq')
        df_sina = pd_read_csv(path_sina)

        df_sina = df_sina[~df_sina.index.duplicated(keep='first')]
        df_163['factor'] = df_sina['factor']
        df_163.to_csv(path_163, index=False)
Пример #11
0
def add_factor_to_163(security_item):
    path_163 = get_kdata_path(security_item, source='163', fuquan='bfq')
    df_163 = pd.read_csv(path_163, dtype=str)
    df_163 = time_index_df(df_163)

    if 'factor' in df_163.columns:
        df = df_163[df_163['factor'].isna()]

        if df.empty:
            logger.info("{} 163 factor is ok", security_item['code'])
            return

    path_sina = get_kdata_path(security_item, source='sina', fuquan='hfq')
    df_sina = pd.read_csv(path_sina, dtype=str)
    df_sina = time_index_df(df_sina)

    df_163['factor'] = df_sina['factor']
    df_163.to_csv(path_163, index=False)
Пример #12
0
def add_factor_to_163(security_item):
    path_163 = get_kdata_path(security_item, source='163', fuquan='bfq')
    df_163 = pd.read_csv(path_163, dtype=str)
    df_163 = time_index_df(df_163)

    if 'factor' in df_163.columns:
        df = df_163[df_163['factor'].isna()]

        if df.empty:
            logger.info("{} 163 factor is ok", security_item['code'])
            return

    path_sina = get_kdata_path(security_item, source='sina', fuquan='hfq')
    df_sina = pd.read_csv(path_sina, dtype=str)
    df_sina = time_index_df(df_sina)

    df_163['factor'] = df_sina['factor']
    df_163.to_csv(path_163, index=False)
Пример #13
0
def fetch_kdata(exchange_str='bitstamp'):
    ccxt_exchange = eval("ccxt.{}()".format(exchange_str))
    if ccxt_exchange.has['fetchOHLCV']:
        for _, security_item in get_security_list(security_type='cryptocurrency', exchanges=[exchange_str]).iterrows():
            try:
                if security_item['name'] not in CRYPTOCURRENCY_PAIR:
                    continue

                start_date, df = get_latest_download_trading_date(security_item)
                # 日K线只抓到昨天
                end_date = pd.Timestamp.today() - pd.DateOffset(1)

                if start_date and (start_date > end_date):
                    logger.info("{} kdata is ok".format(security_item['code']))
                    continue

                try:
                    kdatas = ccxt_exchange.fetch_ohlcv(security_item['name'], timeframe='1d')
                    # for rateLimit
                    time.sleep(5)
                except Exception as e:
                    logger.exception("fetch_kdata for {} {} failed".format(exchange_str, security_item['name']), e)
                    continue

                for kdata in kdatas:
                    timestamp = pd.Timestamp.fromtimestamp(int(kdata[0] / 1000))
                    if is_same_date(timestamp, pd.Timestamp.today()):
                        continue
                    kdata_json = {
                        'timestamp': to_time_str(timestamp),
                        'code': security_item['code'],
                        'name': security_item['name'],
                        'open': kdata[1],
                        'high': kdata[2],
                        'low': kdata[3],
                        'close': kdata[4],
                        'volume': kdata[5],
                        'securityId': security_item['id'],
                        'preClose': None,
                        'change': None,
                        'changePct': None
                    }
                    df = df.append(kdata_json, ignore_index=True)
                if not df.empty:
                    df = df.loc[:, KDATA_COMMON_COL]
                    kdata_df_save(df, get_kdata_path(security_item), calculate_change=True)
                    logger.info(
                        "fetch_kdata for exchange:{} security:{} success".format(exchange_str, security_item['name']))
            except Exception as e:
                logger.info(
                    "fetch_kdata for exchange:{} security:{} failed".format(exchange_str, security_item['name'], e))
    else:
        logger.warning("exchange:{} not support fetchOHLCV".format(exchange_str))
Пример #14
0
def restore_kdata():
    for index, security_item in get_security_list(start_code='600000', end_code='600017').iterrows():
        path_163 = get_kdata_path(security_item, source='163', fuquan='bfq')
        df = pd.read_csv(path_163, dtype=str)
        df = time_index_df(df)

        if 'id' in df.columns:
            df = df.drop(['id'], axis=1)
        df = df[~df.index.duplicated(keep='first')]
        df.timestamp.apply(lambda x: to_time_str(x))
        df.to_csv(path_163, index=False)

        for fuquan in ('hfq', 'bfq'):
            path_sina = get_kdata_path(security_item, source='sina', fuquan=fuquan)
            df = pd.read_csv(path_sina, dtype=str)
            df = time_index_df(df)
            if 'id' in df.columns:
                df = df.drop(['id'], axis=1)
            df = df[~df.index.duplicated(keep='first')]
            df.timestamp = df.timestamp.apply(lambda x: to_time_str(x))
            df.to_csv(path_sina, index=False)
Пример #15
0
def merge_to_current_kdata(security_item, df, fuquan='bfq'):
    df = df.set_index(df['timestamp'], drop=False)
    df.index = pd.to_datetime(df.index)
    df = df.sort_index()

    df1 = get_kdata(security_item, source='sina', fuquan=fuquan, dtype=str)
    df1 = df1.append(df)

    df1 = df1.drop_duplicates(subset='timestamp', keep='last')
    df1 = df1.sort_index()

    the_path = files_contract.get_kdata_path(security_item, source='sina', fuquan=fuquan)
    df1.to_csv(the_path, index=False)
Пример #16
0
def merge_to_current_kdata(security_item, df, fuquan='bfq'):
    df = df.set_index(df['timestamp'], drop=False)
    df.index = pd.to_datetime(df.index)
    df = df.sort_index()

    df1 = get_kdata(security_item, source='sina', fuquan=fuquan, dtype=str)
    df1 = df1.append(df)

    df1 = df1.drop_duplicates(subset='timestamp', keep='last')
    df1 = df1.sort_index()

    the_path = files_contract.get_kdata_path(security_item, source='sina', fuquan=fuquan)
    df1.to_csv(the_path, index=False)
Пример #17
0
def check_result():
    for index, security_item in get_security_list().iterrows():
        for fuquan in ('bfq', 'hfq'):
            dayk_path = get_kdata_path(security_item, fuquan=fuquan)
            if not os.path.exists(dayk_path):
                logger.warn(get_security_dir(security_item))

        dir = get_tick_dir(security_item)
        if os.path.exists(dir):
            files = [os.path.join(dir, f) for f in os.listdir(dir) if
                     ('csv' in f and os.path.isfile(os.path.join(dir, f)))]
            if not files:
                logger.warn(get_security_dir(security_item))
Пример #18
0
def check_result():
    for index, security_item in get_security_list().iterrows():
        for fuquan in ('bfq', 'hfq'):
            dayk_path = get_kdata_path(security_item, fuquan=fuquan)
            if not os.path.exists(dayk_path):
                logger.warn(get_security_dir(security_item))

        dir = get_tick_dir(security_item)
        if os.path.exists(dir):
            files = [os.path.join(dir, f) for f in os.listdir(dir) if
                     ('csv' in f and os.path.isfile(os.path.join(dir, f)))]
            if not files:
                logger.warn(get_security_dir(security_item))
    def merge_kdata_to_one(security_item=None, replace=False, fuquan='bfq'):
        if type(security_item) != 'NoneType':
            items = pd.DataFrame().append(security_item).iterrows()
        else:
            items = get_security_list().iterrows()

        if fuquan:
            fuquans = [fuquan]
        else:
            fuquans = ['bfq', 'hfq']

        for index, security_item in items:
            for fuquan in fuquans:
                dayk_path = get_kdata_path(security_item,
                                           source='sina',
                                           fuquan=fuquan)
                if fuquan == 'hfq':
                    df = pd.DataFrame(
                        columns=data_contract.KDATA_COLUMN_SINA_FQ)
                else:
                    df = pd.DataFrame(columns=data_contract.KDATA_COLUMN_SINA)

                the_dir = get_kdata_dir(security_item, fuquan=fuquan)

                if os.path.exists(the_dir):
                    files = [
                        os.path.join(the_dir, f) for f in os.listdir(the_dir)
                        if ('dayk.csv' not in f
                            and os.path.isfile(os.path.join(the_dir, f)))
                    ]
                    for f in files:
                        df = df.append(pd.read_csv(f, dtype=str),
                                       ignore_index=True)
                if df.size > 0:
                    df = df.set_index(df['timestamp'])
                    df.index = pd.to_datetime(df.index)
                    df = df.sort_index()
                    logger.info("{} to {}".format(security_item['code'],
                                                  dayk_path))
                    if replace:
                        df.to_csv(dayk_path, index=False)
                    else:
                        StockKDataSinaSpider.merge_to_current_kdata(
                            security_item, df, fuquan=fuquan)

                for f in files:
                    logger.info("remove {}".format(f))
                    os.remove(f)

                if fuquan == 'hfq':
                    StockKDataSinaSpider.add_factor_to_163(security_item)
    def yield_request(self, item, the_years=None):
        data_path = get_kdata_path(item, source='163')

        if not the_years:
            if not pd.isna(item['listDate']):
                # 163 could just provide the date after year 2002
                the_years = range(max(int(item['listDate']), 2002), pd.Timestamp.today().year + 1)
            else:
                the_years = range(2005, pd.Timestamp.today().year + 1)

        for the_year in the_years:
            url = self.get_k_data_url(the_year, item['code'])
            yield Request(url=url, meta={'path': data_path, 'item': item},
                          callback=self.download_day_k_data)
Пример #21
0
    def merge_to_current_kdata(security_item, df, fuquan='bfq'):
        df = df.set_index(df['timestamp'], drop=False)
        df.index = pd.to_datetime(df.index)
        df = df.sort_index()

        df1 = get_kdata(security_item, source='sina', fuquan=fuquan)
        df1 = df1.append(df)

        df1 = df1.drop_duplicates(subset='timestamp', keep='last')
        df1 = df1.sort_index()

        the_path = files_contract.get_kdata_path(security_item, source='sina', fuquan=fuquan)
        if fuquan == 'hfq':
            df1 = df1.loc[:, data_contract.KDATA_COLUMN_SINA_FQ]
        else:
            df1 = df1.loc[:, data_contract.KDATA_COLUMN_SINA]
        df1.to_csv(the_path, index=False)
    def start_requests(self):
        for _, item in get_security_list().iterrows():
            for fuquan in ['hfq', 'bfq']:
                data_path = get_kdata_path(item, fuquan=fuquan, source='ths')
                data_exist = os.path.isfile(data_path)
                if not data_exist or True:
                    # get day k data
                    if fuquan == 'hfq':
                        flag = 2
                    else:
                        flag = 0
                    url = self.get_k_data_url(item['code'], flag)
                    yield Request(url=url, headers=TONGHUASHUN_KDATA_HEADER,
                                  meta={'path': data_path, 'item': item, 'fuquan': fuquan},
                                  callback=self.download_day_k_data)

                else:
                    self.logger.info("{} kdata existed".format(item['code']))
Пример #23
0
    def start_requests(self):
        for _, item in get_security_list().iterrows():
            for fuquan in ['hfq', 'bfq']:
                data_path = get_kdata_path(item, fuquan=fuquan, source='ths')
                data_exist = os.path.isfile(data_path)
                if not data_exist or True:
                    # get day k data
                    if fuquan == 'hfq':
                        flag = 2
                    else:
                        flag = 0
                    url = self.get_k_data_url(item['code'], flag)
                    yield Request(url=url, headers=TONGHUASHUN_KDATA_HEADER,
                                  meta={'path': data_path, 'item': item, 'fuquan': fuquan},
                                  callback=self.download_day_k_data)

                else:
                    self.logger.info("{} kdata existed".format(item['code']))
Пример #24
0
def merge_kdata_to_one(security_item=None, replace=False, fuquan='bfq'):
    if type(security_item) != 'NoneType':
        items = pd.DataFrame().append(security_item).iterrows()
    else:
        items = get_security_list().iterrows()

    if fuquan:
        fuquans = [fuquan]
    else:
        fuquans = ['bfq', 'hfq']

    for index, security_item in items:
        for fuquan in fuquans:
            dayk_path = get_kdata_path(security_item, source='sina', fuquan=fuquan)
            if fuquan == 'hfq':
                df = pd.DataFrame(
                    columns=data_contract.KDATA_COLUMN_FQ)
            else:
                df = pd.DataFrame(
                    columns=data_contract.KDATA_COLUMN)

            the_dir = get_kdata_dir(security_item, fuquan=fuquan)

            if os.path.exists(the_dir):
                files = [os.path.join(the_dir, f) for f in os.listdir(the_dir) if
                         ('dayk.csv' not in f and os.path.isfile(os.path.join(the_dir, f)))]
                for f in files:
                    df = df.append(pd.read_csv(f, dtype=str), ignore_index=True)
            if df.size > 0:
                df = df.set_index(df['timestamp'])
                df.index = pd.to_datetime(df.index)
                df = df.sort_index()
                logger.info("{} to {}".format(security_item['code'], dayk_path))
                if replace:
                    df.to_csv(dayk_path, index=False)
                else:
                    merge_to_current_kdata(security_item, df, fuquan=fuquan)

            for f in files:
                logger.info("remove {}".format(f))
                os.remove(f)

            if fuquan == 'hfq':
                add_factor_to_163(security_item)
    def yield_request(self, item, the_years=None):
        data_path = get_kdata_path(item, source='163')

        if not the_years:
            if not pd.isna(item['listDate']):
                # 163 could just provide the date after year 2002
                the_years = range(max(int(item['listDate']), 2002),
                                  pd.Timestamp.today().year + 1)
            else:
                the_years = range(2005, pd.Timestamp.today().year + 1)

        for the_year in the_years:
            url = self.get_k_data_url(the_year, item['code'])
            yield Request(url=url,
                          meta={
                              'path': data_path,
                              'item': item
                          },
                          callback=self.download_day_k_data)
Пример #26
0
    def yield_request(self, item, start_date=None, end_date=None):
        data_path = get_kdata_path(item, source='163')

        if start_date:
            start = start_date.strftime('%Y%m%d')
        else:
            start = item['listDate'].replace('-', '')

        if end_date:
            end = end_date.strftime('%Y%m%d')
        else:
            end = datetime.today().strftime('%Y%m%d')

        if not os.path.exists(data_path) or start_date or end_date:
            if item['exchange'] == 'sh':
                exchange_flag = 0
            else:
                exchange_flag = 1
            url = self.get_k_data_url(exchange_flag, item['code'], start, end)
            yield Request(url=url, meta={'path': data_path, 'item': item},
                          callback=self.download_day_k_data)
Пример #27
0
def get_kdata(security_item, the_date=None, start_date=None, end_date=None, fuquan='bfq', dtype=None, source='163',
              level='day'):
    if type(security_item) == str:
        if 'stock' in security_item:
            security_item = get_security_item(id=security_item)
        else:
            security_item = get_security_item(code=security_item)

    the_path = files_contract.get_kdata_path(security_item, source=source, fuquan=fuquan)

    if os.path.isfile(the_path):
        if not dtype:
            dtype = {"code": str, 'timestamp': str}
        df = pd.read_csv(the_path, dtype=dtype)

        df.timestamp = df.timestamp.apply(lambda x: to_time_str(x))
        df = df.set_index(df['timestamp'], drop=False)
        df.index = pd.to_datetime(df.index)
        df = df.sort_index()
        if the_date:
            if the_date in df.index:
                return df.loc[the_date]
            else:
                return pd.DataFrame()

        if not start_date:
            if type(security_item['listDate']) != str and np.isnan(security_item['listDate']):
                start_date = '2002-01-01'
            else:
                start_date = security_item['listDate']
        if not end_date:
            end_date = datetime.datetime.today()

        if start_date and end_date:
            df = df.loc[start_date:end_date]

        return df
    return pd.DataFrame()
Пример #28
0
def check_convert_result():
    for index, security_item in get_security_list().iterrows():
        for fuquan in ('bfq', 'hfq'):
            dayk_path = get_kdata_path(security_item, fuquan=fuquan)
            if os.path.exists(dayk_path):
                df_result = pd.read_csv(dayk_path)

                if fuquan == 'hfq':
                    df = pd.DataFrame(
                        columns=data_contract.KDATA_COLUMN_FQ)
                else:
                    df = pd.DataFrame(
                        columns=data_contract.KDATA_COLUMN)

                dir = get_kdata_dir(security_item, fuquan=fuquan)

                if os.path.exists(dir):
                    files = [os.path.join(dir, f) for f in os.listdir(dir) if
                             ('day' not in f and 'csv' in f and os.path.isfile(os.path.join(dir, f)))]
                    for f in files:
                        df = df.append(pd.read_csv(f), ignore_index=True)
                    assert_df(df, df_result)
                    logger.info("{} merge as one ok".format(security_item['code']))
Пример #29
0
 def spider_closed(self, spider, reason):
     self.current_df = self.current_df.loc[:, KDATA_COLUMN_INDEX]
     print(self.current_df)
     self.current_df.to_csv(get_kdata_path(item=self.security_item), index=False)
     spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)
Пример #30
0
def parse_shfe_data(force_parse=False):
    the_dir = get_exchange_cache_dir(security_type='future', exchange='shfe')

    need_parse_files = []

    for the_zip_file in [
            os.path.join(the_dir, f) for f in os.listdir(the_dir)
            if f.endswith('.zip')
    ]:
        dst_file = the_zip_file.replace('.zip', ".xls")

        if not os.path.exists(dst_file):
            dst_dir = the_zip_file.replace('.zip', "")
            os.makedirs(dst_dir)

            unzip(the_zip_file, dst_dir)
            files = [
                os.path.join(dst_dir, f) for f in os.listdir(dst_dir)
                if f.endswith('.xls')
            ]
            if len(files) == 1:
                os.rename(files[0], dst_file)
            need_parse_files.append(dst_file)

    if force_parse:
        need_parse_files = [
            os.path.join(the_dir, f) for f in os.listdir(the_dir)
            if f.endswith('.xls')
        ]
    for the_file in need_parse_files:
        logger.info("parse {}".format(the_file))

        df = pd.read_excel(the_file,
                           skiprows=2,
                           skip_footer=4,
                           index_col='合约',
                           converters={'日期': str})
        df.index = pd.Series(df.index).fillna(method='ffill')
        df = df.loc[:, [
            '日期', '前收盘', '前结算', '开盘价', '最高价', '最低价', '收盘价', '结算价', '涨跌1',
            '涨跌2', '成交量', '成交金额', '持仓量'
        ]]
        df.columns = [
            'timestamp', 'preClose', 'preSettlement', 'open', 'high', 'low',
            'close', 'settlement', 'change', 'change1', 'volume', 'turnover',
            'openInterest'
        ]

        # 日期格式统一,方便导入es
        # df.timestamp = df.timestamp.apply(lambda x: to_time_str(x))

        unique_index = df.index.drop_duplicates()

        security_list = get_security_list(security_type='future',
                                          exchanges=['shfe'])

        for the_contract in unique_index:
            logger.info("start handling {} in {}".format(
                the_contract, the_file))
            security_item = {
                'code': the_contract,
                'name': get_future_name(the_contract),
                'id': 'future_{}_{}'.format('shfe', the_contract),
                'exchange': 'shfe',
                'type': 'future'
            }
            # 检查是否需要保存合约meta
            if (not security_list.empty) and ('code' in security_list.columns):
                security_list = security_list.set_index(security_list['code'],
                                                        drop=False)
            if the_contract not in security_list.index:
                security_list = security_list.append(security_item,
                                                     ignore_index=True)
                security_list = security_list.sort_index()
                security_list.to_csv(get_security_list_path('future', 'shfe'),
                                     index=False)

            the_df = df.loc[the_contract, ]
            the_df['code'] = the_contract
            the_df['name'] = get_future_name(the_contract)
            the_df['securityId'] = 'future_{}_{}'.format('shfe', the_contract)
            the_df['changePct'] = the_df['change'] / the_df['preClose']
            the_df['changePct1'] = the_df['change1'] / the_df['preSettlement']

            kdata_path = get_kdata_path(item=security_item, source='exchange')
            # TODO:这些逻辑应该统一处理
            kdata_dir = get_kdata_dir(item=security_item)
            if not os.path.exists(kdata_dir):
                os.makedirs(kdata_dir)

            if os.path.exists(kdata_path):
                saved_df = pd.read_csv(kdata_path, dtype=str)
            else:
                saved_df = pd.DataFrame()

            saved_df = saved_df.append(the_df, ignore_index=True)
            saved_df = saved_df.loc[:, KDATA_FUTURE_COL]

            if not saved_df.empty:
                kdata_df_save(saved_df, kdata_path)

            logger.info("end handling {} in {}".format(the_contract, the_file))
Пример #31
0
def parse_shfe_day_data(force_parse=False):
    cache_dir = get_exchange_cache_dir(security_type='future',
                                       exchange='shfe',
                                       the_year=datetime.datetime.today().year,
                                       data_type="day_kdata")
    the_parsed_path = os.path.join(cache_dir, 'parsed')
    the_parsed = []
    if os.path.exists(the_parsed_path):
        with open(the_parsed_path) as data_file:
            the_parsed = json.load(data_file)

    if force_parse:
        the_dates = [f for f in os.listdir(cache_dir) if f != 'parsed' and f]
    else:
        the_dates = [
            f for f in os.listdir(cache_dir)
            if f != 'parsed' and f not in the_parsed
        ]

    for the_date in the_dates:
        the_path = os.path.join(cache_dir, the_date)
        logger.info("start handling {}".format(the_path))

        with open(the_path, 'r', encoding='UTF8') as f:
            tmp_str = f.read()
            the_json = json.loads(tmp_str)
            the_datas = the_json['o_curinstrument']
            # 日期,代码,名称,最低,开盘,收盘,最高,成交量(手),成交额(元),唯一标识,前收盘,涨跌额,涨跌幅(%),持仓量,结算价,前结算,涨跌额(按结算价),涨跌幅(按结算价)
            KDATA_COLUMN_FUTURE = [
                'timestamp', 'code', 'name', 'low', 'open', 'close', 'high',
                'volume', 'turnover', 'securityId', 'preClose', 'change',
                'changePct', 'openInterest', 'settlement', 'preSettlement',
                'change1', 'changePct1'
            ]
            for the_data in the_datas:
                # {'CLOSEPRICE': 11480,
                #  'DELIVERYMONTH': '1809',
                #  'HIGHESTPRICE': 11555,
                #  'LOWESTPRICE': 11320,
                #  'OPENINTEREST': 425692,
                #  'OPENINTERESTCHG': 3918,
                #  'OPENPRICE': 11495,
                #  'ORDERNO': 0,
                #  'PRESETTLEMENTPRICE': 11545,
                #  'PRODUCTID': 'ru_f    ',
                #  'PRODUCTNAME': '天然橡胶            ',
                #  'PRODUCTSORTNO': 100,
                #  'SETTLEMENTPRICE': 11465,
                #  'VOLUME': 456574,
                #  'ZD1_CHG': -65,
                #  'ZD2_CHG': -80}

                if not re.match("\d{4}", the_data['DELIVERYMONTH']):
                    continue

                code = "{}{}".format(
                    the_data['PRODUCTID'][:the_data['PRODUCTID'].index('_')],
                    the_data['DELIVERYMONTH'])
                logger.info("start handling {} for {}".format(code, the_date))

                name = get_future_name(code)
                security_id = "future_shfe_{}".format(code)

                security_list = get_security_list(security_type='future',
                                                  exchanges=['shfe'])

                logger.info("start handling {} for {}".format(code, the_date))
                security_item = {
                    'code': code,
                    'name': name,
                    'id': security_id,
                    'exchange': 'shfe',
                    'type': 'future'
                }
                # 检查是否需要保存合约meta
                if security_list is not None and 'code' in security_list.columns:
                    security_list = security_list.set_index(
                        security_list['code'], drop=False)
                if code not in security_list.index:
                    security_list = security_list.append(security_item,
                                                         ignore_index=True)
                    security_list.to_csv(get_security_list_path(
                        'future', 'shfe'),
                                         index=False)

                kdata_path = get_kdata_path(item=security_item,
                                            source='exchange')
                # TODO:这些逻辑应该统一处理
                kdata_dir = get_kdata_dir(item=security_item)
                if not os.path.exists(kdata_dir):
                    os.makedirs(kdata_dir)

                if os.path.exists(kdata_path):
                    saved_df = pd.read_csv(kdata_path, dtype=str)
                    saved_df = saved_df.set_index(saved_df['timestamp'],
                                                  drop=False)
                else:
                    saved_df = pd.DataFrame()

                if saved_df.empty or the_date not in saved_df.index:
                    low_price = the_data['LOWESTPRICE']
                    if not low_price:
                        low_price = 0
                    open_price = the_data['OPENPRICE']
                    if not open_price:
                        open_price = 0
                    close_price = the_data['CLOSEPRICE']
                    if not close_price:
                        close_price = 0
                    high_price = the_data['HIGHESTPRICE']
                    if not high_price:
                        high_price = 0
                    volume = the_data['VOLUME']
                    if not volume:
                        volume = 0

                    if type(the_data['ZD1_CHG']) == str:
                        change = 0
                    else:
                        change = the_data['ZD1_CHG']

                    if type(the_data['ZD2_CHG']) == str:
                        change1 = 0
                    else:
                        change1 = the_data['ZD2_CHG']

                    pre_close = close_price - change
                    pre_settlement = the_data['PRESETTLEMENTPRICE']

                    # 首日交易
                    if pre_close != 0:
                        change_pct = change / pre_close
                    else:
                        change_pct = 0
                    if pre_settlement != 0:
                        change_pct1 = change1 / pre_settlement
                    else:
                        change_pct1 = 0

                    the_json = {
                        "timestamp":
                        to_time_str(the_date),
                        "code":
                        code,
                        "name":
                        name,
                        "low":
                        low_price,
                        "open":
                        open_price,
                        "close":
                        close_price,
                        "high":
                        high_price,
                        "volume":
                        volume,
                        # 成交额为估算
                        "turnover":
                        (low_price + open_price + close_price + high_price / 4)
                        * volume,
                        "securityId":
                        security_id,
                        "preClose":
                        pre_close,
                        "change":
                        change,
                        "changePct":
                        change_pct,
                        "openInterest":
                        the_data['OPENINTEREST'],
                        "settlement":
                        the_data['SETTLEMENTPRICE'],
                        "preSettlement":
                        the_data['PRESETTLEMENTPRICE'],
                        "change1":
                        change1,
                        "changePct1":
                        change_pct1
                    }
                    saved_df = saved_df.append(the_json, ignore_index=True)
                    saved_df = saved_df.loc[:, KDATA_COLUMN_FUTURE]
                    saved_df = saved_df.drop_duplicates(subset='timestamp',
                                                        keep='last')
                    saved_df = saved_df.set_index(saved_df['timestamp'],
                                                  drop=False)
                    saved_df.index = pd.to_datetime(saved_df.index)
                    saved_df = saved_df.sort_index()
                    saved_df.to_csv(kdata_path, index=False)

                    logger.info("end handling {} for {}".format(
                        code, the_date))

                    if the_date not in the_parsed:
                        the_parsed.append(the_date)
        if the_parsed:
            result_list = drop_duplicate(the_parsed)
            result_list = sorted(result_list)

            with open(the_parsed_path, 'w') as outfile:
                json.dump(result_list, outfile)
        logger.info("end handling {}".format(the_path))
Пример #32
0
def get_kdata(security_item,
              exchange=None,
              the_date=None,
              start_date=None,
              end_date=None,
              fuquan='bfq',
              dtype=None,
              source=None,
              level='day'):
    """
    get kdata.

    Parameters
    ----------
    security_item : SecurityItem or str
        the security item,id or code

    exchange : str
        the exchange,set this for cryptocurrency

    the_date : TimeStamp str or TimeStamp
        get the kdata for the exact date
    start_date : TimeStamp str or TimeStamp
        start date
    end_date : TimeStamp str or TimeStamp
        end date
    fuquan : str
        {"qfq","hfq","bfq"},default:"bfq"
    dtype : type
        the data type for the csv column,default: None
    source : str
        the data source,{'163','sina','exchange'},just used for internal merge
    level : str or int
        the kdata level,{1,5,15,30,60,'day','week','month'},default : 'day'

    Returns
    -------
    DataFrame

    """

    # 由于数字货币的交易所太多,必须指定exchange
    security_item = to_security_item(security_item, exchange)

    source = adjust_source(security_item, source)

    # 163的数据是合并过的,有复权因子,都存在'bfq'目录下,只需从一个地方取数据,并做相应转换
    if source == '163':
        the_path = files_contract.get_kdata_path(security_item,
                                                 source=source,
                                                 fuquan='bfq')
    else:
        the_path = files_contract.get_kdata_path(security_item,
                                                 source=source,
                                                 fuquan=fuquan)

    if os.path.isfile(the_path):
        if not dtype:
            dtype = {"code": str, 'timestamp': str}
        df = pd.read_csv(the_path, dtype=dtype)

        if 'factor' in df.columns and source == '163' and security_item[
                'type'] == 'stock':
            df_kdata_has_factor = df[df['factor'].notna()]
            if df_kdata_has_factor.shape[0] > 0:
                latest_factor = df_kdata_has_factor.tail(1).factor.iat[0]
            else:
                latest_factor = None

        df.timestamp = df.timestamp.apply(lambda x: to_time_str(x))
        df = df.set_index(df['timestamp'], drop=False)
        df.index = pd.to_datetime(df.index)
        df = df.sort_index()

        if the_date:
            if the_date in df.index:
                df = df.loc[df['timestamp'] == the_date]
            else:
                return None
        else:
            if not start_date and not pd.isna(security_item['listDate']):
                start_date = security_item['listDate']
            if not end_date:
                end_date = datetime.datetime.today()

            if start_date and end_date:
                df = df.loc[start_date:end_date]

        # 复权处理
        if source == '163' and security_item['type'] == 'stock':
            if 'factor' in df.columns:
                # 后复权是不变的
                df['hfqClose'] = df.close * df.factor
                df['hfqOpen'] = df.open * df.factor
                df['hfqHigh'] = df.high * df.factor
                df['hfqLow'] = df.low * df.factor

                # 前复权需要根据最新的factor往回算,当前价格不变
                if latest_factor:
                    df['qfqClose'] = df.hfqClose / latest_factor
                    df['qfqOpen'] = df.hfqOpen / latest_factor
                    df['qfqHigh'] = df.hfqHigh / latest_factor
                    df['qfqLow'] = df.hfqLow / latest_factor
                else:
                    logger.exception("missing latest factor for {}".format(
                        security_item['id']))
        return df
    return pd.DataFrame()
Пример #33
0
def get_kdata(security_item,
              exchange=None,
              the_date=None,
              start_date=None,
              end_date=None,
              fuquan='bfq',
              source=None,
              level='day',
              generate_id=False):
    """
    get kdata.

    Parameters
    ----------
    security_item : SecurityItem or str
        the security item,id or code

    exchange : str
        the exchange,set this for cryptocurrency

    the_date : TimeStamp str or TimeStamp
        get the kdata for the exact date
    start_date : TimeStamp str or TimeStamp
        start date
    end_date : TimeStamp str or TimeStamp
        end date
    fuquan : str
        {"qfq","hfq","bfq"},default:"bfq"
    source : str
        the data source,{'163','sina','exchange'},just used for internal merge
    level : str or int
        the kdata level,{1,5,15,30,60,'day','week','month'},default : 'day'

    Returns
    -------
    DataFrame

    """

    # 由于数字货币的交易所太多,必须指定exchange
    security_item = to_security_item(security_item, exchange)

    source = adjust_source(security_item, source)

    # 163的数据是合并过的,有复权因子,都存在'bfq'目录下,只需从一个地方取数据,并做相应转换
    if source == '163':
        the_path = files_contract.get_kdata_path(security_item,
                                                 source=source,
                                                 fuquan='bfq')
    else:
        the_path = files_contract.get_kdata_path(security_item,
                                                 source=source,
                                                 fuquan=fuquan)

    if os.path.isfile(the_path):
        df = pd_utils.pd_read_csv(the_path, generate_id=generate_id)

        if 'factor' in df.columns and source == '163' and security_item[
                'type'] == 'stock':
            df_kdata_has_factor = df[df['factor'].notna()]
            if df_kdata_has_factor.shape[0] > 0:
                latest_factor = df_kdata_has_factor.tail(1).factor.iat[0]
            else:
                latest_factor = None

        if the_date:
            if the_date in df.index:
                df = df.loc[the_date:the_date, :]
            else:
                return None
        else:
            if start_date or end_date:
                df = df_for_date_range(df,
                                       start_date=start_date,
                                       end_date=end_date)

        # 复权处理
        if source == '163' and security_item['type'] == 'stock':
            if 'factor' in df.columns:
                # 后复权是不变的
                df['hfqClose'] = df.close * df.factor
                df['hfqOpen'] = df.open * df.factor
                df['hfqHigh'] = df.high * df.factor
                df['hfqLow'] = df.low * df.factor

                # 前复权需要根据最新的factor往回算,当前价格不变
                if latest_factor:
                    df['qfqClose'] = df.hfqClose / latest_factor
                    df['qfqOpen'] = df.hfqOpen / latest_factor
                    df['qfqHigh'] = df.hfqHigh / latest_factor
                    df['qfqLow'] = df.hfqLow / latest_factor
                else:
                    logger.exception("missing latest factor for {}".format(
                        security_item['id']))

        return df
    return pd.DataFrame()
Пример #34
0
def get_kdata(security_item,
              the_date=None,
              start_date=None,
              end_date=None,
              fuquan='bfq',
              dtype=None,
              source='163',
              level='day'):
    """
    get kdata.

    Parameters
    ----------
    security_item : SecurityItem or str
        the security item,id or code
    the_date : TimeStamp str or TimeStamp
        get the kdata for the exact date
    start_date : TimeStamp str or TimeStamp
        start date
    end_date : TimeStamp str or TimeStamp
        end date
    fuquan : str
        {"qfq","hfq","bfq"},default:"bfq"
    dtype : type
        the data type for the csv column,default: None
    source : str
        the data source,{'163','sina'},default: '163'
    level : str or int
        the kdata level,{1,5,15,30,60,'day','week','month'},default : 'day'

    Returns
    -------
    DataFrame

    """

    security_item = to_security_item(security_item)

    # 163的数据是合并过的,有复权因子,都存在'bfq'目录下,只需从一个地方取数据,并做相应转换
    if source == '163':
        the_path = files_contract.get_kdata_path(security_item,
                                                 source=source,
                                                 fuquan='bfq')
    else:
        the_path = files_contract.get_kdata_path(security_item,
                                                 source=source,
                                                 fuquan=fuquan)

    if os.path.isfile(the_path):
        if not dtype:
            dtype = {"code": str, 'timestamp': str}
        df = pd.read_csv(the_path, dtype=dtype)

        df.timestamp = df.timestamp.apply(lambda x: to_time_str(x))
        df = df.set_index(df['timestamp'], drop=False)
        df.index = pd.to_datetime(df.index)
        df = df.sort_index()
        if the_date:
            if the_date in df.index:
                return df.loc[the_date]
            else:
                return pd.DataFrame()

        if not start_date:
            if security_item['type'] == 'stock':
                if type(security_item['listDate']) != str and np.isnan(
                        security_item['listDate']):
                    start_date = '2002-01-01'
                else:
                    start_date = security_item['listDate']
            else:
                start_date = datetime.datetime.today() - datetime.timedelta(
                    days=30)
        if not end_date:
            end_date = datetime.datetime.today()

        if start_date and end_date:
            df = df.loc[start_date:end_date]

        #
        if source == '163' and security_item['type'] == 'stock':
            if fuquan == 'bfq':
                return df
            if 'factor' in df.columns:
                current_factor = df.tail(1).factor.iat[0]
                # 后复权是不变的
                df.close *= df.factor
                df.open *= df.factor
                df.high *= df.factor
                df.low *= df.factor
                if fuquan == 'qfq':
                    # 前复权需要根据最新的factor往回算
                    df.close /= current_factor
                    df.open /= current_factor
                    df.high /= current_factor
                    df.low /= current_factor
        return df
    return pd.DataFrame()
Пример #35
0
 def spider_closed(self, spider, reason):
     self.current_df = self.current_df.loc[:, KDATA_INDEX_COL]
     print(self.current_df)
     self.current_df.to_csv(get_kdata_path(item=self.security_item),
                            index=False)
     spider.logger.info('Spider closed: %s,%s\n', spider.name, reason)