Exemplo n.º 1
0
def init_env():
    if not os.path.exists(FOOLTRADER_STORE_PATH):
        print("{} is a wrong path")
        print("please set env FOOLTRADER_STORE_PATH to working path or set it in settings.py")
    else:
        # 初始化股票文件夹
        for _, item in get_security_list(exchanges=EXCHANGE_LIST_COL).iterrows():
            mkdir_for_stock(item)

        # 初始化指数文件夹
        for _, item in get_security_list(security_type='index', exchanges=['sh', 'sz', 'nasdaq']).iterrows():
            kdata_dir = get_kdata_dir(item)
            if not os.path.exists(kdata_dir):
                os.makedirs(kdata_dir)
        # 初始化期货文件夹
        for exchange in ['shfe', 'dce', 'zce']:
            exchange_cache_dir = get_exchange_cache_dir(security_type='future', exchange=exchange)
            if not os.path.exists(exchange_cache_dir):
                os.makedirs(exchange_cache_dir)

            exchange_cache_dir = get_exchange_cache_dir(security_type='future', exchange='shfe',
                                                        the_year=datetime.today().year,
                                                        data_type="day_kdata")
            if not os.path.exists(exchange_cache_dir):
                os.makedirs(exchange_cache_dir)

            exchange_dir = get_exchange_dir(security_type='future', exchange=exchange)
            if not os.path.exists(exchange_dir):
                os.makedirs(exchange_dir)
Exemplo n.º 2
0
    def start_requests(self):
        self.trading_dates = self.settings.get("trading_dates")

        if self.trading_dates:
            # 每天的数据
            for the_date in self.trading_dates:
                the_path = get_exchange_cache_path(
                    security_type='future',
                    exchange='shfe',
                    the_date=to_timestamp(the_date),
                    data_type='day_kdata')

                yield Request(url=self.get_day_kdata_url(the_date=the_date),
                              meta={
                                  'the_date': the_date,
                                  'the_path': the_path
                              },
                              callback=self.download_shfe_data_by_date)
        else:
            # 直接抓年度统计数据
            for the_year in range(2009, datetime.today().year):
                the_dir = get_exchange_cache_dir(security_type='future',
                                                 exchange='shfe')
                the_path = os.path.join(
                    the_dir, "{}_shfe_history_data.zip".format(the_year))

                if not os.path.exists(the_path):
                    yield Request(
                        url=self.get_year_k_data_url(the_year=the_year),
                        meta={
                            'the_year': the_year,
                            'the_path': the_path
                        },
                        callback=self.download_shfe_history_data)
Exemplo n.º 3
0
    def start_requests(self):
        self.dataType = self.settings.get("dataType")
        if self.dataType == 'inventory':
            today = pd.Timestamp.today()
            for date in pd.date_range(start=today.date() -
                                      pd.Timedelta(weeks=520),
                                      end=today):
                the_dir = get_exchange_cache_path(
                    security_type='future',
                    exchange='shfe',
                    the_date=to_timestamp(date),
                    data_type='inventory') + '.json'
                if date.dayofweek < 5 and not os.path.exists(the_dir):
                    yield Request(url=self.get_day_inventory_url(
                        the_date=date.strftime('%Y%m%d')),
                                  meta={
                                      'the_date': date,
                                      'the_path': the_dir
                                  },
                                  callback=self.download_shfe_data_by_date)

        if self.dataType == 'day_kdata':

            daterange = pd.date_range(start='2020-01-01',
                                      end=pd.Timestamp.today())
            daterange = daterange[daterange.dayofweek < 5]
            # 每天的数据
            for the_date in daterange:
                the_path = get_exchange_cache_path(
                    security_type='future',
                    exchange='shfe',
                    the_date=to_timestamp(the_date),
                    data_type='day_kdata')

                if not os.path.exists(the_path):
                    yield Request(url=self.get_day_kdata_url(
                        the_date=the_date.strftime('%Y%m%d')),
                                  meta={
                                      'the_date': the_date,
                                      'the_path': the_path
                                  },
                                  callback=self.download_shfe_data_by_date)
        else:
            # 直接抓年度统计数据
            for the_year in range(2009, datetime.today().year):
                the_dir = get_exchange_cache_dir(security_type='future',
                                                 exchange='shfe')
                the_path = os.path.join(
                    the_dir, "{}_shfe_history_data.zip".format(the_year))

                if not os.path.exists(the_path):
                    yield Request(
                        url=self.get_year_k_data_url(the_year=the_year),
                        meta={
                            'the_year': the_year,
                            'the_path': the_path
                        },
                        callback=self.download_shfe_history_data)
Exemplo n.º 4
0
def scrawl_tick():
    agg = agg_future_dayk()
    logging.info("start filter existed symbols")

    the_path = get_exchange_cache_dir(security_type='future',
                                      exchange='ine',
                                      the_year='2020',
                                      data_type='day_kdata')

    trading_dates = sorted(os.listdir(the_path))

    # trading_dates = get_trading_calendar(security_type="future",exchange="shfe")
    tdates = {}
    for i in range(len(trading_dates)):
        if i > 0:
            tdates[datetime.strptime(trading_dates[i],
                                     '%Y%m%d')] = datetime.strptime(
                                         trading_dates[i - 1], '%Y%m%d')
    path = TICK_PATH
    #filteredTradingDates = sorted(list(filter(lambda y:y>datetime(2018,11,30,0,0), map(lambda x:datetime.strptime(x,'%Y%m%d'),trading_dates))))
    filteredTradingDates = sorted(
        list(
            filter(
                lambda y: y > datetime(2016, 11, 30, 0, 0),
                map(lambda x: datetime.strptime(x, '%Y%m%d'), trading_dates))))
    logging.info("complete filter existed symbols")
    exchanges = ["shfe", "cffex", "dce", "czce", "ine"]
    logging.info("start getting tick data")
    # api = TqApi(account=TqSim(),url="ws://192.168.56.1:7777")
    for ex in exchanges:
        logging.info(ex + ": start getting tick")
        currentYearData = agg.getCurrentYearData(ex)
        currentYearData = currentYearData[currentYearData['date'].isin(
            filteredTradingDates)]
        pathpair = list(
            map(
                lambda x: (x[1].strftime('%Y%m%d') + "-" + x[0], x[0],
                           datetime.utcfromtimestamp(x[1].timestamp())),
                currentYearData[['symbol', 'date']].values))
        #print(pathpair)
        p = Pool(2)
        for i in pathpair:
            if (i[1].startswith("sc")
                    or i[1].startswith("nr")) and ex == "shfe":
                continue
            p.apply_async(scrawl_single_tick, args=(i, path, ex, tdates))

        p.close()
        p.join()
        logging.info(ex + ": complete getting tick")
Exemplo n.º 5
0
 def download_czce_history_data(self, response):
     the_dir = get_exchange_cache_dir(security_type='future',
                                      exchange='czce')
     for filepath in response.xpath('//a[contains(@href,"zip")]').xpath(
             '@href').extract():
         yield Request(
             url="http://www.czce.com.cn/" + filepath,
             meta={
                 'filename':
                 os.path.join(the_dir,
                              ("" if filepath.split("/")[-2] == "exchange"
                               else filepath.split("/")[-2]) +
                              filepath.split("/")[-1])
             },
             callback=self.download_czce_history_data_file)
Exemplo n.º 6
0
    def download_chinaclear_data_by_date(self, response):
        the_path = os.path.join(
            get_exchange_cache_dir(security_type='future',
                                   exchange='chinaclear'),
            response.meta['the_date'])
        tableData = response.css('td[width="40%"]').xpath(
            'p/span/text()').extract()

        # 缓存数据
        with open(the_path, "wb") as f:
            f.write(
                bytes(response.meta['the_date'] + "|" + "|".join(tableData) +
                      '\n',
                      encoding="utf8"))
            f.flush()
Exemplo n.º 7
0
def crawl_shfe_quote():
    # 先抓历年历史数据
    process_crawl(FutureShfeSpider, {})
    # 抓今年的交易日历
    process_crawl(ShfeTradingCalendarSpider, {})
    # 增量抓
    cache_dir = get_exchange_cache_dir(security_type='future', exchange='shfe', the_year=datetime.today().year,
                                       data_type="day_kdata")

    saved_kdata_dates = [f for f in os.listdir(cache_dir)]
    trading_dates = get_trading_calendar(security_type='future', exchange='shfe')

    the_dates = set(trading_dates) - set(saved_kdata_dates)

    process_crawl(FutureShfeSpider, {
        "trading_dates": the_dates})
Exemplo n.º 8
0
def parse_shfe_data(force_parse=False):
    the_dir = get_exchange_cache_dir(security_type='future', exchange='shfe')

    need_parse_files = []

    for the_zip_file in [
            os.path.join(the_dir, f) for f in os.listdir(the_dir)
            if f.endswith('.zip')
    ]:
        dst_file = the_zip_file.replace('.zip', ".xls")

        if not os.path.exists(dst_file):
            dst_dir = the_zip_file.replace('.zip', "")
            os.makedirs(dst_dir)

            unzip(the_zip_file, dst_dir)
            files = [
                os.path.join(dst_dir, f) for f in os.listdir(dst_dir)
                if f.endswith('.xls')
            ]
            if len(files) == 1:
                os.rename(files[0], dst_file)
            need_parse_files.append(dst_file)

    if force_parse:
        need_parse_files = [
            os.path.join(the_dir, f) for f in os.listdir(the_dir)
            if f.endswith('.xls')
        ]
    for the_file in need_parse_files:
        logger.info("parse {}".format(the_file))

        df = pd.read_excel(the_file,
                           skiprows=2,
                           skip_footer=4,
                           index_col='合约',
                           converters={'日期': str})
        df.index = pd.Series(df.index).fillna(method='ffill')
        df = df.loc[:, [
            '日期', '前收盘', '前结算', '开盘价', '最高价', '最低价', '收盘价', '结算价', '涨跌1',
            '涨跌2', '成交量', '成交金额', '持仓量'
        ]]
        df.columns = [
            'timestamp', 'preClose', 'preSettlement', 'open', 'high', 'low',
            'close', 'settlement', 'change', 'change1', 'volume', 'turnover',
            'openInterest'
        ]

        # 日期格式统一,方便导入es
        # df.timestamp = df.timestamp.apply(lambda x: to_time_str(x))

        unique_index = df.index.drop_duplicates()

        security_list = get_security_list(security_type='future',
                                          exchanges=['shfe'])

        for the_contract in unique_index:
            logger.info("start handling {} in {}".format(
                the_contract, the_file))
            security_item = {
                'code': the_contract,
                'name': get_future_name(the_contract),
                'id': 'future_{}_{}'.format('shfe', the_contract),
                'exchange': 'shfe',
                'type': 'future'
            }
            # 检查是否需要保存合约meta
            if (not security_list.empty) and ('code' in security_list.columns):
                security_list = security_list.set_index(security_list['code'],
                                                        drop=False)
            if the_contract not in security_list.index:
                security_list = security_list.append(security_item,
                                                     ignore_index=True)
                security_list = security_list.sort_index()
                security_list.to_csv(get_security_list_path('future', 'shfe'),
                                     index=False)

            the_df = df.loc[the_contract, ]
            the_df['code'] = the_contract
            the_df['name'] = get_future_name(the_contract)
            the_df['securityId'] = 'future_{}_{}'.format('shfe', the_contract)
            the_df['changePct'] = the_df['change'] / the_df['preClose']
            the_df['changePct1'] = the_df['change1'] / the_df['preSettlement']

            kdata_path = get_kdata_path(item=security_item, source='exchange')
            # TODO:这些逻辑应该统一处理
            kdata_dir = get_kdata_dir(item=security_item)
            if not os.path.exists(kdata_dir):
                os.makedirs(kdata_dir)

            if os.path.exists(kdata_path):
                saved_df = pd.read_csv(kdata_path, dtype=str)
            else:
                saved_df = pd.DataFrame()

            saved_df = saved_df.append(the_df, ignore_index=True)
            saved_df = saved_df.loc[:, KDATA_FUTURE_COL]

            if not saved_df.empty:
                kdata_df_save(saved_df, kdata_path)

            logger.info("end handling {} in {}".format(the_contract, the_file))
Exemplo n.º 9
0
def parse_shfe_day_data(force_parse=False):
    cache_dir = get_exchange_cache_dir(security_type='future',
                                       exchange='shfe',
                                       the_year=datetime.datetime.today().year,
                                       data_type="day_kdata")
    the_parsed_path = os.path.join(cache_dir, 'parsed')
    the_parsed = []
    if os.path.exists(the_parsed_path):
        with open(the_parsed_path) as data_file:
            the_parsed = json.load(data_file)

    if force_parse:
        the_dates = [f for f in os.listdir(cache_dir) if f != 'parsed' and f]
    else:
        the_dates = [
            f for f in os.listdir(cache_dir)
            if f != 'parsed' and f not in the_parsed
        ]

    for the_date in the_dates:
        the_path = os.path.join(cache_dir, the_date)
        logger.info("start handling {}".format(the_path))

        with open(the_path, 'r', encoding='UTF8') as f:
            tmp_str = f.read()
            the_json = json.loads(tmp_str)
            the_datas = the_json['o_curinstrument']
            # 日期,代码,名称,最低,开盘,收盘,最高,成交量(手),成交额(元),唯一标识,前收盘,涨跌额,涨跌幅(%),持仓量,结算价,前结算,涨跌额(按结算价),涨跌幅(按结算价)
            KDATA_COLUMN_FUTURE = [
                'timestamp', 'code', 'name', 'low', 'open', 'close', 'high',
                'volume', 'turnover', 'securityId', 'preClose', 'change',
                'changePct', 'openInterest', 'settlement', 'preSettlement',
                'change1', 'changePct1'
            ]
            for the_data in the_datas:
                # {'CLOSEPRICE': 11480,
                #  'DELIVERYMONTH': '1809',
                #  'HIGHESTPRICE': 11555,
                #  'LOWESTPRICE': 11320,
                #  'OPENINTEREST': 425692,
                #  'OPENINTERESTCHG': 3918,
                #  'OPENPRICE': 11495,
                #  'ORDERNO': 0,
                #  'PRESETTLEMENTPRICE': 11545,
                #  'PRODUCTID': 'ru_f    ',
                #  'PRODUCTNAME': '天然橡胶            ',
                #  'PRODUCTSORTNO': 100,
                #  'SETTLEMENTPRICE': 11465,
                #  'VOLUME': 456574,
                #  'ZD1_CHG': -65,
                #  'ZD2_CHG': -80}

                if not re.match("\d{4}", the_data['DELIVERYMONTH']):
                    continue

                code = "{}{}".format(
                    the_data['PRODUCTID'][:the_data['PRODUCTID'].index('_')],
                    the_data['DELIVERYMONTH'])
                logger.info("start handling {} for {}".format(code, the_date))

                name = get_future_name(code)
                security_id = "future_shfe_{}".format(code)

                security_list = get_security_list(security_type='future',
                                                  exchanges=['shfe'])

                logger.info("start handling {} for {}".format(code, the_date))
                security_item = {
                    'code': code,
                    'name': name,
                    'id': security_id,
                    'exchange': 'shfe',
                    'type': 'future'
                }
                # 检查是否需要保存合约meta
                if security_list is not None and 'code' in security_list.columns:
                    security_list = security_list.set_index(
                        security_list['code'], drop=False)
                if code not in security_list.index:
                    security_list = security_list.append(security_item,
                                                         ignore_index=True)
                    security_list.to_csv(get_security_list_path(
                        'future', 'shfe'),
                                         index=False)

                kdata_path = get_kdata_path(item=security_item,
                                            source='exchange')
                # TODO:这些逻辑应该统一处理
                kdata_dir = get_kdata_dir(item=security_item)
                if not os.path.exists(kdata_dir):
                    os.makedirs(kdata_dir)

                if os.path.exists(kdata_path):
                    saved_df = pd.read_csv(kdata_path, dtype=str)
                    saved_df = saved_df.set_index(saved_df['timestamp'],
                                                  drop=False)
                else:
                    saved_df = pd.DataFrame()

                if saved_df.empty or the_date not in saved_df.index:
                    low_price = the_data['LOWESTPRICE']
                    if not low_price:
                        low_price = 0
                    open_price = the_data['OPENPRICE']
                    if not open_price:
                        open_price = 0
                    close_price = the_data['CLOSEPRICE']
                    if not close_price:
                        close_price = 0
                    high_price = the_data['HIGHESTPRICE']
                    if not high_price:
                        high_price = 0
                    volume = the_data['VOLUME']
                    if not volume:
                        volume = 0

                    if type(the_data['ZD1_CHG']) == str:
                        change = 0
                    else:
                        change = the_data['ZD1_CHG']

                    if type(the_data['ZD2_CHG']) == str:
                        change1 = 0
                    else:
                        change1 = the_data['ZD2_CHG']

                    pre_close = close_price - change
                    pre_settlement = the_data['PRESETTLEMENTPRICE']

                    # 首日交易
                    if pre_close != 0:
                        change_pct = change / pre_close
                    else:
                        change_pct = 0
                    if pre_settlement != 0:
                        change_pct1 = change1 / pre_settlement
                    else:
                        change_pct1 = 0

                    the_json = {
                        "timestamp":
                        to_time_str(the_date),
                        "code":
                        code,
                        "name":
                        name,
                        "low":
                        low_price,
                        "open":
                        open_price,
                        "close":
                        close_price,
                        "high":
                        high_price,
                        "volume":
                        volume,
                        # 成交额为估算
                        "turnover":
                        (low_price + open_price + close_price + high_price / 4)
                        * volume,
                        "securityId":
                        security_id,
                        "preClose":
                        pre_close,
                        "change":
                        change,
                        "changePct":
                        change_pct,
                        "openInterest":
                        the_data['OPENINTEREST'],
                        "settlement":
                        the_data['SETTLEMENTPRICE'],
                        "preSettlement":
                        the_data['PRESETTLEMENTPRICE'],
                        "change1":
                        change1,
                        "changePct1":
                        change_pct1
                    }
                    saved_df = saved_df.append(the_json, ignore_index=True)
                    saved_df = saved_df.loc[:, KDATA_COLUMN_FUTURE]
                    saved_df = saved_df.drop_duplicates(subset='timestamp',
                                                        keep='last')
                    saved_df = saved_df.set_index(saved_df['timestamp'],
                                                  drop=False)
                    saved_df.index = pd.to_datetime(saved_df.index)
                    saved_df = saved_df.sort_index()
                    saved_df.to_csv(kdata_path, index=False)

                    logger.info("end handling {} for {}".format(
                        code, the_date))

                    if the_date not in the_parsed:
                        the_parsed.append(the_date)
        if the_parsed:
            result_list = drop_duplicate(the_parsed)
            result_list = sorted(result_list)

            with open(the_parsed_path, 'w') as outfile:
                json.dump(result_list, outfile)
        logger.info("end handling {}".format(the_path))
Exemplo n.º 10
0
 def download_dce_history_data(self,response):
     the_dir = get_exchange_cache_dir(security_type='future', exchange='dce')
     for filepath in response.css('input').xpath('@rel').extract():
         yield Request(url="http://www.dce.com.cn/"+filepath,
                   meta={'filename':os.path.join(the_dir,filepath.split("/")[-1])},
                   callback=self.download_dce_history_data_file)