def start_requests(self): if self.dataType is None or self.dataType == 'dayk': daterange = pd.date_range(start='2006-06-30', end=pd.Timestamp.today()) daterange = daterange[daterange.dayofweek < 5] for i in daterange: the_dir = get_exchange_cache_path( security_type='future', exchange='cffex', data_type='day_kdata', the_date=to_timestamp(i)) + ".csv" if not os.path.exists(the_dir): yield Request( url="http://www.cffex.com.cn/sj/hqsj/rtj/" + i.strftime("%Y%m/%d/%Y%m%d") + "_1.csv", callback=self.download_cffex_history_data_file, meta={'filename': the_dir}) elif self.dataType == 'inventory': daterange = pd.date_range(start='2006-06-30', end=pd.Timestamp.today()) k = ['IF', 'IC', 'IH', 'T', 'TF'] daterange = daterange[daterange.dayofweek < 5] for i in daterange: for j in k: the_dir = get_exchange_cache_path( security_type='future', exchange='cffex', data_type='inventory', the_date=to_timestamp(i)) + j + ".csv" if not os.path.exists(the_dir): yield Request( url="http://www.cffex.com.cn/sj/ccpm/" + i.strftime("%Y%m/%d/") + j + "_1.csv", callback=self.download_cffex_history_data_file, meta={'filename': the_dir})
def start_requests(self): self.dataType = self.settings.get("dataType") if self.dataType is None or self.dataType == 'day_kdata': today = pd.Timestamp.today() for date in pd.date_range(start='20200101', end=today): the_dir = get_exchange_cache_path( security_type='future', exchange='czce', the_date=to_timestamp(date), data_type='day_kdata') + '.xls' if (date.dayofweek < 5 and not os.path.exists(the_dir)): yield Request( url="http://www.czce.com.cn/cn/DFSStaticFiles/Future/" + date.strftime("%Y/%Y%m%d") + "/FutureDataDaily.xls", callback=self.download_czce_kline_data, meta={'filename': the_dir}) elif self.dataType == 'historyk': yield Request( url="http://www.czce.com.cn/cn/jysj/lshqxz/H770319index_1.htm", callback=self.download_czce_history_data) elif self.dataType == 'inventory': today = pd.Timestamp.today() for date in pd.date_range(start='20200101', end=today): the_dir = get_exchange_cache_path( security_type='future', exchange='czce', the_date=to_timestamp(date), data_type='inventory') + '.xls' if (date.dayofweek < 5 and not os.path.exists(the_dir)): yield Request( url="http://www.czce.com.cn/cn/DFSStaticFiles/Future/" + date.strftime("%Y/%Y%m%d") + "/FutureDataHolding.xls", callback=self.download_czce_kline_data, meta={'filename': the_dir})
def start_requests(self): self.dataType = self.settings.get("dataType") if self.dataType == 'inventory': today = pd.Timestamp.today() for date in pd.date_range(start=today.date() - pd.Timedelta(weeks=520), end=today): the_dir = get_exchange_cache_path( security_type='future', exchange='shfe', the_date=to_timestamp(date), data_type='inventory') + '.json' if date.dayofweek < 5 and not os.path.exists(the_dir): yield Request(url=self.get_day_inventory_url( the_date=date.strftime('%Y%m%d')), meta={ 'the_date': date, 'the_path': the_dir }, callback=self.download_shfe_data_by_date) if self.dataType == 'day_kdata': daterange = pd.date_range(start='2020-01-01', end=pd.Timestamp.today()) daterange = daterange[daterange.dayofweek < 5] # 每天的数据 for the_date in daterange: the_path = get_exchange_cache_path( security_type='future', exchange='shfe', the_date=to_timestamp(the_date), data_type='day_kdata') if not os.path.exists(the_path): yield Request(url=self.get_day_kdata_url( the_date=the_date.strftime('%Y%m%d')), meta={ 'the_date': the_date, 'the_path': the_path }, callback=self.download_shfe_data_by_date) else: # 直接抓年度统计数据 for the_year in range(2009, datetime.today().year): the_dir = get_exchange_cache_dir(security_type='future', exchange='shfe') the_path = os.path.join( the_dir, "{}_shfe_history_data.zip".format(the_year)) if not os.path.exists(the_path): yield Request( url=self.get_year_k_data_url(the_year=the_year), meta={ 'the_year': the_year, 'the_path': the_path }, callback=self.download_shfe_history_data)
def consume_topic_with_func(self, topic, func): consumer = KafkaConsumer( topic, client_id='fooltrader', group_id=self.bot_name, value_deserializer=lambda m: json.loads(m.decode('utf8')), bootstrap_servers=[KAFKA_HOST]) topic_partition = TopicPartition(topic=topic, partition=0) if self.start_timestamp: start_timestamp = int(self.start_timestamp.timestamp() * 1000) end_offset = consumer.end_offsets([topic_partition ])[topic_partition] if end_offset == 0: self.logger.warning("topic:{} end offset:{}".format( topic, end_offset)) self.logger.error( "the topic:{} has no data,but you want to backtest".format( self.quote_topic)) return # find the offset from start_timestamp offset_and_timestamp = consumer.offsets_for_times( {topic_partition: start_timestamp}) if offset_and_timestamp: offset_and_timestamp = offset_and_timestamp[topic_partition] if offset_and_timestamp: # partition assigned after poll, and we could seek consumer.poll(5, 1) # move to the offset consumer.seek(topic_partition, offset_and_timestamp.offset) for message in consumer: if 'timestamp' in message.value: message_time = to_timestamp( message.value['timestamp']) else: message_time = to_timestamp(message.timestamp) if self.end_timestamp and (message_time > self.end_timestamp): consumer.close() break getattr(self, func)(message.value) else: latest_timestamp, _ = get_latest_timestamp_order_from_topic( self.quote_topic) self.logger.warning( "start:{} is after the last record:{}".format( self.start_timestamp, latest_timestamp))
def start_requests(self): self.trading_dates = self.settings.get("trading_dates") if self.trading_dates: # 每天的数据 for the_date in self.trading_dates: the_path = get_exchange_cache_path( security_type='future', exchange='shfe', the_date=to_timestamp(the_date), data_type='day_kdata') yield Request(url=self.get_day_kdata_url(the_date=the_date), meta={ 'the_date': the_date, 'the_path': the_path }, callback=self.download_shfe_data_by_date) else: # 直接抓年度统计数据 for the_year in range(2009, datetime.today().year): the_dir = get_exchange_cache_dir(security_type='future', exchange='shfe') the_path = os.path.join( the_dir, "{}_shfe_history_data.zip".format(the_year)) if not os.path.exists(the_path): yield Request( url=self.get_year_k_data_url(the_year=the_year), meta={ 'the_year': the_year, 'the_path': the_path }, callback=self.download_shfe_history_data)
def on_event(self, event_item): self.logger.debug(event_item) if not self.last_date or not is_same_date(self.last_date, self.current_time): self.last_date = to_timestamp( event_item['timestamp']) - timedelta(days=1) self.last_kdata = get_kdata(self.security_item, the_date=to_time_str(self.last_date)) if self.last_kdata is None: fetch_kdata(exchange_str=self.security_item['exchange']) self.last_kdata = get_kdata(self.security_item, the_date=to_time_str( self.last_date)) if self.last_kdata is not None: self.last_close = self.last_kdata.loc[ to_time_str(self.last_date), 'close'] else: self.logger.exception("could not get last close for:{}".format( self.last_date)) self.update_today_triggered() change_pct = (event_item['price'] - self.last_close) / self.last_close self.logger.info( "{} last day close is:{},now price is:{},the change_pct is:{}". format(self.security_item['id'], self.last_close, event_item['price'], change_pct)) self.check_subscription(current_price=event_item['price'], change_pct=change_pct)
def init_new_computing_interval(self, event_timestamp): self.last_timestamp = to_timestamp(event_timestamp) self.kdata_timestamp = self.last_timestamp + timedelta(seconds=-self.last_timestamp.second, microseconds=-self.last_timestamp.microsecond) self.last_day_time_str = to_time_str(self.kdata_timestamp) self.last_mirco_time_str = to_time_str(self.kdata_timestamp, time_fmt=TIME_FORMAT_MICRO)
def on_init(self): super().on_init() self.security_id = 'cryptocurrency_contract_RAM-EOS' query = { "term": {"securityId": ""} } query["term"]["securityId"] = self.security_id # get latest kdata timestamp latest_kdata_timestamp = es_get_latest_timestamp(index=kdata_index_name, query=query) # get latest eos statistic timestamp latest_statistic_record = es_get_latest_record(index=statistic_index_name, query=query, time_field='updateTimestamp') if latest_statistic_record: self.latest_statistic_record = CommonStatistic( meta={'id': latest_statistic_record['id'], 'index': statistic_index_name}, **latest_statistic_record) if not is_same_time(latest_kdata_timestamp, self.latest_statistic_record['updateTimestamp']): self.logger.warning( "latest_kdata_timestamp:{},latest_statistic_timestamp:{}".format(latest_kdata_timestamp, self.latest_statistic_record[ 'updateTimestamp'])) else: self.latest_statistic_record = None if latest_kdata_timestamp and self.latest_statistic_record: self.start_timestamp = min(latest_kdata_timestamp, to_timestamp(self.latest_statistic_record['updateTimestamp']))
def on_event(self, event_item): if not self.computing_start: self.computing_start = datetime.now() if not self.last_timestamp: self.init_new_computing_interval(event_item['timestamp']) current_timestamp = to_timestamp(event_item['timestamp']) # calculating last minute if current_timestamp.minute != self.last_timestamp.minute: self.df = pd.DataFrame(self.item_list) self.generate_user_statistic() if self.es_actions: resp = elasticsearch.helpers.bulk(es_client, self.es_actions) self.logger.info("index success:{} failed:{}".format(resp[0], len(resp[1]))) if resp[1]: self.logger.error("error:{}".format(resp[1])) self.init_new_computing_interval(event_item['timestamp']) self.es_actions = [] self.item_list = [] self.logger.info("using computing time:{}".format(datetime.now() - self.computing_start)) self.computing_start = datetime.now() self.item_list.append(event_item)
def on_init(self): super().on_init() self.security_id = 'cryptocurrency_contract_RAM-EOS' query = { "term": {"securityId": ""} } query["term"]["securityId"] = self.security_id # get latest user statistic timestamp latest_eos_user_statistic_record = es_get_latest_record(index=user_statistic_index_name, query=query, time_field='updateTimestamp') if latest_eos_user_statistic_record: self.latest_eos_user_statistic_record = EosUserStatistic( meta={'id': latest_eos_user_statistic_record['id'], 'index': user_statistic_index_name}, **latest_eos_user_statistic_record) else: self.latest_eos_user_statistic_record = None if self.latest_eos_user_statistic_record: self.start_timestamp = to_timestamp(self.latest_eos_user_statistic_record['updateTimestamp']) self.user_map_latest_user_statistic = {} self.user_map_latest_user_daily_statistic = {} self.es_actions = []
def update_user_statistic(self, user_id, record, update_timestamp): latest_user_statistic = self.user_map_latest_user_statistic.get(user_id) if not latest_user_statistic: doc_id = '{}_{}'.format(user_id, self.security_id) the_record = es_get_user_statistic(user_id=user_id) if the_record: latest_user_statistic = EosUserStatistic(meta={'id': doc_id, 'index': user_statistic_index_name}, **the_record) self.user_map_latest_user_statistic[user_id] = latest_user_statistic # ignore the user statistic has computed before if latest_user_statistic and self.kdata_timestamp <= to_timestamp( latest_user_statistic['updateTimestamp']): return if not latest_user_statistic: latest_user_statistic = EosUserStatistic(meta={'id': doc_id, 'index': user_statistic_index_name}, id=doc_id, userId=user_id, timestamp=self.last_day_time_str, securityId=self.security_id, code=self.security_item['code'], name=self.security_item['name']) self.user_map_latest_user_statistic[user_id] = latest_user_statistic # update user statistic self.update_statistic_doc(latest_user_statistic, record, update_timestamp)
def start_requests(self): startDate = to_timestamp('2015-05-22') today = pd.Timestamp.today() for date in pd.date_range(start=startDate, end=today, freq='W'): yield Request( url= "http://www.chinaclear.cn/cms-search/view.action?action=china&dateStr=" + date.strftime('%Y.%m.%d'), meta={'the_date': date.strftime('%Y%m%d')}, callback=self.download_chinaclear_data_by_date)
def start_requests(self): if self.dataType is None: today = pd.Timestamp.today() for date in pd.date_range(start=today.date() - pd.Timedelta(days=today.dayofyear - 1), end=today): the_dir = get_exchange_cache_path( security_type='future', exchange='czce', the_date=to_timestamp(date), data_type='day_kdata') + '.xls' if (date.dayofweek < 5 and not os.path.exists(the_dir)): yield Request( url= "http://www.czce.com.cn/portal/DFSStaticFiles/Future/" + date.strftime("%Y/%Y%m%d") + "/FutureDataDaily.xls", callback=self.download_czce_kline_data, meta={'filename': the_dir}) elif self.dataType == 'historyk': yield Request( url= "http://www.czce.com.cn/portal/jysj/qhjysj/lshqxz/A09112017index_1.htm", callback=self.download_czce_history_data) elif self.dataType == 'inventory': today = pd.Timestamp.today() for date in pd.date_range(start=today.date() - pd.Timedelta(weeks=450), end=today): the_dir = get_exchange_cache_path( security_type='future', exchange='czce', the_date=to_timestamp(date), data_type='inventory') + '.xls' if (date.dayofweek < 5 and not os.path.exists(the_dir)): yield Request( url= "http://www.czce.com.cn/portal/DFSStaticFiles/Future/" + date.strftime("%Y/%Y%m%d") + "/FutureDataHolding.xls", callback=self.download_czce_kline_data, meta={'filename': the_dir})
def after_init(self): super().after_init() if not self.start_timestamp: self.start_timestamp = to_timestamp(self.security_item['listDate']) # the last timestamp for the computing interval self.last_timestamp = None self.last_day_time_str = None self.last_mirco_time_str = None self.df = pd.DataFrame() self.item_list = [] self.computing_start = None
def generate_eos_daily_statistic(self): # ignore the statistic has computed before if self.latest_statistic_record and self.kdata_timestamp <= to_timestamp( self.latest_statistic_record['updateTimestamp']): return # update the statistic if (not self.latest_statistic_record) or (not is_same_date(self.latest_statistic_record['timestamp'], self.df['timestamp'][0])): doc_id = "{}_{}".format(self.security_id, self.last_day_time_str) self.latest_statistic_record = CommonStatistic(meta={'id': doc_id, 'index': statistic_index_name}, id=doc_id, timestamp=self.last_day_time_str, securityId=self.security_id, code=self.security_item['code'], name=self.security_item['name']) volume = self.df['volume'].sum() turnover = self.df['turnover'].sum() flow = (self.df['turnover'] * self.df['direction']).sum() flowIn = self.df[self.df['direction'] == 1]['turnover'].sum() flowOut = self.df[self.df['direction'] == -1]['turnover'].sum() bigFlowIn = self.df[(self.df['direction'] == 1) & (self.df['turnover'] >= self.BIG_ORDER)]['turnover'].sum() middleFlowIn = self.df[(self.df['direction'] == 1) & (self.df['turnover'] >= self.MIDDLE_ORDER) & ( self.df['turnover'] < self.BIG_ORDER)]['turnover'].sum() smallFlowIn = self.df[(self.df['direction'] == 1) & (self.df['turnover'] < self.MIDDLE_ORDER)]['turnover'].sum() bigFlowOut = self.df[(self.df['direction'] == -1) & (self.df['turnover'] >= self.BIG_ORDER)]['turnover'].sum() middleFlowOut = self.df[(self.df['direction'] == -1) & (self.df['turnover'] >= self.MIDDLE_ORDER) & ( self.df['turnover'] < self.BIG_ORDER)]['turnover'].sum() smallFlowOut = self.df[(self.df['direction'] == -1) & (self.df['turnover'] < self.MIDDLE_ORDER)][ 'turnover'].sum() self.update_statistic_doc(self.latest_statistic_record, {'volume': volume, 'turnover': turnover, 'flow': flow, 'flowIn': flowIn, 'flowOut': flowOut, 'bigFlowIn': bigFlowIn, 'middleFlowIn': middleFlowIn, 'smallFlowIn': smallFlowIn, 'bigFlowOut': bigFlowOut, 'middleFlowOut': middleFlowOut, 'smallFlowOut': smallFlowOut }, updateTimestamp=self.last_mirco_time_str)
def crawl_rollYield_And_Spread(): cache_dir = get_exchange_cache_dir(security_type='future', exchange='shfe', data_type="day_kdata") today = pd.Timestamp.today() calendar = fushare.cons.get_calendar() filteredCalendar = list( filter(lambda x: datetime.strptime(x, '%Y%m%d') <= today, calendar)) for date in filteredCalendar: the_dir = get_exchange_cache_path(security_type='future', exchange='shfe', the_date=to_timestamp(date), data_type='misc') datet = date if not os.path.exists(the_dir): # rydf = fushare.get_rollYield_bar(type="var",date=datet) # rydf.to_csv(the_dir+'rollYeild'+datet+'.csv') try: spdf = fushare.get_spotPrice(datet) spdf.to_csv(the_dir + 'spotPrice' + datet + '.csv') except BaseException as e: print("not downloaded for " + datet)
def on_event(self, event_item): if not self.computing_start: self.computing_start = datetime.now() if not self.last_timestamp: self.init_new_computing_interval(event_item['timestamp']) current_timestamp = to_timestamp(event_item['timestamp']) # calculating last minute if current_timestamp.minute != self.last_timestamp.minute: self.df = pd.DataFrame(self.item_list) self.generate_1min_kdata() self.generate_eos_daily_statistic() self.init_new_computing_interval(event_item['timestamp']) self.item_list = [] self.logger.info("using computing time:{}".format(datetime.now() - self.computing_start)) self.computing_start = datetime.now() self.item_list.append(event_item)
def get_latest_timestamp_order_from_topic(topic): consumer = KafkaConsumer( topic, # client_id='fooltrader', # group_id='fooltrader', value_deserializer=lambda m: json.loads(m.decode('utf8')), bootstrap_servers=[KAFKA_HOST]) topic_partition = TopicPartition(topic=topic, partition=0) end_offset = consumer.end_offsets([topic_partition])[topic_partition] if end_offset > 0: # partition assigned after poll, and we could seek consumer.poll(5, 1) consumer.seek(topic_partition, end_offset - 1) message = consumer.poll(10000, 500) msgs = message[topic_partition] if len(msgs) > 0: record = msgs[-1] timestamp = to_timestamp(record.value['timestamp']) order = None if 'order' in record.value: order = record.value['order'] return timestamp, order return None, None
def eos_ram_to_kafka(): ram_trade = db.ram_trade logger.info("collection:{}".format(ram_trade)) earliest_record = ram_trade.find_one({ "$query": {}, "$orderby": { "global_seq": 1 } }) latest_record = ram_trade.find_one({ "$query": {}, "$orderby": { "global_seq": -1 } }) logger.info("earliest_record:{},latest_record:{}".format( earliest_record, latest_record)) security_id = 'cryptocurrency_contract_RAM-EOS' latest_timestamp, latest_order = get_latest_timestamp_order(security_id) topic = get_kafka_tick_topic(security_id) if not latest_timestamp: latest_timestamp = earliest_record['block_time'] start_date, end_date = evaluate_time_range(latest_timestamp) while True: if latest_order and start_date and end_date: condition = { "block_time": { "$gte": start_date, "$lt": end_date }, "global_seq": { "$gt": latest_order } } elif start_date and end_date: condition = {"block_time": {"$gte": start_date, "$lt": end_date}} elif latest_order: condition = {"global_seq": {"$gt": latest_order}} logger.info("start_date:{},end_date:{},order:{}".format( start_date, end_date, latest_order)) latest_timestamp = end_date for item in ram_trade.find(condition): tick = to_tick(item) record_meta = producer.send( topic, bytes(json.dumps(tick, ensure_ascii=False), encoding='utf8'), key=bytes(security_id, encoding='utf8'), timestamp_ms=int(item['block_time'].timestamp() * 1000)) record = record_meta.get(10) latest_timestamp = to_timestamp(record.timestamp) latest_order = tick['order'] logger.debug("tick_to_kafka {}".format(tick)) if datetime.now() - latest_timestamp < timedelta(minutes=5): time.sleep(2) logger.info("record latest_timestamp:{},now is:{}".format( latest_timestamp, datetime.now())) start_date = None end_date = None else: start_date, end_date = evaluate_time_range(latest_timestamp)
def request_inventory_data(self): today = pd.Timestamp.today() requests = [] for date in pd.date_range(start='20200101',end=today): the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_inventory")+'.zip' if(date.dayofweek<5 and not os.path.exists(the_dir)): requests.append(FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportMemberDealPosiQuotesBatchData.html",formdata={ 'batchExportFlag':'batch', 'contract.contract_id':'all', 'contract.variety_id':'a', 'year':str(date.year), 'month':str(date.month-1), 'day':str(date.day), 'memberDealPosiQuotes.trade_type':'0', 'memberDealPosiQuotes.variety':'all' },callback=self.download_dce_kline_data,meta={ 'filename':the_dir })) return requests
def consume_topic_with_func(self, topic, func): if not topic: while True: self.on_timer({"timestamp": self.current_time}) if is_same_date(self.current_time, pd.Timestamp.now()): time.sleep(self.time_step.total_seconds()) self.current_time += self.time_step consumer = KafkaConsumer( topic, # client_id='fooltrader', # group_id=self.bot_name, value_deserializer=lambda m: json.loads(m.decode('utf8')), bootstrap_servers=[KAFKA_HOST]) topic_partition = TopicPartition(topic=topic, partition=0) start_timestamp = int(self.start_date.timestamp()) end_offset = consumer.end_offsets([topic_partition])[topic_partition] if end_offset == 0: self.logger.warning("topic:{} end offset:{}".format( topic, end_offset)) # 等有数据才能做进一步的判断 for message in consumer: self.logger.info("first message:{} to topic:{}".format( message, topic)) break consumer.poll(5, 1) consumer.seek(topic_partition, 0) # 找到以start_timestamp为起点的offset partition_map_offset_and_timestamp = consumer.offsets_for_times( {topic_partition: start_timestamp}) if partition_map_offset_and_timestamp: offset_and_timestamp = partition_map_offset_and_timestamp[ topic_partition] if offset_and_timestamp: # partition assigned after poll, and we could seek consumer.poll(5, 1) # move to the offset consumer.seek(topic_partition, offset_and_timestamp.offset) # 目前的最大offset end_offset = consumer.end_offsets([topic_partition ])[topic_partition] for message in consumer: if 'timestamp' in message.value: message_time = to_timestamp(message.value['timestamp']) else: message_time = to_timestamp(message.timestamp) # 设定了结束日期的话,时间到了或者kafka没数据了就结束 if self.end_date and (message_time > self.end_date or message.offset + 1 == end_offset): consumer.close() break self.current_time = message_time # 收市后计算 if False: self.account_service.calculate_closing_account( self.current_time) # self.on_event(message.value) getattr(self, func)(message.value) else: consumer.poll(5, 1) consumer.seek( topic_partition, consumer.end_offsets([topic_partition])[topic_partition] - 1) message = consumer.poll(5000, 1) kafka_end_date = datetime.fromtimestamp( message[topic_partition][0].timestamp).strftime( TIME_FORMAT_DAY) self.logger.warning( "start:{} is after the last record:{}".format( self.start_date, kafka_end_date))
def request_currentyear_kdata(self): today = pd.Timestamp.today() requests=[] for date in pd.date_range(start='20200101',end=today): the_dir = get_exchange_cache_path(security_type='future', exchange='dce',the_date=to_timestamp(date),data_type="day_kdata")+'.xls' if(date.dayofweek<5 and not os.path.exists(the_dir)): requests.append( FormRequest(url="http://www.dce.com.cn/publicweb/quotesdata/exportDayQuotesChData.html",formdata={ 'year':str(date.year), 'month':str(date.month-1), 'day':str(date.day), 'dayQuotes.trade_type':'0', 'dayQuotes.variety':'all', 'exportType':'excel' },callback=self.download_dce_kline_data,meta={ 'filename':the_dir })) return requests
def es_get_latest_timestamp(index, time_field='timestamp', query=None): latest_record = es_get_latest_record(index, time_field, query) if latest_record: return to_timestamp(latest_record['timestamp'])