def load_data(self): if self.security_list: self.data_df = get_data(data_schema=self.data_schema, security_list=self.security_list, provider=self.provider, columns=self.columns, start_timestamp=self.start_timestamp, end_timestamp=self.end_timestamp, filters=self.filters, level=self.level) else: self.data_df = get_data(data_schema=self.data_schema, codes=self.codes, provider=self.provider, columns=self.columns, start_timestamp=self.start_timestamp, end_timestamp=self.end_timestamp, filters=self.filters, level=self.level) if df_is_not_null(self.data_df): self.data_df = index_df_with_category_time( self.data_df, category=self.category_field) for listener in self.data_listeners: listener.on_data_loaded(self.data_df)
def common_data(data_schema, security_id=None, codes=None, level=None, provider='eastmoney', columns=None, start_timestamp=None, end_timestamp=None, filters=None, session=None, order=None, limit=None): if security_id: df = get_data(data_schema=data_schema, security_id=security_id, codes=None, level=level, provider=provider, columns=columns, return_type='df', start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit) return [df] if codes: df_list = [] for code in codes: df_list.append( get_data(data_schema=data_schema, security_id=None, codes=[code], level=level, provider=provider, columns=columns, return_type='df', start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit)) return df_list
def get_manager_trading(provider='eastmoney', security_id=None, codes=None, columns=None, return_type='df', session=None, start_timestamp=None, end_timestamp=None, filters=None, order=None, limit=None): return get_data(data_schema=ManagerTrading, security_id=security_id, codes=codes, level=None, provider=provider, columns=columns, return_type=return_type, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit)
def evaluate_start_end_size_timestamps(self, security_item): """ evaluate the size for recording data :param security_item: :type security_item: str :return:the start,end,size need to recording,size=0 means finish recording :rtype:(pd.Timestamp,pd.Timestamp,int) """ # get latest record latest_record = get_data(security_id=security_item.id, provider=self.provider, data_schema=self.data_schema, order=self.data_schema.timestamp.desc(), limit=1, return_type='domain', session=self.session) if latest_record: latest_timestamp = latest_record[0].timestamp else: latest_timestamp = security_item.timestamp if not latest_timestamp: return None, None, self.default_size, None return latest_timestamp, None, self.default_size, None
def get_kdata(security_id, level=TradingLevel.LEVEL_1DAY.value, provider='eastmoney', columns=None, return_type='df', start_timestamp=None, end_timestamp=None, filters=None, session=None, order=None, limit=None): security_type, exchange, code = decode_security_id(security_id) data_schema = get_kdata_schema(security_type, level=level) return get_data(data_schema=data_schema, security_id=security_id, level=level, provider=provider, columns=columns, return_type=return_type, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit)
def get_account(trader_name=None, return_type='df', start_timestamp=None, end_timestamp=None, filters=None, session=None, order=None, limit=None): if trader_name: if filters: filters = filters + [SimAccount.trader_name == trader_name] else: filters = [SimAccount.trader_name == trader_name] return get_data(data_schema=SimAccount, security_id=None, codes=None, level=None, provider='zvt', columns=None, return_type=return_type, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit)
def evaluate_start_end_size_timestamps(self, security_item): the_timestamps = self.security_timestamps_map.get(security_item.id) if not the_timestamps: self.init_timestamps(security_item) the_timestamps = self.security_timestamps_map.get(security_item.id) if not the_timestamps: self.logger.exception("could not get time series for:{}".format(security_item.id)) assert False timestamps = [to_pd_timestamp(t) for t in the_timestamps] timestamps.sort() self.logger.info( 'security_id:{},init timestamps start:{},end:{}'.format(security_item.id, timestamps[0], timestamps[-1])) latest_record = get_data(security_id=security_item.id, provider=self.provider, data_schema=self.data_schema, order=self.data_schema.timestamp.desc(), limit=1, return_type='domain', session=self.session) if latest_record: self.logger.info('latest record timestamp:{}'.format(latest_record[0].timestamp)) timestamps = [t for t in timestamps if t > latest_record[0].timestamp] if timestamps: return timestamps[0], timestamps[-1], len(timestamps), timestamps return None, None, 0, None return timestamps[0], timestamps[-1], len(timestamps), timestamps
def generate_domain(self, security_item, original_data): """ generate the data_schema instance using security_item and original_data,the original_data should be from record :param security_item: :param original_data: """ the_id = self.generate_domain_id(security_item, original_data) items = get_data(data_schema=self.data_schema, session=self.session, provider=self.provider, security_id=security_item.id, filters=[self.data_schema.id == the_id], return_type='domain') if items and not self.force_update: self.logger.info('ignore the data {}:{} saved before'.format(self.data_schema, the_id)) return None if not items: timestamp_str = original_data[self.get_timestamp_field()] timestamp = None try: timestamp = to_pd_timestamp(timestamp_str) except Exception as e: self.logger.exception(e) domain_item = self.data_schema(id=the_id, code=security_item.code, security_id=security_item.id, timestamp=timestamp) else: domain_item = items[0] fill_domain_from_dict(domain_item, original_data, self.get_data_map()) return domain_item
def get_trader(trader_name=None, return_type='df', start_timestamp=None, end_timestamp=None, filters=None, session=None, order=None, limit=None) -> List[business.Trader]: if trader_name: if filters: filters = filters + [business.Trader.trader_name == trader_name] else: filters = [business.Trader.trader_name == trader_name] return get_data(data_schema=business.Trader, security_id=None, codes=None, level=None, provider='zvt', columns=None, return_type=return_type, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit)
def get_top_ten_tradable_holder(provider='eastmoney', security_id=None, codes=None, columns=None, return_type='df', session=None, start_timestamp=None, end_timestamp=None, filters=None, order=None, limit=None): return get_data(data_schema=TopTenTradableHolder, security_id=security_id, codes=codes, level=None, provider=provider, columns=columns, return_type=return_type, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit)
def __init__(self, security_type=SecurityType.stock, exchanges=['sh', 'sz'], codes=None, the_timestamp=None, window=None, window_func='mean', start_timestamp=None, end_timestamp=None, keep_all_timestamp=False, fill_method='ffill', columns=[], filters=None, provider='eastmoney') -> None: super().__init__(security_type, exchanges, codes, the_timestamp, window, window_func, start_timestamp, end_timestamp, keep_all_timestamp, fill_method) self.columns = set(columns) | { self.data_schema.security_id, self.data_schema.timestamp } self.factors = [item.key for item in columns] self.provider = provider self.original_df = get_data(data_schema=self.data_schema, provider=self.provider, codes=self.codes, columns=self.columns, start_timestamp=self.fetch_start_timestamp, end_timestamp=self.end_timestamp, filters=filters) self.original_df = index_df_with_security_time(self.original_df) self.logger.info(self.original_df) if self.window: self.data_df = self.original_df.reset_index(level='timestamp') # TODO:better way to handle window function if self.window_func == 'mean': self.data_df = self.data_df.groupby(level=0).rolling( window='{}D'.format(self.window.days), on='timestamp').mean() elif self.window_func == 'count': self.data_df = self.data_df.groupby(level=0).rolling( window='{}D'.format(self.window.days), on='timestamp').count() self.data_df = self.data_df.reset_index(level=0, drop=True) self.data_df = self.data_df.set_index('timestamp', append=True) print(self.data_df) else: self.data_df = self.original_df self.data_df = self.data_df.loc[( slice(None), slice(self.start_timestamp, self.end_timestamp)), :] self.logger.info(self.data_df)
def df_to_db(df, data_schema, provider): store_category = get_store_category(data_schema) db_engine = get_db_engine(provider, store_category=store_category) current = get_data(data_schema=data_schema, columns=[data_schema.id]) df = df[~df['id'].isin(current['id'])] df.to_sql(data_schema.__tablename__, db_engine, index=False, if_exists='append')
def move_on(self, to_timestamp, touching_timestamp): df = self.original_df.reset_index(level='timestamp') recorded_timestamps = df.groupby(level=0)['timestamp'].max() self.logger.info('current_timestamps:\n{}'.format(recorded_timestamps)) for security_id, recorded_timestamp in recorded_timestamps.iteritems(): while True: now_timestamp = now_pd_timestamp() if touching_timestamp > now_timestamp: delta = (touching_timestamp - now_timestamp).seconds self.logger.info( 'want to get {} {} kdata for {},now is:{},waiting:{}sencods' .format(to_timestamp, touching_timestamp, security_id, now_timestamp, delta)) time.sleep(delta) added = get_data(data_schema=self.data_schema, provider=self.provider, security_id=security_id, columns=self.columns, start_timestamp=recorded_timestamp, end_timestamp=to_timestamp, filters=self.filters, level=self.level) if (added is not None) and not added.empty: would_added = added[ added['timestamp'] != recorded_timestamp] if not would_added.empty: would_added = index_df_with_security_time(would_added) self.logger.info( 'would_added:\n{}'.format(would_added)) self.original_df = self.original_df.append(would_added) self.original_df = self.original_df.sort_index( level=[0, 1]) self.on_data_added(security_id=security_id, size=len(would_added)) break else: self.logger.info( 'touching_timestamp:{} now_timestamp:{} kdata for {} not ready' .format(touching_timestamp, now_pd_timestamp(), security_id)) if now_timestamp > touching_timestamp + pd.Timedelta( seconds=self.level.to_second() / 2): self.logger.warning( 'now_timestamp:{},still could not get {} {} kdata for {}' .format(now_timestamp, to_timestamp, touching_timestamp, security_id)) break
def evaluate_start_end_size_timestamps(self, security_item): """ evaluate the size for recording data :param security_item: :type security_item: str :return:the start,end,size need to recording,size=0 means finish recording :rtype:(pd.Timestamp,pd.Timestamp,int) """ # get latest record latest_record = get_data(security_id=security_item.id, provider=self.provider, data_schema=self.data_schema, level=self.level.value, order=self.data_schema.timestamp.desc(), limit=1, return_type='domain', session=self.session) if latest_record: latest_timestamp = latest_record[0].timestamp else: latest_timestamp = security_item.timestamp if not latest_timestamp: return latest_timestamp, None, self.default_size, None current_time = pd.Timestamp.now() time_delta = current_time - latest_timestamp if self.level == TradingLevel.LEVEL_1DAY: if is_same_date(current_time, latest_timestamp): return latest_timestamp, None, 0, None return latest_timestamp, None, time_delta.days + 1, None close_hour, close_minute = get_close_time(security_item.id) # to today,check closing time if time_delta.days == 0: if latest_timestamp.hour == close_hour and latest_timestamp.minute == close_minute: return latest_timestamp, None, 0, None if self.level == TradingLevel.LEVEL_5MIN: if time_delta.days > 0: minutes = (time_delta.days + 1) * get_one_day_trading_minutes(security_item.id) return latest_timestamp, None, int(math.ceil(minutes / 5)) + 1, None else: return latest_timestamp, None, int(math.ceil(time_delta.total_seconds() / (5 * 60))) + 1, None if self.level == TradingLevel.LEVEL_1HOUR: if time_delta.days > 0: minutes = (time_delta.days + 1) * get_one_day_trading_minutes(security_item.id) return latest_timestamp, None, int(math.ceil(minutes / 60)) + 1, None else: return latest_timestamp, None, int(math.ceil(time_delta.total_seconds() / (60 * 60))) + 1, None
def finance_score(data_schema, security_id=None, codes=None, provider='eastmoney', fields=None, timestamp=now_pd_timestamp(), report_count=20): fields = fields + ['security_id', 'timestamp', 'report_date'] data_df = get_data(data_schema=data_schema, security_id=security_id, codes=codes, provider=provider, columns=fields, end_timestamp=timestamp) time_series = data_df['report_date'].drop_duplicates() time_series = time_series[-report_count:] data_df = index_df_with_security_time(data_df) idx = pd.IndexSlice df = data_df.loc[idx[:, time_series], ] print(df) df = df.groupby(df['security_id']).mean() print(df) quantile = df.quantile([0.1, 0.3, 0.5, 0.7, 0.9]) def evaluate_score(s, column): the_column = column if s > quantile.loc[0.9, the_column]: return 0.9 if s > quantile.loc[0.7, the_column]: return 0.7 if s > quantile.loc[0.5, the_column]: return 0.5 if s > quantile.loc[0.3, the_column]: return 0.3 if s > quantile.loc[0.1, the_column]: return 0.1 return 0 for item in quantile.columns: df[item] = df[item].apply(lambda x: evaluate_score(x, item)) print(df)
def get_securities(security_list: List[str] = None, security_type: Union[SecurityType, str] = 'stock', exchanges: List[str] = None, codes: List[str] = None, columns: List = None, return_type: str = 'df', session: Session = None, start_timestamp: Union[str, pd.Timestamp] = None, end_timestamp: Union[str, pd.Timestamp] = None, filters: List = None, order: object = None, limit: int = None, provider: Union[str, Provider] = 'eastmoney', index: str = 'code', index_is_time: bool = False) -> object: data_schema = get_security_schema(security_type) if not order: order = data_schema.code.asc() if exchanges: if filters: filters.append(data_schema.exchange.in_(exchanges)) else: filters = [data_schema.exchange.in_(exchanges)] return get_data(data_schema=data_schema, security_list=security_list, security_id=None, codes=codes, level=None, provider=provider, columns=columns, return_type=return_type, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit, index=index, index_is_time=index_is_time)
def get_cash_flow_statement(provider='eastmoney', security_id=None, codes=None, columns=None, return_type='df', session=None, start_timestamp=None, end_timestamp=None, filters=None, order=None, limit=None): return get_data(data_schema=CashFlowStatement, security_id=security_id, codes=codes, level=None, provider=provider, columns=columns, return_type=return_type, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit)
def move_on(self, to_timestamp: Union[str, pd.Timestamp] = None, timeout: int = 20) -> bool: """ get the data happened before to_timestamp,if not set,get all the data which means to now Parameters ---------- to_timestamp : timeout : the time waiting the data ready in seconds Returns ------- whether got data """ if not df_is_not_null(self.data_df): self.load_data() return False df = self.data_df.reset_index(level='timestamp') recorded_timestamps = df.groupby(level=0)['timestamp'].max() self.logger.info('level:{},current_timestamps:\n{}'.format( self.level, recorded_timestamps)) changed = False # FIXME:we suppose history data should be there at first start_time = time.time() for category, recorded_timestamp in recorded_timestamps.iteritems(): while True: category_filter = [self.category_column == category] if self.filters: filters = self.filters + category_filter else: filters = category_filter added = get_data(data_schema=self.data_schema, provider=self.provider, columns=self.columns, start_timestamp=recorded_timestamp, end_timestamp=to_timestamp, filters=filters, level=self.level) if df_is_not_null(added): would_added = added[ added['timestamp'] != recorded_timestamp].copy() if not would_added.empty: added = index_df_with_category_time( would_added, category=self.category_field) self.logger.info('category:{},added:\n{}'.format( category, added)) self.data_df = self.data_df.append(added) self.data_df = self.data_df.sort_index(level=[0, 1]) for listener in self.data_listeners: listener.on_category_data_added(category=category, added_data=added) changed = True # if got data,just move to another category break cost_time = time.time() - start_time if cost_time > timeout: self.logger.warning( 'category:{} level:{} getting data timeout,to_timestamp:{},now:{}' .format(category, self.level, to_timestamp, now_pd_timestamp())) break if changed: for listener in self.data_listeners: listener.on_data_changed(self.data_df) return changed
def __init__(self, data_schema, security_list=None, security_type=SecurityType.stock, exchanges=['sh', 'sz'], codes=None, the_timestamp=None, start_timestamp=None, end_timestamp=None, keep_all_timestamp=False, fill_method='ffill', columns=[], filters=None, provider='eastmoney', level=TradingLevel.LEVEL_1DAY, effective_number=10) -> None: super().__init__(security_list, security_type, exchanges, codes, the_timestamp, start_timestamp, end_timestamp, keep_all_timestamp, fill_method, effective_number) self.data_schema = data_schema if columns: self.columns = set(columns) | { self.data_schema.security_id, self.data_schema.timestamp } self.factors = [item.key for item in columns] else: self.columns = None self.provider = provider self.level = level self.filters = filters # use security_list if possible if self.security_list: self.original_df = get_data(data_schema=self.data_schema, security_list=self.security_list, provider=self.provider, columns=self.columns, start_timestamp=self.start_timestamp, end_timestamp=self.end_timestamp, filters=self.filters, level=self.level) else: self.original_df = get_data(data_schema=self.data_schema, codes=self.codes, provider=self.provider, columns=self.columns, start_timestamp=self.start_timestamp, end_timestamp=self.end_timestamp, filters=self.filters, level=self.level) if self.original_df is None or self.original_df.empty: raise Exception( 'no data for: {} {} level:{} from: {} to: {}'.format( self.security_list, self.codes, self.level, self.start_timestamp, self.end_timestamp)) self.original_df = index_df_with_security_time(self.original_df) self.logger.info('factor:{},original_df:\n{}'.format( self.factor_name, self.original_df))
def get_finance_factor(provider='eastmoney', security_id=None, codes=None, columns=None, return_type='df', session=None, start_timestamp=None, end_timestamp=None, filters=None, order=None, limit=None): return get_data(data_schema=FinanceFactor, security_id=security_id, codes=codes, level=None, provider=provider, columns=columns, return_type=return_type, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit)
def evaluate_start_end_size_timestamps(self, security_item): """ evaluate the size for recording data :param security_item: :type security_item: str :return:the start,end,size need to recording,size=0 means finish recording :rtype:(pd.Timestamp,pd.Timestamp,int) """ # get latest record latest_record = get_data(security_id=security_item.id, provider=self.provider, data_schema=self.data_schema, level=self.level.value, order=self.data_schema.timestamp.desc(), limit=1, return_type='domain', session=self.session) if latest_record: latest_timestamp = latest_record[0].timestamp else: latest_timestamp = security_item.timestamp if not latest_timestamp: return latest_timestamp, None, self.default_size, None current_time = pd.Timestamp.now() time_delta = current_time - latest_timestamp if self.level == TradingLevel.LEVEL_1DAY: if is_same_date(current_time, latest_timestamp): return latest_timestamp, None, 0, None return latest_timestamp, None, time_delta.days + 1, None close_hour, close_minute = get_close_time(security_item.id) # to today,check closing time # 0,0 means never stop,e.g,coin if (close_hour != 0 and close_minute != 0) and time_delta.days == 0: if latest_timestamp.hour == close_hour and latest_timestamp.minute == close_minute: return latest_timestamp, None, 0, None if self.kdata_use_begin_time: touching_timestamp = latest_timestamp + pd.Timedelta( seconds=self.level.to_second()) else: touching_timestamp = latest_timestamp waiting_seconds, size = self.level.count_from_timestamp( touching_timestamp, one_day_trading_minutes=get_one_day_trading_minutes( security_item.id)) if not self.one_shot and waiting_seconds and (waiting_seconds > 30): t = waiting_seconds / 2 self.logger.info( 'level:{},recorded_time:{},touching_timestamp:{},current_time:{},next_ok_time:{},just sleep:{} seconds' .format( self.level.value, latest_timestamp, touching_timestamp, current_time, touching_timestamp + pd.Timedelta(seconds=self.level.to_second()), t)) time.sleep(t) return latest_timestamp, None, size, None
def get_rights_issue_detail(provider='eastmoney', security_id=None, codes=None, columns=None, return_type='df', session=None, start_timestamp=None, end_timestamp=None, filters=None, order=None, limit=None): return get_data(data_schema=RightsIssueDetail, security_id=security_id, codes=codes, level=None, provider=provider, columns=columns, return_type=return_type, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, session=session, order=order, limit=limit)