def on_finish_entity(self, entity): super().on_finish_entity(entity) if not self.fetch_jq_timestamp: return # fill the timestamp for report published date the_data_list = get_data( data_schema=self.data_schema, provider=self.provider, entity_id=entity.id, order=self.data_schema.timestamp.asc(), return_type='domain', session=self.session, filters=[ self.data_schema.timestamp == self.data_schema.report_date, self.data_schema.timestamp >= to_pd_timestamp('2005-01-01') ]) if the_data_list: if self.data_schema == FinanceFactor: for the_data in the_data_list: self.fill_timestamp_with_jq(entity, the_data) else: df = get_finance_factor( entity_id=entity.id, columns=[ FinanceFactor.timestamp, FinanceFactor.report_date, FinanceFactor.id ], filters=[ FinanceFactor.timestamp != FinanceFactor.report_date, FinanceFactor.timestamp >= to_pd_timestamp('2005-01-01'), FinanceFactor.report_date >= the_data_list[0].report_date, FinanceFactor.report_date <= the_data_list[-1].report_date, ]) if pd_is_not_null(df): index_df(df, index='report_date', time_field='report_date') for the_data in the_data_list: if (df is not None) and ( not df.empty) and the_data.report_date in df.index: the_data.timestamp = df.at[the_data.report_date, 'timestamp'] self.logger.info( 'db fill {} {} timestamp:{} for report_date:{}'. format(self.data_schema, entity.id, the_data.timestamp, the_data.report_date)) self.session.commit() else: # self.logger.info( # 'waiting jq fill {} {} timestamp:{} for report_date:{}'.format(self.data_schema, # security_item.id, # the_data.timestamp, # the_data.report_date)) self.fill_timestamp_with_jq(entity, the_data)
def get_data(data_schema, entity_ids: List[str] = None, entity_id: str = None, codes: List[str] = None, level: Union[IntervalLevel, str] = None, provider: str = None, columns: List = None, return_type: str = 'df', start_timestamp: Union[pd.Timestamp, str] = None, end_timestamp: Union[pd.Timestamp, str] = None, filters: List = None, session: Session = None, order=None, limit: int = None, index: str = 'timestamp', index_is_time: bool = True, time_field: str = 'timestamp'): assert data_schema is not None assert provider is not None assert provider in global_providers local_session = False if not session: session = get_db_session(provider=provider, data_schema=data_schema) local_session = True try: time_col = eval('data_schema.{}'.format(time_field)) if columns: # support str if type(columns[0]) == str: columns_ = [] for col in columns: columns_.append(eval('data_schema.{}'.format(col))) columns = columns_ if time_col not in columns: columns.append(time_col) query = session.query(*columns) else: query = session.query(data_schema) if entity_id: query = query.filter(data_schema.entity_id == entity_id) if codes: query = query.filter(data_schema.code.in_(codes)) if entity_ids: query = query.filter(data_schema.entity_id.in_(entity_ids)) # we always store different level in different schema,the level param is not useful now if level: try: # some schema has no level,just ignore it data_schema.level if type(level) == IntervalLevel: level = level.value query = query.filter(data_schema.level == level) except Exception as e: pass query = common_filter(query, data_schema=data_schema, start_timestamp=start_timestamp, end_timestamp=end_timestamp, filters=filters, order=order, limit=limit, time_field=time_field) if return_type == 'df': df = pd.read_sql(query.statement, query.session.bind) if df_is_not_null(df): return index_df(df, drop=False, index=index, index_is_time=index_is_time) return df elif return_type == 'domain': return query.all() elif return_type == 'dict': return [item.__dict__ for item in query.all()] except Exception: raise finally: if local_session: session.close()
def normalize_result_df(self, df): if pd_is_not_null(df): df = df.reset_index() df = index_df(df) df = df.sort_values(by=['score', 'entity_id']) return df