def on_finish_entity(self, entity):
        super().on_finish_entity(entity)

        if not self.fetch_jq_timestamp:
            return

        # fill the timestamp for report published date
        the_data_list = get_data(
            data_schema=self.data_schema,
            provider=self.provider,
            entity_id=entity.id,
            order=self.data_schema.timestamp.asc(),
            return_type='domain',
            session=self.session,
            filters=[
                self.data_schema.timestamp == self.data_schema.report_date,
                self.data_schema.timestamp >= to_pd_timestamp('2005-01-01')
            ])
        if the_data_list:
            if self.data_schema == FinanceFactor:
                for the_data in the_data_list:
                    self.fill_timestamp_with_jq(entity, the_data)
            else:
                df = FinanceFactor.query_data(
                    entity_id=entity.id,
                    columns=[
                        FinanceFactor.timestamp, FinanceFactor.report_date,
                        FinanceFactor.id
                    ],
                    filters=[
                        FinanceFactor.timestamp != FinanceFactor.report_date,
                        FinanceFactor.timestamp >=
                        to_pd_timestamp('2005-01-01'),
                        FinanceFactor.report_date >=
                        the_data_list[0].report_date,
                        FinanceFactor.report_date <=
                        the_data_list[-1].report_date,
                    ])

                if pd_is_not_null(df):
                    index_df(df, index='report_date', time_field='report_date')

                for the_data in the_data_list:
                    if (df is not None) and (
                            not df.empty) and the_data.report_date in df.index:
                        the_data.timestamp = df.at[the_data.report_date,
                                                   'timestamp']
                        self.logger.info(
                            'db fill {} {} timestamp:{} for report_date:{}'.
                            format(self.data_schema, entity.id,
                                   the_data.timestamp, the_data.report_date))
                        self.session.commit()
                    else:
                        # self.logger.info(
                        #     'waiting jq fill {} {} timestamp:{} for report_date:{}'.format(self.data_schema,
                        #                                                                    security_item.id,
                        #                                                                    the_data.timestamp,
                        #                                                                    the_data.report_date))

                        self.fill_timestamp_with_jq(entity, the_data)
Пример #2
0
    def run(self):
        if self.filter_factors:
            musts = []
            for factor in self.filter_factors:
                df = factor.get_result_df()
                if len(df.columns) > 1:
                    s = df.agg("and", axis="columns")
                    s.name = 'score'
                    musts.append(s.to_frame(name='score'))
                else:
                    df.columns = ['score']
                    musts.append(df)

            self.must_result = list(accumulate(musts,
                                               func=operator.__and__))[-1]

        if self.score_factors:
            scores = []
            for factor in self.score_factors:
                df = factor.get_result_df()
                if len(df.columns) > 1:
                    s = df.agg("mean", axis="columns")
                    s.name = 'score'
                    scores.append(s.to_frame(name='score'))
                else:
                    df.columns = ['score']
                    scores.append(df)
            self.score_result = list(accumulate(scores,
                                                func=operator.__add__))[-1]

        if df_is_not_null(self.must_result) and df_is_not_null(
                self.score_result):
            result1 = self.must_result[self.must_result.score]
            result2 = self.score_result[
                self.score_result.score >= self.threshold]
            result = result2.loc[result1.index, :]

        elif df_is_not_null(self.score_result):
            result = self.score_result[
                self.score_result.score >= self.threshold]
        else:
            result = self.must_result[self.must_result.score]

        self.result_df = result.reset_index()

        self.result_df = index_df(self.result_df)
Пример #3
0
    factor = TSIFactor(entity_schema=Coin, entity_ids=entity_ids, provider='ccxt', level=IntervalLevel.LEVEL_1DAY,
                       start_timestamp=start_date, need_persist=False)
    df = factor.result_df
    musts = []
    if len(df.columns) > 1:
        s = df.agg("and", axis="columns")
        s.name = 'score'
        musts.append(s.to_frame(name='score'))
    else:
        df.columns = ['score']
        musts.append(df)

    signal_in_last_n_day_num = 14
    filter_result = list(accumulate(musts, func=operator.__and__))[-1]
    long_result = df[df.score == True]
    long_result = long_result.reset_index()
    long_result = index_df(long_result)
    long_result = long_result.sort_values(by=['score', 'entity_id'])
    long_result = long_result[long_result.timestamp > target_date - timedelta(signal_in_last_n_day_num)]
    longdf = factor.factor_df[factor.factor_df['entity_id'].isin(long_result['entity_id'].tolist())]
    good_coins = set(long_result['entity_id'].tolist())
    coins = get_entities(provider='ccxt', entity_schema=Coin, entity_ids=good_coins,
                         return_type='domain')
    codeList = []
    for coin in coins:
        codeList.append(to_tradingview_code(coin.code, coin.exchange))
    info = [f'{coin}' for coin in codeList]
    msg = '选币:' + ' '.join(info) + '\n'
    logger.info(msg)
    # add_list_to_group(codeList, group_id=19580865, entity_type='coin')
Пример #4
0
def get_data(data_schema,
             security_list=None,
             security_id=None,
             codes=None,
             level=None,
             provider='eastmoney',
             columns=None,
             return_type='df',
             start_timestamp=None,
             end_timestamp=None,
             filters=None,
             session=None,
             order=None,
             limit=None,
             index='timestamp',
             index_is_time=True):
    local_session = False
    if not session:
        store_category = get_store_category(data_schema)
        session = get_db_session(provider=provider,
                                 store_category=store_category)
        local_session = True

    try:
        if columns:
            if data_schema.timestamp not in columns:
                columns.append(data_schema.timestamp)
            query = session.query(*columns)
        else:
            query = session.query(data_schema)

        if security_id:
            query = query.filter(data_schema.security_id == security_id)
        if codes:
            query = query.filter(data_schema.code.in_(codes))
        if security_list:
            query = query.filter(data_schema.security_id.in_(security_list))

        # we always store different level in different schema,the level param is not useful now
        if level:
            try:
                # some schema has no level,just ignore it
                data_schema.level
                if type(level) == TradingLevel:
                    level = level.value
                query = query.filter(data_schema.level == level)
            except Exception as e:
                pass

        query = common_filter(query,
                              data_schema=data_schema,
                              start_timestamp=start_timestamp,
                              end_timestamp=end_timestamp,
                              filters=filters,
                              order=order,
                              limit=limit)

        if return_type == 'df':
            df = pd.read_sql(query.statement, query.session.bind)
            if df_is_not_null(df):
                return index_df(df,
                                drop=False,
                                index=index,
                                index_is_time=index_is_time)
        elif return_type == 'domain':
            return query.all()
        elif return_type == 'dict':
            return [item.__dict__ for item in query.all()]
    except Exception:
        raise
    finally:
        if local_session:
            session.close()
Пример #5
0
 def normalize_result_df(self, df):
     if pd_is_not_null(df):
         df = df.reset_index()
         df = index_df(df)
         df = df.sort_values(by=['score', 'entity_id'])
     return df
Пример #6
0
def get_data(data_schema,
             ids: List[str] = None,
             entity_ids: List[str] = None,
             entity_id: str = None,
             codes: List[str] = None,
             code: str = None,
             level: Union[IntervalLevel, str] = None,
             provider: str = None,
             columns: List = None,
             col_label: dict = None,
             return_type: str = 'df',
             start_timestamp: Union[pd.Timestamp, str] = None,
             end_timestamp: Union[pd.Timestamp, str] = None,
             filters: List = None,
             session: Session = None,
             order=None,
             limit: int = None,
             index: Union[str, list] = None,
             time_field: str = 'timestamp'):
    assert data_schema is not None
    assert provider is not None
    assert provider in zvt_context.providers

    if not session:
        session = get_db_session(provider=provider, data_schema=data_schema)

    time_col = eval('data_schema.{}'.format(time_field))

    if columns:
        # support str
        if type(columns[0]) == str:
            columns_ = []
            for col in columns:
                assert isinstance(col, str)
                columns_.append(eval('data_schema.{}'.format(col)))
            columns = columns_

        # make sure get timestamp
        if time_col not in columns:
            columns.append(time_col)

        if col_label:
            columns_ = []
            for col in columns:
                if col.name in col_label:
                    columns_.append(col.label(col_label.get(col.name)))
                else:
                    columns_.append(col)
            columns = columns_

        query = session.query(*columns)
    else:
        query = session.query(data_schema)

    if entity_id:
        query = query.filter(data_schema.entity_id == entity_id)
    if entity_ids:
        query = query.filter(data_schema.entity_id.in_(entity_ids))
    if code:
        query = query.filter(data_schema.code == code)
    if codes:
        query = query.filter(data_schema.code.in_(codes))
    if ids:
        query = query.filter(data_schema.id.in_(ids))

    # we always store different level in different schema,the level param is not useful now
    if level:
        try:
            # some schema has no level,just ignore it
            data_schema.level
            if type(level) == IntervalLevel:
                level = level.value
            query = query.filter(data_schema.level == level)
        except Exception as e:
            pass

    query = common_filter(query,
                          data_schema=data_schema,
                          start_timestamp=start_timestamp,
                          end_timestamp=end_timestamp,
                          filters=filters,
                          order=order,
                          limit=limit,
                          time_field=time_field)

    if return_type == 'df':
        df = pd.read_sql(query.statement, query.session.bind)
        if pd_is_not_null(df):
            if index:
                df = index_df(df, index=index, time_field=time_field)
        return df
    elif return_type == 'domain':
        return query.all()
    elif return_type == 'dict':
        return [item.__dict__ for item in query.all()]
Пример #7
0
def get_data(region: Region,
             data_schema,
             ids: List[str] = None,
             entity_ids: List[str] = None,
             entity_id: str = None,
             codes: List[str] = None,
             code: str = None,
             level: Union[IntervalLevel, str] = None,
             provider: Provider = Provider.Default,
             columns: List = None,
             col_label: dict = None,
             return_type: str = 'df',
             start_timestamp: Union[pd.Timestamp, str] = None,
             end_timestamp: Union[pd.Timestamp, str] = None,
             filters: List = None,
             session: Session = None,
             order=None,
             limit: int = None,
             index: Union[str, list] = None,
             time_field: str = 'timestamp',
             fun=None):
    assert data_schema is not None
    assert provider.value is not None
    assert provider in zvt_context.providers[region]

    step1 = time.time()
    precision_str = '{' + ':>{},.{}f'.format(8, 4) + '}'

    if not session:
        session = get_db_session(region=region,
                                 provider=provider,
                                 data_schema=data_schema)

    time_col = eval('data_schema.{}'.format(time_field))

    if fun is not None:
        query = session.query(fun)
    elif columns:
        # support str
        if type(columns[0]) == str:
            columns_ = []
            for col in columns:
                assert isinstance(col, str)
                columns_.append(eval('data_schema.{}'.format(col)))
            columns = columns_

        # make sure get timestamp
        if time_col not in columns:
            columns.append(time_col)

        if col_label:
            columns_ = []
            for col in columns:
                if col.name in col_label:
                    columns_.append(col.label(col_label.get(col.name)))
                else:
                    columns_.append(col)
            columns = columns_

        query = session.query(*columns)
    else:
        query = session.query(data_schema)

    if zvt_config['debug'] == 2:
        cost = precision_str.format(time.time() - step1)
        logger.debug("get_data query column: {}".format(cost))

    if entity_id is not None:
        query = query.filter(data_schema.entity_id == entity_id)
    if entity_ids is not None:
        query = query.filter(data_schema.entity_id.in_(entity_ids))
    if code is not None:
        query = query.filter(data_schema.code == code)
    if codes is not None:
        query = query.filter(data_schema.code.in_(codes))
    if ids is not None:
        query = query.filter(data_schema.id.in_(ids))

    # we always store different level in different schema,the level param is not useful now
    # if level:
    #     try:
    #         # some schema has no level,just ignore it
    #         data_schema.level
    #         if type(level) == IntervalLevel:
    #             level = level.value
    #         query = query.filter(data_schema.level == level)
    #     except Exception as _:
    #         pass

    query = common_filter(query,
                          data_schema=data_schema,
                          start_timestamp=start_timestamp,
                          end_timestamp=end_timestamp,
                          filters=filters,
                          order=order,
                          limit=limit,
                          time_field=time_field)

    if zvt_config['debug'] == 2:
        cost = precision_str.format(time.time() - step1)
        logger.debug("get_data query common: {}".format(cost))

    if return_type == 'func':
        result = query.scalar()
        return result

    elif return_type == 'df':
        df = pd.read_sql(query.statement, query.session.bind, index_col=['id'])
        if pd_is_not_null(df):
            if index:
                df = index_df(df, index=index, time_field=time_field)

        if zvt_config['debug'] == 2:
            cost = precision_str.format(time.time() - step1)
            logger.debug("get_data do query cost: {} type: {} size: {}".format(
                cost, return_type, len(df)))
        return df

    elif return_type == 'domain':
        # if limit is not None and limit == 1:
        #     result = [query.first()]
        # else:
        #     result = list(window_query(query, window_size, step1))
        # result = list(query.yield_per(window_size))

        if zvt_config['debug'] == 2:
            with profiled():
                result = query.all()
        else:
            result = query.all()

        if zvt_config['debug'] == 2:
            cost = precision_str.format(time.time() - step1)
            res_cnt = len(result) if result else 0
            logger.debug(
                "get_data do query cost: {} type: {} limit: {} size: {}".
                format(cost, return_type, limit, res_cnt))

        return result

    elif return_type == 'dict':
        # if limit is not None and limit == 1:
        #     result = [item.__dict__ for item in query.first()]
        # else:
        #     result = [item.__dict__ for item in list(window_query(query, window_size, step1))]
        # result = [item.__dict__ for item in list(query.yield_per(window_size))]

        if zvt_config['debug'] == 2:
            with profiled():
                result = [item.__dict__ for item in query.all()]
        else:
            result = [item.__dict__ for item in query.all()]

        if zvt_config['debug'] == 2:
            cost = precision_str.format(time.time() - step1)
            res_cnt = len(result) if result else 0
            logger.debug(
                "get_data do query cost: {} type: {} limit: {} size: {}".
                format(cost, return_type, limit, res_cnt))

        return result
Пример #8
0
 def normalize_result_df(self, df):
     df = df.reset_index()
     df = index_df(df)
     df = df.sort_values(by=['score', 'security_id'])
     return df
Пример #9
0
def get_data(
    data_schema: Type[Mixin],
    ids: List[str] = None,
    entity_ids: List[str] = None,
    entity_id: str = None,
    codes: List[str] = None,
    code: str = None,
    level: Union[IntervalLevel, str] = None,
    provider: str = None,
    columns: List = None,
    col_label: dict = None,
    return_type: str = "df",
    start_timestamp: Union[pd.Timestamp, str] = None,
    end_timestamp: Union[pd.Timestamp, str] = None,
    filters: List = None,
    session: Session = None,
    order=None,
    limit: int = None,
    index: Union[str, list] = None,
    drop_index_col=False,
    time_field: str = "timestamp",
):
    """
    query data by the arguments

    :param data_schema:
    :param ids:
    :param entity_ids:
    :param entity_id:
    :param codes:
    :param code:
    :param level:
    :param provider:
    :param columns:
    :param col_label: dict with key(column), value(label)
    :param return_type: df, domain or dict. default is df
    :param start_timestamp:
    :param end_timestamp:
    :param filters:
    :param session:
    :param order:
    :param limit:
    :param index: index field name, str for single index, str list for multiple index
    :param drop_index_col: whether drop the col if it's in index, default False
    :param time_field:
    :return: results basing on return_type.
    """
    if "providers" not in data_schema.__dict__:
        logger.error("no provider registered for: {}", data_schema)
    if not provider:
        provider = data_schema.providers[0]

    if not session:
        session = get_db_session(provider=provider, data_schema=data_schema)

    time_col = eval("data_schema.{}".format(time_field))

    if columns:
        # support str
        for i, col in enumerate(columns):
            if isinstance(col, str):
                columns[i] = eval("data_schema.{}".format(col))

        # make sure get timestamp
        if time_col not in columns:
            columns.append(time_col)

        if col_label:
            columns_ = []
            for col in columns:
                if col.name in col_label:
                    columns_.append(col.label(col_label.get(col.name)))
                else:
                    columns_.append(col)
            columns = columns_

        query = session.query(*columns)
    else:
        query = session.query(data_schema)

    if entity_id:
        query = query.filter(data_schema.entity_id == entity_id)
    if entity_ids:
        query = query.filter(data_schema.entity_id.in_(entity_ids))
    if code:
        query = query.filter(data_schema.code == code)
    if codes:
        query = query.filter(data_schema.code.in_(codes))
    if ids:
        query = query.filter(data_schema.id.in_(ids))

    # we always store different level in different schema,the level param is not useful now
    if level:
        try:
            #: some schema has no level,just ignore it
            data_schema.level
            if type(level) == IntervalLevel:
                level = level.value
            query = query.filter(data_schema.level == level)
        except Exception as e:
            pass

    query = common_filter(
        query,
        data_schema=data_schema,
        start_timestamp=start_timestamp,
        end_timestamp=end_timestamp,
        filters=filters,
        order=order,
        limit=limit,
        time_field=time_field,
    )

    if return_type == "df":
        df = pd.read_sql(query.statement, query.session.bind)
        if pd_is_not_null(df):
            if index:
                df = index_df(df,
                              index=index,
                              drop=drop_index_col,
                              time_field=time_field)
        return df
    elif return_type == "domain":
        return query.all()
    elif return_type == "dict":
        return [item.__dict__ for item in query.all()]