def pre_compute(self): super().pre_compute() self.pipe_df = self.pipe_df.groupby( level=1)['net_main_inflows'].rolling(window=self.window).mean() self.pipe_df = self.pipe_df.reset_index(level=0, drop=True) self.pipe_df = self.pipe_df.reset_index() self.pipe_df = normal_index_df(self.pipe_df)
def score(self, input_df): self.score_levels.sort(reverse=True) quantile_df = input_df.groupby(level=1).quantile(self.score_levels) quantile_df.index.names = [self.time_field, 'score'] self.logger.info('factor:{},quantile:\n{}'.format( self.factor_name, quantile_df)) result_df = input_df.copy() result_df.reset_index(inplace=True, level='entity_id') result_df['quantile'] = None for timestamp in quantile_df.index.levels[0]: length = len(result_df.loc[result_df.index == timestamp, 'quantile']) result_df.loc[result_df.index == timestamp, 'quantile'] = [quantile_df.loc[timestamp].to_dict() ] * length self.logger.info('factor:{},df with quantile:\n{}'.format( self.factor_name, result_df)) # result_df = result_df.set_index(['entity_id'], append=True) # result_df = result_df.sort_index(level=[0, 1]) # # self.logger.info(result_df) # def calculate_score(df, factor_name, quantile): original_value = df[factor_name] score_map = quantile.get(factor_name) min_score = self.score_levels[-1] if original_value < score_map.get(min_score): return 0 for score in self.score_levels[:-1]: if original_value >= score_map.get(score): return score for factor in input_df.columns.to_list(): result_df[factor] = result_df.apply( lambda x: calculate_score(x, factor, x['quantile']), axis=1) result_df = result_df.reset_index() result_df = normal_index_df(result_df) result_df = result_df.loc[:, self.factors] result_df = result_df.loc[~result_df.index.duplicated(keep='first')] self.logger.info('factor:{},df:\n{}'.format(self.factor_name, result_df)) return result_df
def normalize(self): """ normalize data_df to col1 col2 col3 entity_id timestamp """ if pd_is_not_null(self.data_df): if not is_normal_df(self.data_df): self.data_df = normal_index_df(self.data_df, self.category_field, self.time_field) self.entity_ids = self.data_df.index.levels[0].to_list() for entity_id in self.entity_ids: df = self.data_df.loc[(entity_id, )] self.df_list.append(df) self.entity_map_df[entity_id] = df if len(self.df_list) > 1 and self.fill_index: self.df_list = fill_with_same_index(df_list=self.df_list)