def _random_forest_segmentation(self, dataset) -> List[List[Tuple[str, int]]]: log.debug('Dataset before running tree algorithm:\n' f'{dataset.head().to_string()}\n') max_gain = 0.0 max_segmentation: List[List[Tuple[str, int]]] = [] log.info('Creating segmentation...') for _ in range(self._parameters.segment_num_trees): dataset.loc[:, 'norm_feedback'] = min_max_normalize( dataset['feedback']) segmentation_builder = TreeSegmentationBuilder( dataset.drop('feedback', axis=1), self._parameters) tree = segmentation_builder.build_tree(self._has_experian) segments = _get_segments(tree) segmentation_df = self._segment_dataframe(dataset, segments) tree_gain_vs_no_action = self._evaluate_segmentation( segmentation_df) if tree_gain_vs_no_action > max_gain: max_gain = tree_gain_vs_no_action max_segmentation = segments log.debug( f'Expected gain of the best tree vs no action = {max_gain}\n') log.info(f'Segments: {max_segmentation}') return max_segmentation
def _log_event_stats(unprepared_data, influx_exporter): events_per_date = unprepared_data.copy().assigned_date.apply( lambda x: datetime.strptime(str(x), '%Y%m%d')).value_counts() influx_exporter.send_event_stats(events_per_date) events_per_week = events_per_date.reset_index(name='count') events_per_week.loc[:, 'index'] = events_per_week['index'].apply( plds.datetime.datetime_to_pl_week) log.debug('Events per pl week:') log.debug(events_per_week.groupby('index').sum().sort_index().to_string())
def prepare_output(self, country, model_id, prediction_date, action_desc): self._output_df[PROJECT] = project_parameters.project_name self._output_df[MODEL_VERSION] = project_parameters.model_version self._output_df[COUNTRY] = country self._output_df[MODEL_ID] = model_id self._output_df[ACTION_GENERATED_DATE] = prediction_date self._output_df[ACTION_DESC] = action_desc self._output_df = self._output_df[FINAL_MODEL_OUTPUT_COLUMNS] self._output_df = self._output_df.dropna(subset=[GIVER_ID]) self._set_types() self._output_df.sort_index(axis=1, inplace=True) log.debug(f'model_output.columns = {self._output_df.columns}')
def fit(self, df: pd.DataFrame): input_feature_columns = [ feat.short_name for feat in self._categorical_features ] feature_df = df[input_feature_columns].copy() for feature in self._categorical_features: log.debug(f'Transforming feature: {feature.original_name}') feature_df.loc[:, feature.short_name] = self._filter_categories( feature_df[feature.short_name]) feature_df = pd.get_dummies(feature_df, columns=input_feature_columns).fillna(0) self._feature_columns = feature_df.columns df = pd.concat([df.drop(input_feature_columns, axis=1), feature_df], axis=1) return df
def _evaluate_segmentation(self, segmentation_df): max_feedback_per_segment = segmentation_df[['segment', 'action_code', 'feedback']] \ .groupby(['segment', 'action_code']).agg(self._parameters.reward_agg_func) \ .reset_index()[['segment', 'feedback']] \ .groupby('segment').max() num_samples_per_segment = segmentation_df[['segment', 'feedback']] \ .groupby(['segment']).count() \ .rename(columns={'feedback': 'num_samples'}) total_num_samples = np.sum(num_samples_per_segment.values) expected_value = np.dot( max_feedback_per_segment.values.transpose(), num_samples_per_segment.values / total_num_samples)[0][0] log.debug(f'Expected value of segmentation = {expected_value}') # Expected value of zero action all_zero_exp = segmentation_df[segmentation_df.action_code == 0.0]['feedback'].agg( self._parameters.reward_agg_func) log.debug(f'Expected value of zero action = {all_zero_exp}') tree_gain_vs_no_action = expected_value - all_zero_exp log.debug( f'Expected gain of the tree compared to zero action = {tree_gain_vs_no_action}' ) return tree_gain_vs_no_action
def select_split_feature(self, total_num_samples, total_num_features): segment_num_samples = len(self._df) features_df = self._calculate_feature_gains(segment_num_samples) features_df[CHERNOFF_CONFIDENT] = self._is_chernoff_confident( features_df, segment_num_samples, total_num_samples, total_num_features) features_df[ 'candidate_split'] = features_df.chernoff_confident & features_df.sample_confident log.debug('Split feature candidates:') log.debug(list(features_df[features_df.candidate_split][FEATURE])) split_candidates = features_df[features_df.candidate_split] if split_candidates.empty: raise NoSplitCandidatesException selected_feature_idx = np.argmax( np.random.multinomial( 1, _softmax(split_candidates[FEATURE_GAIN].values))) selected_feature = split_candidates[FEATURE].values[ selected_feature_idx] return selected_feature
def _sample_actions(self, df: pd.DataFrame) -> pd.DataFrame: df = pd.merge(df[[GIVER_ID, SEGMENT]], self._eps_greedy_distribution, how='left', on=SEGMENT) log.debug('Sampling actions from customer distributions...') df.loc[:, ACTION_IDX] = df.apply(_select_action, axis=1) log.debug('Getting action codes...') df.loc[:, ACTION_CODE] = df[ACTION_IDX].apply( lambda x: self._available_actions[x]) log.debug('Computing logprobs...') df.loc[:, LOGPROB] = df.apply(self._logprob, axis=1) return df
def build_tree(self, has_experian=False) -> SegmentNode: root = SegmentNode(None, None, 0, [], self._universe_df, self._parameters) self._leaves_to_explore.append(root) while self._leaves_to_explore: log.debug(f'Number of segments so far:') log.debug(len(_get_segments(root))) log.debug(f'Number of unfinished segments:') log.debug(len(self._leaves_to_explore)) log.debug(f'Unfinished segments: { self._leaves_to_explore }') segment_node: SegmentNode = self._leaves_to_explore.popleft() log.debug(f'Trying to split segment {segment_node}...') if has_experian and segment_node.depth == 1: split_feature = HAS_EXPERIAN else: try: split_feature = segment_node.select_split_feature( self._total_num_samples, self._total_num_features) except NoSplitCandidatesException: log.debug( f'No more confident _features for segment {segment_node}' ) continue log.debug( f'Splitting segment {segment_node} on feature {split_feature}...' ) left, right = segment_node.split_segment(split_feature) if left.depth < self._parameters.tree_max_depth and left.has_features and right.has_features: self._leaves_to_explore.extend([left, right]) log.debug('No more confident features for any of the segments.') log.debug('Segmentation done!') return root