def search(self, df, num_examples_per_instance, minimum_data=None, gap=None, drop_empty=True, label_type=None, verbose=True, *args, **kwargs): """Searches the data to calculates labels. Args: df (DataFrame): Data frame to search and extract labels. num_examples_per_instance (int or dict): The expected number of examples to return from each entity group. A dictionary can be used to further specify the expected number of examples to return from each label. minimum_data (str): Minimum data before starting search. Default value is first time of index. gap (str or int): Time between examples. Default value is window size. If an integer, search will start on the first event after the minimum data. drop_empty (bool): Whether to drop empty slices. Default value is True. label_type (str): The label type can be "continuous" or "categorical". Default value is the inferred label type. verbose (bool): Whether to render progress bar. Default value is True. *args: Positional arguments for labeling function. **kwargs: Keyword arguments for labeling function. Returns: lt (LabelTimes): Calculated labels with cutoff times. """ assert self.labeling_function, 'missing labeling function(s)' self._check_example_count(num_examples_per_instance, gap) self.window_size = self.window_size or len(df) gap = to_offset(gap or self.window_size) is_label_search = isinstance(num_examples_per_instance, dict) search = (LabelSearch if is_label_search else ExampleSearch)(num_examples_per_instance) records = self._run_search( df=df, search=search, gap=gap, min_data=minimum_data, drop_empty=drop_empty, verbose=verbose, *args, **kwargs, ) lt = LabelTimes( data=records, target_columns=list(self.labeling_function), target_entity=self.target_entity, search_settings={ 'num_examples_per_instance': num_examples_per_instance, 'minimum_data': str(minimum_data), 'window_size': str(self.window_size), 'gap': str(gap), }, ) return lt
def test_describe_empty(capsys): LabelTimes().describe() captured = capsys.readouterr() out = '\n'.join([ 'Settings', '--------', 'No settings', '', '', 'Transforms', '----------', 'No transforms applied', '', '', ]) assert captured.out == out
def test_describe_no_transforms(capsys): data = {'target': range(3)} LabelTimes(data).describe() captured = capsys.readouterr() out = '\n'.join([ 'Settings', '--------', 'target_column target', 'target_entity None', 'target_type continuous', '', '', 'Transforms', '----------', 'No transforms applied', '', '', ]) assert captured.out == out
def search(self, df, num_examples_per_instance, minimum_data=None, gap=None, drop_empty=True, label_type=None, verbose=True, *args, **kwargs): """Searches the data to calculates labels. Args: df (DataFrame) : Data frame to search and extract labels. num_examples_per_instance (int) : Number of examples per unique instance of target entity. minimum_data (str) : Minimum data before starting search. Default value is first time of index. gap (str or int) : Time between examples. Default value is window size. If an integer, search will start on the first event after the minimum data. drop_empty (bool) : Whether to drop empty slices. Default value is True. label_type (str) : The label type can be "continuous" or "categorical". Default value is the inferred label type. verbose (bool) : Whether to render progress bar. Default value is True. *args : Positional arguments for labeling function. **kwargs : Keyword arguments for labeling function. Returns: LabelTimes : Calculated labels with cutoff times. """ bar_format = "Elapsed: {elapsed} | Remaining: {remaining} | " bar_format += "Progress: {l_bar}{bar}| " bar_format += self.target_entity + ": {n}/{total} " total = len(df.groupby(self.target_entity)) finite_examples_per_instance = num_examples_per_instance > -1 and num_examples_per_instance != float( 'inf') if finite_examples_per_instance: total *= num_examples_per_instance progress_bar = tqdm(total=total, bar_format=bar_format, disable=not verbose, file=stdout) slices = self.slice( df=df, num_examples_per_instance=num_examples_per_instance, minimum_data=minimum_data, gap=gap, drop_empty=drop_empty, verbose=False) name = self.labeling_function.__name__ labels, instance = [], 0 for df in slices: label = self.labeling_function(df, *args, **kwargs) if not pd.isnull(label): label = { self.target_entity: df.context.target_instance, 'cutoff_time': df.context.window[0], name: label } labels.append(label) first_slice_for_instance = df.context.slice_number == 1 if finite_examples_per_instance: progress_bar.update(n=1) # update skipped examples for previous instance if first_slice_for_instance: instance += 1 skipped_examples = instance - 1 skipped_examples *= num_examples_per_instance skipped_examples -= progress_bar.n progress_bar.update(n=skipped_examples) if not finite_examples_per_instance and first_slice_for_instance: progress_bar.update(n=1) total -= progress_bar.n progress_bar.update(n=total) progress_bar.close() labels = LabelTimes(data=labels, name=name, target_entity=self.target_entity, label_type=label_type) labels = labels.rename_axis('id', axis=0) if labels.empty: return labels if labels.is_discrete: labels[labels.name] = labels[labels.name].astype('category') labels.settings.update({ 'labeling_function': name, 'num_examples_per_instance': num_examples_per_instance, 'minimum_data': str(minimum_data), 'window_size': self.window_size, 'gap': gap, }) return labels