Пример #1
0
    def search(self,
               df,
               num_examples_per_instance,
               minimum_data=None,
               gap=None,
               drop_empty=True,
               label_type=None,
               verbose=True,
               *args,
               **kwargs):
        """Searches the data to calculates labels.

        Args:
            df (DataFrame): Data frame to search and extract labels.
            num_examples_per_instance (int or dict): The expected number of examples to return from each entity group.
                A dictionary can be used to further specify the expected number of examples to return from each label.
            minimum_data (str): Minimum data before starting search. Default value is first time of index.
            gap (str or int): Time between examples. Default value is window size.
                If an integer, search will start on the first event after the minimum data.
            drop_empty (bool): Whether to drop empty slices. Default value is True.
            label_type (str): The label type can be "continuous" or "categorical". Default value is the inferred label type.
            verbose (bool): Whether to render progress bar. Default value is True.
            *args: Positional arguments for labeling function.
            **kwargs: Keyword arguments for labeling function.

        Returns:
            lt (LabelTimes): Calculated labels with cutoff times.
        """
        assert self.labeling_function, 'missing labeling function(s)'
        self._check_example_count(num_examples_per_instance, gap)
        self.window_size = self.window_size or len(df)
        gap = to_offset(gap or self.window_size)

        is_label_search = isinstance(num_examples_per_instance, dict)
        search = (LabelSearch if is_label_search else ExampleSearch)(num_examples_per_instance)

        records = self._run_search(
            df=df,
            search=search,
            gap=gap,
            min_data=minimum_data,
            drop_empty=drop_empty,
            verbose=verbose,
            *args,
            **kwargs,
        )

        lt = LabelTimes(
            data=records,
            target_columns=list(self.labeling_function),
            target_entity=self.target_entity,
            search_settings={
                'num_examples_per_instance': num_examples_per_instance,
                'minimum_data': str(minimum_data),
                'window_size': str(self.window_size),
                'gap': str(gap),
            },
        )

        return lt
Пример #2
0
def test_describe_empty(capsys):
    LabelTimes().describe()
    captured = capsys.readouterr()

    out = '\n'.join([
        'Settings',
        '--------',
        'No settings',
        '',
        '',
        'Transforms',
        '----------',
        'No transforms applied',
        '',
        '',
    ])

    assert captured.out == out
Пример #3
0
def test_describe_no_transforms(capsys):
    data = {'target': range(3)}
    LabelTimes(data).describe()
    captured = capsys.readouterr()
    out = '\n'.join([
        'Settings',
        '--------',
        'target_column        target',
        'target_entity          None',
        'target_type      continuous',
        '',
        '',
        'Transforms',
        '----------',
        'No transforms applied',
        '',
        '',
    ])

    assert captured.out == out
Пример #4
0
    def search(self,
               df,
               num_examples_per_instance,
               minimum_data=None,
               gap=None,
               drop_empty=True,
               label_type=None,
               verbose=True,
               *args,
               **kwargs):
        """Searches the data to calculates labels.

        Args:
            df (DataFrame) : Data frame to search and extract labels.
            num_examples_per_instance (int) : Number of examples per unique instance of target entity.
            minimum_data (str) : Minimum data before starting search. Default value is first time of index.
            gap (str or int) : Time between examples. Default value is window size.
                If an integer, search will start on the first event after the minimum data.
            drop_empty (bool) : Whether to drop empty slices. Default value is True.
            label_type (str) : The label type can be "continuous" or "categorical". Default value is the inferred label type.
            verbose (bool) : Whether to render progress bar. Default value is True.
            *args : Positional arguments for labeling function.
            **kwargs : Keyword arguments for labeling function.

        Returns:
            LabelTimes : Calculated labels with cutoff times.
        """
        bar_format = "Elapsed: {elapsed} | Remaining: {remaining} | "
        bar_format += "Progress: {l_bar}{bar}| "
        bar_format += self.target_entity + ": {n}/{total} "
        total = len(df.groupby(self.target_entity))
        finite_examples_per_instance = num_examples_per_instance > -1 and num_examples_per_instance != float(
            'inf')

        if finite_examples_per_instance:
            total *= num_examples_per_instance

        progress_bar = tqdm(total=total,
                            bar_format=bar_format,
                            disable=not verbose,
                            file=stdout)

        slices = self.slice(
            df=df,
            num_examples_per_instance=num_examples_per_instance,
            minimum_data=minimum_data,
            gap=gap,
            drop_empty=drop_empty,
            verbose=False)

        name = self.labeling_function.__name__
        labels, instance = [], 0

        for df in slices:
            label = self.labeling_function(df, *args, **kwargs)

            if not pd.isnull(label):
                label = {
                    self.target_entity: df.context.target_instance,
                    'cutoff_time': df.context.window[0],
                    name: label
                }
                labels.append(label)

            first_slice_for_instance = df.context.slice_number == 1

            if finite_examples_per_instance:
                progress_bar.update(n=1)

                # update skipped examples for previous instance
                if first_slice_for_instance:
                    instance += 1
                    skipped_examples = instance - 1
                    skipped_examples *= num_examples_per_instance
                    skipped_examples -= progress_bar.n
                    progress_bar.update(n=skipped_examples)

            if not finite_examples_per_instance and first_slice_for_instance:
                progress_bar.update(n=1)

        total -= progress_bar.n
        progress_bar.update(n=total)
        progress_bar.close()

        labels = LabelTimes(data=labels,
                            name=name,
                            target_entity=self.target_entity,
                            label_type=label_type)
        labels = labels.rename_axis('id', axis=0)

        if labels.empty:
            return labels

        if labels.is_discrete:
            labels[labels.name] = labels[labels.name].astype('category')

        labels.settings.update({
            'labeling_function': name,
            'num_examples_per_instance': num_examples_per_instance,
            'minimum_data': str(minimum_data),
            'window_size': self.window_size,
            'gap': gap,
        })

        return labels