示例#1
0
    def __init__(self, recursive: bool):
        """Instantiate empty class.

        :param recursive: flag to indicate recursion
        :type recursive: bool
        """
        self.files = []
        self.filtered = dict()
        self.artifacts = dict()
        self.pipeline = Pipeline()
        self.dir_name = None
        self.dir_children = None
        self.files = None
        self.recursive = recursive
示例#2
0
    def __init__(
        self,
        source: str,
        recursive: bool,
        use_predefined_pipeline: bool = False,
        out_dir: str = None,
    ):
        """Initialize new tabular file group and a data processing pipeline.

        :param source: The path to the folder where the tabular data (csv or json files) are stored.
        :type source: str
        :param recursive: If True, Traverse the folder recursively to find files.
        :type recursive: bool
        :param use_predefined_pipeline: If True, use a predefined pipeline: load -> concat -> dedup -> export.
        :type use_predefined_pipeline: bool
        :param out_dir: The output directory.
        :type out_dir: str
        """
        super().__init__(recursive)
        self.pipeline = Pipeline()
        self.artifacts["df_concatenated"] = None
        self.load_dir(source, TabularFile, recursive)
        if use_predefined_pipeline:
            if out_dir is not None:

                def save():
                    self.save(out_dir=out_dir)

                self.configure_pipeline(
                    ["FilterDuplicateFiles", "FilterDuplicateRowsEachDataframe", save]
                )
            self.configure_pipeline(
                ["FilterDuplicateFiles", "FilterDuplicateRowsEachDataframe"]
            )
示例#3
0
def test_add_component():
    pipeline = Pipeline()
    pipeline.add_component(PipelineComponent())
    assert len(pipeline.components) == 1
示例#4
0
def test_init():
    pipeline = Pipeline()
    assert len(pipeline.components) == 0
示例#5
0
def test_str():
    pipeline = Pipeline()
    assert "Empty" in str(pipeline)
    pipeline.add_component(PipelineComponent())
    assert "PipelineComponent" in str(pipeline)
示例#6
0
def test_remove_component():
    pipeline = Pipeline()
    pipeline.add_component(PipelineComponent())
    pipeline.remove_component(0)
    assert len(pipeline.components) == 0
示例#7
0
class FileGroup:
    """Class for reading and performing general operations on groups of files."""

    # Externally defined classes and methods
    from kaishi.core.misc import CollapseChildren
    from kaishi.core.filters.duplicate_files import FilterDuplicateFiles
    from kaishi.core.filters.by_regex import FilterByRegex
    from kaishi.core.filters.subsample import FilterSubsample
    from kaishi.core.filters.by_label import FilterByLabel
    from kaishi.core.labelers.validation_and_test import LabelerValidationAndTest

    def __init__(self, recursive: bool):
        """Instantiate empty class.

        :param recursive: flag to indicate recursion
        :type recursive: bool
        """
        self.files = []
        self.filtered = dict()
        self.artifacts = dict()
        self.pipeline = Pipeline()
        self.dir_name = None
        self.dir_children = None
        self.files = None
        self.recursive = recursive

    def __getitem__(self, key):
        """Get a specific file object."""
        for fobj in self.files:
            if repr(fobj) == key:
                return fobj
        raise KeyError(key + " not a valid file")

    def load_dir(self, source: str, file_initializer, recursive: bool):
        """Read file names in a directory

        :param source: Directory to load from
        :type source: str
        :param file_initializer: Data file calss to initialize each file with
        :type file_initializer: kaishi file initializer class (e.g. :class:`kaishi.core.file.File`)
        """
        self.dir_name, self.dir_children, self.files = load_files_by_walk(
            source, file_initializer, recursive)

    def get_pipeline_options(self):
        """Returns available pipeline options for this dataset.

        :return: list of uninitialized pipeline component objects
        :rtype: list
        """
        options = []
        for method in dir(self):
            if method.startswith("Filter"):
                options.append(method)
        for method in dir(self):
            if method.startswith("Labeler"):
                options.append(method)
        for method in dir(self):
            if method.startswith("Transform"):
                options.append(method)
        for method in dir(self):
            if method.startswith("Aggregator"):
                options.append(method)

        return options

    def configure_pipeline(self, choices: list = None, verbose: bool = False):
        """Configures the sequence of components in the data processing pipeline.

        :param choices: list of pipeline choices
        :type choices: list
        :param verbose: flag to indicate verbosity
        :type verbose: bool
        """
        options = self.get_pipeline_options()
        if choices is None:  # Prompt for choices if not provided

            def fix_choice_string(choice_string, options):
                choices = np.array(choice_string.split(",")).astype(np.int64)
                if np.any(choices < 0) or np.max(choices) > len(options) - 1:
                    choices = None
                return choices

            verbose = True
            print("Pipeline options: ")
            for i, option in enumerate(options):
                print(repr(i) + ": " + option)
            print("")

            choice_string = input(
                "To configure, enter a comma separated list of integers: ")
            choices = None
            while choices is None:  # Keep trying until a valid string is entered
                try:
                    choices = fix_choice_string(choice_string, options)
                except ValueError:
                    choices = None
                if choices is None:
                    choice_string = input(
                        "Error parsing string, please re-enter a list of the above options: "
                    )
                    choices = fix_choice_string(choice_string, options)
        self.pipeline.reset()
        for choice in choices:  # Use the configuration specified to construct pipeline
            if isinstance(choice, str):
                try:
                    self.pipeline.add_component(
                        self.__getattribute__(choice)())
                except AttributeError:
                    warnings.warn(
                        choice +
                        " is an invalid pipeline component, skipping...")
            elif isinstance(choice, np.int64):
                self.pipeline.add_component(
                    self.__getattribute__(options[choice])())
            elif callable(choice):
                self.pipeline.add_component(choice)

        if verbose:
            print(repr(self.pipeline))

    def file_report(self, max_file_entries=16, max_filter_entries=10):
        """Show a report of valid and invalid data.

        :param max_file_entries: max number of entries to print of file list
        :type max_file_entries: int
        :param max_filter_entries: max number of entries to print per filter category (e.g. duplicates, similar, etc.)
        :type max_filter_entries: int
        """
        if self.files == [] and self.filtered == {}:
            print("No data loaded to report on.")
            return

        pp = pprint.PrettyPrinter()
        print("Current file list:")
        table = PrettyTable()
        table.field_names = ["Index", "File Name", "Children", "Labels"]
        table.align["Children"] = "l"
        for i, fobj in enumerate(self.files):
            if should_print_row(i, max_file_entries, len(self.files)) == 1:
                children_text = pp.pformat(fobj.children).split("\n")
                table.add_row([
                    i,
                    repr(fobj),
                    "    " + children_text[0],
                    repr([label.name for label in fobj.labels]),
                ])
                if len(children_text) > 1:
                    for child_line in children_text[1:]:
                        table.add_row([" ", " ", "\t" + child_line, " "])
            elif should_print_row(i, max_file_entries, len(self.files)) == 2:
                table.add_row(["...", " ", " ", " "])
        print(table)

        print("Filtered files:")
        table = PrettyTable()
        table.field_names = ["File Name", "Filter Reason"]
        for k in self.filtered:
            for i, fobj in enumerate(self.filtered[k]):
                if should_print_row(i, max_filter_entries,
                                    len(self.filtered[k])) == 1:
                    table.add_row([repr(fobj), k])
                elif (should_print_row(i, max_filter_entries,
                                       len(self.filtered[k])) == 2):
                    table.add_row(["...", " "])
        print(table)

    def run_pipeline(self, verbose: bool = False):
        """Run the pipeline as configured.

        :param verbose: flag to indicate verbosity
        :type verbose: bool
        """
        self.pipeline(self, verbose=verbose)
        if verbose:
            print("Pipeline completed")