示例#1
0
    def _collect(self, *args, **kwargs) -> Iterator[Any]:
        # pylint: disable = no-self-use, unused-argument
        """
        Iterator over conll files in the data_source

        Args:
            args: args[0] is the directory to the conllu files.
            kwargs:

        Returns: data packs obtained from each document from each conllu file.
        """
        conll_dir_path = args[0]

        file_paths = dataset_path_iterator(conll_dir_path, "conllu")
        for file_path in file_paths:
            with open(file_path, "r", encoding="utf8") as file:
                lines = file.readlines()
                doc_lines = []

                for i, line in enumerate(lines):
                    # previous document ends
                    doc_lines.append(line)
                    if i == len(lines) - 1 or \
                            lines[i + 1].strip().startswith("# newdoc"):
                        yield doc_lines
                        doc_lines = []
示例#2
0
    def _collect(self, conll_directory: str) -> Iterator[Any]:  # type: ignore
        """
        Iterator over *.gold_conll files in the data_source

        Args:
            conll_directory:  path to the directory containing the files.

        Returns: Iterator over files with gold_conll path.

        """
        return dataset_path_iterator(conll_directory, "gold_conll")
示例#3
0
    def _collect(self, conll_directory) -> Iterator[Any]:  # type: ignore
        """
        Iterator over conll files in the data_source

        Args:
            conll_directory: directory to the conll files.

        Returns: Iterator over files in the path with conll extensions.
        """
        logging.info("Reading .conll from %s", conll_directory)
        return dataset_path_iterator(conll_directory, "conll")
示例#4
0
    def _collect(self, text_directory) -> Iterator[Any]:  # type: ignore
        """
        Should be called with param `text_directory` which is a path to a folder
        containing txt files.
        Args:
            text_directory: text directory containing the files.

        Returns: Iterator over paths to .txt files

        """
        return dataset_path_iterator(text_directory, ".txt")
示例#5
0
    def _collect(self, json_directory) -> Iterator[Any]:  # type: ignore
        """
        Should be called with param `json_directory` which is a path to a folder
        containing json files.
        Args:
            json_directory: directory containing the json files.

        Returns: Iterator over paths to .json files

        """
        return dataset_path_iterator(json_directory, "")
示例#6
0
 def _collect(self, text_directory: str) -> Iterator[Any]:  # type: ignore
     return dataset_path_iterator(text_directory, '')