def _collect(self, *args, **kwargs) -> Iterator[Any]: # pylint: disable = unused-argument r"""Iterator over conll files in the data_source. Args: args: args[0] is the directory to the conllu files. kwargs: Returns: data packs obtained from each document from each conllu file. """ conll_dir_path = args[0] file_paths = dataset_path_iterator(conll_dir_path, "conllu") for file_path in file_paths: with open(file_path, "r", encoding="utf8") as file: lines = file.readlines() doc_lines = [] for i, line in enumerate(lines): # previous document ends doc_lines.append(line) if i == len(lines) - 1 or \ lines[i + 1].strip().startswith("# newdoc"): yield doc_lines doc_lines = []
def _collect(self, conll_directory: str) -> Iterator[Any]: # type: ignore r"""Iterator over *.gold_conll files in the data_source Args: conll_directory: path to the directory containing the files. Returns: Iterator over files with gold_conll path. """ return dataset_path_iterator(conll_directory, "gold_conll")
def _collect(self, json_directory) -> Iterator[Any]: # type: ignore r"""Should be called with param ``json_directory`` which is a path to a folder containing json files. Args: json_directory: directory containing the json files. Returns: Iterator over paths to .json files """ return dataset_path_iterator(json_directory, "")
def _collect(self, text_directory) -> Iterator[Any]: # type: ignore r"""Should be called with param ``text_directory`` which is a path to a folder containing txt files. Args: text_directory: text directory containing the files. Returns: Iterator over paths to .txt files """ return dataset_path_iterator(text_directory, self.configs.file_ext)
def _collect(self, conll_directory) -> Iterator[Any]: # type: ignore r"""Iterator over conll files in the data_source. Args: conll_directory: directory to the conll files. Returns: Iterator over files in the path with conll extensions. """ logging.info("Reading .conll from %s", conll_directory) return dataset_path_iterator(conll_directory, "conll")
def _collect(self, text_directory) -> Iterator[Any]: r"""Should be called with param ``text_directory`` which is a path to a folder containing xml files. Args: text_directory: text directory containing the files. Returns: Iterator over paths to .xml files """ return dataset_path_iterator(text_directory, '.xml')
def _collect(self, *args, **kwargs) -> Iterator[str]: # pylint: disable = unused-argument r"""Iterator over text files in the data_source. Args: args: args[0] is the directory to the pos/neg movie files. kwargs: Returns: Iterator over files in the path with txt extensions. """ movie_directory: str = args[0] logging.info("Reading .txt from %s", movie_directory) return dataset_path_iterator(movie_directory, "txt")
def _collect(self, *args, **kwargs) -> Iterator[Any]: # pylint: disable = unused-argument r'''args[0] should be the folder where the SemEval Task8 dataset is stored. Files ended with sem_eval_task8_file_extension (.txt) are exptected here. Args: args: args[0] is the directory to the dataset. Returns: Iterator over the file name (str). ''' sem_file_dir: str = args[0] return dataset_path_iterator( sem_file_dir, self.configs.sem_eval_task8_file_extension)
def _collect(self, *args, **kwargs) -> Iterator[str]: # pylint: disable = unused-argument r"""Should be called with param ``oie_directory`` which is a path to a folder containing json files. Args: args: args[0] is the directory to the open ie files. kwargs: Returns: Iterator over files in the path with oie extensions. """ oie_directory: str = args[0] oie_file_extension: str = self.configs.oie_file_extension logging.info("Reading dataset from %s with extension %s", oie_directory, oie_file_extension) return dataset_path_iterator(oie_directory, oie_file_extension)
def _collect(self, *args, **kwargs) -> Iterator[str]: # pylint: disable = unused-argument r"""Iterator over text files in the data_source Args: args: args[0] is the directory to the .qa files. kwargs: Returns: Iterator over files in the path with qa extensions. """ qa_directory: str = args[0] qa_file_extension: str = self.configs.qa_file_extension logging.info("Reading dataset from %s with extension %s", qa_directory, qa_file_extension) return dataset_path_iterator(qa_directory, qa_file_extension)
def _collect(self, content) -> Iterator[str]: # type: ignore r"""Could be called with a directory, a particular file location or a list of strings. If the string is an HTML string, it will be cleaned. Args: content: either a string, or list of string Returns: Iterator over the content based on type of input """ if isinstance(content, str): # Check if directory if os.path.isdir(content): self.init_with_fileloc = True # TODO: maybe extend it to .txt also if need be? return dataset_path_iterator(content, ".html") # If file path to a single file, just return the filepath elif os.path.isfile(content): def data_yielder(data): yield data self.init_with_fileloc = True return data_yielder(content) else: # Treat it as a string content = [content] if isinstance(content, list): # Must be a list of strings now self.init_with_html = True def data_iterator(data): for html_string in data: yield html_string return data_iterator(content) else: raise TypeError(f"HTMLReader supports only strings and list of" f" strings, Please make sure your inputs are" f" correct!" f"Found {type(content)} instead!")
def _collect(self, text_directory: str) -> Iterator[Any]: # type: ignore return dataset_path_iterator(text_directory, '')