Пример #1
0
    def test_tokenize(self):
        storage = Storage(self.ROOT)
        path = self._download(kind="F")
        path = storage.parse("company.history")
        path = storage.tokenize(tokenizer="janome")
        self.assertTrue(path.exists())
        self.assertTrue(path.joinpath("2018").exists())
        self.assertTrue(path.joinpath("2018/documents.csv").exists())
        self.assertGreater(
            len(
                list(
                    path.joinpath("2018/docs").glob(
                        "*company_history_tokenized.txt"))), 0)

        path = storage.parse("business.risks", sec_code="1376")
        path = storage.tokenize(tokenizer="sudachi")
        with path.joinpath("2018/documents.csv").open(encoding="utf-8") as f:
            self.assertEquals(len(f.readlines()), 2)
        self.assertEquals(
            len(
                list(
                    path.joinpath("2018/docs").glob(
                        "*business_risks_tokenized.txt"))), 1)
Пример #2
0
class CoARiJ(object):
    """
    Data management tool for CoARiJ
     dataset.
    """
    def __init__(self, version="v1.0"):
        self._storage = Storage(version=version)

    def download(self, directory="", kind="F", year="", force=False):
        """Download the {kind} {year} dataset to {directory}.

        Args:
            directory (str): Downloaded dataset to specified directory.
            kind (str): 'F': raw file datadata, 'E': text extracted data.
            year (str): Financial year of dataset.
            force (bool): When True, overwrite data if exist.

        Returns:
            str: Path to downloaded directory

        """
        return self._storage.download(directory, kind, year, force)

    def extract(self,
                aspect_element,
                year="",
                edinet_code="",
                sec_code="",
                jcn="",
                source_directory="",
                target_directory="",
                normalized=True):
        """
        Extract {aspect_to_element} from files in {source_directory}{year} and
        save it in {target_directory}{year} as txt/html file.

        Args:
            aspect_element (str): Target aspect.element (ex: company.history).
            year (str): Target financial year.
            edinet_code (str): EDINET code to specify compan.
            sec_code (str): SEC code to specify compan.
            jcn (str): Target JCN code to specify compan.
            source_directory (str): Source directory includes XBRL files.
            target_directory (str): Target directory that txt/htmls are saved.
            normalized: (bool): True: extract text, False: save raw xml(html).

        Returns:
            str: Path to extracted files directory

        """

        return self._storage.extract(aspect_dot_element=aspect_element,
                                     year=year,
                                     edinet_code=edinet_code,
                                     sec_code=sec_code,
                                     jcn=jcn,
                                     source_directory=source_directory,
                                     target_directory=target_directory,
                                     normalized=normalized)

    def tokenize(self,
                 tokenizer="janome",
                 mode="",
                 dictionary="",
                 dictionary_type="",
                 year="",
                 edinet_code="",
                 sec_code="",
                 jcn="",
                 aspect_element="",
                 source_directory="",
                 target_directory=""):
        """
        Tokenize by {tokenizer} from files in {source_directory}{year} and
        save it in {target_directory}{year} as txt/html file.

        Args:
            tokenizer (str): Japanese tokenizer ('janome' or 'sudachi').
            mode: (str): Sudachi tokenizer mode.
            dictionary: (str): Dictionary path for Janome.
            dictionary_type: (str): Dictionary type for Janome.
            year (str): Target financial year.
            edinet_code (str): EDINET code to specify compan.
            sec_code (str): SEC code to specify compan.
            jcn (str): Target JCN code to specify compan.
            aspect_element (str): Target aspect.element (ex: company.history).
            source_directory (str): Source directory includes XBRL files.
            target_directory (str): Target directory that txt/htmls are saved.

        Returns:
            str: Path to tokenized files directory

        """
        return self._storage.tokenize(tokenizer=tokenizer,
                                      mode=mode,
                                      dictionary=dictionary,
                                      dictionary_type=dictionary_type,
                                      year=year,
                                      edinet_code=edinet_code,
                                      sec_code=sec_code,
                                      jcn=jcn,
                                      aspect_dot_element=aspect_element,
                                      source_directory=source_directory,
                                      target_directory=target_directory)