Exemplo n.º 1
0
    def test_collect(self):
        storage = Storage(self.ROOT)
        path = storage.download(kind="XF", year=2018)
        ledger = storage.download_ledger(directory=f"{self.ROOT}/processed")
        self.assertGreater(len(ledger.data), 1)
        self.assertTrue(Path(ledger.path).exists)

        loaded = ledger.collect(edinet_code="E00021")
        self.assertGreater(len(loaded), 0)
Exemplo n.º 2
0
 def test_download_extracted(self):
     storage = Storage(self.ROOT)
     path = storage.download(directory=f"{self.ROOT}/rawe",
                             kind="XE",
                             year=2018)
     self.assertTrue(path.exists())
     self.assertEqual(path.name, "2018")
     self.assertTrue(path.joinpath("documents.csv").exists())
     self.assertTrue(path.joinpath("docs").exists())
     self.assertGreater(len(list(path.joinpath("docs").glob("*.txt"))), 1)
Exemplo n.º 3
0
    def test_parse(self):
        storage = Storage(self.ROOT)
        path = self._download(kind="F")
        path = storage.parse("company.history")
        self.assertTrue(path.exists())
        self.assertTrue(path.joinpath("2018").exists())
        self.assertTrue(path.joinpath("2018/documents.csv").exists())
        self.assertGreater(
            len(list(path.joinpath("2018/docs").glob("*company_history.txt"))),
            0)

        path = storage.parse("business.risks", sec_code="1376")
        with path.joinpath("2018/documents.csv").open(encoding="utf-8") as f:
            self.assertEquals(len(f.readlines()), 2)
        self.assertEquals(
            len(list(path.joinpath("2018/docs").glob("*business_risks.txt"))),
            1)
Exemplo n.º 4
0
 def __init__(self, version="v1.0"):
     self._storage = Storage(version=version)
Exemplo n.º 5
0
class CoARiJ(object):
    """
    Data management tool for CoARiJ
     dataset.
    """
    def __init__(self, version="v1.0"):
        self._storage = Storage(version=version)

    def download(self, directory="", kind="F", year="", force=False):
        """Download the {kind} {year} dataset to {directory}.

        Args:
            directory (str): Downloaded dataset to specified directory.
            kind (str): 'F': raw file datadata, 'E': text extracted data.
            year (str): Financial year of dataset.
            force (bool): When True, overwrite data if exist.

        Returns:
            str: Path to downloaded directory

        """
        return self._storage.download(directory, kind, year, force)

    def extract(self,
                aspect_element,
                year="",
                edinet_code="",
                sec_code="",
                jcn="",
                source_directory="",
                target_directory="",
                normalized=True):
        """
        Extract {aspect_to_element} from files in {source_directory}{year} and
        save it in {target_directory}{year} as txt/html file.

        Args:
            aspect_element (str): Target aspect.element (ex: company.history).
            year (str): Target financial year.
            edinet_code (str): EDINET code to specify compan.
            sec_code (str): SEC code to specify compan.
            jcn (str): Target JCN code to specify compan.
            source_directory (str): Source directory includes XBRL files.
            target_directory (str): Target directory that txt/htmls are saved.
            normalized: (bool): True: extract text, False: save raw xml(html).

        Returns:
            str: Path to extracted files directory

        """

        return self._storage.extract(aspect_dot_element=aspect_element,
                                     year=year,
                                     edinet_code=edinet_code,
                                     sec_code=sec_code,
                                     jcn=jcn,
                                     source_directory=source_directory,
                                     target_directory=target_directory,
                                     normalized=normalized)

    def tokenize(self,
                 tokenizer="janome",
                 mode="",
                 dictionary="",
                 dictionary_type="",
                 year="",
                 edinet_code="",
                 sec_code="",
                 jcn="",
                 aspect_element="",
                 source_directory="",
                 target_directory=""):
        """
        Tokenize by {tokenizer} from files in {source_directory}{year} and
        save it in {target_directory}{year} as txt/html file.

        Args:
            tokenizer (str): Japanese tokenizer ('janome' or 'sudachi').
            mode: (str): Sudachi tokenizer mode.
            dictionary: (str): Dictionary path for Janome.
            dictionary_type: (str): Dictionary type for Janome.
            year (str): Target financial year.
            edinet_code (str): EDINET code to specify compan.
            sec_code (str): SEC code to specify compan.
            jcn (str): Target JCN code to specify compan.
            aspect_element (str): Target aspect.element (ex: company.history).
            source_directory (str): Source directory includes XBRL files.
            target_directory (str): Target directory that txt/htmls are saved.

        Returns:
            str: Path to tokenized files directory

        """
        return self._storage.tokenize(tokenizer=tokenizer,
                                      mode=mode,
                                      dictionary=dictionary,
                                      dictionary_type=dictionary_type,
                                      year=year,
                                      edinet_code=edinet_code,
                                      sec_code=sec_code,
                                      jcn=jcn,
                                      aspect_dot_element=aspect_element,
                                      source_directory=source_directory,
                                      target_directory=target_directory)
Exemplo n.º 6
0
 def _download(self, kind="F"):
     storage = Storage(self.ROOT)
     path = storage.download(kind="X" + kind, year=2018)
     return path
Exemplo n.º 7
0
 def test_download_ledger(self):
     storage = Storage(self.ROOT)
     ledger = storage.download_ledger(directory=f"{self.ROOT}/processed")
     self.assertGreater(len(ledger.data), 1)
     self.assertTrue(Path(ledger.path).exists)
Exemplo n.º 8
0
 def __init__(self):
     self._storage = Storage()