Python Storage.tokenize примеры использования

Язык программирования: Python

Пространство имен/Пакет: coarij.storage

Класс/Тип: Storage

Метод/Функция: tokenize

Примеров на hotexamples.com: 2

Python Storage.tokenize - 2 примера найдено. Это лучшие примеры Python кода для coarij.storage.Storage.tokenize, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Storage(7)

download(4)

download_ledger(2)

extract(2)

parse(2)

tokenize(2)

get_ledger(1)

Пример #1

Показать файл

    def test_tokenize(self):
        storage = Storage(self.ROOT)
        path = self._download(kind="F")
        path = storage.parse("company.history")
        path = storage.tokenize(tokenizer="janome")
        self.assertTrue(path.exists())
        self.assertTrue(path.joinpath("2018").exists())
        self.assertTrue(path.joinpath("2018/documents.csv").exists())
        self.assertGreater(
            len(
                list(
                    path.joinpath("2018/docs").glob(
                        "*company_history_tokenized.txt"))), 0)

        path = storage.parse("business.risks", sec_code="1376")
        path = storage.tokenize(tokenizer="sudachi")
        with path.joinpath("2018/documents.csv").open(encoding="utf-8") as f:
            self.assertEquals(len(f.readlines()), 2)
        self.assertEquals(
            len(
                list(
                    path.joinpath("2018/docs").glob(
                        "*business_risks_tokenized.txt"))), 1)

Пример #2

Показать файл

class CoARiJ(object):
    """
    Data management tool for CoARiJ
     dataset.
    """
    def __init__(self, version="v1.0"):
        self._storage = Storage(version=version)

    def download(self, directory="", kind="F", year="", force=False):
        """Download the {kind} {year} dataset to {directory}.

        Args:
            directory (str): Downloaded dataset to specified directory.
            kind (str): 'F': raw file datadata, 'E': text extracted data.
            year (str): Financial year of dataset.
            force (bool): When True, overwrite data if exist.

        Returns:
            str: Path to downloaded directory

        """
        return self._storage.download(directory, kind, year, force)

    def extract(self,
                aspect_element,
                year="",
                edinet_code="",
                sec_code="",
                jcn="",
                source_directory="",
                target_directory="",
                normalized=True):
        """
        Extract {aspect_to_element} from files in {source_directory}{year} and
        save it in {target_directory}{year} as txt/html file.

        Args:
            aspect_element (str): Target aspect.element (ex: company.history).
            year (str): Target financial year.
            edinet_code (str): EDINET code to specify compan.
            sec_code (str): SEC code to specify compan.
            jcn (str): Target JCN code to specify compan.
            source_directory (str): Source directory includes XBRL files.
            target_directory (str): Target directory that txt/htmls are saved.
            normalized: (bool): True: extract text, False: save raw xml(html).

        Returns:
            str: Path to extracted files directory

        """

        return self._storage.extract(aspect_dot_element=aspect_element,
                                     year=year,
                                     edinet_code=edinet_code,
                                     sec_code=sec_code,
                                     jcn=jcn,
                                     source_directory=source_directory,
                                     target_directory=target_directory,
                                     normalized=normalized)

    def tokenize(self,
                 tokenizer="janome",
                 mode="",
                 dictionary="",
                 dictionary_type="",
                 year="",
                 edinet_code="",
                 sec_code="",
                 jcn="",
                 aspect_element="",
                 source_directory="",
                 target_directory=""):
        """
        Tokenize by {tokenizer} from files in {source_directory}{year} and
        save it in {target_directory}{year} as txt/html file.

        Args:
            tokenizer (str): Japanese tokenizer ('janome' or 'sudachi').
            mode: (str): Sudachi tokenizer mode.
            dictionary: (str): Dictionary path for Janome.
            dictionary_type: (str): Dictionary type for Janome.
            year (str): Target financial year.
            edinet_code (str): EDINET code to specify compan.
            sec_code (str): SEC code to specify compan.
            jcn (str): Target JCN code to specify compan.
            aspect_element (str): Target aspect.element (ex: company.history).
            source_directory (str): Source directory includes XBRL files.
            target_directory (str): Target directory that txt/htmls are saved.

        Returns:
            str: Path to tokenized files directory

        """
        return self._storage.tokenize(tokenizer=tokenizer,
                                      mode=mode,
                                      dictionary=dictionary,
                                      dictionary_type=dictionary_type,
                                      year=year,
                                      edinet_code=edinet_code,
                                      sec_code=sec_code,
                                      jcn=jcn,
                                      aspect_dot_element=aspect_element,
                                      source_directory=source_directory,
                                      target_directory=target_directory)