Exemplo n.º 1
0
 def fill_cache(self, file_name: str) -> None:
     """Construct dictionary from words to sentences."""
     log("fill word cache")
     size = len(self.links)
     for index, id_ in enumerate(self.links.keys()):
         id_ = int(id_)
         progress_bar(index, size)
         word: str = ""
         sentence_words: Set[str] = set()
         sentence: str = self.sentence_db.get_sentence(
             self.language_2, id_).text
         for symbol in sentence.lower():
             if self.language_2.has_symbol(symbol):
                 word += symbol
             else:
                 if word:
                     sentence_words.add(word)
                 word = ""
         if word:
             sentence_words.add(word)
         for word in sentence_words:
             if word not in self.cache:
                 self.cache[word] = []
             self.cache[word].append(id_)
     progress_bar(-1, size)
     with open(file_name, "w+") as output_file:
         log("writing word cache")
         json.dump(self.cache, output_file)
Exemplo n.º 2
0
    def create(self, language: Language, cache_path: Path):
        table_id: str = f"{language.language.part1}_sentences"
        file_path = cache_path / f"{language.get_part3()}_sentences.tsv"

        if not file_path.exists():
            zip_path: Path = (cache_path /
                              f"{language.get_part3()}_sentences.tsv.bz2")
            # FIXME: remove zip file.
            if not zip_path.is_file():
                download(
                    f"https://downloads.tatoeba.org/exports/per_language/"
                    f"{language.get_part3()}/{language.get_part3()}"
                    f"_sentences.tsv.bz2",
                    zip_path,
                )
            with bz2.open(zip_path) as zip_file:
                with file_path.open("wb+") as cache_file:
                    log(f"unzipping sentences for {language.get_name()}")
                    cache_file.write(zip_file.read())

        self.cursor.execute(
            f"CREATE TABLE {table_id} (id integer primary key, sentence text)")
        print(f"Reading {table_id}...")
        with file_path.open() as input_file:
            for line in input_file.readlines():
                id_, _, sentence = line[:-1].split("\t")
                self.cursor.execute(f"INSERT INTO {table_id} VALUES (?,?)",
                                    (id_, sentence))
        self.connection.commit()
Exemplo n.º 3
0
Arquivo: core.py Projeto: enzet/Emmio
 def write(self) -> None:
     """Serialize learning process to a file."""
     log(f"saving learning process to {self.file_path}")
     structure = {"log": [], "config": self.config}
     for record in self.records:
         structure["log"].append(record.to_structure())
     with self.file_path.open("w+") as output_file:
         json.dump(structure, output_file, ensure_ascii=False, indent=4)
Exemplo n.º 4
0
    def __init__(
        self,
        cache_directory_name: Path,
        interface: Interface,
        user_data: UserData,
        sentence_db: SentenceDatabase,
        frequency_db: FrequencyDatabase,
        learning: Learning,
        lexicon: Lexicon,
        get_dictionaries=None,
    ) -> None:
        self.interface: Interface = interface
        self.user_data: UserData = user_data
        self.known_language: Language = learning.language

        self.learning_language: Optional[Language]
        try:
            self.learning_language = construct_language(learning.subject)
        except KeyError:
            self.learning_language = None

        self.max_for_day: int = learning.ratio
        self.learning: Learning = learning
        self.dictionaries: Dictionaries = Dictionaries(
            get_dictionaries(self.learning_language)
        )

        self.lexicon: Lexicon = lexicon

        self.sentences: Sentences = Sentences(
            cache_directory_name,
            sentence_db,
            self.known_language,
            self.learning_language,
        )

        self.words: list[tuple[int, str]] = []
        log("getting words")
        for frequency_list_id in learning.frequency_list_ids:
            frequency_list_id: str
            for index, word, _ in frequency_db.get_words(frequency_list_id):
                index: int
                word: str
                if (
                    not self.learning.check_lexicon
                    or not self.lexicon
                    or not self.lexicon.has(word)
                    or self.lexicon.get(word) == LexiconResponse.DONT
                ):
                    self.words.append((index, word))

        self.skip = set()

        self.stop_after_answer: bool = False
Exemplo n.º 5
0
    def write(self) -> None:
        """
        Write lexicon to a JSON file using string writing. Should be faster than
        `write_json` but less accurate.
        """
        log(f"writing lexicon to {self.file_path}")

        structure: list[dict[str, Any]] = []

        for lexicon_log_id in self.logs:
            structure.append(self.logs[lexicon_log_id].serialize())

        with self.file_path.open("w+") as output:
            json.dump(structure, output, indent=4, ensure_ascii=False)
Exemplo n.º 6
0
    def __init__(
        self,
        cache_path: Path,
        sentence_db: SentenceDatabase,
        language_1: Language,
        language_2: Language,
    ):

        self.sentence_db: SentenceDatabase = sentence_db
        self.language_1: Language = language_1
        self.language_2: Language = language_2

        links_cache_path: Path = (cache_path /
                                  f"links_{self.language_1.get_part3()}_"
                                  f"{self.language_2.get_part3()}.json")

        self.links: Dict[int, Set[int]] = {}

        if links_cache_path.is_file():
            self.read_link_sets(links_cache_path)
        else:
            self.read_links(cache_path)
            log("writing link cache")
            with open(links_cache_path, "w+") as output_file:
                content = {}
                for key in self.links:
                    assert isinstance(key, int)
                    content[key] = list(self.links[key])
                json.dump(content, output_file)

        self.cache: Dict[str, List[int]] = {}

        links_cache_path: str = join(
            cache_path, f"cache_{self.language_2.get_part3()}.json")
        # FIXME: remove cache file.

        if os.path.isfile(links_cache_path):
            log("reading word cache")
            with open(links_cache_path) as input_file:
                self.cache = json.load(input_file)
        else:
            self.fill_cache(links_cache_path)
Exemplo n.º 7
0
    def read_links(self, cache_path: Path):

        file_path: Path = cache_path / "links.csv"

        if not file_path.exists():
            zip_path: Path = cache_path / "links.tar.bz2"
            # FIXME: remove zip file.
            if not zip_path.exists():
                download(
                    "https://downloads.tatoeba.org/exports/links.tar.bz2",
                    Path("links.tar.bz2"),
                )
            with bz2.open(zip_path) as zip_file:
                with file_path.open("wb+") as cache_file:
                    log("unzipping links")
                    cache_file.write(zip_file.read())

        log("reading links")
        with file_path.open() as input_1:
            lines = input_1.readlines()

        links: Dict[int, Set[int]] = {}

        size = len(lines)

        log(f"construct cache links for {self.language_1.get_name()} and "
            f"{self.language_2.get_name()}")
        for index, line in enumerate(lines):
            progress_bar(index, size)

            try:
                id_1, id_2 = map(int, line[:-1].split("\t"))
            except ValueError:
                continue

            if id_1 not in links and id_2 not in links:
                set_ = {id_1, id_2}
                links[id_1] = set_
                links[id_2] = set_

            if id_1 in links:
                set_ = links[id_1]
                set_.add(id_2)
                links[id_2] = set_

            if id_2 in links:
                set_ = links[id_2]
                set_.add(id_1)
                links[id_1] = set_

        progress_bar(-1, size)

        sentences_1: dict[str, Sentence] = self.sentence_db.get_sentences(
            self.language_1, cache_path)
        sentences_2: dict[str, Sentence] = self.sentence_db.get_sentences(
            self.language_2, cache_path)

        for id_1 in sentences_2:
            assert isinstance(id_1, int)
            self.links[id_1] = set()
            if id_1 in links:
                for id_2 in links[id_1]:
                    if id_2 in sentences_1:
                        self.links[id_1].add(id_2)
            if not self.links[id_1]:
                self.links.pop(id_1)
Exemplo n.º 8
0
    def read_link_sets(self, file_name: Path):

        log("reading link cache")
        with file_name.open() as input_file:
            self.links = json.load(input_file)