def fill_cache(self, file_name: str) -> None: """Construct dictionary from words to sentences.""" log("fill word cache") size = len(self.links) for index, id_ in enumerate(self.links.keys()): id_ = int(id_) progress_bar(index, size) word: str = "" sentence_words: Set[str] = set() sentence: str = self.sentence_db.get_sentence( self.language_2, id_).text for symbol in sentence.lower(): if self.language_2.has_symbol(symbol): word += symbol else: if word: sentence_words.add(word) word = "" if word: sentence_words.add(word) for word in sentence_words: if word not in self.cache: self.cache[word] = [] self.cache[word].append(id_) progress_bar(-1, size) with open(file_name, "w+") as output_file: log("writing word cache") json.dump(self.cache, output_file)
def create(self, language: Language, cache_path: Path): table_id: str = f"{language.language.part1}_sentences" file_path = cache_path / f"{language.get_part3()}_sentences.tsv" if not file_path.exists(): zip_path: Path = (cache_path / f"{language.get_part3()}_sentences.tsv.bz2") # FIXME: remove zip file. if not zip_path.is_file(): download( f"https://downloads.tatoeba.org/exports/per_language/" f"{language.get_part3()}/{language.get_part3()}" f"_sentences.tsv.bz2", zip_path, ) with bz2.open(zip_path) as zip_file: with file_path.open("wb+") as cache_file: log(f"unzipping sentences for {language.get_name()}") cache_file.write(zip_file.read()) self.cursor.execute( f"CREATE TABLE {table_id} (id integer primary key, sentence text)") print(f"Reading {table_id}...") with file_path.open() as input_file: for line in input_file.readlines(): id_, _, sentence = line[:-1].split("\t") self.cursor.execute(f"INSERT INTO {table_id} VALUES (?,?)", (id_, sentence)) self.connection.commit()
def write(self) -> None: """Serialize learning process to a file.""" log(f"saving learning process to {self.file_path}") structure = {"log": [], "config": self.config} for record in self.records: structure["log"].append(record.to_structure()) with self.file_path.open("w+") as output_file: json.dump(structure, output_file, ensure_ascii=False, indent=4)
def __init__( self, cache_directory_name: Path, interface: Interface, user_data: UserData, sentence_db: SentenceDatabase, frequency_db: FrequencyDatabase, learning: Learning, lexicon: Lexicon, get_dictionaries=None, ) -> None: self.interface: Interface = interface self.user_data: UserData = user_data self.known_language: Language = learning.language self.learning_language: Optional[Language] try: self.learning_language = construct_language(learning.subject) except KeyError: self.learning_language = None self.max_for_day: int = learning.ratio self.learning: Learning = learning self.dictionaries: Dictionaries = Dictionaries( get_dictionaries(self.learning_language) ) self.lexicon: Lexicon = lexicon self.sentences: Sentences = Sentences( cache_directory_name, sentence_db, self.known_language, self.learning_language, ) self.words: list[tuple[int, str]] = [] log("getting words") for frequency_list_id in learning.frequency_list_ids: frequency_list_id: str for index, word, _ in frequency_db.get_words(frequency_list_id): index: int word: str if ( not self.learning.check_lexicon or not self.lexicon or not self.lexicon.has(word) or self.lexicon.get(word) == LexiconResponse.DONT ): self.words.append((index, word)) self.skip = set() self.stop_after_answer: bool = False
def write(self) -> None: """ Write lexicon to a JSON file using string writing. Should be faster than `write_json` but less accurate. """ log(f"writing lexicon to {self.file_path}") structure: list[dict[str, Any]] = [] for lexicon_log_id in self.logs: structure.append(self.logs[lexicon_log_id].serialize()) with self.file_path.open("w+") as output: json.dump(structure, output, indent=4, ensure_ascii=False)
def __init__( self, cache_path: Path, sentence_db: SentenceDatabase, language_1: Language, language_2: Language, ): self.sentence_db: SentenceDatabase = sentence_db self.language_1: Language = language_1 self.language_2: Language = language_2 links_cache_path: Path = (cache_path / f"links_{self.language_1.get_part3()}_" f"{self.language_2.get_part3()}.json") self.links: Dict[int, Set[int]] = {} if links_cache_path.is_file(): self.read_link_sets(links_cache_path) else: self.read_links(cache_path) log("writing link cache") with open(links_cache_path, "w+") as output_file: content = {} for key in self.links: assert isinstance(key, int) content[key] = list(self.links[key]) json.dump(content, output_file) self.cache: Dict[str, List[int]] = {} links_cache_path: str = join( cache_path, f"cache_{self.language_2.get_part3()}.json") # FIXME: remove cache file. if os.path.isfile(links_cache_path): log("reading word cache") with open(links_cache_path) as input_file: self.cache = json.load(input_file) else: self.fill_cache(links_cache_path)
def read_links(self, cache_path: Path): file_path: Path = cache_path / "links.csv" if not file_path.exists(): zip_path: Path = cache_path / "links.tar.bz2" # FIXME: remove zip file. if not zip_path.exists(): download( "https://downloads.tatoeba.org/exports/links.tar.bz2", Path("links.tar.bz2"), ) with bz2.open(zip_path) as zip_file: with file_path.open("wb+") as cache_file: log("unzipping links") cache_file.write(zip_file.read()) log("reading links") with file_path.open() as input_1: lines = input_1.readlines() links: Dict[int, Set[int]] = {} size = len(lines) log(f"construct cache links for {self.language_1.get_name()} and " f"{self.language_2.get_name()}") for index, line in enumerate(lines): progress_bar(index, size) try: id_1, id_2 = map(int, line[:-1].split("\t")) except ValueError: continue if id_1 not in links and id_2 not in links: set_ = {id_1, id_2} links[id_1] = set_ links[id_2] = set_ if id_1 in links: set_ = links[id_1] set_.add(id_2) links[id_2] = set_ if id_2 in links: set_ = links[id_2] set_.add(id_1) links[id_1] = set_ progress_bar(-1, size) sentences_1: dict[str, Sentence] = self.sentence_db.get_sentences( self.language_1, cache_path) sentences_2: dict[str, Sentence] = self.sentence_db.get_sentences( self.language_2, cache_path) for id_1 in sentences_2: assert isinstance(id_1, int) self.links[id_1] = set() if id_1 in links: for id_2 in links[id_1]: if id_2 in sentences_1: self.links[id_1].add(id_2) if not self.links[id_1]: self.links.pop(id_1)
def read_link_sets(self, file_name: Path): log("reading link cache") with file_name.open() as input_file: self.links = json.load(input_file)