def nextPair(self): if not self._file: raise StopIteration line = self._file.readline() if not line: raise StopIteration line = line.strip()## This also removed tailing newline if not line: return ### word, tab, defi = line.partition('\t') if not tab: log.error('Warning: line starting with "%s" has no tab!'%line[:10]) return ### if self._glos.getPref('enable_alts', True): word = splitByBarUnescapeNTB(word) if len(word)==1: word = word[0] else: word = unescapeNTB(word, bar=True) ### defi = unescapeNTB(defi) ### return word, defi
def nextPair(self) -> "Tuple[str, str]": if not self._file: raise StopIteration line = self.readline() if not line: raise StopIteration line = line.rstrip("\n") if not line: return ### word, tab, defi = line.partition("\t") if not tab: log.error(f"Warning: line starting with {line[:10]!r} has no tab!") return ### if self._glos.getConfig("enable_alts", True): word = splitByBarUnescapeNTB(word) if len(word) == 1: word = word[0] else: word = unescapeNTB(word, bar=False) ### defi = unescapeNTB(defi) ### return word, defi
def nextPair(self) -> Tuple[str, str]: if not self._file: raise StopIteration line = self._file.readline() if not line: raise StopIteration line = line.strip() # This also removes tailing newline if not line: return ### word, tab, defi = line.partition("\t") if not tab: log.error( "Warning: line starting with \"%s\" has no tab!" % line[:10] ) return ### if self._glos.getPref("enable_alts", True): word = splitByBarUnescapeNTB(word) if len(word) == 1: word = word[0] else: word = unescapeNTB(word, bar=True) ### defi = unescapeNTB(defi) ### return word, defi
def __next__(self): if not self._nextPath: log.error('iterating over a reader which is not open') raise StopIteration if self._nextPath == 'END': if self._pos != self._len: log.warning('%s words found, wordCount in info.json was %s'%(self._pos, self._len)) self._len = self._pos raise StopIteration ### self._pos += 1 ### with open(join(self._filename, self._nextPath), 'r', encoding=self._encoding) as fp: self._nextPath = fp.readline().rstrip() word = fp.readline().rstrip() defi = fp.read().rstrip() ### if self._glos.getPref('enable_alts', True): word = splitByBarUnescapeNTB(word) if len(word)==1: word = word[0] else: word = unescapeNTB(word, bar=True) ### #defi = unescapeNTB(defi) ### return Entry(word, defi)
def __iter__(self) -> Iterator[BaseEntry]: if not self._rootPath: log.error("iterating over a reader which is not open") raise StopIteration wordCount = 0 nextPath = self._rootPath while nextPath != "END": wordCount += 1 # before or after reading word and defi # (and skipping empty entry)? FIXME with open( join(self._filename, nextPath), "r", encoding=self._encoding, ) as fromFile: header = fromFile.readline().rstrip() if self._havePrevLink: self._prevPath, nextPath = header.split(" ") else: nextPath = header word = fromFile.readline() if not word: yield None # update progressbar continue defi = fromFile.read() if not defi: log.warning( f"Edlin Reader: no definition for word {word!r}" f", skipping") yield None # update progressbar continue word = word.rstrip() defi = defi.rstrip() if self._glos.getPref("enable_alts", True): word = splitByBarUnescapeNTB(word) if len(word) == 1: word = word[0] else: word = unescapeNTB(word, bar=True) # defi = unescapeNTB(defi) yield self._glos.newEntry(word, defi) if wordCount != self._wordCount: log.warning(f"{wordCount} words found, " f"wordCount in info.json was {self._wordCount}") self._wordCount = wordCount resDir = self._resDir for fname in self._resFileNames: with open(join(resDir, fname), "rb") as fromFile: yield self._glos.newDataEntry( fname, fromFile.read(), )
def __next__(self): if not self._nextPath: log.error('iterating over a reader which is not open') raise StopIteration if self._nextPath == 'END': if self._pos != self._len: log.warning('%s words found, wordCount in info.json was %s' % (self._pos, self._len)) self._len = self._pos raise StopIteration ### self._pos += 1 ## before or after reading word and defi (and skipping empty entry)? FIXME ### with open(join(self._filename, self._nextPath), 'r', encoding=self._encoding) as fp: header = fp.readline().rstrip() if self._havePrevLink: self._prevPath, self._nextPath = header.split(' ') else: self._nextPath = header word = fp.readline() if not word: return defi = fp.read() if not defi: log.warning( 'Edlin Reader: no definition for word "%s", skipping' % word) return word = word.rstrip() defi = defi.rstrip() ### if self._glos.getPref('enable_alts', True): word = splitByBarUnescapeNTB(word) if len(word) == 1: word = word[0] else: word = unescapeNTB(word, bar=True) ### #defi = unescapeNTB(defi) ### return Entry(word, defi)
def __next__(self): if not self._nextPath: log.error('iterating over a reader which is not open') raise StopIteration if self._nextPath == 'END': if self._pos != self._len: log.warning('%s words found, wordCount in info.json was %s'%(self._pos, self._len)) self._len = self._pos raise StopIteration ### self._pos += 1 ## before or after reading word and defi (and skipping empty entry)? FIXME ### with open(join(self._filename, self._nextPath), 'r', encoding=self._encoding) as fp: header = fp.readline().rstrip() if self._havePrevLink: self._prevPath, self._nextPath = header.split(' ') else: self._nextPath = header word = fp.readline() if not word: return defi = fp.read() if not defi: log.warning('Edlin Reader: no definition for word "%s", skipping'%word) return word = word.rstrip() defi = defi.rstrip() ### if self._glos.getPref('enable_alts', True): word = splitByBarUnescapeNTB(word) if len(word)==1: word = word[0] else: word = unescapeNTB(word, bar=True) ### #defi = unescapeNTB(defi) ### return Entry(word, defi)
def fixLinks(self, linkTargetSet): import gc from cachetools import LRUCache gc.collect() dirn = self._filename filenameList = self._filenameList fileByWord = {} for line in open(join(dirn, "index.txt"), encoding="utf-8"): line = line.rstrip("\n") if not line: continue entryIndex, wordEsc, filename, _ = line.split("\t") entryIndex = int(entryIndex) # entryId = f"entry{entryIndex}" word = unescapeNTB(wordEsc) if word not in linkTargetSet: continue if word in fileByWord: # log.info(f'fileByWord[{word}]={fileByWord[word]}, filename={filename}') fileByWord[word].append((filename, entryIndex)) else: fileByWord[word] = [(filename, entryIndex)] linksByFile = LRUCache(maxsize=100) # with open(join(dirn, "fileByWord.json"), "w") as fileByWordFile: # json.dump(fileByWord, fileByWordFile, ensure_ascii=False, indent="\t") def getLinksByFile(fileIndex): _file = linksByFile.get(fileIndex) if _file is not None: return _file _file = open( join(dirn, f"links{fileIndex}"), mode="a", encoding="utf-8", ) linksByFile[fileIndex] = _file return _file log.info("") for line in open(join(dirn, "links.txt"), encoding="utf-8"): line = line.rstrip("\n") if not line: continue target, fileIndex, x_start, x_size = line.split("\t") target = unescapeNTB(target) if target not in fileByWord: targetNew = "" else: targetFilename, targetEntryIndex = fileByWord[target][0] if targetFilename == filename: continue targetNew = f"{targetFilename}#entry{targetEntryIndex}" _file = getLinksByFile(int(fileIndex)) _file.write(f"{x_start}\t{x_size}\t{targetNew}\n") _file.flush() for _, _file in linksByFile.items(): _file.close() del linksByFile linkTargetSet.clear() del fileByWord, linkTargetSet gc.collect() entry_url_fmt = self._glos.getInfo("entry_url") re_href = re.compile( b' href="[^<>"]*?"', re.I, ) for fileIndex, filename in enumerate(filenameList): if not isfile(join(dirn, f"links{fileIndex}")): continue with open(join(dirn, filename), mode="rb") as inFile: with open(join(dirn, f"{filename}.new"), mode="wb") as outFile: for linkLine in open(join(dirn, f"links{fileIndex}"), "rb"): outFile.flush() linkLine = linkLine.rstrip(b"\n") x_start, x_size, target = linkLine.split(b"\t") outFile.write( inFile.read(int(x_start, 16) - inFile.tell())) curLink = inFile.read(int(x_size, 16)) if target: outFile.write( re_href.sub( b' href="./' + target + b'"', curLink, )) continue if not entry_url_fmt: outFile.write( curLink.replace( b' href="#', b' class="broken" href="#', )) continue _st = curLink.decode("utf-8") i = _st.find('href="#') j = _st.find('"', i + 7) word = _st[i + 7:j] url = entry_url_fmt.format(word=word) outFile.write( (_st[:i] + f'class="broken" href="{url}"' + _st[j + 1:]).encode("utf-8")) outFile.write(inFile.read()) os.rename(join(dirn, f"{filename}.new"), join(dirn, filename)) os.remove(join(dirn, f"links{fileIndex}"))
def fixLinks(self, linkTargetSet): import gc from cachetools import LRUCache gc.collect() dirn = self._filename filenameList = self._filenameList fileByWord = {} for line in open(join(dirn, "index.txt"), encoding="utf-8"): line = line.rstrip("\n") if not line: continue word, filename, _ = line.split("\t") word = unescapeNTB(word) if word not in linkTargetSet: continue fileByWord[word] = filename linksByFile = LRUCache(maxsize=100) def getLinksByFile(fileIndex): _file = linksByFile.get(fileIndex) if _file is not None: return _file _file = open( join(dirn, f"links{fileIndex}"), mode="a", encoding="utf-8", ) linksByFile[fileIndex] = _file return _file log.info("") for line in open(join(dirn, "links.txt"), encoding="utf-8"): line = line.rstrip("\n") if not line: continue target, fileIndex, x_start, x_size = line.split("\t") target = unescapeNTB(target) if target not in fileByWord: targetFilename = "" else: targetFilename = fileByWord[target] if targetFilename == filename: continue _file = getLinksByFile(int(fileIndex)) _file.write( f"{x_start}\t{x_size}\t{targetFilename}\n" ) _file.flush() for _, _file in linksByFile.items(): _file.close() del linksByFile linkTargetSet.clear() del fileByWord, linkTargetSet gc.collect() entry_url_fmt = self._glos.getInfo("entry_url") for fileIndex, filename in enumerate(filenameList): with open(join(dirn, filename), mode="rb") as inFile: with open(join(dirn, f"{filename}.new"), mode="wb") as outFile: for linkLine in open(join(dirn, f"links{fileIndex}"), "rb"): outFile.flush() linkLine = linkLine.rstrip(b"\n") x_start, x_size, targetFilename = linkLine.split(b"\t") outFile.write(inFile.read( int(x_start, 16) - inFile.tell() )) curLink = inFile.read(int(x_size, 16)) if targetFilename: outFile.write(curLink.replace( b' href="#', b' href="./' + targetFilename + b'#', )) continue if not entry_url_fmt: outFile.write(curLink.replace( b' href="#', b' class="broken" href="#', )) continue _st = curLink.decode("utf-8") i = _st.find('href="#') j = _st.find('"', i + 7) word = _st[i + 7:j] url = entry_url_fmt.format(word=word) outFile.write(( _st[:i] + f'class="broken" href="{url}"' + _st[j + 1:] ).encode("utf-8")) outFile.write(inFile.read()) os.rename(join(dirn, f"{filename}.new"), join(dirn, filename)) os.remove(join(dirn, f"links{fileIndex}"))
def __iter__(self): if not self._rootPath: log.error("iterating over a reader which is not open") raise StopIteration wordCount = 0 nextPath = self._rootPath while nextPath != "END": wordCount += 1 # before or after reading word and defi # (and skipping empty entry)? FIXME with open( join(self._filename, nextPath), "r", encoding=self._encoding, ) as fromFile: header = fromFile.readline().rstrip() if self._havePrevLink: self._prevPath, nextPath = header.split(" ") else: nextPath = header word = fromFile.readline() if not word: yield None # update progressbar continue defi = fromFile.read() if not defi: log.warning( "Edlin Reader: no definition for word %r" % word + ", skipping" ) yield None # update progressbar continue word = word.rstrip() defi = defi.rstrip() if self._glos.getPref("enable_alts", True): word = splitByBarUnescapeNTB(word) if len(word) == 1: word = word[0] else: word = unescapeNTB(word, bar=True) # defi = unescapeNTB(defi) yield self._glos.newEntry(word, defi) if wordCount != self._wordCount: log.warning( "%s words found, " % wordCount + "wordCount in info.json was %s" % self._wordCount ) self._wordCount = wordCount resDir = self._resDir for fname in self._resFileNames: with open(join(resDir, fname), "rb") as fromFile: yield self._glos.newDataEntry( fname, fromFile.read(), )