def writeGramGroups( self, hf: "lxml.etree.htmlfile", gramGrpList: "List[lxml.etree.htmlfile]", ): from lxml import etree as ET auto_rtl = self._auto_rtl color = self._gram_color for gramGrp in gramGrpList: parts = [] for child in gramGrp.iterchildren(): part = self.normalizeGramGrpChild(child) if part: parts.append(part) if not parts: continue sep = ", " if auto_rtl: ws = getWritingSystemFromText(parts[0]) if ws: sep = ws.comma + " " text = sep.join(parts) with hf.element("font", color=color): hf.write(text) hf.write(ET.Element("br"))
def __iter__(self): from pyglossary.langs.writing_system import getWritingSystemFromText alternateDict = {} self._cur.execute("select wordkey, searchwordkey from Keys") for row in self._cur.fetchall(): if row[0] in alternateDict: alternateDict[row[0]].append(row[1]) else: alternateDict[row[0]] = [row[1]] self._cur.execute( "select word, searchword, root, meaning from WordsTable" " order by id") # FIXME: iteration over self._cur stops after one entry # and self._cur.fetchone() returns None # for row in self._cur: for row in self._cur.fetchall(): word = row[0] searchword = row[1] root = row[2] meaning = row[3] definition = meaning definition = definition.replace("|", "<br>") if root: definition += f'<br>Root: <a href="bword://{html.escape(root)}">{root}</a>' ws = getWritingSystemFromText(meaning) if ws and ws.direction == "rtl": definition = f'<div dir="rtl">{definition}</div>' words = [word, searchword] if word in alternateDict: words += alternateDict[word] yield self._glos.newEntry( words, definition, defiFormat="h", )
def write(self) -> "Generator[None, BaseEntry, None]": import re from collections import Counter, OrderedDict from pyglossary.json_utils import dataToPrettyJson from pyglossary.langs.writing_system import getWritingSystemFromText glos = self._glos re_possible_html = re.compile( r"<[a-z1-6]+[ />]", re.I, ) re_style = re.compile( r"<([a-z1-6]+)[^<>]* style=", re.I | re.DOTALL, ) wordCount = 0 bwordCount = 0 styleByTagCounter = Counter() defiFormatCounter = Counter() firstTagCounter = Counter() allTagsCounter = Counter() sourceScriptCounter = Counter() while True: entry = yield if entry is None: break defi = entry.defi wordCount += 1 bwordCount += defi.count("bword://") for m in re_style.finditer(defi): tag = m.group(1) styleByTagCounter[tag] += 1 entry.detectDefiFormat() defiFormat = entry.defiFormat defiFormatCounter[defiFormat] += 1 if defiFormat == "m": if re_possible_html.match(defi): log.warn(f"undetected html defi: {defi}") elif defiFormat == "h": match = re_possible_html.search(defi) if match is not None: tag = match.group().strip("< />").lower() firstTagCounter[tag] += 1 for tag in re_possible_html.findall(defi): tag = tag.strip("< />").lower() allTagsCounter[tag] += 1 ws = getWritingSystemFromText(entry.s_word) if ws: wsName = ws.name else: log.debug(f"No script detected for word: {entry.s_word}") wsName = "None" sourceScriptCounter[wsName] += 1 data_entry_count = defiFormatCounter["b"] del defiFormatCounter["b"] info = OrderedDict() for key, value in glos.iterInfo(): info[key] = value info["word_count"] = wordCount info["bword_count"] = bwordCount info["data_entry_count"] = data_entry_count info["defi_format"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in sorted(defiFormatCounter.items())) info["defi_tag"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in allTagsCounter.most_common()) info["defi_first_tag"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in firstTagCounter.most_common()) info["style"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in styleByTagCounter.most_common()) info["source_script"] = ", ".join( f"{defiFormat}={count}" for defiFormat, count in sourceScriptCounter.most_common()) self._file.write(dataToPrettyJson(info) + "\n")
def getTitleTag(self, sample: str) -> str: ws = getWritingSystemFromText(sample) if ws: return ws.titleTag return "b"
def getCommaSep(self, sample: str): if self._auto_rtl: ws = getWritingSystemFromText(sample) if ws: return ws.comma + " " return ", "