Exemplo n.º 1
0
    def write(self,
              filename: str,
              encoding: str = "utf-8",
              havePrevLink: bool = True) -> None:
        from collections import OrderedDict as odict
        from pyglossary.json_utils import dataToPrettyJson

        if exists(filename):
            raise ValueError(f"directory {filename!r} already exists")
        self._filename = filename
        self._encoding = encoding
        self._havePrevLink = havePrevLink
        self._resDir = join(filename, "res")
        os.makedirs(filename)
        os.mkdir(self._resDir)

        thisEntry = yield
        if thisEntry is None:
            raise ValueError("glossary is empty")

        count = 1
        rootHash = thisHash = self.getEntryHash(thisEntry)
        prevHash = None

        while True:
            nextEntry = yield
            if nextEntry is None:
                break
            if nextEntry.isData():
                nextEntry.save(self._resDir)
                continue
            nextHash = self.getEntryHash(nextEntry)
            self.saveEntry(thisEntry, thisHash, prevHash, nextHash)
            thisEntry = nextEntry
            prevHash, thisHash = thisHash, nextHash
            count += 1
        self.saveEntry(thisEntry, thisHash, prevHash, None)

        with open(
                join(self._filename, "info.json"),
                "w",
                encoding=self._encoding,
        ) as toFile:
            info = odict()
            info["name"] = self._glos.getInfo("name")
            info["root"] = self.hashToPath(rootHash)
            info["havePrevLink"] = self._havePrevLink
            info["wordCount"] = count
            # info["modified"] =

            for key, value in self._glos.getExtraInfos((
                    "name",
                    "root",
                    "havePrevLink",
                    "wordCount",
            )).items():
                info[key] = value

            toFile.write(dataToPrettyJson(info))
Exemplo n.º 2
0
    def write(self, filename: str) -> Generator[None, "BaseEntry", None]:
        import re
        from collections import Counter, OrderedDict
        from pyglossary.json_utils import dataToPrettyJson

        glos = self._glos
        re_possible_html = re.compile(r"<[a-zA-Z]+[ />]")

        defiFormatCounter = Counter()
        firstTagCounter = Counter()
        allTagsCounter = Counter()
        wordCount = 0
        while True:
            entry = yield
            if entry is None:
                break
            entry.detectDefiFormat()
            defiFormat = entry.defiFormat
            wordCount += 1
            defiFormatCounter[defiFormat] += 1
            defi = entry.defi
            if defiFormat == "m":
                if re_possible_html.match(defi):
                    log.warn(f"undetected html defi: {defi}")
            elif defiFormat == "h":
                tag = re_possible_html.search(defi).group().strip(
                    "< />").lower()
                firstTagCounter[tag] += 1
                for tag in re_possible_html.findall(defi):
                    tag = tag.strip("< />").lower()
                    allTagsCounter[tag] += 1

        data_entry_count = defiFormatCounter["b"]
        del defiFormatCounter["b"]
        info = OrderedDict()
        for key, value in glos.iterInfo():
            info[key] = value
        info["word_count"] = wordCount
        info["data_entry_count"] = data_entry_count
        info["defi_format_counter"] = ", ".join(
            f"{defiFormat}={count}"
            for defiFormat, count in sorted(defiFormatCounter.items()))
        info["defi_tag_counter"] = ", ".join(
            f"{defiFormat}={count}"
            for defiFormat, count in allTagsCounter.most_common())
        info["defi_first_tag_counter"] = ", ".join(
            f"{defiFormat}={count}"
            for defiFormat, count in firstTagCounter.most_common())
        with open(filename, mode="w", encoding="utf-8") as _file:
            _file.write(dataToPrettyJson(info))
Exemplo n.º 3
0
    def write(self) -> "Generator[None, BaseEntry, None]":
        from collections import OrderedDict as odict
        from pyglossary.json_utils import dataToPrettyJson

        filename = self._filename

        thisEntry = yield
        if thisEntry is None:
            raise ValueError("glossary is empty")

        count = 1
        rootHash = thisHash = self.getEntryHash(thisEntry)
        prevHash = None

        while True:
            nextEntry = yield
            if nextEntry is None:
                break
            if nextEntry.isData():
                nextEntry.save(self._resDir)
                continue
            nextHash = self.getEntryHash(nextEntry)
            self.saveEntry(thisEntry, thisHash, prevHash, nextHash)
            thisEntry = nextEntry
            prevHash, thisHash = thisHash, nextHash
            count += 1
        self.saveEntry(thisEntry, thisHash, prevHash, None)

        with open(
                join(self._filename, "info.json"),
                "w",
                encoding=self._encoding,
        ) as toFile:
            info = odict()
            info["name"] = self._glos.getInfo("name")
            info["root"] = self.hashToPath(rootHash)
            info["havePrevLink"] = self._havePrevLink
            info["wordCount"] = count
            # info["modified"] =

            for key, value in self._glos.getExtraInfos((
                    "name",
                    "root",
                    "havePrevLink",
                    "wordCount",
            )).items():
                info[key] = value

            toFile.write(dataToPrettyJson(info))
Exemplo n.º 4
0
 def saveConfig(self):
     from pyglossary.json_utils import dataToPrettyJson
     config = OrderedDict()
     for key, option in self.configDefDict.items():
         if key not in self.config:
             log.warning(f"saveConfig: missing key {key!r}")
             continue
         value = self.config[key]
         if not option.validate(value):
             log.error(f"saveConfig: invalid {key}={value!r}")
             continue
         config[key] = value
     jsonStr = dataToPrettyJson(config)
     with open(confJsonFile, mode="wt", encoding="utf-8") as _file:
         _file.write(jsonStr)
     log.info(f"saved {confJsonFile!r}")
Exemplo n.º 5
0
	def write(self, ):
		from collections import OrderedDict as odict
		from pyglossary.json_utils import dataToPrettyJson

		filename = self._filename

		wordCount = 0
		compression = self._compression
		c_open = compressionOpenFunc(compression)
		if not c_open:
			raise ValueError(f"invalid compression {c!r}")
		while True:
			entry = yield
			if entry is None:
				break
			if entry.isData():
				continue
			fpath = join(filename, self.filePathFromWord(entry.b_word))
			if compression:
				fpath = f"{fpath}.{compression}"
			parentDir = dirname(fpath)
			if not isdir(parentDir):
				makedirs(parentDir)
			if isfile(fpath):
				log.warn(f"file exists: {fpath}")
				fpath += f"-{sha1(entry.b_defi).hexdigest()[:4]}"
			with c_open(fpath, "wt", encoding="utf-8") as _file:
				_file.write(
					f"{escapeNTB(entry.s_word)}\n{entry.defi}"
				)
			wordCount += 1

		with open(
			join(filename, "info.json"),
			mode="w",
			encoding="utf-8",
		) as infoFile:
			info = odict()
			info["name"] = self._glos.getInfo("name")
			info["wordCount"] = wordCount
			for key, value in self._glos.getExtraInfos((
				"name",
				"wordCount",
			)).items():
				info[key] = value
			infoFile.write(dataToPrettyJson(info))
Exemplo n.º 6
0
    def write(self):
        from collections import OrderedDict as odict
        from pyglossary.json_utils import dataToPrettyJson

        glosIter = iter(self._iterNonDataEntries())
        try:
            thisEntry = next(glosIter)
        except StopIteration:
            raise ValueError("glossary is empty")

        count = 1
        rootHash = thisHash = self.getEntryHash(thisEntry)
        prevHash = None
        for nextEntry in glosIter:
            nextHash = self.getEntryHash(nextEntry)
            self.saveEntry(thisEntry, thisHash, prevHash, nextHash)
            thisEntry = nextEntry
            prevHash, thisHash = thisHash, nextHash
            count += 1
        self.saveEntry(thisEntry, thisHash, prevHash, None)

        with open(
                join(self._filename, "info.json"),
                "w",
                encoding=self._encoding,
        ) as toFile:
            info = odict()
            info["name"] = self._glos.getInfo("name")
            info["root"] = self.hashToPath(rootHash)
            info["havePrevLink"] = self._havePrevLink
            info["wordCount"] = count
            # info["modified"] =

            for key, value in self._glos.getExtraInfos((
                    "name",
                    "root",
                    "havePrevLink",
                    "wordCount",
            )).items():
                info[key] = value

            toFile.write(dataToPrettyJson(info))
Exemplo n.º 7
0
	def write(self):
		from collections import OrderedDict as odict
		from pyglossary.json_utils import dataToPrettyJson

		glosIter = iter(self._iterNonDataEntries())
		try:
			thisEntry = next(glosIter)
		except StopIteration:
			raise ValueError("glossary is empty")

		count = 1
		rootHash = thisHash = self.getEntryHash(thisEntry)
		prevHash = None
		for nextEntry in glosIter:
			nextHash = self.getEntryHash(nextEntry)
			self.saveEntry(thisEntry, thisHash, prevHash, nextHash)
			thisEntry = nextEntry
			prevHash, thisHash = thisHash, nextHash
			count += 1
		self.saveEntry(thisEntry, thisHash, prevHash, None)

		with open(
			join(self._filename, "info.json"),
			"w",
			encoding=self._encoding,
		) as toFile:
			info = odict()
			info["name"] = self._glos.getInfo("name")
			info["root"] = self.hashToPath(rootHash)
			info["havePrevLink"] = self._havePrevLink
			info["wordCount"] = count
			# info["modified"] =

			for key, value in self._glos.getExtraInfos((
				"name",
				"root",
				"havePrevLink",
				"wordCount",
			)).items():
				info[key] = value

			toFile.write(dataToPrettyJson(info))
Exemplo n.º 8
0
    def write(self):
        from collections import OrderedDict as odict
        from pyglossary.json_utils import dataToPrettyJson

        glosIter = iter(self._glos)
        try:
            thisEntry = next(glosIter)
        except StopIteration:
            raise ValueError('glossary is empty')

        os.makedirs(self._filename)
        count = 1
        rootHash = thisHash = self.getEntryHash(thisEntry)
        prevHash = None
        for nextEntry in glosIter:
            nextHash = self.getEntryHash(nextEntry)
            self.saveEntry(thisEntry, thisHash, prevHash, nextHash)
            thisEntry = nextEntry
            prevHash, thisHash = thisHash, nextHash
            count += 1
        self.saveEntry(thisEntry, thisHash, prevHash, None)

        with open(join(self._filename, 'info.json'),
                  'w',
                  encoding=self._encoding) as fp:
            info = odict()
            info['name'] = self._glos.getInfo('name')
            info['root'] = self.hashToPath(rootHash)
            info['havePrevLink'] = self._havePrevLink
            info['wordCount'] = count
            #info['modified'] =

            for key, value in self._glos.getExtraInfos((
                    'name',
                    'root',
                    'havePrevLink',
                    'wordCount',
            )).items():
                info[key] = value

            fp.write(dataToPrettyJson(info))
Exemplo n.º 9
0
def write(glos: GlossaryType, filename: str) -> bool:
	import re
	from collections import Counter, OrderedDict
	from pyglossary.json_utils import dataToPrettyJson

	possible_html_re = re.compile(r"<[a-zA-Z]+[ />]")

	defiFormatCounter = Counter()
	firstTagCounter = Counter()
	for entry in glos:
		entry.detectDefiFormat()
		defiFormat = entry.getDefiFormat()
		defiFormatCounter[defiFormat] += 1
		defi = entry.getDefi()
		if defiFormat == "m":
			if possible_html_re.match(defi):
				log.warn(f"undetected html defi: {defi}")
		elif defiFormat == "h":
			tag = possible_html_re.search(defi).group().strip("< />").lower()
			firstTagCounter[tag] += 1

	data_entry_count = defiFormatCounter["b"]
	del defiFormatCounter["b"]
	info = OrderedDict()
	for key, value in glos.iterInfo():
		info[key] = value
	info["data_entry_count"] = data_entry_count
	info["defi_format_counter"] = ", ".join(
		f"{defiFormat}={count}"
		for defiFormat, count in
		sorted(defiFormatCounter.items())
	)
	info["defi_first_tag_counter"] = ", ".join(
		f"{defiFormat}={count}"
		for defiFormat, count in
		firstTagCounter.most_common()
	)
	with open(filename, mode="w", encoding="utf-8") as _file:
		_file.write(dataToPrettyJson(info))
Exemplo n.º 10
0
    def write(self):
        from collections import OrderedDict as odict
        from pyglossary.json_utils import dataToPrettyJson

        glosIter = iter(self._glos)
        try:
            thisEntry = next(glosIter)
        except StopIteration:
            raise ValueError('glossary is empty')

        os.makedirs(self._filename)
        count = 1
        rootHash = thisHash = self.getEntryHash(thisEntry)
        prevHash = None
        for nextEntry in glosIter:
            nextHash = self.getEntryHash(nextEntry)
            self.saveEntry(thisEntry, thisHash, prevHash, nextHash)
            thisEntry = nextEntry
            prevHash, thisHash = thisHash, nextHash
            count += 1
        self.saveEntry(thisEntry, thisHash, prevHash, None)
        
        with open(join(self._filename, 'info.json'), 'w', encoding=self._encoding) as fp:
            info = odict()
            info['name'] = self._glos.getInfo('name')
            info['root'] = self.hashToPath(rootHash)
            info['havePrevLink'] = self._havePrevLink
            info['wordCount'] = count
            #info['modified'] =

            for key, value in self._glos.getExtraInfos((
                'name',
                'root',
                'havePrevLink',
                'wordCount',
            )).items():
                info[key] = value

            fp.write(dataToPrettyJson(info))
Exemplo n.º 11
0
    def write(self):
        from collections import OrderedDict as odict
        from pyglossary.json_utils import dataToPrettyJson

        glosIter = iter(self._glos)
        try:
            thisEntry = next(glosIter)
        except StopIteration:
            raise ValueError('glossary is empty')

        os.makedirs(self._filename)
        count = 1
        rootHash = thisHash = self.getEntryHash(thisEntry)
        for nextEntry in glosIter:
            nextHash = self.getEntryHash(nextEntry)
            self.saveEntry(thisEntry, thisHash, nextHash)
            thisEntry = nextEntry
            thisHash = nextHash
            count += 1
        self.saveEntry(thisEntry, thisHash, None)
        
        with open(join(self._filename, 'info.json'), 'w', encoding=self._encoding) as fp:
            info = odict()
            info['name'] = self._glos.getInfo('name')
            info['root'] = self.hashToPath(rootHash)
            info['wordCount'] = count
            #info['modified'] =

            origInfo = self._glos.info.copy()
            for key in ('name', 'root', 'wordCount'):
                try:
                    del origInfo[key]
                except KeyError:
                    pass
            info.update(origInfo)

            fp.write(dataToPrettyJson(info))
Exemplo n.º 12
0
    def write(self) -> "Generator[None, BaseEntry, None]":
        import re
        from collections import Counter, OrderedDict
        from pyglossary.json_utils import dataToPrettyJson
        from pyglossary.langs.writing_system import getWritingSystemFromText

        glos = self._glos

        re_possible_html = re.compile(
            r"<[a-z1-6]+[ />]",
            re.I,
        )
        re_style = re.compile(
            r"<([a-z1-6]+)[^<>]* style=",
            re.I | re.DOTALL,
        )

        wordCount = 0
        bwordCount = 0

        styleByTagCounter = Counter()

        defiFormatCounter = Counter()
        firstTagCounter = Counter()
        allTagsCounter = Counter()
        sourceScriptCounter = Counter()

        while True:
            entry = yield
            if entry is None:
                break
            defi = entry.defi

            wordCount += 1
            bwordCount += defi.count("bword://")

            for m in re_style.finditer(defi):
                tag = m.group(1)
                styleByTagCounter[tag] += 1

            entry.detectDefiFormat()
            defiFormat = entry.defiFormat
            defiFormatCounter[defiFormat] += 1
            if defiFormat == "m":
                if re_possible_html.match(defi):
                    log.warn(f"undetected html defi: {defi}")
            elif defiFormat == "h":
                match = re_possible_html.search(defi)
                if match is not None:
                    tag = match.group().strip("< />").lower()
                    firstTagCounter[tag] += 1
                    for tag in re_possible_html.findall(defi):
                        tag = tag.strip("< />").lower()
                        allTagsCounter[tag] += 1

            ws = getWritingSystemFromText(entry.s_word)
            if ws:
                wsName = ws.name
            else:
                log.debug(f"No script detected for word: {entry.s_word}")
                wsName = "None"
            sourceScriptCounter[wsName] += 1

        data_entry_count = defiFormatCounter["b"]
        del defiFormatCounter["b"]
        info = OrderedDict()
        for key, value in glos.iterInfo():
            info[key] = value
        info["word_count"] = wordCount
        info["bword_count"] = bwordCount
        info["data_entry_count"] = data_entry_count
        info["defi_format"] = ", ".join(
            f"{defiFormat}={count}"
            for defiFormat, count in sorted(defiFormatCounter.items()))
        info["defi_tag"] = ", ".join(
            f"{defiFormat}={count}"
            for defiFormat, count in allTagsCounter.most_common())
        info["defi_first_tag"] = ", ".join(
            f"{defiFormat}={count}"
            for defiFormat, count in firstTagCounter.most_common())
        info["style"] = ", ".join(
            f"{defiFormat}={count}"
            for defiFormat, count in styleByTagCounter.most_common())
        info["source_script"] = ", ".join(
            f"{defiFormat}={count}"
            for defiFormat, count in sourceScriptCounter.most_common())
        self._file.write(dataToPrettyJson(info) + "\n")