def analyze_path(path): by_tag = collections.defaultdict(collections.Counter) by_class = collections.defaultdict(collections.Counter) pnum_classes = {} for infile in path.glob('**/*.html'): doc = html.parse(str(infile), encoding='utf8') for e in doc.getroot().cssselect('[class]'): for class_ in e.attrib['class'].split(): by_tag[e.tag][class_] += 1 by_class[class_][e.tag] += 1 if 'id' in e.attrib and not e.text_content(): by_class[class_]['pnum'] += 1 defaults = {} for class_, counter in by_class.items(): pnum_count = counter['pnum'] if pnum_count: del counter['pnum'] tag, count = counter.most_common(1)[0] defaults[class_] = tag if pnum_count > count / 2: pnum_classes[class_] = pnum_count return { 'defaults': defaults, 'by_tag': {tag: dict(val) for tag, val in by_tag.items()}, 'by_class': {class_: dict(val) for class_, val in by_class.items()}, 'pnum_classes': pnum_classes }
def analyze_path(path): by_tag = collections.defaultdict(collections.Counter) by_class = collections.defaultdict(collections.Counter) pnum_classes = {} for infile in path.glob("**/*.html"): doc = html.parse(str(infile), encoding="utf8") for e in doc.getroot().cssselect("[class]"): for class_ in e.attrib["class"].split(): by_tag[e.tag][class_] += 1 by_class[class_][e.tag] += 1 if "id" in e.attrib and not e.text_content(): by_class[class_]["pnum"] += 1 defaults = {} for class_, counter in by_class.items(): pnum_count = counter["pnum"] if pnum_count: del counter["pnum"] tag, count = counter.most_common(1)[0] defaults[class_] = tag if pnum_count > count / 2: pnum_classes[class_] = pnum_count return { "defaults": defaults, "by_tag": {tag: dict(val) for tag, val in by_tag.items()}, "by_class": {class_: dict(val) for class_, val in by_class.items()}, "pnum_classes": pnum_classes, }
def get_all_pali_words(): if "pali_words" not in cache: words = Counter() for file in (sc.text_dir / 'pi' / 'su' / 'mn').glob('**/*.html'): doc = html.parse(str(file)) root = doc.getroot() for e in root.cssselect('#metaarea'): e.drop_tree() text = root.text_content() text = regex.sub(r'[\xad”’]', '', text) words_from_text = regex.findall(r'\p{alpha}+', text) words.update(words_from_text) words.update(word.rstrip('ṃ') for word in words_from_text if word.endswith('ṃ')) result = {} for word, count in words.most_common(): asc_word = asciify(word) if not asc_word in result: result[asc_word] = ((word, count),) else: result[asc_word] = result[asc_word] + ((word, count),) cache["pali_words"] = result return cache["pali_words"]
def process(self, srcfo): skipped = [] found = [] pinyin_match = regex.compile(r'Pinyin: (.*)').match # The source file is not a true XML tree so we need to jump # through some hoops. Yay, hoops! with tempfile.NamedTemporaryFile('w+') as truetree: truetree.write('<root>') truetree.writelines(srcfo) truetree.write('</root>') truetree.flush() truetree.seek(0) root = html.parse(truetree.name).getroot() self.root = root entries = [] for entry in root.iter('entry'): head = pinyin = meaning = None try: head = entry.text.strip() for e in entry: m = pinyin_match(e.text_content()) if m: pinyin = m[1] break meaning = entry.select_one('b').tail.lstrip(': ') if not head or not pinyin or not meaning: logging.warning( 'Incomplete buddhdic entry: {!s}'.format(entry)) if self.existing.issuperset(head): entries.append('"{}": {}'.format(head, [pinyin, meaning])) self.seen.update(head) found.append((head, meaning)) else: skipped.append((head, meaning)) except: print(head, pinyin, meaning) print(str(entry)) if skipped: logging.info( '{} entries do and {} entries do not appear in SuttaCentral texts' .format(len(found), len(skipped))) if self.args.verbose: logging.info('Entries which do not appear: ') logging.info(', '.join('{}: {}'.format(head, meaning) for head, meaning in skipped)) return 'sc.lzh2enData = {\n' + ',\n'.join(entries) + '\n}'
def process(self, srcfo): skipped = [] found = [] pinyin_match = regex.compile(r'Pinyin: (.*)').match # The source file is not a true XML tree so we need to jump # through some hoops. Yay, hoops! with tempfile.NamedTemporaryFile('w+') as truetree: truetree.write('<root>') truetree.writelines(srcfo) truetree.write('</root>') truetree.flush() truetree.seek(0) root = html.parse(truetree.name).getroot() self.root = root entries = [] for entry in root.iter('entry'): head = pinyin = meaning = None try: head = entry.text.strip() for e in entry: m = pinyin_match(e.text_content()) if m: pinyin = m[1] break meaning = entry.select_one('b').tail.lstrip(': ') if not head or not pinyin or not meaning: logging.warning('Incomplete buddhdic entry: {!s}'.format(entry)) if self.existing.issuperset(head): entries.append('"{}": {}'.format(head, [pinyin, meaning])) self.seen.update(head) found.append((head, meaning)) else: skipped.append((head, meaning)) except: print(head, pinyin, meaning) print(str(entry)) if skipped: logging.info('{} entries do and {} entries do not appear in SuttaCentral texts'.format(len(found), len(skipped))) if self.args.verbose: logging.info('Entries which do not appear: ') logging.info(', '.join('{}: {}'.format(head, meaning) for head, meaning in skipped)) return 'sc.lzh2enData = {\n' + ',\n'.join(entries) + '\n}'
if not replacement: continue if args.hyphenate: replacement = hyphenate(replacement, args.hyphenate) mapping[original] = replacement def replace_word_from_mapping(m): word = m[0] if not args.no_act and word in mapping: return mapping[word] return word def process_text(text): if not text: return text return word_rex.sub(replace_word_from_mapping, text) if args.source.is_dir(): files = sorted(args.source.glob('**/*.html'), key=numericsortkey) else: files = [args.source] for file in files: doc = html.parse(str(file)) root = doc.getroot() process_node(root, process_text) if not args.no_act: doc.write(str(file), method='html', encoding='utf8')
def build(self, lang_dir, force=False): # The pagenumbinator should be scoped because it uses # a large chunk of memory which should be gc'd. # But it shouldn't be created at all if we don't need it. # So we use a getter, and delete it when we are done. self._ppn = None codepoints = set() bold_codepoints = set() italic_codepoints = set() lang_uid = lang_dir.stem all_files = sorted(lang_dir.glob('**/*.html'), key=lambda f: sc.util.numericsortkey(f.stem)) files = [f for f in all_files if f.stem == 'metadata'] + [f for f in all_files if f.stem != 'metadata'] for i, htmlfile in enumerate(files): try: if not self._should_process_file(htmlfile, force): continue logger.info('Adding file: {!s}'.format(htmlfile)) uid = htmlfile.stem root = html.parse(str(htmlfile)).getroot() #Set codepoint data _stack = [root] while _stack: e = _stack.pop() if self.is_bold(lang_uid, e): bold_codepoints.update(e.text_content()) elif self.is_italic(lang_uid, e): italic_codepoints.update(e.text_content()) else: _stack.extend(e) codepoints.update(root.text_content()) # Set the previous and next uids, using explicit data # if available, otherwise making a safe guess. # The safe guess relies on comparing uids, and will not # capture relationships such as the order of patimokha # rules. prev_uid = root.get('data-prev') next_uid = root.get('data-next') if not (prev_uid or next_uid): if i > 0: prev_uid = files[i - 1].stem if not self.uids_are_related(uid, prev_uid): prev_uid = None if i + 1 < len(files): next_uid = files[i + 1].stem if not self.uids_are_related(uid, next_uid): next_uid = None path = htmlfile.relative_to(sc.text_dir) author = self._get_author(root, lang_uid, uid) if uid == 'metadata': if author is None: raise ValueError('Metadata file {} does not define author'.format(path)) self.add_metadata(path, author, root) continue if author is None: metadata = self.get_metadata(path) if metadata: author = metadata['author'] if author is None: metadata = root.select_one('#metaarea') if metadata: metadata_text = metadata.text_content() m = regex.match(r'.{,80}\.', metadata_text) if not m: m = regex.match(r'.{,80}(?=\s)', metadata_text) if m: author = m[0] if author is None: logger.warn('Could not determine author for {}/{}'.format(lang_uid, uid)) author = '' name = self._get_name(root, lang_uid, uid) volpage = self._get_volpage(root, lang_uid, uid) embedded = self._get_embedded_uids(root, lang_uid, uid) fstat = htmlfile.stat() cdate = self.datestr(fstat.st_ctime) mdate = self.datestr(fstat.st_mtime) textinfo = TextInfo(uid=uid, lang=lang_uid, path=path, name=name, author=author, volpage=volpage, prev_uid=prev_uid, next_uid=next_uid, cdate=cdate, mdate=mdate, file_uid=uid) self.add_text_info(lang_uid, uid, textinfo) for child in embedded: child.path = path child.author = author child.file_uid = uid self.add_text_info(lang_uid, child.uid, child) m = regex.match(r'(.*?)(\d+)-(\d+)$', uid) if m: range_textinfo = TextInfo(uid=uid+'#', lang=lang_uid, path=path, name=name, author=author, volpage=volpage, file_uid=uid) start = int(m[2]) end = int(m[3]) + 1 for i in range(start, end): iuid = m[1] + str(i) if self.exists(iuid, lang_uid): continue self.add_text_info(lang_uid, iuid, range_textinfo) except Exception as e: print('An exception occured: {!s}'.format(htmlfile)) raise self._codepoints[lang_uid] = { 'normal': codepoints, 'bold': bold_codepoints, 'italic': italic_codepoints } del self._ppn
replacement = line[2] if not replacement: continue if args.hyphenate: replacement = hyphenate(replacement, args.hyphenate) mapping[original] = replacement def replace_word_from_mapping(m): word = m[0] if not args.no_act and word in mapping: return mapping[word] return word def process_text(text): if not text: return text return word_rex.sub(replace_word_from_mapping, text) if args.source.is_dir(): files = sorted(args.source.glob('**/*.html'), key=numericsortkey) else: files = [args.source] for file in files: doc = html.parse(str(file)) root = doc.getroot() process_node(root, process_text) if not args.no_act: doc.write(str(file), method='html', encoding='utf8')
def build(self, force=False): # The pagenumbinator should be scoped because it uses # a large chunk of memory which should be gc'd. # But it shouldn't be created at all if we don't need it. # So we use a getter, and delete it when we are done. self._ppn = None file_i = 0 file_of_total_i = 0 percent = 0 file_count = sum(1 for _ in sc.text_dir.glob('**/*.html')) for lang_dir in sc.text_dir.glob('*'): lang_uid = lang_dir.stem files = sorted(lang_dir.glob('**/*.html'), key=lambda f: sc.util.numericsortkey(f.stem)) for i, htmlfile in enumerate(files): try: if not self._should_process_file(htmlfile, force): continue logger.info('Adding file: {!s}'.format(htmlfile)) uid = htmlfile.stem root = html.parse(str(htmlfile)).getroot() # Set the previous and next uids, using explicit data # if available, otherwise making a safe guess. # The safe guess relies on comparing uids, and will not # capture relationships such as the order of patimokha # rules. prev_uid = root.get('data-prev') next_uid = root.get('data-next') if not (prev_uid or next_uid): if i > 0: prev_uid = files[i - 1].stem if not self.uids_are_related(uid, prev_uid): prev_uid = None if i + 1 < len(files): next_uid = files[i + 1].stem if not self.uids_are_related(uid, next_uid): next_uid = None path = htmlfile.relative_to(sc.text_dir) author = self._get_author(root, lang_uid, uid) name = self._get_name(root, lang_uid, uid) volpage = self._get_volpage(root, lang_uid, uid) embedded = self._get_embedded_uids(root, lang_uid, uid) fstat = htmlfile.stat() cdate = self.datestr(fstat.st_ctime) mdate = self.datestr(fstat.st_mtime) textinfo = TextInfo(uid=uid, lang=lang_uid, path=path, name=name, author=author, volpage=volpage, prev_uid=prev_uid, next_uid=next_uid, cdate=cdate, mdate=mdate) self.add_text_info(lang_uid, uid, textinfo) for child in embedded: child.path = path child.author = author self.add_text_info(lang_uid, child.uid, child) m = regex.match(r'(.*?)(\d+)-(\d+)$', uid) if m: range_textinfo = TextInfo(uid=uid+'#', lang=lang_uid, path=path, name=name, author=author, volpage=volpage) start = int(m[2]) end = int(m[3]) + 1 for i in range(start, end): iuid = m[1] + str(i) if self.exists(iuid, lang_uid): continue self.add_text_info(lang_uid, iuid, range_textinfo) file_i += 1 if (file_i % self.FILES_N) == 0: self._on_n_files() file_of_total_i += 1 new_percent = int(0.5 + 100 * file_of_total_i / file_count) if new_percent > percent: percent = new_percent self.build_process(percent) except Exception as e: print('An exception occured: {!s}'.format(htmlfile)) raise if (file_i % self.FILES_N) != 0: self._on_n_files() del self._ppn