def load_publications(db: Database, sc_bilara_data_dir: Path) -> None: publication_file = sc_bilara_data_dir / '_publication.json' publications: Dict[str, dict] = json_load(publication_file) docs = [{'_key': pub_id, **publication} for pub_id, publication in publications.items()] print(f'{len(docs)} publications added or updated') db['publications'].truncate() db['publications'].import_bulk(docs)
def process_extra_info_file( extra_info_file: Path) -> Dict[str, Dict[str, str]]: """ Method to process super_extra_info.json and text_extra_info.json files Args: extra_info_file - path to the file """ info = json_load(extra_info_file) data = {item['uid']: item for item in info} return data
def _process_tree_files(tree_files: List[Path]) -> List[Dict[str, str]]: """ Method for processing tree files from tree sc-data/structure/tree folder Args: tree_files - list of Paths to the tree files """ edges = [] for tree_file in tqdm(tree_files): content = json_load(tree_file) edges.extend(_parse_tree_recursive(content)) return edges
def _process_super_tree_file(super_tree_file: Path) -> List[Dict[str, str]]: """ Method for super-tree.json file processing Args: super_tree_file - path to the super-tree.json file """ content: List[Dict[str, list]] = json_load(super_tree_file) data = [] for division in content: data.extend(_parse_tree_recursive(division)) return data
def get(self, uid, target): db = get_db() results = db.aql.execute(SUTTA_SINGLE_PALI_TEXT, bind_vars={'uid': uid}) result = next(results) if not result: return {'error': 'Not Found'}, 404 sutta_texts = {k: json_load(v) for k, v in result.items()} for key, value in sutta_texts[uid].items(): sutta_texts[uid][key] = transliterate.process('ISO', target, value) return sutta_texts[uid]
def get(self, uid, author_uid=''): db = get_db() results = db.aql.execute(SEGMENTED_SUTTA_VIEW, bind_vars={ 'uid': uid, 'author_uid': author_uid }) result = next(results) if not result: return {'error': 'Not Found'}, 404 data = {k: json_load(v) for k, v in result.items()} data.update({'keys_order': list(data['html_text'].keys())}) return data, 200
def load_names(db: Database, sc_bilara_data_dir: Path, languages_file: Path) -> None: names = [] lang_folder_idx = len(sc_bilara_data_dir.parts) + 1 languages: Dict[str, str] = process_languages(languages_file) for name_file in sc_bilara_data_dir.glob('**/name/**/*.json'): is_root = 'root' in name_file.parts lang = name_file.parts[lang_folder_idx] file_content: Dict[str, str] = json_load(name_file) names.extend(parse_name_file_content(file_content, is_root, lang, languages)) print(f'{len(names)} names added or updated') db['names'].truncate() db['names'].import_bulk(names)
def _hyphenate_modern_text(text_file: Path, hyphenator: Hyphenator) -> None: has_changes = False file_content: Dict[str, str] = json_load(text_file) for key, string in file_content.items(): if not string: continue hyphenated_words = (hyphenator.hyphenate(word) for word in string.split()) hyphenated_string = ' '.join(hyphenated_words) if string.strip() != hyphenated_string: has_changes = True file_content[key] = string.replace(string.strip(), hyphenated_string) if has_changes: json_save(file_content, text_file)
def __init__(self, segments_file: Path): vowel_chars = 'aioueāīū' self.segments = set(json_load(segments_file)['segments']) self.cons = '(?:br|[kgcjtṭdḍbp]h|[kgcjtṭdḍp](?!h)|[mnyrlvshṅṇṃṁñḷ]|b(?![rh]))' self.vowel_pattern = '[' + vowel_chars.lower() + ']' segments_revoweled = [ regex.sub(self.vowel_pattern + '$', self.vowel_pattern, segment, flags=regex.I) for segment in sorted(self.segments, key=len, reverse=True) ] self.segment_rex = regex.compile('({})'.format( '|'.join(segments_revoweled)), flags=regex.I) self.alpha_rex = regex.compile(r'\p{alpha}+')
def load_blurbs(db: Database, sc_bilara_data_dir: Path) -> None: blurbs = [] pattern = r'^.*?:(.*?)$' for blurb_file in sc_bilara_data_dir.glob('**/blurb/*.json'): lang = blurb_file.parent.parent.name file_content: Dict[str, str] = json_load(blurb_file) for prefix, blurb in file_content.items(): match = re.match(pattern, prefix) uid = match.group(1) if match else prefix blurbs.append({ '_key': '_'.join((uid, lang)), 'uid': uid, 'lang': lang, 'blurb': blurb }) print(f'{len(blurbs)} blurbs added or updated') db['blurbs'].truncate() db['blurbs'].import_bulk(blurbs)
def _process_names_files( names_files: List[Path], root_languages: Dict[str, str], super_extra_info: Dict[str, Dict[str, str]], text_extra_info: Dict[str, Dict[str, str]]) -> List[dict]: """ Method for processing name files from sc-data/structure/name Args: names_files - list of name Path objects to files from name folder root_languages - parsed data from super_root_lang.json super_extra_info - parsed data from super_extra_info.json text_extra_info - parsed data from text_extra_info.json Returns: list of processed data """ docs = [] names_files.sort(key=lambda path: len(path.parts)) for name_file in tqdm(names_files): entries: Dict[str, str] = json_load(name_file) docs.extend( _parse_name_file_entries(entries, root_languages, super_extra_info, text_extra_info)) return docs