示例#1
0
def load_publications(db: Database, sc_bilara_data_dir: Path) -> None:
    publication_file = sc_bilara_data_dir / '_publication.json'
    publications: Dict[str, dict] = json_load(publication_file)

    docs = [{'_key': pub_id, **publication} for pub_id, publication in publications.items()]

    print(f'{len(docs)} publications added or updated')
    db['publications'].truncate()
    db['publications'].import_bulk(docs)
示例#2
0
def process_extra_info_file(
        extra_info_file: Path) -> Dict[str, Dict[str, str]]:
    """
    Method to process super_extra_info.json and text_extra_info.json files

    Args:
        extra_info_file - path to the file
    """
    info = json_load(extra_info_file)
    data = {item['uid']: item for item in info}
    return data
示例#3
0
def _process_tree_files(tree_files: List[Path]) -> List[Dict[str, str]]:
    """
    Method for processing tree files from tree sc-data/structure/tree folder

    Args:
        tree_files - list of Paths to the tree files
    """
    edges = []
    for tree_file in tqdm(tree_files):
        content = json_load(tree_file)
        edges.extend(_parse_tree_recursive(content))
    return edges
示例#4
0
def _process_super_tree_file(super_tree_file: Path) -> List[Dict[str, str]]:
    """
    Method for super-tree.json file processing

    Args:
        super_tree_file - path to the super-tree.json file
    """
    content: List[Dict[str, list]] = json_load(super_tree_file)
    data = []
    for division in content:
        data.extend(_parse_tree_recursive(division))
    return data
示例#5
0
    def get(self, uid, target):
        db = get_db()

        results = db.aql.execute(SUTTA_SINGLE_PALI_TEXT,
                                 bind_vars={'uid': uid})
        result = next(results)
        if not result:
            return {'error': 'Not Found'}, 404

        sutta_texts = {k: json_load(v) for k, v in result.items()}
        for key, value in sutta_texts[uid].items():
            sutta_texts[uid][key] = transliterate.process('ISO', target, value)

        return sutta_texts[uid]
示例#6
0
    def get(self, uid, author_uid=''):
        db = get_db()
        results = db.aql.execute(SEGMENTED_SUTTA_VIEW,
                                 bind_vars={
                                     'uid': uid,
                                     'author_uid': author_uid
                                 })
        result = next(results)
        if not result:
            return {'error': 'Not Found'}, 404

        data = {k: json_load(v) for k, v in result.items()}
        data.update({'keys_order': list(data['html_text'].keys())})

        return data, 200
示例#7
0
def load_names(db: Database, sc_bilara_data_dir: Path, languages_file: Path) -> None:
    names = []
    lang_folder_idx = len(sc_bilara_data_dir.parts) + 1

    languages: Dict[str, str] = process_languages(languages_file)

    for name_file in sc_bilara_data_dir.glob('**/name/**/*.json'):
        is_root = 'root' in name_file.parts
        lang = name_file.parts[lang_folder_idx]
        file_content: Dict[str, str] = json_load(name_file)
        names.extend(parse_name_file_content(file_content, is_root, lang, languages))

    print(f'{len(names)} names added or updated')
    db['names'].truncate()
    db['names'].import_bulk(names)
示例#8
0
def _hyphenate_modern_text(text_file: Path, hyphenator: Hyphenator) -> None:
    has_changes = False
    file_content: Dict[str, str] = json_load(text_file)
    for key, string in file_content.items():
        if not string:
            continue
        hyphenated_words = (hyphenator.hyphenate(word)
                            for word in string.split())
        hyphenated_string = ' '.join(hyphenated_words)
        if string.strip() != hyphenated_string:
            has_changes = True
            file_content[key] = string.replace(string.strip(),
                                               hyphenated_string)

    if has_changes:
        json_save(file_content, text_file)
示例#9
0
    def __init__(self, segments_file: Path):
        vowel_chars = 'aioueāīū'

        self.segments = set(json_load(segments_file)['segments'])
        self.cons = '(?:br|[kgcjtṭdḍbp]h|[kgcjtṭdḍp](?!h)|[mnyrlvshṅṇṃṁñḷ]|b(?![rh]))'
        self.vowel_pattern = '[' + vowel_chars.lower() + ']'

        segments_revoweled = [
            regex.sub(self.vowel_pattern + '$',
                      self.vowel_pattern,
                      segment,
                      flags=regex.I)
            for segment in sorted(self.segments, key=len, reverse=True)
        ]

        self.segment_rex = regex.compile('({})'.format(
            '|'.join(segments_revoweled)),
                                         flags=regex.I)
        self.alpha_rex = regex.compile(r'\p{alpha}+')
def load_blurbs(db: Database, sc_bilara_data_dir: Path) -> None:
    blurbs = []
    pattern = r'^.*?:(.*?)$'

    for blurb_file in sc_bilara_data_dir.glob('**/blurb/*.json'):
        lang = blurb_file.parent.parent.name
        file_content: Dict[str, str] = json_load(blurb_file)
        for prefix, blurb in file_content.items():
            match = re.match(pattern, prefix)
            uid = match.group(1) if match else prefix
            blurbs.append({
                '_key': '_'.join((uid, lang)),
                'uid': uid,
                'lang': lang,
                'blurb': blurb
            })

    print(f'{len(blurbs)} blurbs added or updated')
    db['blurbs'].truncate()
    db['blurbs'].import_bulk(blurbs)
示例#11
0
def _process_names_files(
        names_files: List[Path], root_languages: Dict[str, str],
        super_extra_info: Dict[str, Dict[str, str]],
        text_extra_info: Dict[str, Dict[str, str]]) -> List[dict]:
    """
    Method for processing name files from sc-data/structure/name

    Args:
        names_files - list of name Path objects to files from name folder
        root_languages - parsed data from super_root_lang.json
        super_extra_info - parsed data from super_extra_info.json
        text_extra_info - parsed data from text_extra_info.json

    Returns:
        list of processed data
    """
    docs = []
    names_files.sort(key=lambda path: len(path.parts))
    for name_file in tqdm(names_files):
        entries: Dict[str, str] = json_load(name_file)
        docs.extend(
            _parse_name_file_entries(entries, root_languages, super_extra_info,
                                     text_extra_info))
    return docs