def add_site_to_history(): db = get_db() site_dirs = [WORKING_DIR / 'root/en/site'] + list(WORKING_DIR.glob('**/translation/*/site')) docs = {} for folder in site_dirs: lang = folder.parts[-2] for file in folder.glob('**/*.json'): file_uid = file.stem.split('_')[0] with file.open() as f: entries = json.load(f) for k,v in entries.items(): context = f'{file_uid}_{k}' if context in docs: doc = docs[context] else: doc = { '_key': f'bilara_{context}', 'context': context, 'origin': 'bilara', 'strings': {} } docs[context] = doc doc['strings'][lang] = v errors = db['historic'].import_bulk(docs.values(), on_duplicate='replace', halt_on_error=False) return errors, docs
def make_special_uid_mapping(): uid_mapping = {} for file in WORKING_DIR.glob("root/**/*.json"): if "blurbs" in str(file): continue with file.open() as f: data = json.load(f) for k in data: uid = k.split(":")[0] if uid not in _uid_index and uid not in uid_mapping: uid_mapping[uid] = file.name.split("_")[0] return uid_mapping
def load_data(): # On my PC using a ThreadPoolExecutor cuts the import time to one-third executor = ThreadPoolExecutor(max_workers=4) limit = 4 futures = set() db = get_db() strings_coll = db['strings'] strings_coll.truncate() start = monotonic() for folder in sorted(WORKING_DIR.glob('*')): if folder.name not in {'root', 'translation', 'comment'}: continue print(f'\nProcessing: {folder.name}') files = list(folder.glob('**/*.json')) docs = [] for i, file in enumerate(files): print(f'{i} of {len(files)}', end=' \r') with file.open() as f: try: data = json.load(f) except json.JSONDecodeError: logging.error( f'Could not parse JSON, skipping: {file.relative_to(REPO_DIR)}' ) continue uid, muids = file.stem.split('_') for segment_id, string in data.items(): doc = { '_key': f'{muids}:{segment_id}', 'muids': muids, 'segment_id': segment_id, 'string': string } docs.append(doc) if len(docs) > 10000: if len(futures) > limit: completed, futures = wait(futures, return_when=FIRST_COMPLETED) futures.add( executor.submit(strings_coll.insert_many, docs.copy())) docs.clear() if docs: futures.add(executor.submit(strings_coll.insert_many, docs.copy())) completed, futures = wait(futures) print(f'\nComplete in {monotonic()-start} seconds')
def validate_permissions(rules=None): if not rules: rules = get_rules() files = WORKING_DIR.glob('**/*.json') files = [ str(file.relative_to(WORKING_DIR)) for file in files if not any(part for part in file.parts if part.startswith('.')) ] for user, user_permissions in rules.items(): if user.startswith('_'): continue # Not a valid Github ID, used for bilara for paths in user_permissions.values(): for path in paths: if path == '*': continue for file in files: if file.startswith(path): break else: problemsLog.add(file=publications_file_name, msg=f"No files match path: {path}")
def make_file_index(force=False): global _tree_index global _uid_index global _muid_index global _file_index global _meta_definitions global _special_uid_mapping global _legal_ids if state_build_lock_file.exists(): # We arrived here because another process started the build # let that process do the work for i in range(0, 100): time.sleep(1) if not state_build_lock_file.exists(): if load_state(): _build_complete.set() return # Should not normally reach here, but if so fall through and do the build # regardless after 100 seconds of waiting. try: state_build_lock_file.touch() _muid_index = muid_index = {} _uid_index = uid_index = {} _file_index = file_index = {} _legal_ids = set() for file in sorted(WORKING_DIR.glob('root/**/*.json')): with file.open() as f: data = json.load(f) _legal_ids.update(data.keys()) def recurse(folder, meta_definitions=None, depth=0): subtree = {} meta_definitions = meta_definitions.copy() metafiles = set(folder.glob("_*.json")) if metafiles: for metafile in sorted(metafiles, key=humansortkey): file_data = json_load(metafile) if isinstance(file_data, dict): meta_definitions.update(file_data) for k, v in file_data.items(): if k not in _meta_definitions: _meta_definitions[k] = v for file in sorted(folder.glob("*"), key=humansortkey): if file.name.startswith("."): continue if file in metafiles: continue long_id = file.stem meta = {} for part in file.parts: if part.endswith(".json"): part = part[:-5] if part in meta_definitions: meta[part] = meta_definitions[part] if file.is_dir(): subtree[file.name] = recurse( file, meta_definitions=meta_definitions, depth=depth + 1) subtree[file.name]["_meta"] = meta elif file.suffix == ".json": mtime = file.stat().st_mtime_ns path = str(file.relative_to(WORKING_DIR)) obj = subtree[long_id] = { "path": path, "mtime": mtime, "_meta": meta } if "_" in long_id: uid, muids = get_uid_and_muids(file) else: uid = file.name if file.is_dir() else file.stem muids = None obj["uid"] = uid if uid not in uid_index: uid_index[uid] = set() uid_index[uid].add(long_id) if long_id in file_index: logging.error(f"{str(file)} not unique") file_index[long_id] = obj if muids: for muid in muids: if muid not in muid_index: muid_index[muid] = set() muid_index[muid].add(long_id) # Create Virtual Files if 'translation' in muids: uid, muids = long_id.split('_') _add_virtual_comment_file(uid, muids, file, uid_index, muid_index, file_index, meta_definitions) if depth == 0: _add_virtual_project_files(uid_index, muid_index, file_index, subtree, _meta_definitions) return subtree _meta_definitions = {} _tree_index = recurse(WORKING_DIR, {}) _uid_index = uid_index _muid_index = muid_index _file_index = file_index _special_uid_mapping = make_special_uid_mapping() for v in file_index.values(): v["_meta"] = invert_meta(v["_meta"]) print("File Index Built") save_state() _build_complete.set() finally: state_build_lock_file.unlink() stats_calculator.reset()
def make_file_index(force=False): _build_started.set() global _tree_index global _uid_index global _muid_index global _file_index global _meta_definitions global _special_uid_mapping global _legal_ids if not force: load_state() print("Building file index") _muid_index = muid_index = {} _uid_index = uid_index = {} _file_index = file_index = {} _legal_ids = set() for file in sorted(WORKING_DIR.glob('root/**/*.json')): with file.open() as f: data = json.load(f) _legal_ids.update(data.keys()) def recurse(folder, meta_definitions=None): subtree = {} meta_definitions = meta_definitions.copy() metafiles = set(folder.glob("_*.json")) if metafiles: for metafile in sorted(metafiles, key=humansortkey): file_data = json_load(metafile) meta_definitions.update(file_data) for k, v in file_data.items(): if k not in _meta_definitions: _meta_definitions[k] = v for file in sorted(folder.glob("*"), key=humansortkey): if file.name.startswith("."): continue if file in metafiles: continue long_id = file.stem meta = {} for part in file.parts: if part.endswith(".json"): part = part[:-5] if part in meta_definitions: meta[part] = meta_definitions[part] if file.is_dir(): subtree[file.name] = recurse(file, meta_definitions=meta_definitions) subtree[file.name]["_meta"] = meta elif file.suffix == ".json": mtime = file.stat().st_mtime_ns path = str(file.relative_to(WORKING_DIR)) obj = subtree[long_id] = { "path": path, "mtime": mtime, "_meta": meta } if "_" in long_id: uid, muids = get_uid_and_muids(file) else: uid = file.name if file.is_dir() else file.stem muids = None obj["uid"] = uid if uid not in uid_index: uid_index[uid] = set() uid_index[uid].add(long_id) if long_id in file_index: logging.error(f"{str(file)} not unique") file_index[long_id] = obj if muids: for muid in muids: if muid not in muid_index: muid_index[muid] = set() muid_index[muid].add(long_id) # Create Virtual Files if 'translation' in muids: uid, muids = long_id.split('_') muids = muids.replace('translation', 'comment') comment_stem = f"{uid}_{muids}" if comment_stem in uid_index: continue parent = pathlib.Path('comment') / file.relative_to( WORKING_DIR / 'translation').parent virtual_file = parent / (comment_stem + '.json') meta = { part: meta_definitions[part] for part in muids.split('-') if part in meta_definitions } obj = { "uid": uid, "path": str(virtual_file), "mtime": None, "_meta": meta } uid_index[uid].add(comment_stem) file_index[comment_stem] = obj for muid in muids.split('-'): muid_index[muid].add(comment_stem) return subtree _meta_definitions = {} _tree_index = recurse(WORKING_DIR, {}) _uid_index = uid_index _muid_index = muid_index _file_index = file_index _special_uid_mapping = make_special_uid_mapping() for v in file_index.values(): v["_meta"] = invert_meta(v["_meta"]) print("File Index Built") save_state() _build_complete.set() stats_calculator.reset()