def check_schemas(data_root, schemas_dir, verbose=False): schemas = ('category.json', 'video.json') all_file_paths = get_json_files(data_root) error_count = 0 for schema, file_paths in zip(schemas, all_file_paths): schema_path = os.path.join(schemas_dir, schema) with open(schema_path, encoding='UTF-8') as fp: schema_blob = json.load(fp) for file_path in file_paths: with open(file_path, encoding='UTF-8') as fp: try: blob = json.load(fp) except json.decoder.JSONDecodeError as e: print('\nError JSON-decoding {}'.format(file_path), flush=True) if verbose: print(e, flush=True) error_count += 1 continue try: jsonschema.validate(blob, schema_blob) except jsonschema.exceptions.ValidationError as e: print(file_path, flush=True) if verbose: print(e, flush=True) error_count += 1 return error_count
def check_render_rest(data_root, verbose=False): _, video_paths = get_json_files(data_root) fields = ('description', 'summary') error_by_path = {} valid = True for file_path in video_paths: with open(file_path, encoding='UTF-8') as fp: blob = json.load(fp) for field in fields: # A description or summary maybe None. # Ensure text is a string. text = blob.get(field) or '' error, level = validate_rest(text) if error and level >= INVALID_ERROR_LEVEL: valid = False if error and verbose: msg = 'ReST validation error:\n\tFile:{}\n\tKey:{}' print(msg.format(file_path, field), flush=True) print('\t', error, sep='', flush=True) if not valid: sys.exit(1)
def check_render_rest(data_root, verbose=False): _, video_paths = get_json_files(data_root) fields = ('description', 'summary') error_by_path = {} valid = True for file_path in video_paths: with open(file_path, encoding='UTF-8') as fp: blob = json.load(fp) for field in fields: # A description or summary maybe None. # Ensure text is a string. text = blob.get(field) or '' error, level = validate_rest(text) if error and level >= INVALID_ERROR_LEVEL: valid = False if error: msg = 'ReST validation error (level {level}):\n\tFile: {fp}\n\tKey: {key}\n\tError:\n{error}' print(msg.format(fp=file_path, key=field, level=level, error=textwrap.indent(error, '\t\t')), flush=True) if verbose: print('\t', error, sep='', flush=True) if not valid: sys.exit(1)
def main(): """Fill id field in json video files""" logging.basicConfig(level=logging.WARNING) # WARNING or DEBUG parser = argparse.ArgumentParser() parser.add_argument("path", help="path to data repository") parser.add_argument('--db', default='/tmp/db.json', help="path to tinydb file") args = parser.parse_args() _, video_paths = get_json_files(args.path) # Retrieve data tb_video = [get_json_data(file_name) for file_name in sorted(video_paths)] # Query max id all_id = collections.Counter(video['id'] for video in tb_video if 'id' in video.keys()) most_common, times_duplicate = all_id.most_common(1)[0] if times_duplicate > 1: raise ValueError('Duplicate id: {}'.format(most_common)) max_id = max(all_id) logging.debug('Max id: {}'.format(max_id)) # Update files video_without_id = [video for video in tb_video if 'id' not in video.keys()] for video_id, video in enumerate(video_without_id, max_id + 1): update_id(video, video_id)
def main(): """Fill id field in json video files""" logging.basicConfig(level=logging.WARNING) # WARNING or DEBUG parser = argparse.ArgumentParser() parser.add_argument("path", help="path to data repository") parser.add_argument('--db', default='/tmp/db.json', help="path to tinydb file") args = parser.parse_args() _, video_paths = get_json_files(args.path) # Retrieve data tb_video = [get_json_data(file_name) for file_name in sorted(video_paths)] # Query max id all_id = collections.Counter(video['id'] for video in tb_video if 'id' in video.keys()) most_common, times_duplicate = all_id.most_common(1)[0] if times_duplicate > 1: raise ValueError('Duplicate id: {}'.format(most_common)) max_id = max(all_id) logging.debug('Max id: {}'.format(max_id)) # Update files video_without_id = [ video for video in tb_video if 'id' not in video.keys() ] for video_id, video in enumerate(video_without_id, max_id + 1): update_id(video, video_id)
def check_schemas(data_root, schemas_dir, verbose=False): schemas = ('category.json', 'video.json') all_file_paths = get_json_files(data_root) error_count = 0 for schema, file_paths in zip(schemas, all_file_paths): schema_path = os.path.join(schemas_dir, schema) with open(schema_path, encoding='UTF-8') as fp: schema_blob = json.load(fp) for file_path in file_paths: with open(file_path, encoding='UTF-8') as fp: try: blob = json.load(fp) except json.decoder.JSONDecodeError as e: print('\nError JSON-decoding {}'.format(file_path), flush=True) if verbose: print(e, flush=True) error_count += 1 continue try: jsonschema.validate( blob, schema_blob, format_checker=jsonschema.FormatChecker()) except jsonschema.exceptions.ValidationError as e: print(file_path, flush=True) if verbose: print(e, flush=True) error_count += 1 return error_count
def main(): """Convert json file(s) to the project format standards""" logging.basicConfig(level=logging.WARNING) # WARNING or DEBUG parser = argparse.ArgumentParser() parser.add_argument("path", help="path to data repository") args = parser.parse_args() category_paths, video_paths = get_json_files(args.path) print('\n# Category statistics') print(markdown_statistics(category_paths)) print('\n# Video statistics') print(markdown_statistics(video_paths))
def check_ids_unique(data_root, verbose=False): _, video_paths = get_json_files(data_root) bad_lang_by_path = {} for file_path in video_paths: with open(file_path, encoding='UTF-8') as fp: blob = json.load(fp) lang = blob.get('language') if lang and lang not in VIDEO_LANGUAGE_NAMES: bad_lang_by_path[file_path] = lang if bad_lang_by_path: print('Incorrect languages found:') for path, lang in bad_lang_by_path.items(): print('{} {}'.format(lang, path)) sys.exit(1)
def main(): """Convert json file(s) to the project format standards""" logging.basicConfig(level=logging.WARNING) parser = argparse.ArgumentParser() parser.add_argument("path", help="path to file(s) to reserialize") parser.add_argument("-a", "--all", action="store_true", help="reserialize all JSON files under path") args = parser.parse_args() if args.all: category_paths, video_paths = get_json_files(args.path) paths = category_paths + video_paths for path in paths: reserialize(path) else: reserialize(args.path)
def main(): """Pull related urls from summary and description of video JSON""" logging.basicConfig(level=logging.WARNING) parser = argparse.ArgumentParser() parser.add_argument("path", help="path to file(s) to reserialize") parser.add_argument("-a", "--all", action="store_true", help="reserialize all JSON files under path") args = parser.parse_args() if args.all: category_paths, video_paths = get_json_files(args.path) paths = video_paths for path in paths: pull_links_from_file(path) else: pull_links_from_file(args.path)
def check_ids_unique(data_root, verbose=False): _, video_paths = get_json_files(data_root) paths_by_id = defaultdict(list) for file_path in video_paths: with open(file_path, encoding='UTF-8') as fp: blob = json.load(fp) id_ = blob.get('id') if id_: paths_by_id[id_].append(file_path) keys = list(paths_by_id.keys()) for key in keys: if len(paths_by_id[key]) <= 1: del paths_by_id[key] if paths_by_id: print('Duplicate IDs found:') for id_, paths in paths_by_id.items(): print('ID {}'.format(id_)) for path in paths: print('\t', path) sys.exit(1)
def check_render_rest(data_root, verbose=False): category_paths, video_paths = get_json_files(data_root) file_paths = category_paths + video_paths error_by_path = {} for file_path in file_paths: with open(file_path, encoding='UTF-8') as fp: serialized_blob = fp.read() re_serialized_blob = json.dumps( json.loads(serialized_blob), **JSON_FORMAT_KWARGS ) if serialized_blob.strip() != re_serialized_blob.strip(): error_by_path[file_path] = (serialized_blob, re_serialized_blob) if error_by_path: for path, blobs in error_by_path.items(): print('Incorrect serialization order in {}'.format(path), flush=True) blobs = tuple(blob.splitlines(keepends=True) for blob in blobs) if verbose: print(''.join(difflib.ndiff(*blobs)), end="") sys.exit(1)
def check_schemas(data_root, schemas_dir, verbose=False): schemas = ('category.json', 'video.json') all_file_paths = get_json_files(data_root) error_count = 0 for schema, file_paths in zip(schemas, all_file_paths): schema_path = os.path.join(schemas_dir, schema) with open(schema_path, encoding='UTF-8') as fp: schema_blob = json.load(fp) for file_path in file_paths: with open(file_path, encoding='UTF-8') as fp: blob = json.load(fp) try: jsonschema.validate(blob, schema_blob) except jsonschema.exceptions.ValidationError as e: print(file_path, flush=True) if verbose: print(e, flush=True) error_count += 1 return error_count
def check_slugs_unique(data_root, verbose=False): category_paths, _ = get_json_files(data_root) paths_by_combo = defaultdict(list) for category_path in category_paths: with open(category_path, encoding='UTF-8') as fp: category_blob = json.load(fp) # slugs will be generated from titles, so titles can be used # as a stand-in for slugs when testing unique constraints. category_title = category_blob.get('title') head, _ = os.path.split(category_path) video_pattern = os.path.join(head, 'videos/*.json') for video_path in glob.iglob(video_pattern): with open(video_path, encoding='UTF-8') as fp: video_blob = json.load(fp) video_slug = video_blob.get('slug') if not video_slug: video_slug = slugify(video_blob.get('title')) combo = (category_title, video_slug) paths_by_combo[combo].append(video_path) keys = list(paths_by_combo.keys()) for key in keys: if len(paths_by_combo[key]) <= 1: del paths_by_combo[key] if paths_by_combo: print('Duplicate slug combinations found:') for combo, paths in paths_by_combo.items(): print('Combination {}'.format(combo)) for path in paths: print('\t', path) sys.exit(1)