def process_file(fjson_storage_path): """Read the fjson file from disk and parse it into a structured dict.""" if not settings.RTD_BUILD_MEDIA_STORAGE: log.warning( 'RTD_BUILD_MEDIA_STORAGE is missing - Not updating intersphinx data' ) raise RuntimeError( 'RTD_BUILD_MEDIA_STORAGE is missing - Not updating intersphinx data' ) storage = get_storage_class(settings.RTD_BUILD_MEDIA_STORAGE)() log.debug('Processing JSON file for indexing: %s', fjson_storage_path) try: with storage.open(fjson_storage_path, mode='r') as f: file_contents = f.read() except IOError: log.info('Unable to read file: %s', fjson_storage_path) raise data = json.loads(file_contents) sections = [] path = '' title = '' domain_data = {} if 'current_page_name' in data: path = data['current_page_name'] else: log.info('Unable to index file due to no name %s', fjson_storage_path) if data.get('body'): body = PyQuery(data['body']) sections.extend( generate_sections_from_pyquery(body.clone(), fjson_storage_path)) domain_data = generate_domains_data_from_pyquery( body.clone(), fjson_storage_path) else: log.info('Unable to index content for: %s', fjson_storage_path) if 'title' in data: title = data['title'] title = PyQuery(data['title']).text().replace('¶', '').strip() else: log.info('Unable to index title for: %s', fjson_storage_path) return { 'path': path, 'title': title, 'sections': sections, 'domain_data': domain_data, }
def pq_remove_nodes( pq: PyQuery, css_remove: Union[str, list], ) -> PyQuery: pq = pq.clone() if isinstance(css_remove, str): css_remove = [css_remove] for remove_node in css_remove: pq.remove(remove_node) return pq