def get_redirect(url): # simple case: it's already in the dict url = absolute_from_relative_url(url) if url in redirects: return redirects[url] # Try looking it up without the fragment defrag_url = uritools.uridefrag(url).uri fragment = uritools.uridefrag(url).uri if fragment: if defrag_url in redirects: return uritools.urijoin(redirects[defrag_url], '#'+fragment) # Try fixing http/https to match the TOC url_parts = uritools.urisplit(url) base_url_parts = uritools.urisplit(redirects[base_url]) fixed_scheme_url = uritools.uriunsplit( list(base_url_parts)[:1] + list(url_parts)[1:]) if fixed_scheme_url in redirects: return redirects[fixed_scheme_url] # if same domain, try scraping it if url_parts.host == base_url_parts.host: try: print(f"Scraping url for get_redirect: {url}") scraper_result = scraper.scrape( url, wait_for_selector=config['post_body_selector']) redirects[url] = scraper_result['final_url'] # TODO: Make store this scraped result in the book as well? return redirects[url] except (urllib.error.URLError, ssl.SSLError): return url # TODO: Could return '' or something but for now leaving it seems fine # else, couldn't find it, so leave it alone. return url
def test_uridefrag(self): cases = [ ('http://python.org#frag', 'http://python.org', 'frag'), ('http://python.org', 'http://python.org', None), ('http://python.org/#frag', 'http://python.org/', 'frag'), ('http://python.org/', 'http://python.org/', None), ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'), ('http://python.org/?q', 'http://python.org/?q', None), ('http://python.org/p#frag', 'http://python.org/p', 'frag'), ('http://python.org/p?q', 'http://python.org/p?q', None), ('http://python.org#', 'http://python.org', ''), ('http://python.org/#', 'http://python.org/', ''), ('http://python.org/?q#', 'http://python.org/?q', ''), ('http://python.org/p?q#', 'http://python.org/p?q', ''), ] def encode(s): return s.encode() if s is not None else None cases += list(map(encode, case) for case in cases) for uri, base, fragment in cases: defrag = uridefrag(uri) self.assertEqual(defrag, (base, fragment)) self.assertEqual(defrag.uri, base) self.assertEqual(defrag.fragment, fragment) self.assertEqual(uri, defrag.geturi())
def test_getfragment(self): self.assertEqual(uridefrag('').getfragment(), None) self.assertEqual(uridefrag(b'').getfragment(), None) self.assertEqual(uridefrag('#').getfragment(), '') self.assertEqual(uridefrag(b'#').getfragment(), '') self.assertEqual(uridefrag('#foo').getfragment(), 'foo') self.assertEqual(uridefrag(b'#foo').getfragment(), 'foo') self.assertEqual(uridefrag('#foo%20bar').getfragment(), 'foo bar') self.assertEqual(uridefrag(b'#foo%20bar').getfragment(), 'foo bar')
def _traverse_dict(schemaURI, j, jp="", fragment=None): # Pre-processing newPartialSchemaURI = j.get('$id') if newPartialSchemaURI: # Computing the absolute schema URI if uritools.isabsuri(schemaURI): newSchemaURI , uriFragment = uritools.uridefrag(uritools.urijoin(schemaURI,newPartialSchemaURI)) else: newSchemaURI , uriFragment = uritools.uridefrag(newPartialSchemaURI) else: newSchemaURI = schemaURI # Are we jumping to a different place? if newSchemaURI == schemaURI: theId = id(j) theIdStr = str(theId) # Does the dictionary contain a '$ref'? isRef = REF_FEATURE in j for k,v in j.items(): # Following JSON reference standards, we have to # ignore other keys when there is a $ref one # https://tools.ietf.org/html/draft-pbryan-zyp-json-ref-03#section-3 if isRef and (k != REF_FEATURE): continue elemId = theIdStr + ':' + k elemPath = jp + '/' + k jp2val[elemPath] = elemId # Is the key among the "special ones"? if k in keySet: # Saving the correspondence from Python address # to unique id of the feature id2ElemId.setdefault(theId,{})[k] = [ elemId ] keyRefs.setdefault(k,[]).append(FeatureLoc(schemaURI=schemaURI,fragment=fragment,path=elemPath,context=j,id=elemId)) if isinstance(v,dict): # Fragment must not be propagated to children _traverse_dict(schemaURI,v,jp=elemPath) elif isinstance(v,list): _traverse_list(schemaURI,v,jp=elemPath) else: traverseJSONSchema(j,schemaURI=newSchemaURI,fragment=uriFragment,keys=keys,refSchemaListSet=refSchemaListSet)
def translate_uri(self, uri): parts = uritools.uridefrag(uri) try: feed = self.backend.feeds[parts.uri] except Exception as e: logger.error('Error retrieving %s: %s', parts.uri, e) else: return feed.getstreamuri(parts.getfragment())
def validate(self, validator, fp_def, value, schema): if fp_def and isinstance(fp_def, str): fp_loc_id = id(schema) # Getting the absolute schema id and the route if uritools.isabsuri(self.schemaURI): abs_ref_schema_id, rel_json_pointer = uritools.uridefrag( uritools.urijoin(self.schemaURI, fp_def)) else: abs_ref_schema_id, rel_json_pointer = uritools.uridefrag( fp_def) fpDef = self.FPWorld.setdefault(abs_ref_schema_id, {}).get(fp_loc_id) # And getting the foreign property definition if fpDef is None: fpDef = FPDef(schemaURI=self.schemaURI, refSchemaURI=abs_ref_schema_id, path='(unknown {})'.format(fp_loc_id), refPath=rel_json_pointer, values=list()) self.FPWorld[abs_ref_schema_id][fp_loc_id] = fpDef obtainedValues = [(value, )] isAtomicValue = len(obtainedValues) == 1 and len( obtainedValues[0]) == 1 and isinstance( obtainedValues[0][0], ALLOWED_ATOMIC_VALUE_TYPES) if isAtomicValue: theValues = [obtainedValues[0][0]] else: theValues = UniqueKey.GenKeyStrings(obtainedValues) fpVals = fpDef.values # Second pass will do the validation for theValue in theValues: fpVals.append(FPVal(where=self.currentJSONFile, value=theValue))
def bootstrap(self, refSchemaTuple=tuple()): (id2ElemId, keyRefs, refSchemaCache) = refSchemaTuple keyList = keyRefs[self.triggerAttribute] errors = [] # Saving the unique locations # based on information from FeatureLoc elems for loc in keyList: fp_def = loc.context[self.triggerAttribute] fp_loc_id = id(loc.context) # Getting the absolute schema id and the route if uritools.isabsuri(self.schemaURI): abs_ref_schema_id, rel_json_pointer = uritools.uridefrag( uritools.urijoin(self.schemaURI, fp_def)) else: abs_ref_schema_id, rel_json_pointer = uritools.uridefrag( fp_def) if abs_ref_schema_id not in refSchemaCache: errors.append({ 'reason': 'fp_no_schema', 'description': "No schema with {0} id, required by {1} ({2})".format( abs_ref_schema_id, self.jsonSchemaSource, self.schemaURI) }) fpDefH = self.FPWorld.setdefault(abs_ref_schema_id, {}) # This control is here for same primary key referenced from multiple cases fpDefH[fp_loc_id] = FPDef(schemaURI=self.schemaURI, refSchemaURI=abs_ref_schema_id, path=loc.path, refPath=rel_json_pointer, values=list()) return errors
def root_directory(self): root = self.__browse_root if not root: return None elif root.startswith(('file:', 'http:', 'https:')): uri = uritools.uridefrag('podcast+' + root).uri elif os.path.isabs(root): uri = uritools.uricompose('podcast+file', '', root) elif self.__config_dir: uri = uritools.uricompose('podcast+file', '', os.path.join(self.__config_dir, root)) else: return None return models.Ref.directory(name='Podcasts', uri=uri)
def lookup(self, uri): # pop from __tracks since cached tracks shouldn't live too long try: track = self.__tracks.pop(uri) except KeyError: logger.debug('Lookup cache miss: %s', uri) else: return [track] try: feed = self.backend.feeds[uritools.uridefrag(uri).uri] except Exception as e: logger.error('Error retrieving %s: %s', uri, e) # TODO: raise? else: return self.__lookup(feed, uri) return [] # FIXME: hide errors from clients
def custom_patterns(parser, results): a_tags = [] elements = cxpath_href(parser.doc) for element in elements: href = element.get('href') # skip links to invalid hrefs if not href: continue if href.startswith('javascript:'): continue # canonicalize the href href = uritools.urijoin(parser.url, href) href = uritools.uridefrag(href).uri href = clean_url(href) try: href = url_normalize.url_normalize(href) except Exception as e: logging.debug('failed to normalize url ' + href) a_tag = {'href': href} # extract the rel if element.get('rel') is not None: rel = element.get('rel').strip() if len(rel) > 0: a_tag['rel'] = rel # extract the text_content if element.text_content() is not None: text_content = element.text_content().strip() if len(text_content) > 0: a_tag['text_content'] = text_content # add to list a_tags.append(a_tag) # remove duplicate tags a_tags.sort(key=lambda x: x['href']) a_tags_dedup = [] prev_href = None for a_tag in a_tags: if a_tag['href'] != prev_href: prev_href = a_tag['href'] a_tags_dedup.append(a_tag) results.append({'value': a_tags_dedup})
def root_directory(self): root = self.__browse_root if not root: return None elif root.startswith(('file:', 'http:', 'https:')): uri = uritools.uridefrag('podcast+' + root).uri return models.Ref.directory(name='Podcasts', uri=uri) elif os.path.isabs(root): uri = uritools.uricompose('podcast+file', '', root) return models.Ref.directory(name='Podcasts', uri=uri) elif self.__config_dir: path = os.path.join(self.__config_dir, root) uri = uritools.uricompose('podcast+file', '', path) return models.Ref.directory(name='Podcasts', uri=uri) else: logger.error('Cannot retrieve Podcast root directory') return None
def find_linked_extras(posts): extra_page_urls = [] for post in posts: for body_soup in post['body_soups']: for element in body_soup.select('[href]'): full_href = uritools.urijoin( post['final_url'], element['href']) defragged_href = uritools.uridefrag(full_href).uri if not url_is_included(defragged_href): href_parts = uritools.urisplit(full_href) base_url_parts = uritools.urisplit(redirects[base_url]) if href_parts.host == base_url_parts.host: # Never try to include linked pages from other domains if defragged_href not in extra_page_urls: # TODO: defragged, or full? Uniqueness or is the fragment important? extra_page_urls.append(defragged_href) return extra_page_urls
def key(uri): return uritools.uridefrag(uri).uri
('http://python.org/?q', 'http://python.org/?q', None), ('http://python.org/p#frag', 'http://python.org/p', 'frag'), ('http://python.org/p?q', 'http://python.org/p?q', None), ('http://python.org#', 'http://python.org', ''), ('http://python.org/#', 'http://python.org/', ''), ('http://python.org/?q#', 'http://python.org/?q', ''), ('http://python.org/p?q#', 'http://python.org/p?q', ''), ] def encode(s): return s.encode() if s is not None else None cases += list(map(encode, case) for case in cases) for uri, base, fragment in cases: defrag = uridefrag(uri) self.assertEqual(defrag, (base, fragment)) self.assertEqual(defrag.uri, base) self.assertEqual(defrag.fragment, fragment) self.assertEqual(uri, defrag.geturi()) def test_getfragment(self): self.assertEqual(uridefrag('').getfragment(), None) self.assertEqual(uridefrag(b'').getfragment(), None) self.assertEqual(uridefrag('#').getfragment(), '') self.assertEqual(uridefrag(b'#').getfragment(), '') self.assertEqual(uridefrag('#foo').getfragment(), 'foo') self.assertEqual(uridefrag(b'#foo').getfragment(), 'foo') self.assertEqual(uridefrag('#foo%20bar').getfragment(), 'foo bar') self.assertEqual(uridefrag(b'#foo%20bar').getfragment(), 'foo bar')
def url_is_included(url): return uritools.uridefrag(url).uri in included_scraped_urls
def uri(feedurl, guid=None, safe=uritools.SUB_DELIMS+b':@/?'): uri = uritools.uridefrag('podcast+' + feedurl).uri if guid: return uri + '#' + uritools.uriencode(guid, safe=safe) else: return uri
def mark_url_included(url): included_scraped_urls.add(uritools.uridefrag(url).uri)
def loadJSONSchemas(self,*args,verbose=None): p_schemaHash = self.schemaHash # Schema validation stats numDirOK = 0 numDirFail = 0 numFileOK = 0 numFileIgnore = 0 numFileFail = 0 if verbose: print("PASS 0.a: JSON schema loading and cache generation") jsonSchemaPossibles = list(args) jsonSchemaNext = [] refSchemaCache = self.refSchemaCache = {} refSchemaFile = {} refSchemaSet = self.refSchemaSet = {} inlineCounter = 0 for jsonSchemaPossible in jsonSchemaPossibles: schemaObj = None if isinstance(jsonSchemaPossible,dict): schemaObj = jsonSchemaPossible errors = schemaObj.get('errors') if errors is None: if verbose: print("\tIGNORE: cached schema does not have the mandatory 'errors' attribute, so it cannot be processed") numFileIgnore += 1 continue jsonSchema = schemaObj.get('schema') if jsonSchema is None: if verbose: print("\tIGNORE: cached schema does not have the mandatory 'schema' attribute, so it cannot be processed") errors.append({ 'reason': 'unexpected', 'description': "The cached schema is missing" }) numFileIgnore += 1 continue schemaObj['schema_hash'] = self.GetNormalizedJSONHash(jsonSchema) if 'file' not in schemaObj: schemaObj['file'] = '(inline schema {})'.format(inlineCounter) inlineCounter += 1 jsonSchemaFile = schemaObj['file'] elif os.path.isdir(jsonSchemaPossible): jsonSchemaDir = jsonSchemaPossible # It's a possible JSON Schema directory, not a JSON Schema file try: for relJsonSchemaFile in os.listdir(jsonSchemaDir): if relJsonSchemaFile[0]=='.': continue newJsonSchemaFile = os.path.join(jsonSchemaDir,relJsonSchemaFile) if os.path.isdir(newJsonSchemaFile) or '.json' in relJsonSchemaFile: jsonSchemaPossibles.append(newJsonSchemaFile) numDirOK += 1 except IOError as ioe: if verbose: print("FATAL ERROR: Unable to open JSON schema directory {0}. Reason: {1}\n".format(jsonSchemaDir,ioe.strerror),file=sys.stderr) numDirFail += 1 continue else: jsonSchemaFile = jsonSchemaPossible if verbose: print("* Loading schema {0}".format(jsonSchemaFile)) try: with open(jsonSchemaFile,mode="r",encoding="utf-8") as sHandle: jsonSchema = json.load(sHandle) except IOError as ioe: if verbose: print("FATAL ERROR: Unable to open schema file {0}. Reason: {1}".format(jsonSchemaFile,ioe.strerror),file=sys.stderr) numFileFail += 1 continue else: errors = [] schemaObj = { 'schema': jsonSchema, 'schema_hash': self.GetNormalizedJSONHash(jsonSchema), 'file': jsonSchemaFile, 'errors': errors } schemaValId = jsonSchema.get(self.SCHEMA_KEY) if schemaValId is None: if verbose: print("\tIGNORE: {0} does not have the mandatory '{1}' attribute, so it cannot be validated".format(jsonSchemaFile,self.SCHEMA_KEY)) errors.append({ 'reason': 'no_schema', 'description': "JSON Schema attribute '{}' is missing".format(self.SCHEMA_KEY) }) numFileIgnore += 1 continue if PLAIN_VALIDATOR_MAPPER.get(schemaValId) is None: if verbose: print("\tIGNORE/FIXME: The JSON Schema id {0} is not being acknowledged by this validator".format(schemaValId)) errors.append({ 'reason': 'schema_unknown', 'description': "'$schema' id {0} is not being acknowledged by this validator".format(schemaValId) }) numFileIgnore += 1 continue # Getting the JSON Schema URI, needed by this idKey = '$id' if '$id' in jsonSchema else 'id' jsonSchemaURI = jsonSchema.get(idKey) if jsonSchemaURI is not None: if jsonSchemaURI in refSchemaFile: if verbose: print("\tERROR: schema in {0} and schema in {1} have the same id".format(jsonSchemaFile,refSchemaFile[jsonSchemaURI]),file=sys.stderr) errors.append({ 'reason': 'dup_id', 'description': "schema in {0} and schema in {1} have the same id".format(jsonSchemaFile,refSchemaFile[jsonSchemaURI]) }) numFileFail += 1 continue else: refSchemaCache[jsonSchemaURI] = jsonSchema refSchemaFile[jsonSchemaURI] = jsonSchemaFile else: numFileIgnore += 1 if verbose: print("\tIGNORE: Schema in {0} has no id attribute".format(jsonSchemaFile),file=sys.stderr) if self.doNotValidateNoId: errors.append({ 'reason': 'no_id', 'description': "JSON Schema attributes '$id' (Draft06 onward) and 'id' (Draft04) are missing in {}".format(jsonSchemaFile) }) numFileIgnore += 1 continue # We need to store these before creating the validators # in order to build the RefSchema cache jsonSchemaNext.append(schemaObj) if verbose: print("PASS 0.b: JSON schema validation") refSchemaListSet = {} for schemaObj in jsonSchemaNext: jsonSchema = schemaObj['schema'] jsonSchemaFile = schemaObj['file'] errors = schemaObj['errors'] # Errors related to these are captured in the previous loop schemaValId = jsonSchema.get(self.SCHEMA_KEY) plain_validator = PLAIN_VALIDATOR_MAPPER.get(schemaValId) # Getting the JSON Schema URI, needed by this idKey = '$id' if '$id' in jsonSchema else 'id' jsonSchemaURI = jsonSchema.get(idKey) validator , customFormatInstances = extendValidator(jsonSchemaURI, plain_validator, self.customTypes, self.customValidators, config=self.config, jsonSchemaSource=jsonSchemaFile) schemaObj['customFormatInstances'] = customFormatInstances schemaObj['validator'] = validator # Validate the extended JSON schema properly metaSchema = validator.META_SCHEMA if len(customFormatInstances) > 0: metaSchema = metaSchema.copy() metaSchema['properties'] = metaProps = metaSchema['properties'].copy() for customFormatInstance in customFormatInstances: for kF, vF in customFormatInstance.triggerJSONSchemaDef.items(): if kF in metaProps: # Multiple declarations vM = metaProps[kF].copy() if 'anyOf' not in vM: newDecl = { 'anyOf': [ vM ] } vM = metaProps[kF] = newDecl else: metaProps[kF] = vM vM['anyOf'].append(vF) else: metaProps[kF] = vF # We need to shadow the original schema localRefSchemaCache = refSchemaCache.copy() localRefSchemaCache[jsonSchemaURI] = metaSchema cachedSchemasResolver = JSV.RefResolver(base_uri=jsonSchemaURI, referrer=metaSchema, store=localRefSchemaCache) valErrors = [ valError for valError in validator(metaSchema,resolver = cachedSchemasResolver).iter_errors(jsonSchema) ] if len(valErrors) > 0: if verbose: print("\t- ERRORS:\n"+"\n".join(map(lambda se: "\t\tPath: {0} . Message: {1}".format("/"+"/".join(map(lambda e: str(e),se.path)),se.message) , valErrors))+"\n") for valError in valErrors: errors.append({ 'reason': 'schema_error', 'description': "Path: {0} . Message: {1}".format("/"+"/".join(map(lambda e: str(e),valError.path)),valError.message) }) numFileFail += 1 elif jsonSchemaURI is not None: # Getting the JSON Pointer object instance of the augmented schema # my $jsonSchemaP = $v->schema($jsonSchema)->schema; # This step is done, so we fetch a complete schema # $jsonSchema = $jsonSchemaP->data; if jsonSchemaURI in p_schemaHash: if verbose: print("\tERROR: validated, but schema in {0} and schema in {1} have the same id".format(jsonSchemaFile,p_schemaHash[jsonSchemaURI]['file']),file=sys.stderr) errors.append({ 'reason': 'dup_id', 'description': "JSON Schema validated, but schema in {0} and schema in {1} have the same id".format(jsonSchemaFile,p_schemaHash[jsonSchemaURI]['file']) }) numFileFail += 1 else: if verbose: print("\t- Validated {0}".format(jsonSchemaURI)) # Reverse mappings, needed later triggeringFeatures = [] for cFI in customFormatInstances: for triggerAttribute,_ in cFI.getValidators(): triggeringFeatures.append(triggerAttribute) traverseJSONSchema(jsonSchema,schemaURI=jsonSchemaURI,keys=triggeringFeatures,refSchemaListSet=refSchemaListSet) p_schemaHash[jsonSchemaURI] = schemaObj numFileOK += 1 else: # This is here to capture cases where we wanted to validate an # unidentified schema for its correctness if verbose: print("\tIGNORE: validated, but schema in {0} has no id attribute".format(jsonSchemaFile),file=sys.stderr) errors.append({ 'reason': 'no_id', 'description': "JSON Schema attributes '$id' (Draft06 onward) and 'id' (Draft04) are missing" }) numFileIgnore += 1 if verbose: print("\nSCHEMA VALIDATION STATS: loaded {0} schemas from {1} directories, ignored {2} schemas, failed {3} schemas and {4} directories".format(numFileOK,numDirOK,numFileIgnore,numFileFail,numDirFail)) print("\nPASS 0.c: JSON schema set consistency checks") # Circular references check is based on having two levels # one unmodified, another being built from the first, taking # into account already visited schemas refSchemaSetBase = {} for jsonSchemaURI, traverseListSet in refSchemaListSet.items(): # Time to implode each one of the elements from refSchemaListSet # for further usage refSchemaSetBase[jsonSchemaURI] = flattenTraverseListSet(traverseListSet) for jsonSchemaURI, jsonSchemaSet in refSchemaSetBase.items(): id2ElemId , keyRefs , jp2val = jsonSchemaSet # referenced schemas id2ElemId and keyRefs if REF_FEATURE in keyRefs: # Unlinking references on keyRefs keyRefs_augmented = {} for featName , featList in keyRefs.items(): keyRefs_augmented[featName] = list(featList) # Unlinking references on id2ElemId id2ElemId_augmented = {} for i2e_k , featDict in id2ElemId.items(): id2ElemId_augmented[i2e_k] = {} for featName , l_uniqId in featDict.items(): id2ElemId_augmented[i2e_k][featName] = list(l_uniqId) # And on the $ref case refList = keyRefs_augmented[REF_FEATURE] # Initializing the visitedURIs through # $ref fetching visitedURIs = set([jsonSchemaURI]) # This $ref list can be increased through the process for fLoc in refList: theRef = fLoc.context[REF_FEATURE] # Computing the absolute schema URI if uritools.isabsuri(jsonSchemaURI): abs_ref_schema_id , _ = uritools.uridefrag(uritools.urijoin(jsonSchemaURI,theRef)) else: abs_ref_schema_id , _ = uritools.uridefrag(uritools.urijoin(jsonSchemaURI,theRef)) # Circular references detection check if abs_ref_schema_id in visitedURIs: continue visitedURIs.add(abs_ref_schema_id) # Now, time to get the referenced, gathered data refSet = refSchemaSetBase.get(abs_ref_schema_id) if refSet is not None: ref_id2ElemId , ref_keyRefs , ref_jp2val = refSet # TODO: properly augment refSchemaSet id2ElemId and keyRefs with # This is needed to have a proper bootstrap for ref_pAddr_k, ref_pAddr_v in ref_id2ElemId.items(): featDict = id2ElemId_augmented.setdefault(ref_pAddr_k,{}) for ref_feat_k , ref_feat_v in ref_pAddr_v.items(): featDict.setdefault(ref_feat_k,[]).extend(ref_feat_v) for ref_kR_k , ref_kR_v in ref_keyRefs.items(): keyRefs_augmented.setdefault(ref_kR_k,[]).extend(ref_kR_v) else: # TODO: error handling print("UNHANDLED ERROR",file=sys.stderr) # Recomposing the tuple jsonSchemaSet = (id2ElemId_augmented,keyRefs_augmented,jp2val) refSchemaSet[jsonSchemaURI] = jsonSchemaSet # Last, bootstrapping the extensions # Now, we check whether the declared foreign keys are pointing to loaded JSON schemas numSchemaConsistent = 0 numSchemaInconsistent = 0 for jsonSchemaURI , p_schema in p_schemaHash.items(): jsonSchemaFile = p_schema['file'] if verbose: print("* Checking {0}".format(jsonSchemaFile)) customFormatInstances = p_schema['customFormatInstances'] isValid = True if len(customFormatInstances) > 0: (id2ElemId , keyRefs , jp2val) = refSchemaSet[jsonSchemaURI] for cFI in customFormatInstances: if cFI.needsBootstrapping: doBootstrap = False for triggerAttribute,_ in cFI.getValidators(): if triggerAttribute in keyRefs: doBootstrap = True break if doBootstrap: # Bootstrapping the schema # By default this is a no-op errors = cFI.bootstrap(refSchemaTuple=(id2ElemId , keyRefs , self.refSchemaCache)) if errors: if verbose: for error in errors: print("\t- ERROR: {}".format(error['description']),file=sys.stderr) p_schema['errors'].extend(errors) isValid = False if isValid: if verbose: print("\t- Consistent!") numSchemaConsistent += 1 else: numSchemaInconsistent += 1 if verbose: print("\nSCHEMA CONSISTENCY STATS: {0} schemas right, {1} with inconsistencies".format(numSchemaConsistent,numSchemaInconsistent)) return len(self.schemaHash.keys())
def getfeeduri(cls, url): return uritools.uridefrag(Extension.ext_name + '+' + url).uri
def refresh(self, uri=None): if uri: self.backend.feeds.pop(uritools.uridefrag(uri).uri, None) else: self.backend.feeds.clear() self.__tracks.clear()
""" replace hrefs in TOC to use new ids """ for link in toc_links: tag = link['tag'] tag['href'] = '#' + final_url_to_id[get_redirect(tag['href'])] """ replace hrefs from links in posts to use new ids """ for (post, element) in post_select_iter('[href]'): full_href = uritools.urijoin(post['final_url'], element['href']) defragged_href = uritools.uridefrag(full_href).uri subsection_id = uritools.urisplit(full_href).fragment final_defrag_url = get_redirect(defragged_href) if final_defrag_url in final_url_to_id: chap_id = '#' + final_url_to_id[final_defrag_url] # TODO: Display a warning if subsection ID can't be found. We're currently assuming they line up. Should point to just the chapter if broken. if subsection_id: final_href = chap_id + '_' + subsection_id print( f"Replacing an internal subsection link: {full_href} {final_href}") else: final_href = chap_id print( f"Replacing an internal chapter link: {full_href} {final_href}") element['href'] = final_href
def traverseJSONSchema(jsonObj, schemaURI=None, keys=set(), fragment=None, refSchemaListSet={}): # Should we try getting it? if schemaURI is None: if isinstance(jsonObj,dict): startingSchemaURI = j.get('$id') if startingSchemaURI is None: startingSchemaURI = j.get('id') # End / fail fast if startingSchemaURI is None: return None schemaURI , fragment = uritools.uridefrag(startingSchemaURI) else: # End / fail fast return None # Dictionary from name of the feature # to be captured to arrays of FeatureLoc named tuples keyRefs = {} # Dictionary from Python address # to dictionaries containing the features # to the features they contain # It's a dictionary of dictionaries of unique ids # First level: python address # Second level: name of the feature # Third level: unique ids id2ElemId = {} # Dictionary from JSON Pointer # to unique ids jp2val = {} refSchemaListSet.setdefault(schemaURI,[]).append( (id2ElemId , keyRefs , jp2val) ) # Translating it into an set keySet = keys if isinstance(keys,set) else set(keys) # And adding the '$ref' feature keySet.add(REF_FEATURE) def _traverse_dict(schemaURI, j, jp="", fragment=None): # Pre-processing newPartialSchemaURI = j.get('$id') if newPartialSchemaURI: # Computing the absolute schema URI if uritools.isabsuri(schemaURI): newSchemaURI , uriFragment = uritools.uridefrag(uritools.urijoin(schemaURI,newPartialSchemaURI)) else: newSchemaURI , uriFragment = uritools.uridefrag(newPartialSchemaURI) else: newSchemaURI = schemaURI # Are we jumping to a different place? if newSchemaURI == schemaURI: theId = id(j) theIdStr = str(theId) # Does the dictionary contain a '$ref'? isRef = REF_FEATURE in j for k,v in j.items(): # Following JSON reference standards, we have to # ignore other keys when there is a $ref one # https://tools.ietf.org/html/draft-pbryan-zyp-json-ref-03#section-3 if isRef and (k != REF_FEATURE): continue elemId = theIdStr + ':' + k elemPath = jp + '/' + k jp2val[elemPath] = elemId # Is the key among the "special ones"? if k in keySet: # Saving the correspondence from Python address # to unique id of the feature id2ElemId.setdefault(theId,{})[k] = [ elemId ] keyRefs.setdefault(k,[]).append(FeatureLoc(schemaURI=schemaURI,fragment=fragment,path=elemPath,context=j,id=elemId)) if isinstance(v,dict): # Fragment must not be propagated to children _traverse_dict(schemaURI,v,jp=elemPath) elif isinstance(v,list): _traverse_list(schemaURI,v,jp=elemPath) else: traverseJSONSchema(j,schemaURI=newSchemaURI,fragment=uriFragment,keys=keys,refSchemaListSet=refSchemaListSet) def _traverse_list(schemaURI, j, jp=""): theIdStr = str(id(j)) for vi, v in enumerate(j): str_vi = str(vi) elemId = theIdStr + ':' + str_vi elemPath = jp + '/' + str_vi jp2val[elemPath] = elemId if isinstance(v,dict): _traverse_dict(schemaURI,v,jp=elemPath) elif isinstance(v,list): _traverse_list(schemaURI,v,jp=elemPath) if isinstance(jsonObj,dict): _traverse_dict(schemaURI,jsonObj, fragment=fragment) elif isinstance(jsonObj,list): _traverse_list(schemaURI,jsonObj) return refSchemaListSet