Пример #1
0
def get_redirect(url):
    # simple case: it's already in the dict
    url = absolute_from_relative_url(url)
    if url in redirects:
        return redirects[url]

    # Try looking it up without the fragment
    defrag_url = uritools.uridefrag(url).uri
    fragment = uritools.uridefrag(url).uri
    if fragment:
        if defrag_url in redirects:
            return uritools.urijoin(redirects[defrag_url], '#'+fragment)

    # Try fixing http/https to match the TOC
    url_parts = uritools.urisplit(url)
    base_url_parts = uritools.urisplit(redirects[base_url])
    fixed_scheme_url = uritools.uriunsplit(
        list(base_url_parts)[:1] + list(url_parts)[1:])
    if fixed_scheme_url in redirects:
        return redirects[fixed_scheme_url]

    # if same domain, try scraping it
    if url_parts.host == base_url_parts.host:
        try:
            print(f"Scraping url for get_redirect: {url}")
            scraper_result = scraper.scrape(
                url, wait_for_selector=config['post_body_selector'])
            redirects[url] = scraper_result['final_url']
            # TODO: Make store this scraped result in the book as well?
            return redirects[url]
        except (urllib.error.URLError, ssl.SSLError):
            return url  # TODO: Could return '' or something but for now leaving it seems fine
    # else, couldn't find it, so leave it alone.

    return url
Пример #2
0
    def test_uridefrag(self):
        cases = [
            ('http://python.org#frag', 'http://python.org', 'frag'),
            ('http://python.org', 'http://python.org', None),
            ('http://python.org/#frag', 'http://python.org/', 'frag'),
            ('http://python.org/', 'http://python.org/', None),
            ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
            ('http://python.org/?q', 'http://python.org/?q', None),
            ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
            ('http://python.org/p?q', 'http://python.org/p?q', None),
            ('http://python.org#', 'http://python.org', ''),
            ('http://python.org/#', 'http://python.org/', ''),
            ('http://python.org/?q#', 'http://python.org/?q', ''),
            ('http://python.org/p?q#', 'http://python.org/p?q', ''),
        ]

        def encode(s):
            return s.encode() if s is not None else None
        cases += list(map(encode, case) for case in cases)

        for uri, base, fragment in cases:
            defrag = uridefrag(uri)
            self.assertEqual(defrag, (base, fragment))
            self.assertEqual(defrag.uri, base)
            self.assertEqual(defrag.fragment, fragment)
            self.assertEqual(uri, defrag.geturi())
Пример #3
0
 def test_getfragment(self):
     self.assertEqual(uridefrag('').getfragment(), None)
     self.assertEqual(uridefrag(b'').getfragment(), None)
     self.assertEqual(uridefrag('#').getfragment(), '')
     self.assertEqual(uridefrag(b'#').getfragment(), '')
     self.assertEqual(uridefrag('#foo').getfragment(), 'foo')
     self.assertEqual(uridefrag(b'#foo').getfragment(), 'foo')
     self.assertEqual(uridefrag('#foo%20bar').getfragment(), 'foo bar')
     self.assertEqual(uridefrag(b'#foo%20bar').getfragment(), 'foo bar')
Пример #4
0
 def test_getfragment(self):
     self.assertEqual(uridefrag('').getfragment(), None)
     self.assertEqual(uridefrag(b'').getfragment(), None)
     self.assertEqual(uridefrag('#').getfragment(), '')
     self.assertEqual(uridefrag(b'#').getfragment(), '')
     self.assertEqual(uridefrag('#foo').getfragment(), 'foo')
     self.assertEqual(uridefrag(b'#foo').getfragment(), 'foo')
     self.assertEqual(uridefrag('#foo%20bar').getfragment(), 'foo bar')
     self.assertEqual(uridefrag(b'#foo%20bar').getfragment(), 'foo bar')
	def _traverse_dict(schemaURI, j, jp="", fragment=None):
		# Pre-processing
		newPartialSchemaURI = j.get('$id')
		if newPartialSchemaURI:
			# Computing the absolute schema URI
			if uritools.isabsuri(schemaURI):
				newSchemaURI , uriFragment = uritools.uridefrag(uritools.urijoin(schemaURI,newPartialSchemaURI))
			else:
				newSchemaURI , uriFragment = uritools.uridefrag(newPartialSchemaURI)
		else:
			newSchemaURI = schemaURI
		
		# Are we jumping to a different place?
		if newSchemaURI == schemaURI:
			theId = id(j)
			theIdStr = str(theId)
			
			# Does the dictionary contain a '$ref'?
			isRef = REF_FEATURE in j
			
			for k,v in j.items():
				# Following JSON reference standards, we have to
				# ignore other keys when there is a $ref one
				# https://tools.ietf.org/html/draft-pbryan-zyp-json-ref-03#section-3
				if isRef and (k != REF_FEATURE):
					continue
				
				elemId = theIdStr + ':' + k
				
				elemPath = jp + '/' + k
				jp2val[elemPath] = elemId
				
				# Is the key among the "special ones"?
				if k in keySet:
					# Saving the correspondence from Python address
					# to unique id of the feature
					id2ElemId.setdefault(theId,{})[k] = [ elemId ]
					keyRefs.setdefault(k,[]).append(FeatureLoc(schemaURI=schemaURI,fragment=fragment,path=elemPath,context=j,id=elemId))
				
				if isinstance(v,dict):
					# Fragment must not be propagated to children
					_traverse_dict(schemaURI,v,jp=elemPath)
				elif isinstance(v,list):
					_traverse_list(schemaURI,v,jp=elemPath)
		else:
			traverseJSONSchema(j,schemaURI=newSchemaURI,fragment=uriFragment,keys=keys,refSchemaListSet=refSchemaListSet)
Пример #6
0
 def translate_uri(self, uri):
     parts = uritools.uridefrag(uri)
     try:
         feed = self.backend.feeds[parts.uri]
     except Exception as e:
         logger.error('Error retrieving %s: %s', parts.uri, e)
     else:
         return feed.getstreamuri(parts.getfragment())
Пример #7
0
 def translate_uri(self, uri):
     parts = uritools.uridefrag(uri)
     try:
         feed = self.backend.feeds[parts.uri]
     except Exception as e:
         logger.error('Error retrieving %s: %s', parts.uri, e)
     else:
         return feed.getstreamuri(parts.getfragment())
Пример #8
0
    def validate(self, validator, fp_def, value, schema):
        if fp_def and isinstance(fp_def, str):
            fp_loc_id = id(schema)

            # Getting the absolute schema id and the route
            if uritools.isabsuri(self.schemaURI):
                abs_ref_schema_id, rel_json_pointer = uritools.uridefrag(
                    uritools.urijoin(self.schemaURI, fp_def))
            else:
                abs_ref_schema_id, rel_json_pointer = uritools.uridefrag(
                    fp_def)
            fpDef = self.FPWorld.setdefault(abs_ref_schema_id,
                                            {}).get(fp_loc_id)

            # And getting the foreign property definition
            if fpDef is None:
                fpDef = FPDef(schemaURI=self.schemaURI,
                              refSchemaURI=abs_ref_schema_id,
                              path='(unknown {})'.format(fp_loc_id),
                              refPath=rel_json_pointer,
                              values=list())
                self.FPWorld[abs_ref_schema_id][fp_loc_id] = fpDef

            obtainedValues = [(value, )]

            isAtomicValue = len(obtainedValues) == 1 and len(
                obtainedValues[0]) == 1 and isinstance(
                    obtainedValues[0][0], ALLOWED_ATOMIC_VALUE_TYPES)

            if isAtomicValue:
                theValues = [obtainedValues[0][0]]
            else:
                theValues = UniqueKey.GenKeyStrings(obtainedValues)

            fpVals = fpDef.values

            # Second pass will do the validation
            for theValue in theValues:
                fpVals.append(FPVal(where=self.currentJSONFile,
                                    value=theValue))
Пример #9
0
    def bootstrap(self, refSchemaTuple=tuple()):
        (id2ElemId, keyRefs, refSchemaCache) = refSchemaTuple

        keyList = keyRefs[self.triggerAttribute]
        errors = []
        # Saving the unique locations
        # based on information from FeatureLoc elems
        for loc in keyList:
            fp_def = loc.context[self.triggerAttribute]
            fp_loc_id = id(loc.context)

            # Getting the absolute schema id and the route
            if uritools.isabsuri(self.schemaURI):
                abs_ref_schema_id, rel_json_pointer = uritools.uridefrag(
                    uritools.urijoin(self.schemaURI, fp_def))
            else:
                abs_ref_schema_id, rel_json_pointer = uritools.uridefrag(
                    fp_def)

            if abs_ref_schema_id not in refSchemaCache:
                errors.append({
                    'reason':
                    'fp_no_schema',
                    'description':
                    "No schema with {0} id, required by {1} ({2})".format(
                        abs_ref_schema_id, self.jsonSchemaSource,
                        self.schemaURI)
                })

            fpDefH = self.FPWorld.setdefault(abs_ref_schema_id, {})

            # This control is here for same primary key referenced from multiple cases
            fpDefH[fp_loc_id] = FPDef(schemaURI=self.schemaURI,
                                      refSchemaURI=abs_ref_schema_id,
                                      path=loc.path,
                                      refPath=rel_json_pointer,
                                      values=list())

        return errors
Пример #10
0
 def root_directory(self):
     root = self.__browse_root
     if not root:
         return None
     elif root.startswith(('file:', 'http:', 'https:')):
         uri = uritools.uridefrag('podcast+' + root).uri
     elif os.path.isabs(root):
         uri = uritools.uricompose('podcast+file', '', root)
     elif self.__config_dir:
         uri = uritools.uricompose('podcast+file', '',
                                   os.path.join(self.__config_dir, root))
     else:
         return None
     return models.Ref.directory(name='Podcasts', uri=uri)
Пример #11
0
 def lookup(self, uri):
     # pop from __tracks since cached tracks shouldn't live too long
     try:
         track = self.__tracks.pop(uri)
     except KeyError:
         logger.debug('Lookup cache miss: %s', uri)
     else:
         return [track]
     try:
         feed = self.backend.feeds[uritools.uridefrag(uri).uri]
     except Exception as e:
         logger.error('Error retrieving %s: %s', uri, e)  # TODO: raise?
     else:
         return self.__lookup(feed, uri)
     return []  # FIXME: hide errors from clients
Пример #12
0
 def lookup(self, uri):
     # pop from __tracks since cached tracks shouldn't live too long
     try:
         track = self.__tracks.pop(uri)
     except KeyError:
         logger.debug('Lookup cache miss: %s', uri)
     else:
         return [track]
     try:
         feed = self.backend.feeds[uritools.uridefrag(uri).uri]
     except Exception as e:
         logger.error('Error retrieving %s: %s', uri, e)  # TODO: raise?
     else:
         return self.__lookup(feed, uri)
     return []  # FIXME: hide errors from clients
Пример #13
0
    def custom_patterns(parser, results):
        a_tags = []
        elements = cxpath_href(parser.doc)
        for element in elements:
            href = element.get('href')

            # skip links to invalid hrefs
            if not href:
                continue
            if href.startswith('javascript:'):
                continue

            # canonicalize the href
            href = uritools.urijoin(parser.url, href)
            href = uritools.uridefrag(href).uri
            href = clean_url(href)
            try:
                href = url_normalize.url_normalize(href)
            except Exception as e:
                logging.debug('failed to normalize url ' + href)
            a_tag = {'href': href}

            # extract the rel
            if element.get('rel') is not None:
                rel = element.get('rel').strip()
                if len(rel) > 0:
                    a_tag['rel'] = rel

            # extract the text_content
            if element.text_content() is not None:
                text_content = element.text_content().strip()
                if len(text_content) > 0:
                    a_tag['text_content'] = text_content

            # add to list
            a_tags.append(a_tag)

        # remove duplicate tags
        a_tags.sort(key=lambda x: x['href'])
        a_tags_dedup = []
        prev_href = None
        for a_tag in a_tags:
            if a_tag['href'] != prev_href:
                prev_href = a_tag['href']
                a_tags_dedup.append(a_tag)

        results.append({'value': a_tags_dedup})
Пример #14
0
 def root_directory(self):
     root = self.__browse_root
     if not root:
         return None
     elif root.startswith(('file:', 'http:', 'https:')):
         uri = uritools.uridefrag('podcast+' + root).uri
         return models.Ref.directory(name='Podcasts', uri=uri)
     elif os.path.isabs(root):
         uri = uritools.uricompose('podcast+file', '', root)
         return models.Ref.directory(name='Podcasts', uri=uri)
     elif self.__config_dir:
         path = os.path.join(self.__config_dir, root)
         uri = uritools.uricompose('podcast+file', '', path)
         return models.Ref.directory(name='Podcasts', uri=uri)
     else:
         logger.error('Cannot retrieve Podcast root directory')
         return None
Пример #15
0
    def find_linked_extras(posts):
        extra_page_urls = []
        for post in posts:
            for body_soup in post['body_soups']:
                for element in body_soup.select('[href]'):
                    full_href = uritools.urijoin(
                        post['final_url'], element['href'])
                    defragged_href = uritools.uridefrag(full_href).uri

                    if not url_is_included(defragged_href):
                        href_parts = uritools.urisplit(full_href)
                        base_url_parts = uritools.urisplit(redirects[base_url])
                        if href_parts.host == base_url_parts.host:  # Never try to include linked pages from other domains
                            if defragged_href not in extra_page_urls:
                                # TODO: defragged, or full? Uniqueness or is the fragment important?
                                extra_page_urls.append(defragged_href)
        return extra_page_urls
Пример #16
0
 def key(uri):
     return uritools.uridefrag(uri).uri
Пример #17
0
            ('http://python.org/?q', 'http://python.org/?q', None),
            ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
            ('http://python.org/p?q', 'http://python.org/p?q', None),
            ('http://python.org#', 'http://python.org', ''),
            ('http://python.org/#', 'http://python.org/', ''),
            ('http://python.org/?q#', 'http://python.org/?q', ''),
            ('http://python.org/p?q#', 'http://python.org/p?q', ''),
        ]

        def encode(s):
            return s.encode() if s is not None else None

        cases += list(map(encode, case) for case in cases)

        for uri, base, fragment in cases:
            defrag = uridefrag(uri)
            self.assertEqual(defrag, (base, fragment))
            self.assertEqual(defrag.uri, base)
            self.assertEqual(defrag.fragment, fragment)
            self.assertEqual(uri, defrag.geturi())

    def test_getfragment(self):
        self.assertEqual(uridefrag('').getfragment(), None)
        self.assertEqual(uridefrag(b'').getfragment(), None)
        self.assertEqual(uridefrag('#').getfragment(), '')
        self.assertEqual(uridefrag(b'#').getfragment(), '')
        self.assertEqual(uridefrag('#foo').getfragment(), 'foo')
        self.assertEqual(uridefrag(b'#foo').getfragment(), 'foo')
        self.assertEqual(uridefrag('#foo%20bar').getfragment(), 'foo bar')
        self.assertEqual(uridefrag(b'#foo%20bar').getfragment(), 'foo bar')
Пример #18
0
def url_is_included(url):
    return uritools.uridefrag(url).uri in included_scraped_urls
def uri(feedurl, guid=None, safe=uritools.SUB_DELIMS+b':@/?'):
    uri = uritools.uridefrag('podcast+' + feedurl).uri
    if guid:
        return uri + '#' + uritools.uriencode(guid, safe=safe)
    else:
        return uri
Пример #20
0
def mark_url_included(url):
    included_scraped_urls.add(uritools.uridefrag(url).uri)
Пример #21
0
	def loadJSONSchemas(self,*args,verbose=None):
		p_schemaHash = self.schemaHash
		# Schema validation stats
		numDirOK = 0
		numDirFail = 0
		numFileOK = 0
		numFileIgnore = 0
		numFileFail = 0
		
		if verbose:
			print("PASS 0.a: JSON schema loading and cache generation")
		jsonSchemaPossibles = list(args)
		jsonSchemaNext = []
		refSchemaCache = self.refSchemaCache = {}
		refSchemaFile = {}
		refSchemaSet = self.refSchemaSet = {}
		inlineCounter = 0
		for jsonSchemaPossible in jsonSchemaPossibles:
			schemaObj = None
			
			if isinstance(jsonSchemaPossible,dict):
				schemaObj = jsonSchemaPossible
				errors = schemaObj.get('errors')
				if errors is None:
					if verbose:
						print("\tIGNORE: cached schema does not have the mandatory 'errors' attribute, so it cannot be processed")
					numFileIgnore += 1
					continue
				
				jsonSchema = schemaObj.get('schema')
				if jsonSchema is None:
					if verbose:
						print("\tIGNORE: cached schema does not have the mandatory 'schema' attribute, so it cannot be processed")
					errors.append({
						'reason': 'unexpected',
						'description': "The cached schema is missing"
					})
					numFileIgnore += 1
					continue
				
				schemaObj['schema_hash'] = self.GetNormalizedJSONHash(jsonSchema)
				
				if 'file' not in schemaObj:
					schemaObj['file'] = '(inline schema {})'.format(inlineCounter)
					inlineCounter += 1
				jsonSchemaFile = schemaObj['file']
			elif os.path.isdir(jsonSchemaPossible):
				jsonSchemaDir = jsonSchemaPossible
				# It's a possible JSON Schema directory, not a JSON Schema file
				try:
					for relJsonSchemaFile in os.listdir(jsonSchemaDir):
						if relJsonSchemaFile[0]=='.':
							continue
						
						newJsonSchemaFile = os.path.join(jsonSchemaDir,relJsonSchemaFile)
						if os.path.isdir(newJsonSchemaFile) or '.json' in relJsonSchemaFile:
							jsonSchemaPossibles.append(newJsonSchemaFile)
					numDirOK += 1
				except IOError as ioe:
					if verbose:
						print("FATAL ERROR: Unable to open JSON schema directory {0}. Reason: {1}\n".format(jsonSchemaDir,ioe.strerror),file=sys.stderr)
					numDirFail += 1
				
				continue
			else:
				jsonSchemaFile = jsonSchemaPossible
				if verbose:
					print("* Loading schema {0}".format(jsonSchemaFile))
				try:
					with open(jsonSchemaFile,mode="r",encoding="utf-8") as sHandle:
						jsonSchema = json.load(sHandle)
				except IOError as ioe:
					if verbose:
						print("FATAL ERROR: Unable to open schema file {0}. Reason: {1}".format(jsonSchemaFile,ioe.strerror),file=sys.stderr)
					numFileFail += 1
					continue
				else:
					errors = []
					schemaObj = {
						'schema': jsonSchema,
						'schema_hash': self.GetNormalizedJSONHash(jsonSchema),
						'file': jsonSchemaFile,
						'errors': errors
					}
			
			schemaValId = jsonSchema.get(self.SCHEMA_KEY)
			if schemaValId is None:
				if verbose:
					print("\tIGNORE: {0} does not have the mandatory '{1}' attribute, so it cannot be validated".format(jsonSchemaFile,self.SCHEMA_KEY))
				errors.append({
					'reason': 'no_schema',
					'description': "JSON Schema attribute '{}' is missing".format(self.SCHEMA_KEY)
				})
				numFileIgnore += 1
				continue
			
			if PLAIN_VALIDATOR_MAPPER.get(schemaValId) is None:
				if verbose:
					print("\tIGNORE/FIXME: The JSON Schema id {0} is not being acknowledged by this validator".format(schemaValId))
				errors.append({
					'reason': 'schema_unknown',
					'description': "'$schema' id {0} is not being acknowledged by this validator".format(schemaValId)
				})
				numFileIgnore += 1
				continue
			
			# Getting the JSON Schema URI, needed by this
			idKey = '$id'  if '$id' in jsonSchema else 'id'
			jsonSchemaURI = jsonSchema.get(idKey)
			if jsonSchemaURI is not None:
				if jsonSchemaURI in refSchemaFile:
					if verbose:
						print("\tERROR: schema in {0} and schema in {1} have the same id".format(jsonSchemaFile,refSchemaFile[jsonSchemaURI]),file=sys.stderr)
					errors.append({
						'reason': 'dup_id',
						'description': "schema in {0} and schema in {1} have the same id".format(jsonSchemaFile,refSchemaFile[jsonSchemaURI])
					})
					numFileFail += 1
					continue
				else:
					refSchemaCache[jsonSchemaURI] = jsonSchema
					refSchemaFile[jsonSchemaURI] = jsonSchemaFile
			else:
				numFileIgnore += 1
				if verbose:
					print("\tIGNORE: Schema in {0} has no id attribute".format(jsonSchemaFile),file=sys.stderr)
				if self.doNotValidateNoId:
					errors.append({
						'reason': 'no_id',
						'description': "JSON Schema attributes '$id' (Draft06 onward) and 'id' (Draft04) are missing in {}".format(jsonSchemaFile)
					})
					numFileIgnore += 1
					continue
			
			# We need to store these before creating the validators
			# in order to build the RefSchema cache
			jsonSchemaNext.append(schemaObj)
		
		
		if verbose:
			print("PASS 0.b: JSON schema validation")
		
		refSchemaListSet = {}
		for schemaObj in jsonSchemaNext:
			jsonSchema = schemaObj['schema']
			jsonSchemaFile = schemaObj['file']
			errors = schemaObj['errors']
			
			# Errors related to these are captured in the previous loop
			schemaValId = jsonSchema.get(self.SCHEMA_KEY)
			plain_validator = PLAIN_VALIDATOR_MAPPER.get(schemaValId)
			
			# Getting the JSON Schema URI, needed by this
			idKey = '$id'  if '$id' in jsonSchema else 'id'
			jsonSchemaURI = jsonSchema.get(idKey)
			
			validator , customFormatInstances = extendValidator(jsonSchemaURI, plain_validator, self.customTypes, self.customValidators, config=self.config, jsonSchemaSource=jsonSchemaFile)
			
			schemaObj['customFormatInstances'] = customFormatInstances
			schemaObj['validator'] = validator
			
			# Validate the extended JSON schema properly
			metaSchema = validator.META_SCHEMA
			if len(customFormatInstances) > 0:
				metaSchema = metaSchema.copy()
				metaSchema['properties'] = metaProps = metaSchema['properties'].copy()
				
				for customFormatInstance in customFormatInstances:
					for kF, vF in customFormatInstance.triggerJSONSchemaDef.items():
						if kF in metaProps:
							# Multiple declarations
							vM = metaProps[kF].copy()
							if 'anyOf' not in vM:
								newDecl = {
									'anyOf': [
										vM
									]
								}
								vM = metaProps[kF] = newDecl
							else:
								metaProps[kF] = vM
							
							vM['anyOf'].append(vF)
						else:
							metaProps[kF] = vF
			
			# We need to shadow the original schema
			localRefSchemaCache = refSchemaCache.copy()
			localRefSchemaCache[jsonSchemaURI] = metaSchema
			cachedSchemasResolver = JSV.RefResolver(base_uri=jsonSchemaURI, referrer=metaSchema, store=localRefSchemaCache)
			
			valErrors = [ valError  for valError in validator(metaSchema,resolver = cachedSchemasResolver).iter_errors(jsonSchema) ]
			if len(valErrors) > 0:
				if verbose:
					print("\t- ERRORS:\n"+"\n".join(map(lambda se: "\t\tPath: {0} . Message: {1}".format("/"+"/".join(map(lambda e: str(e),se.path)),se.message) , valErrors))+"\n")
				for valError in valErrors:
					errors.append({
						'reason': 'schema_error',
						'description': "Path: {0} . Message: {1}".format("/"+"/".join(map(lambda e: str(e),valError.path)),valError.message)
					})
				numFileFail += 1
			elif jsonSchemaURI is not None:
				# Getting the JSON Pointer object instance of the augmented schema
				# my $jsonSchemaP = $v->schema($jsonSchema)->schema;
				# This step is done, so we fetch a complete schema
				# $jsonSchema = $jsonSchemaP->data;
				
				if jsonSchemaURI in p_schemaHash:
					if verbose:
						print("\tERROR: validated, but schema in {0} and schema in {1} have the same id".format(jsonSchemaFile,p_schemaHash[jsonSchemaURI]['file']),file=sys.stderr)
					errors.append({
						'reason': 'dup_id',
						'description': "JSON Schema validated, but schema in {0} and schema in {1} have the same id".format(jsonSchemaFile,p_schemaHash[jsonSchemaURI]['file'])
					})
					numFileFail += 1
				else:
					if verbose:
						print("\t- Validated {0}".format(jsonSchemaURI))
					
					# Reverse mappings, needed later
					triggeringFeatures = []
					for cFI in customFormatInstances:
						for triggerAttribute,_ in cFI.getValidators():
							triggeringFeatures.append(triggerAttribute)
					
					traverseJSONSchema(jsonSchema,schemaURI=jsonSchemaURI,keys=triggeringFeatures,refSchemaListSet=refSchemaListSet)
					
					p_schemaHash[jsonSchemaURI] = schemaObj
					numFileOK += 1
			else:
				# This is here to capture cases where we wanted to validate an
				# unidentified schema for its correctness
				if verbose:
					print("\tIGNORE: validated, but schema in {0} has no id attribute".format(jsonSchemaFile),file=sys.stderr)
				errors.append({
					'reason': 'no_id',
					'description': "JSON Schema attributes '$id' (Draft06 onward) and 'id' (Draft04) are missing"
				})
				numFileIgnore += 1
		
		
		if verbose:
			print("\nSCHEMA VALIDATION STATS: loaded {0} schemas from {1} directories, ignored {2} schemas, failed {3} schemas and {4} directories".format(numFileOK,numDirOK,numFileIgnore,numFileFail,numDirFail))
		
			print("\nPASS 0.c: JSON schema set consistency checks")
		
		# Circular references check is based on having two levels
		# one unmodified, another being built from the first, taking
		# into account already visited schemas
		refSchemaSetBase = {}
		for jsonSchemaURI, traverseListSet in refSchemaListSet.items():
			# Time to implode each one of the elements from refSchemaListSet
			# for further usage
			refSchemaSetBase[jsonSchemaURI] = flattenTraverseListSet(traverseListSet)
			
		for jsonSchemaURI, jsonSchemaSet in refSchemaSetBase.items():
			id2ElemId , keyRefs , jp2val = jsonSchemaSet
			
			# referenced schemas id2ElemId and keyRefs
			if REF_FEATURE in keyRefs:
				# Unlinking references on keyRefs
				keyRefs_augmented = {}
				for featName , featList in keyRefs.items():
					keyRefs_augmented[featName] = list(featList)
				
				# Unlinking references on id2ElemId
				id2ElemId_augmented = {}
				for i2e_k , featDict in  id2ElemId.items():
					id2ElemId_augmented[i2e_k] = {}
					for featName , l_uniqId in featDict.items():
						id2ElemId_augmented[i2e_k][featName] = list(l_uniqId)
				
				# And on the $ref case
				refList = keyRefs_augmented[REF_FEATURE]
				
				# Initializing the visitedURIs through
				# $ref fetching
				visitedURIs = set([jsonSchemaURI])
				
				# This $ref list can be increased through the process
				for fLoc in refList:
					theRef = fLoc.context[REF_FEATURE]
					# Computing the absolute schema URI
					if uritools.isabsuri(jsonSchemaURI):
						abs_ref_schema_id , _ = uritools.uridefrag(uritools.urijoin(jsonSchemaURI,theRef))
					else:
						abs_ref_schema_id , _ = uritools.uridefrag(uritools.urijoin(jsonSchemaURI,theRef))
					
					# Circular references detection check
					if abs_ref_schema_id in visitedURIs:
						continue
					
					visitedURIs.add(abs_ref_schema_id)
					
					# Now, time to get the referenced, gathered data
					refSet = refSchemaSetBase.get(abs_ref_schema_id)
					if refSet is not None:
						ref_id2ElemId , ref_keyRefs , ref_jp2val = refSet
						
						# TODO: properly augment refSchemaSet id2ElemId and keyRefs with
						# This is needed to have a proper bootstrap
						
						for ref_pAddr_k, ref_pAddr_v in ref_id2ElemId.items():
							featDict = id2ElemId_augmented.setdefault(ref_pAddr_k,{})
							for ref_feat_k , ref_feat_v in ref_pAddr_v.items():
								featDict.setdefault(ref_feat_k,[]).extend(ref_feat_v)
						
						for ref_kR_k , ref_kR_v in ref_keyRefs.items():
							keyRefs_augmented.setdefault(ref_kR_k,[]).extend(ref_kR_v)
					else:
						# TODO: error handling
						print("UNHANDLED ERROR",file=sys.stderr)
				
				# Recomposing the tuple
				jsonSchemaSet = (id2ElemId_augmented,keyRefs_augmented,jp2val)
			
			refSchemaSet[jsonSchemaURI] = jsonSchemaSet
		
		# Last, bootstrapping the extensions
		# Now, we check whether the declared foreign keys are pointing to loaded JSON schemas
		numSchemaConsistent = 0
		numSchemaInconsistent = 0
		for jsonSchemaURI , p_schema in p_schemaHash.items():
			jsonSchemaFile = p_schema['file']
			if verbose:
				print("* Checking {0}".format(jsonSchemaFile))
			customFormatInstances = p_schema['customFormatInstances']
			isValid = True
			if len(customFormatInstances) > 0:
				(id2ElemId , keyRefs , jp2val) = refSchemaSet[jsonSchemaURI]
				
				for cFI in customFormatInstances:
					if cFI.needsBootstrapping:
						doBootstrap = False
						for triggerAttribute,_ in cFI.getValidators():
							if triggerAttribute in keyRefs:
								doBootstrap = True
								break
						
						if doBootstrap:
							# Bootstrapping the schema
							# By default this is a no-op
							errors = cFI.bootstrap(refSchemaTuple=(id2ElemId , keyRefs , self.refSchemaCache))
							if errors:
								if verbose:
									for error in errors:
										print("\t- ERROR: {}".format(error['description']),file=sys.stderr)
								
								p_schema['errors'].extend(errors)
								isValid = False
			
			if isValid:
				if verbose:
					print("\t- Consistent!")
				numSchemaConsistent += 1
			else:
				numSchemaInconsistent += 1
		
		if verbose:
			print("\nSCHEMA CONSISTENCY STATS: {0} schemas right, {1} with inconsistencies".format(numSchemaConsistent,numSchemaInconsistent))
		
		return len(self.schemaHash.keys())
Пример #22
0
 def getfeeduri(cls, url):
     return uritools.uridefrag(Extension.ext_name + '+' + url).uri
Пример #23
0
 def refresh(self, uri=None):
     if uri:
         self.backend.feeds.pop(uritools.uridefrag(uri).uri, None)
     else:
         self.backend.feeds.clear()
     self.__tracks.clear()
Пример #24
0
 def refresh(self, uri=None):
     if uri:
         self.backend.feeds.pop(uritools.uridefrag(uri).uri, None)
     else:
         self.backend.feeds.clear()
     self.__tracks.clear()
Пример #25
0
 def key(uri):
     return uritools.uridefrag(uri).uri
Пример #26
0
"""
replace hrefs in TOC to use new ids
"""

for link in toc_links:
    tag = link['tag']
    tag['href'] = '#' + final_url_to_id[get_redirect(tag['href'])]

"""
replace hrefs from links in posts to use new ids
"""

for (post, element) in post_select_iter('[href]'):
    full_href = uritools.urijoin(post['final_url'], element['href'])
    defragged_href = uritools.uridefrag(full_href).uri
    subsection_id = uritools.urisplit(full_href).fragment
    final_defrag_url = get_redirect(defragged_href)

    if final_defrag_url in final_url_to_id:
        chap_id = '#' + final_url_to_id[final_defrag_url]
        # TODO: Display a warning if subsection ID can't be found. We're currently assuming they line up. Should point to just the chapter if broken.
        if subsection_id:
            final_href = chap_id + '_' + subsection_id
            print(
                f"Replacing an internal subsection link: {full_href} {final_href}")
        else:
            final_href = chap_id
            print(
                f"Replacing an internal chapter link: {full_href} {final_href}")
        element['href'] = final_href
def traverseJSONSchema(jsonObj, schemaURI=None, keys=set(), fragment=None, refSchemaListSet={}):
	# Should we try getting it?
	if schemaURI is None:
		if isinstance(jsonObj,dict):
			startingSchemaURI = j.get('$id')
			if startingSchemaURI is None:
				startingSchemaURI = j.get('id')
			
			# End / fail fast
			if startingSchemaURI is None:
				return None
			
			schemaURI , fragment = uritools.uridefrag(startingSchemaURI)
		else:
			# End / fail fast
			return None
	
	# Dictionary from name of the feature
	# to be captured to arrays of FeatureLoc named tuples
	keyRefs = {}
	
	# Dictionary from Python address
	# to dictionaries containing the features
	# to the features they contain
	# It's a dictionary of dictionaries of unique ids
	# First level: python address
	# Second level: name of the feature
	# Third level: unique ids
	id2ElemId = {}
	
	# Dictionary from JSON Pointer
	# to unique ids
	jp2val = {}
	
	refSchemaListSet.setdefault(schemaURI,[]).append( (id2ElemId , keyRefs , jp2val) )
	
	# Translating it into an set
	keySet = keys  if isinstance(keys,set)  else set(keys)
	
	# And adding the '$ref' feature
	keySet.add(REF_FEATURE)
	
	def _traverse_dict(schemaURI, j, jp="", fragment=None):
		# Pre-processing
		newPartialSchemaURI = j.get('$id')
		if newPartialSchemaURI:
			# Computing the absolute schema URI
			if uritools.isabsuri(schemaURI):
				newSchemaURI , uriFragment = uritools.uridefrag(uritools.urijoin(schemaURI,newPartialSchemaURI))
			else:
				newSchemaURI , uriFragment = uritools.uridefrag(newPartialSchemaURI)
		else:
			newSchemaURI = schemaURI
		
		# Are we jumping to a different place?
		if newSchemaURI == schemaURI:
			theId = id(j)
			theIdStr = str(theId)
			
			# Does the dictionary contain a '$ref'?
			isRef = REF_FEATURE in j
			
			for k,v in j.items():
				# Following JSON reference standards, we have to
				# ignore other keys when there is a $ref one
				# https://tools.ietf.org/html/draft-pbryan-zyp-json-ref-03#section-3
				if isRef and (k != REF_FEATURE):
					continue
				
				elemId = theIdStr + ':' + k
				
				elemPath = jp + '/' + k
				jp2val[elemPath] = elemId
				
				# Is the key among the "special ones"?
				if k in keySet:
					# Saving the correspondence from Python address
					# to unique id of the feature
					id2ElemId.setdefault(theId,{})[k] = [ elemId ]
					keyRefs.setdefault(k,[]).append(FeatureLoc(schemaURI=schemaURI,fragment=fragment,path=elemPath,context=j,id=elemId))
				
				if isinstance(v,dict):
					# Fragment must not be propagated to children
					_traverse_dict(schemaURI,v,jp=elemPath)
				elif isinstance(v,list):
					_traverse_list(schemaURI,v,jp=elemPath)
		else:
			traverseJSONSchema(j,schemaURI=newSchemaURI,fragment=uriFragment,keys=keys,refSchemaListSet=refSchemaListSet)
	
	def _traverse_list(schemaURI, j, jp=""):
		theIdStr = str(id(j))
		for vi, v in enumerate(j):
			str_vi = str(vi)
			elemId = theIdStr + ':' + str_vi
			
			elemPath = jp + '/' + str_vi
			jp2val[elemPath] = elemId
			
			if isinstance(v,dict):
				_traverse_dict(schemaURI,v,jp=elemPath)
			elif isinstance(v,list):
				_traverse_list(schemaURI,v,jp=elemPath)
	
	if isinstance(jsonObj,dict):
		_traverse_dict(schemaURI,jsonObj, fragment=fragment)
	elif isinstance(jsonObj,list):
		_traverse_list(schemaURI,jsonObj)
	
	return refSchemaListSet
Пример #28
0
 def getfeeduri(cls, url):
     return uritools.uridefrag(Extension.ext_name + '+' + url).uri