def testURI(self): sample = rdf_standard.URI(transport="http", host="google.com", path="/index", query="q=hi", fragment="anchor1") self.assertEqual(sample.transport, "http") self.assertEqual(sample.host, "google.com") self.assertEqual(sample.path, "/index") self.assertEqual(sample.query, "q=hi") self.assertEqual(sample.fragment, "anchor1")
def testHumanReadable(self): sample = rdf_standard.URI() url = "http://google.com:443/search?query=hi#anchor2" sample.ParseFromHumanReadable(url) self.assertEqual(sample.transport, "http") self.assertEqual(sample.host, "google.com:443") self.assertEqual(sample.path, "/search") self.assertEqual(sample.query, "query=hi") self.assertEqual(sample.fragment, "anchor2") self.assertEqual(sample.SerializeToHumanReadable(), url)
def testByteString(self): raw_uri = "http://gógiel.pl:1337/znajdź?frazę=🦋#nagłówek" uri = rdf_standard.URI() uri.ParseFromBytes(raw_uri.encode("utf-8")) self.assertEqual(uri.transport, "http") self.assertEqual(uri.host, "gógiel.pl:1337") self.assertEqual(uri.path, "/znajdź") self.assertEqual(uri.query, "frazę=🦋") self.assertEqual(uri.fragment, "nagłówek") self.assertEqual(uri.FromSerializedBytes(uri.SerializeToBytes()), uri)
def testURI(self): sample = rdf_standard.URI(transport="http", host="google.com", path="/index", query="q=hi", fragment="anchor1") self.assertEqual(sample.transport, "http") self.assertEqual(sample.host, "google.com") self.assertEqual(sample.path, "/index") self.assertEqual(sample.query, "q=hi") self.assertEqual(sample.fragment, "anchor1") url = "http://google.com/index?q=hi#anchor1" self.assertEqual(sample.SerializeToString(), url)
def Parse(self, stat, file_obj, unused_knowledge_base): uris_to_parse = self.FindPotentialURIs(file_obj) uris = [] for url_to_parse in uris_to_parse: url = rdf_standard.URI() url.ParseFromString(url_to_parse) # if no transport then url_to_parse wasn't actually a valid URL # either host or path also have to exist for this to be a valid URL if url.transport and (url.host or url.path): uris.append(url) filename = stat.pathspec.path cfg = {"filename": filename, "uris": uris} yield rdf_protodict.AttributedDict(**cfg)
def ParseFile(self, knowledge_base, pathspec, filedesc): del knowledge_base # Unused. uris_to_parse = self.FindPotentialURIs(filedesc) uris = [] for url_to_parse in uris_to_parse: url = rdf_standard.URI() url.ParseFromHumanReadable(url_to_parse) # if no transport then url_to_parse wasn't actually a valid URL # either host or path also have to exist for this to be a valid URL if url.transport and (url.host or url.path): uris.append(url) filename = pathspec.path cfg = {"filename": filename, "uris": uris} yield rdf_protodict.AttributedDict(**cfg)
def GenerateSample(self, number=0): return rdf_standard.URI(transport="http", host="%s.example.com" % number)