def testParseFromString(self): sample = rdf_standard.URI() url = "http://google.com:443/search?query=hi#anchor2" sample.ParseFromString(url) self.assertEqual(sample.transport, "http") self.assertEqual(sample.host, "google.com:443") self.assertEqual(sample.path, "/search") self.assertEqual(sample.query, "query=hi") self.assertEqual(sample.fragment, "anchor2") self.assertEqual(sample.SerializeToString(), url)
def testURI(self): sample = rdf_standard.URI(transport="http", host="google.com", path="/index", query="q=hi", fragment="anchor1") self.assertEqual(sample.transport, "http") self.assertEqual(sample.host, "google.com") self.assertEqual(sample.path, "/index") self.assertEqual(sample.query, "q=hi") self.assertEqual(sample.fragment, "anchor1") url = "http://google.com/index?q=hi#anchor1" self.assertEqual(sample.SerializeToString(), url)
def Parse(self, stat, file_obj, unused_knowledge_base): uris_to_parse = self.FindPotentialURIs(file_obj) uris = [] for url_to_parse in uris_to_parse: url = rdf_standard.URI() url.ParseFromString(url_to_parse) # if no transport then url_to_parse wasn't actually a valid URL # either host or path also have to exist for this to be a valid URL if url.transport and (url.host or url.path): uris.append(url) filename = stat.pathspec.path cfg = {"filename": filename, "uris": uris} yield rdf_protodict.AttributedDict(**cfg)
def GenerateSample(self, number=0): return rdf_standard.URI(transport="http", host="%s.example.com" % number)