def pcurl(url): ns.register(workurl=url+'#') pStore = Store(reader="rdflib", writer="rdflib", rdflib_store='IOMemory') pSession = Session(pStore) Work = pSession.get_class(ns.FRBR['Work']) Agent = pSession.get_class(ns.PROV['Agent']) Entity = pSession.get_class(ns.PROV['Entity']) controller = getController(Agent) work = Work(url) works = set([url]) response = getResponse(url) content = response.read() originalWork = work while response.status >= 300 and response.status < 400: newURL = response.msg.dict['location'] if newURL in works: raise Exception("Redirect loop") works.add(newURL) newWork = Work(newURL) newWork.save() work.irw_redirectsTo.append(newWork) work.save() work = newWork response = getResponse(work.subject) content = response.read() if response.status != 200: raise Exception(response.reason) pSession.commit() #work = originalWork workURI = str(work.subject) FileHash = work.session.get_class(ns.NFO['FileHash']) ContentDigest = work.session.get_class(ns.FRIR['ContentDigest']) Item = work.session.get_class(ns.FRBR['Item']) Response = work.session.get_class(ns.FRIR['HTTP_1_1_Response']) Get = work.session.get_class(ns.FRIR['HTTP_1_1_GET']) Manifestation = work.session.get_class(ns.FRBR['Manifestation']) Expression = work.session.get_class(ns.FRBR['Expression']) ProcessExecution = work.session.get_class(ns.PROV['ProcessExecution']) #httpGetURI = "http://www.w3.org/Protocols/rfc2616/rfc2616-sec9.html#sec9.3" o = urlparse(str(workURI)) filename = o.path.split("/")[-1] f = open(filename,"wb+") f.write(content) f.close() pStore, localItem = fstack(open(filename,'rb+'),filename,workURI,pStore,response.msg.dict['content-type']) #localItem = Item(localItem.subject) itemHashValue = createItemHash(url, response, content) item = Response(ns.PITEM['-'.join(itemHashValue)]) item.frir_hasHeader = ''.join(response.msg.headers) item.nfo_hasHash.append(createHashInstance(itemHashValue,FileHash)) item.dcterms_date = dateutil.parser.parse(response.msg.dict['date']) item.frbr_exemplarOf = localItem.frbr_exemplarOf provF = open(filename+".prov.ttl","wb+") localItem.frbr_reproductionOf.append(item) getPE = Get() getPE.dcterms_date = localItem.dcterms_date getPE.prov_used.append(ns.FRIR['HTTP_1_1_GET']) getPE.prov_wasControlledBy = controller getPE.prov_used.append(item) localItem.prov_wasGeneratedBy = getPE item.save() localItem.save() getPE.save() pSession.commit() bindPrefixes(pStore.reader.graph) provF.write(pStore.reader.graph.serialize(format="turtle"))
def pcurl(url): ns.register(workurl=url+'#') pStore = Store(reader="rdflib", writer="rdflib", rdflib_store='IOMemory') pSession = Session(pStore) Work = pSession.get_class(ns.FRBR['Work']) Agent = pSession.get_class(ns.PROV['Agent']) Entity = pSession.get_class(ns.PROV['Entity']) controller = getController(Agent) work = Work(url) works = set([url]) response = getResponse(url) content = response.read() originalWork = work while response.status >= 300 and response.status < 400: newURL = response.msg.dict['location'] if newURL in works: raise Exception("Redirect loop") works.add(newURL) newWork = Work(newURL) newWork.save() work.irw_redirectsTo.append(newWork) work.save() work = newWork response = getResponse(work.subject) content = response.read() if response.status != 200: raise Exception(response.reason) pSession.commit() #work = originalWork workURI = str(work.subject) FileHash = work.session.get_class(ns.NFO['FileHash']) ContentDigest = work.session.get_class(ns.FRIR['ContentDigest']) Item = work.session.get_class(ns.FRBR['Item']) Request = work.session.get_class(ns.HTTP['Request']) RequestHeader = work.session.get_class(ns.HTTP['RequestHeader']) Response = work.session.get_class(ns.HTTP['Response']) ResponseHeader = work.session.get_class(ns.HTTP['ResponseHeader']) Method = work.session.get_class(ns.HTTP["Method"]) GET = Method(ns.METHOD["GET"]) Manifestation = work.session.get_class(ns.FRBR['Manifestation']) Expression = work.session.get_class(ns.FRBR['Expression']) ProcessExecution = work.session.get_class(ns.PROV['ProcessExecution']) o = urlparse(str(workURI)) filename = [f for f in o.path.split("/") if len(f) > 0][-1] #print filename f = open(filename,"wb+") f.write(content) f.close() mimetype = response.msg.dict['content-type'] pStore, localItem = fstack(open(filename,'rb+'),filename,workURI,pStore,mimetype) #localItem = Item(localItem.subject) itemHashValue = createItemHash(url, response, content) item = Response(ns.PITEM['-'.join(itemHashValue[:2])]) item.http_httpVersion = '1.1' for field in response.msg.dict.keys(): header = ResponseHeader() header.http_fieldName = field header.http_fieldValue = response.msg.dict[field] header.http_hdrName = ns.HEADER[field.lower()] header.save() item.http_headers.append(header) item.nfo_hasHash.append(createHashInstance(itemHashValue,FileHash)) item.dcterms_date = dateutil.parser.parse(response.msg.dict['date']) item.frbr_exemplarOf = localItem.frbr_exemplarOf provF = open(filename+".prov.ttl","wb+") localItem.frbr_reproductionOf.append(item) getPE = Request() getPE.http_methd = GET getPE.http_requestURI = workURI getPE.dcterms_date = localItem.dcterms_date getPE.prov_hadRecipe.append(GET) getPE.prov_wasControlledBy = controller getPE.prov_used.append(item) getPE.http_resp = item localItem.prov_wasGeneratedBy = getPE item.save() localItem.save() getPE.save() pSession.commit() bindPrefixes(pStore.reader.graph) provF.write(pStore.reader.graph.serialize(format="turtle"))
def pcurl(url): ns.register(workurl=url + '#') pStore = Store(reader="rdflib", writer="rdflib", rdflib_store='IOMemory') pSession = Session(pStore) Work = pSession.get_class(ns.FRBR['Work']) Agent = pSession.get_class(ns.PROV['Agent']) Entity = pSession.get_class(ns.PROV['Entity']) controller = getController(Agent) work = Work(url) works = set([url]) response = getResponse(url) content = response.read() originalWork = work while response.status >= 300 and response.status < 400: newURL = response.msg.dict['location'] if newURL in works: raise Exception("Redirect loop") works.add(newURL) newWork = Work(newURL) newWork.save() work.irw_redirectsTo.append(newWork) work.save() work = newWork response = getResponse(work.subject) content = response.read() if response.status != 200: raise Exception(response.reason) pSession.commit() #work = originalWork workURI = str(work.subject) FileHash = work.session.get_class(ns.NFO['FileHash']) ContentDigest = work.session.get_class(ns.FRIR['ContentDigest']) Item = work.session.get_class(ns.FRBR['Item']) Request = work.session.get_class(ns.HTTP['Request']) RequestHeader = work.session.get_class(ns.HTTP['RequestHeader']) Response = work.session.get_class(ns.HTTP['Response']) ResponseHeader = work.session.get_class(ns.HTTP['ResponseHeader']) Method = work.session.get_class(ns.HTTP["Method"]) GET = Method(ns.METHOD["GET"]) GET.rdfs_label = "HTTP 1.1 GET" Manifestation = work.session.get_class(ns.FRBR['Manifestation']) Expression = work.session.get_class(ns.FRBR['Expression']) ProcessExecution = work.session.get_class(ns.PROV['Activity']) o = urlparse(str(workURI)) filename = [f for f in o.path.split("/") if len(f) > 0][-1] #print filename f = open(filename, "wb+") f.write(content) f.close() mimetype = response.msg.dict['content-type'] pStore, localItem = fstack(open(filename, 'rb+'), filename, workURI, pStore, mimetype) #localItem = Item(localItem.subject) itemHashValue = createItemHash(url, response, content) item = Response(ns.PITEM['-'.join(itemHashValue[:2])]) item.http_httpVersion = '1.1' for field in response.msg.dict.keys(): header = ResponseHeader() header.http_fieldName = field header.http_fieldValue = response.msg.dict[field] header.http_hdrName = ns.HEADER[field.lower()] header.save() item.http_headers.append(header) item.nfo_hasHash.append(createHashInstance(itemHashValue, FileHash)) item.dcterms_date = dateutil.parser.parse(response.msg.dict['date']) item.frbr_exemplarOf = localItem.frbr_exemplarOf provF = open(filename + ".prov.ttl", "wb+") localItem.frbr_reproductionOf.append(item) getPE = Request() getPE.http_methd = GET getPE.http_requestURI = workURI getPE.dcterms_date = localItem.dcterms_date getPE.prov_hadPlan.append(GET) getPE.prov_wasAttributedTo = controller getPE.prov_used.append(item) getPE.http_resp = item localItem.prov_wasGeneratedBy = getPE item.save() localItem.save() getPE.save() pSession.commit() bindPrefixes(pStore.reader.graph) provF.write(pStore.reader.graph.serialize(format="turtle"))