Exemplo n.º 1
0
def pcurl(url):
    ns.register(workurl=url+'#')
    pStore = Store(reader="rdflib", writer="rdflib",
                       rdflib_store='IOMemory')
    pSession = Session(pStore)
    Work = pSession.get_class(ns.FRBR['Work'])

    Agent = pSession.get_class(ns.PROV['Agent'])
    Entity = pSession.get_class(ns.PROV['Entity'])

    controller = getController(Agent)
    
    work = Work(url)
    works = set([url])
    response = getResponse(url)
    content = response.read()
    originalWork = work

    while response.status >= 300 and response.status < 400:
        newURL = response.msg.dict['location']
        if newURL in works:
            raise Exception("Redirect loop")
        works.add(newURL)
        newWork = Work(newURL)
        newWork.save()
        work.irw_redirectsTo.append(newWork)
        work.save()
        work = newWork
        response = getResponse(work.subject)
        content = response.read()
    if response.status != 200:
        raise Exception(response.reason)
    pSession.commit()

    #work = originalWork
    workURI = str(work.subject)
    FileHash = work.session.get_class(ns.NFO['FileHash'])
    ContentDigest = work.session.get_class(ns.FRIR['ContentDigest'])
    Item = work.session.get_class(ns.FRBR['Item'])
    Response = work.session.get_class(ns.FRIR['HTTP_1_1_Response'])
    Get = work.session.get_class(ns.FRIR['HTTP_1_1_GET'])
    Manifestation = work.session.get_class(ns.FRBR['Manifestation'])
    Expression = work.session.get_class(ns.FRBR['Expression'])
    ProcessExecution = work.session.get_class(ns.PROV['ProcessExecution'])
    #httpGetURI = "http://www.w3.org/Protocols/rfc2616/rfc2616-sec9.html#sec9.3"

    o = urlparse(str(workURI))
    filename = o.path.split("/")[-1]

    f = open(filename,"wb+")
    f.write(content)
    f.close()

    pStore, localItem = fstack(open(filename,'rb+'),filename,workURI,pStore,response.msg.dict['content-type'])
    #localItem = Item(localItem.subject)

    itemHashValue = createItemHash(url, response, content)

    item = Response(ns.PITEM['-'.join(itemHashValue)])
    item.frir_hasHeader = ''.join(response.msg.headers)
    item.nfo_hasHash.append(createHashInstance(itemHashValue,FileHash))
    item.dcterms_date = dateutil.parser.parse(response.msg.dict['date'])
    item.frbr_exemplarOf = localItem.frbr_exemplarOf

    provF = open(filename+".prov.ttl","wb+")

    localItem.frbr_reproductionOf.append(item)

    getPE = Get()
    getPE.dcterms_date = localItem.dcterms_date
    getPE.prov_used.append(ns.FRIR['HTTP_1_1_GET'])
    getPE.prov_wasControlledBy = controller
    getPE.prov_used.append(item)
    localItem.prov_wasGeneratedBy = getPE
    
    item.save()
    localItem.save()
    getPE.save()

    pSession.commit()
    bindPrefixes(pStore.reader.graph)
    provF.write(pStore.reader.graph.serialize(format="turtle"))
Exemplo n.º 2
0
def pcurl(url):
    ns.register(workurl=url+'#')
    pStore = Store(reader="rdflib", writer="rdflib",
                       rdflib_store='IOMemory')
    pSession = Session(pStore)
    Work = pSession.get_class(ns.FRBR['Work'])

    Agent = pSession.get_class(ns.PROV['Agent'])
    Entity = pSession.get_class(ns.PROV['Entity'])

    controller = getController(Agent)
    
    work = Work(url)
    works = set([url])
    response = getResponse(url)
    content = response.read()
    originalWork = work

    while response.status >= 300 and response.status < 400:
        newURL = response.msg.dict['location']
        if newURL in works:
            raise Exception("Redirect loop")
        works.add(newURL)
        newWork = Work(newURL)
        newWork.save()
        work.irw_redirectsTo.append(newWork)
        work.save()
        work = newWork
        response = getResponse(work.subject)
        content = response.read()
    if response.status != 200:
        raise Exception(response.reason)
    pSession.commit()

    #work = originalWork
    workURI = str(work.subject)
    FileHash = work.session.get_class(ns.NFO['FileHash'])
    ContentDigest = work.session.get_class(ns.FRIR['ContentDigest'])
    Item = work.session.get_class(ns.FRBR['Item'])
    Request = work.session.get_class(ns.HTTP['Request'])
    RequestHeader = work.session.get_class(ns.HTTP['RequestHeader'])
    Response = work.session.get_class(ns.HTTP['Response'])
    ResponseHeader = work.session.get_class(ns.HTTP['ResponseHeader'])
    Method = work.session.get_class(ns.HTTP["Method"])
    GET = Method(ns.METHOD["GET"])
    Manifestation = work.session.get_class(ns.FRBR['Manifestation'])
    Expression = work.session.get_class(ns.FRBR['Expression'])
    ProcessExecution = work.session.get_class(ns.PROV['ProcessExecution'])

    o = urlparse(str(workURI))
    filename = [f for f in o.path.split("/") if len(f) > 0][-1]
    #print filename
    
    f = open(filename,"wb+")
    f.write(content)
    f.close()

    mimetype = response.msg.dict['content-type']
    pStore, localItem = fstack(open(filename,'rb+'),filename,workURI,pStore,mimetype)
    #localItem = Item(localItem.subject)

    itemHashValue = createItemHash(url, response, content)

    item = Response(ns.PITEM['-'.join(itemHashValue[:2])])
    item.http_httpVersion = '1.1'
    for field in response.msg.dict.keys():
        header = ResponseHeader()
        header.http_fieldName = field
        header.http_fieldValue = response.msg.dict[field]
        header.http_hdrName = ns.HEADER[field.lower()]
        header.save()
        item.http_headers.append(header)
    item.nfo_hasHash.append(createHashInstance(itemHashValue,FileHash))
    item.dcterms_date = dateutil.parser.parse(response.msg.dict['date'])
    item.frbr_exemplarOf = localItem.frbr_exemplarOf

    provF = open(filename+".prov.ttl","wb+")

    localItem.frbr_reproductionOf.append(item)

    getPE = Request()
    getPE.http_methd = GET
    getPE.http_requestURI = workURI
    getPE.dcterms_date = localItem.dcterms_date
    getPE.prov_hadRecipe.append(GET)
    getPE.prov_wasControlledBy = controller
    getPE.prov_used.append(item)
    getPE.http_resp = item
    localItem.prov_wasGeneratedBy = getPE
    
    item.save()
    localItem.save()
    getPE.save()

    pSession.commit()
    bindPrefixes(pStore.reader.graph)
    provF.write(pStore.reader.graph.serialize(format="turtle"))
Exemplo n.º 3
0
def pcurl(url):
    ns.register(workurl=url + '#')
    pStore = Store(reader="rdflib", writer="rdflib", rdflib_store='IOMemory')
    pSession = Session(pStore)
    Work = pSession.get_class(ns.FRBR['Work'])

    Agent = pSession.get_class(ns.PROV['Agent'])
    Entity = pSession.get_class(ns.PROV['Entity'])

    controller = getController(Agent)

    work = Work(url)
    works = set([url])
    response = getResponse(url)
    content = response.read()
    originalWork = work

    while response.status >= 300 and response.status < 400:
        newURL = response.msg.dict['location']
        if newURL in works:
            raise Exception("Redirect loop")
        works.add(newURL)
        newWork = Work(newURL)
        newWork.save()
        work.irw_redirectsTo.append(newWork)
        work.save()
        work = newWork
        response = getResponse(work.subject)
        content = response.read()
    if response.status != 200:
        raise Exception(response.reason)
    pSession.commit()

    #work = originalWork
    workURI = str(work.subject)
    FileHash = work.session.get_class(ns.NFO['FileHash'])
    ContentDigest = work.session.get_class(ns.FRIR['ContentDigest'])
    Item = work.session.get_class(ns.FRBR['Item'])
    Request = work.session.get_class(ns.HTTP['Request'])
    RequestHeader = work.session.get_class(ns.HTTP['RequestHeader'])
    Response = work.session.get_class(ns.HTTP['Response'])
    ResponseHeader = work.session.get_class(ns.HTTP['ResponseHeader'])
    Method = work.session.get_class(ns.HTTP["Method"])
    GET = Method(ns.METHOD["GET"])
    GET.rdfs_label = "HTTP 1.1 GET"
    Manifestation = work.session.get_class(ns.FRBR['Manifestation'])
    Expression = work.session.get_class(ns.FRBR['Expression'])
    ProcessExecution = work.session.get_class(ns.PROV['Activity'])

    o = urlparse(str(workURI))
    filename = [f for f in o.path.split("/") if len(f) > 0][-1]
    #print filename

    f = open(filename, "wb+")
    f.write(content)
    f.close()

    mimetype = response.msg.dict['content-type']
    pStore, localItem = fstack(open(filename, 'rb+'), filename, workURI,
                               pStore, mimetype)
    #localItem = Item(localItem.subject)

    itemHashValue = createItemHash(url, response, content)

    item = Response(ns.PITEM['-'.join(itemHashValue[:2])])
    item.http_httpVersion = '1.1'
    for field in response.msg.dict.keys():
        header = ResponseHeader()
        header.http_fieldName = field
        header.http_fieldValue = response.msg.dict[field]
        header.http_hdrName = ns.HEADER[field.lower()]
        header.save()
        item.http_headers.append(header)
    item.nfo_hasHash.append(createHashInstance(itemHashValue, FileHash))
    item.dcterms_date = dateutil.parser.parse(response.msg.dict['date'])
    item.frbr_exemplarOf = localItem.frbr_exemplarOf

    provF = open(filename + ".prov.ttl", "wb+")

    localItem.frbr_reproductionOf.append(item)

    getPE = Request()
    getPE.http_methd = GET
    getPE.http_requestURI = workURI
    getPE.dcterms_date = localItem.dcterms_date
    getPE.prov_hadPlan.append(GET)
    getPE.prov_wasAttributedTo = controller
    getPE.prov_used.append(item)
    getPE.http_resp = item
    localItem.prov_wasGeneratedBy = getPE

    item.save()
    localItem.save()
    getPE.save()

    pSession.commit()
    bindPrefixes(pStore.reader.graph)
    provF.write(pStore.reader.graph.serialize(format="turtle"))