예제 #1
0
 def testXmlPrintLxmlPrettyPrintFalse(self):
     observable = Observable()
     xmlprintlxml = XmlPrintLxml(fromKwarg='lxmlNode', toKwarg="data", pretty_print=False)
     observer = CallTrace('observer', emptyGeneratorMethods=['someMessage'])
     xmlprintlxml.addObserver(observer)
     observable.addObserver(xmlprintlxml)
     list(compose(observable.all.someMessage(lxmlNode=parse(StringIO('<a><b>“c</b></a>')))))
     self.assertEquals(['someMessage'], observer.calledMethodNames())
     self.assertEquals(['data'], observer.calledMethods[0].kwargs.keys())
     self.assertEquals('''<a><b>“c</b></a>''', observer.calledMethods[0].kwargs['data'])
예제 #2
0
    def testTransparency(self):
        lxml = CallTrace('lxml')
        lxml2 = CallTrace('lxml2')
        observable = be((Observable(), (
            XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'),
            (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), (lxml, ),
             (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), (
                 XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'),
                 (lxml2, ),
             ))),
        )))

        observable.do.something(identifier='identifier',
                                partname='partName',
                                data='<?xml version="1.0"?><a><b>c</b></a>')
        self.assertEqualsWS('<a><b>c</b></a>',
                            lxml.calledMethods[0].kwargs['data'].decode())
        self.assertEqualsWS('<a><b>c</b></a>',
                            lxml2.calledMethods[0].kwargs['data'].decode())
예제 #3
0
    def testMissingFromKwargDoesNothing(self):
        observer = CallTrace()
        observable = be(
            (Observable(), (XmlPrintLxml(fromKwarg='lxmlNode',
                                         toKwarg='data'), (observer, ))))

        observable.do.something('identifier', 'partname',
                                parse(StringIO('<a/>')))
        self.assertEqual(1, len(observer.calledMethods))
        self.assertEqual("<class 'lxml.etree._ElementTree'>",
                         str(type(observer.calledMethods[0].args[2])))
예제 #4
0
 def testToKwargDefaultsToFromKwarg(self):
     observer = CallTrace()
     observable = be(
         (Observable(),
             (XmlPrintLxml(fromKwarg='data'),
                 (observer,),
             )
         )
     )
     observable.do.something('identifier', 'partname', data=parse(StringIO('<someXml/>')))
     self.assertEquals("something('identifier', 'partname', data='<someXml/>\n')", str(observer.calledMethods[0]))
예제 #5
0
    def testRenameKwargOnConvert(self):
        observer = CallTrace()
        observable = be(
            (Observable(),
                (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='dataString'),
                    (observer,),
                )
            )
        )
        observable.do.something('identifier', 'partname', lxmlNode=parse(StringIO('<someXml/>')))
        self.assertEquals("something('identifier', 'partname', dataString='<someXml/>\n')", str(observer.calledMethods[0]))

        observable.do.something('identifier', 'partname', someKwarg=1)
        self.assertEquals("something('identifier', 'partname', someKwarg=1)", str(observer.calledMethods[1]))
예제 #6
0
def uploadHelix(lucene, termNumerator, storageComponent, drilldownFields,
                fieldRegistry):
    indexHelix = (Fields2LuceneDoc('record', fieldRegistry=fieldRegistry),
                  (termNumerator, ), (lucene, ))

    return \
    (SruRecordUpdate(),
        (TransactionScope('record'),
            (Venturi(should=[{'partname': 'record', 'xpath': '.'}], namespaces={'doc': 'http://meresco.org/namespace/example'}),
                (FilterMessages(allowed=['delete']),
                    (lucene,),
                    (storageComponent,)
                ),
                (FilterMessages(allowed=['add']),
                    (Xml2Fields(),
                        (RenameField(lambda name: name.split('.', 1)[-1]),
                            (FilterField(lambda name: 'fieldHier' not in name),
                                indexHelix,
                            ),
                            (FilterField(lambda name: name == 'intfield1'),
                                (RenameField(lambda name: SORTED_PREFIX + name),
                                    indexHelix,
                                )
                            ),
                            (FilterField(lambda name: name in ['field2', 'field3']),
                                (RenameField(lambda name: UNTOKENIZED_PREFIX + name),
                                    indexHelix,
                                )
                            ),
                        )
                    ),
                    (FieldHier(),
                        indexHelix,
                    )
                ),
                (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'),
                    (storageComponent,)
                )
            )
        )
    )
예제 #7
0
def createDownloadHelix(reactor, periodicDownload, oaiDownload,
                        storageComponent, oaiJazz):
    return \
    (periodicDownload, # Scheduled connection to a remote (response / request)...
        (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), # Convert from plain text to lxml-object.
            (oaiDownload, # Implementation/Protocol of a PeriodicDownload...
                (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message.
                    (FilterMessages(['delete']), # Filtert delete messages
                        # (LogComponent("Delete Update"),),
                        (storageComponent,), # Delete from storage
                        (oaiJazz,), # Delete from OAI-pmh repo
                        # Write a 'deleted' part to the storage, that holds the (Record)uploadId.
                        (WriteTombstone(),
                            (storageComponent,),
                        )
                    ),
                    (FilterMessages(allowed=['add']),
                        # (LogComponent("ADD"),),

                        (XmlXPath(['//document:document/document:part[@name="normdoc"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            # (LogComponent("NORMDOC"),),
                            (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'),
                                (RewritePartname(NL_DIDL_NORMALISED_PREFIX), # Hernoemt partname van 'record' naar "metadata".
                                    (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                        (storageComponent,) # Schrijft oai:metadata (=origineel) naar storage.
                                    )
                                )
                            )
                        ),

                        (XmlXPath(['//document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'),
                                # TODO: Check indien conversies misgaan, dat ook de meta en header part niet naar storage gaan: geen enkel part als het even kan...
                                # Schrijf 'header' partname naar storage:
                                (XmlXPath(['/oai:record/oai:header'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP),
                                    (RewritePartname("header"),
                                        (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                            (storageComponent,) # Schrijft OAI-header naar storage.
                                        )
                                    )
                                ),
                                # Schrijf 'metadata' partname naar storage:
                                # Op gharvester21 gaat dit niet goed: Daar is het root element <metadata> in het 'metadata' part, in plaats van <DIDL>.
                                # Liever hier een child::node(), echter gaat deze syntax mis i.c.m. XmlXPath component??
                                (XmlXPath(['/oai:record/oai:metadata/didl:DIDL'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP),
                                    # (LogComponent("METADATA_PART"),),
                                    (RewritePartname("metadata"),
                                        (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                            (storageComponent,) # Schrijft metadata naar storage.
                                        )
                                    )
                                )
                            )
                        ),

                        (XmlXPath(['//document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode',
                               toKwarg='data', namespaces=NAMESPACEMAP),
                            (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'),
                                (NlDidlCombined(nsMap=NAMESPACEMAP, fromKwarg='lxmlNode'),
                                    # Create combined format from stored metadataPart and normalized part.
                                    (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'),  # Convert it to plaintext
                                        (RewritePartname(NL_DIDL_COMBINED_PREFIX),  # Rename combined partName
                                            (storageComponent,)  # Write combined partName to storage
                                        )
                                    )
                                )
                            )
                        ),

                        (XmlXPath(['//document:document/document:part[@name="meta"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            (RewritePartname("meta"),
                                (storageComponent,) # Schrijft harvester 'meta' data naar storage.
                            )
                        ),

                        (OaiAddRecord(metadataPrefixes=[('metadata', 'http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-21_schema_files/did/didmodel.xsd', 'urn:mpeg:mpeg21:2002:02-DIDL-NS'),
                                    (NL_DIDL_NORMALISED_PREFIX, '', NAMESPACEMAP.gmhnorm),
                                    (NL_DIDL_COMBINED_PREFIX, '', NAMESPACEMAP.gmhcombined)]), #[(partname, schema, namespace)]
                            # (LogComponent("OaiAddRecord:"),),
                            (storageComponent,),
                            (oaiJazz,) # Assert partNames header and meta are available from storage!
                        ),

                        (ResurrectTombstone(),
                            (storageComponent,),
                        ),

                    ),

                    # (FilterMessages(allowed=['add']),
                    #     # (LogComponent("UnDelete"),),
                    #     (ResurrectTombstone(),
                    #         (storageComponent,),
                    #     )
                    # )
                )
            )
        )
    )
예제 #8
0
def main(reactor, port, statePath, **ignored):

    oaiSuspendRegister = SuspendRegister()
    oaiJazz = be((OaiJazz(join(statePath, 'oai')), (oaiSuspendRegister, )))

    # WST:
    # strategie = HashDistributeStrategy() # filename (=partname) is also hashed: difficult to read by human eye...
    strategie = Md5HashDistributeStrategy()

    storeComponent = StorageComponent(
        join(statePath, 'store'),
        strategy=strategie,
        partsRemovedOnDelete=[NORMALISED_DOC_NAME])

    return \
    (Observable(),
        # (scheduledCommitPeriodicCall,),
        # (DebugPrompt(reactor=reactor, port=port+1, globals=locals()),),
        (ObservableHttpServer(reactor=reactor, port=port),
            (BasicHttpHandler(),
                (IpFilter(allowedIps=['127.0.0.1']),
                    (PathFilter('/oaix', excluding=['/oaix/info']),
                        (OaiPmh(repositoryName='Gateway',
                                adminEmail='*****@*****.**',
                                supportXWait=True,
                                batchSize=2000 # Override default batch size of 200.
                            ),
                            (oaiJazz,),
                            (oaiSuspendRegister,),
                            (StorageAdapter(),
                                (storeComponent,),
                            ),
                        )
                    ),
                    (PathFilter('/oaix/info'),
                        (OaiInfo(reactor=reactor, oaiPath='/oai'),
                            (oaiJazz,),
                        )
                    ),
                ),
                (PathFilter('/update'),
                    (SruRecordUpdate(sendRecordData=False, logErrors=True,),
                        (FilterMessages(allowed=['delete']),
                            (storeComponent,),
                            (oaiJazz,),
                        ),
                        (FilterMessages(allowed=['add']),

                            # Does not work? See comments in component...
                            # (AddMetadataFormat(fromKwarg="lxmlNode", name='md_format'),
                            #     (LogComponent("AddMetadataFormat"),),
                            # ),
                            (XmlXPath(['srw:recordData/*'], fromKwarg='lxmlNode'), # Stuurt IEDERE matching node in een nieuw bericht door.
                                # (LogComponent("TO LONG CONVERTER:"),),
                                (AddMetadataNamespace(dateformat="%Y-%m-%dT%H:%M:%SZ", fromKwarg='lxmlNode'), # Adds metadataNamespace to meta part in the message.
                                    (NormaliseOaiRecord(fromKwarg='lxmlNode'), # Normalises record to: long & original parts. Raises ValidationException if no 'known' metadataformat 
                                        (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data', pretty_print=False),
                                            (RewritePartname(NORMALISED_DOC_NAME), # Rename converted part.
                                                (storeComponent,), # Store converted/renamed part.
                                            )
                                        )
                                    ),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[NORMALISED_DOC_NAME]),
                                        (oaiJazz,),
                                    )
                                )
                            )
                        )
                    )
                )
            )
        )
    )
예제 #9
0
def createDownloadHelix(reactor, periodicDownload, oaiDownload,
                        storageComponent, oaiJazz, oai_oa_cerifJazz):
    return \
    (periodicDownload, # Scheduled connection to a remote (response / request)...
        (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), # Convert from plain text to lxml-object.
            (oaiDownload, # Implementation/Protocol of a PeriodicDownload...
                (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message.
                    (FilterMessages(['delete']), # Filtert delete messages
                        # (LogComponent("Delete Update"),),
                        (storageComponent,), # Delete from storage
                        (oaiJazz,), # Delete from OAI-pmh repo
                        (oai_oa_cerifJazz,),
                        # Write a 'deleted' part to the storage, that holds the (Record)uploadId.
                        (WriteTombstone(),
                            (storageComponent,),
                        )
                    ),
                    (FilterMessages(allowed=['add']),
                        (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'),

                                (FilterWcpCollection(allowed=['research']),
                                    (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat
                                        (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-project.xsl')], fromKwarg="lxmlNode"),
                                            (RewritePartname(OPENAIRE_PARTNAME),
                                                (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                                    (storageComponent,)
                                                )
                                            )
                                        )
                                    )
                                ),

                                (FilterWcpCollection(allowed=['person']),
                                    (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat
                                        (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-person.xsl')], fromKwarg="lxmlNode"),
                                            (RewritePartname(OPENAIRE_PARTNAME),
                                                (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                                    (storageComponent,)
                                                )
                                            )
                                        )
                                    )
                                ),
                                (FilterWcpCollection(allowed=['organisation']),
                                    (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat
                                        (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-orgunit.xsl')], fromKwarg="lxmlNode"),
                                            (RewritePartname(OPENAIRE_PARTNAME),
                                                (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                                    (storageComponent,)
                                                )
                                            )
                                        )
                                    )
                                ),
                                (FilterWcpCollection(allowed=['dataset']), # START CERIF CONVERSION FOR DATASET COLLECTION: cerif-dataset / cerif-software.
                                    (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat.
                                        (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-product.xsl')], fromKwarg="lxmlNode"),
                                            (RewritePartname(OPENAIRE_PARTNAME),
                                                (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                                    (storageComponent,)
                                                )
                                            )
                                        )
                                    )
                                ),
                                (FilterWcpCollection(allowed=['publication']), # START CERIF CONVERSION FOR PUBLICATIONS COLLECTION
                                    (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat.
                                        (FilterKnawLongGenre(allowed=['patent']), # START PATENTS CONVERSION
                                            (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-patent.xsl')], fromKwarg="lxmlNode"),
                                                (RewritePartname(OPENAIRE_PARTNAME),
                                                    (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                                        (storageComponent,)
                                                    )
                                                )
                                            )
                                        ),
                                        (FilterKnawLongGenre(disallowed=['patent']), # START Publication CONVERSION
                                            (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-publication.xsl')], fromKwarg="lxmlNode"),
                                                (RewritePartname(OPENAIRE_PARTNAME),
                                                    (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                                        (storageComponent,)
                                                    )
                                                )
                                            )
                                        )
                                    )
                                ),
                                (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat
                                    (RewritePartname("metadata"), # Hernoemt partname van 'record' naar "metadata".
                                        (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                            (storageComponent,) # Schrijft oai:metadata (=origineel) naar storage.
                                        )
                                    )
                                ),
                                (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat.
                                    (RewritePartname("knaw_long"), # Hernoemt partname van 'record' naar "knaw_long".
                                        (FilterWcpCollection(disallowed=['person', 'research', 'organisation']),
                                            (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                                (storageComponent,), # Schrijft 'long' (=norm:normdoc) naar storage.
                                            )
                                        ),
                                        (ShortConverter(fromKwarg='lxmlNode'), # creeer 'knaw_short' subset formaat.
                                            (RewritePartname("knaw_short"),
                                                (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                                    (storageComponent,) # Schrijft 'short' naar storage.
                                                )
                                            )
                                        ),
                                        (FilterWcpCollection(disallowed=['person', 'research', 'organisation']),
                                            (DcConverter(fromKwarg='lxmlNode'), # Hernoem partname van 'record' naar "oai_dc".
                                                (RewritePartname("oai_dc"),
                                                    (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                                        (storageComponent,) # Schrijft 'oai_dc' naar storage.
                                                    )
                                                )
                                            )
                                        )
                                    )
                                ),
                                # TODO: Check indien conversies misgaan, dat ook de meta en header part niet naar storage gaan: geen 1 part als het even kan...
                                # Schrijf 'header' partname naar storage:
                                (XmlXPath(['/oai:record/oai:header'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP),
                                    (RewritePartname("header"),
                                        (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                            (storageComponent,) # Schrijft OAI-header naar storage.
                                        )
                                    )
                                ),
                                (FilterWcpCollection(allowed=['publication']),
                                    # (LogComponent("PUBLICATION"),),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['publication']),
                                        (oaiJazz,),
                                    ),
                                    (XmlXPath(["//long:knaw_long[long:accessRights ='openAccess']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['oa_publication', 'openaire']),
                                            (oaiJazz,),
                                        )
                                    ),
                                    (XmlXPath(["//long:knaw_long/long:metadata[long:genre ='doctoralthesis']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['thesis']),
                                            (oaiJazz,),
                                        )
                                    ),
                                    (XmlXPath(['//long:knaw_long/long:metadata/long:grantAgreements/long:grantAgreement[long:code[contains(.,"greement/EC/") or contains(.,"greement/ec/")]][1]'], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['ec_fundedresources', 'openaire']),
                                            (oaiJazz,),
                                        )
                                    ),
                                    (XmlXPath(["//long:knaw_long/long:metadata[long:genre ='patent']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_patents"]),
                                            (oai_oa_cerifJazz,),
                                        )
                                    ),
                                    (XmlXPath(["//long:knaw_long/long:metadata[long:genre !='patent']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_publications"]),
                                            (oai_oa_cerifJazz,),
                                        )
                                    ),
                                ),
                                (FilterWcpCollection(allowed=['dataset']),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['dataset']),
                                        (oaiJazz,),
                                    ),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_products"]),
                                        (oai_oa_cerifJazz,),
                                    )
                                ),
                                # Add NOD OpenAIRE Cerif to OpenAIRE-PMH repo.
                                (FilterWcpCollection(allowed=['research']),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_projects"]),
                                        (oai_oa_cerifJazz,),
                                    )
                                ),
                                (FilterWcpCollection(allowed=['person']),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=['openaire_cris_persons']),
                                        (oai_oa_cerifJazz,),
                                    )
                                ),
                                (FilterWcpCollection(allowed=['organisation']),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=['openaire_cris_orgunits']),
                                        (oai_oa_cerifJazz,),
                                    )
                                )
                            )
                        ), # Schrijf 'meta' partname naar storage:
                        (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="meta"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            (RewritePartname("meta"),
                                (storageComponent,) # Schrijft harvester 'meta' data naar storage.
                            )
                        )
                    ),
                    (FilterMessages(allowed=['add']),
                        # (LogComponent("UnDelete"),),
                        (ResurrectTombstone(),
                            (storageComponent,),
                        )
                    )
                )
            )
        )
    )
예제 #10
0
def createUploadHelix(storageComponent, oaiJazz, loggerComponent):

    return \
        (TransactionScope('batch'),
            (TransactionScope('record'),
                (Venturi(
                    should=[ # Order DOES matter: First part goes first!
                        {'partname':'header', 'xpath':'/document:document/document:part[@name="header"]/text()', 'asString':False},                        
                        {'partname':'meta', 'xpath':'/document:document/document:part[@name="meta"]/text()', 'asString':False},
                        {'partname':'metadata', 'xpath':'/document:document/document:part[@name="metadata"]/text()', 'asString':False}
                    ],
                    namespaceMap=namespacesMap),
                    # Remove all delete msgs from storage and OAI:
                    (FilterMessages(allowed=['delete']),
                        #(DNADebug(enabled=False, prefix='DELETE'),
                            (storageComponent,),
                            (oaiJazz,)
                        #)
                    ),
                    (FilterMessages(allowed=['add']),
                    
                        ## Write harvestdate (=now()) to meta part (OAI provenance)
                        (FilterPartByName(included=['meta']),                            
                            (AddHarvestDateToMetaPart(verbose=False),)                            
                        ),                    
                        # Store ALL (original)parts retrieved by Venturi (required ('should') and optional ('could') parts).
                        # Write all uploadParts to storage (header, meta & metadata)
                        (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'),
                                (storageComponent,)
                        ),
                        (FilterPartByName(included=['metadata']), # Normalize 'metadata' part:
                            #(DNADebug(enabled=False, prefix='add metadata'),
                                # Validate DIDL and MODS part against their xsd-schema:
                                (Validate([('DIDL container','//didl:DIDL', 'didl.xsd'), ('MODS metadata', '//mods:mods', 'mods-3-6.xsd')], nsMap=namespacesMap), 
                                    (Normalize_nl_DIDL(nsMap=namespacesMap), # Normalize DIDL in metadataPart
                                        (loggerComponent,),
                                        (Normalize_nl_MODS(nsMap=namespacesMap), # Normalize MODS in metadataPart.
                                            (loggerComponent,),
                                            (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), # Convert it from etree.ElementTree to plaintext
                                                (RewritePartname(NL_DIDL_NORMALISED_PREFIX), # Rename normalized partName from 'metadata' to 'nl_didl_norm'
                                                    #(DNADebug(enabled=False, prefix='to storage'),
                                                        (storageComponent,) # Write normalized partName to storage                                    
                                                    #)
                                                )
                                            ),
                                            # Create and store Combined format:
                                            (NL_DIDL_combined(nsMap=namespacesMap), # Create combined format from stored metadataPart and normalized part. 
                                                (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), # Convert it to plaintext
                                                    (RewritePartname(NL_DIDL_COMBINED_PREFIX), # Rename combined partName
                                                         (storageComponent,) # Write combined partName to storage
                                                    )
                                                )
                                            ),
                                            # Add parts to OAI repository/index
                                            #(DNADebug(enabled=False, prefix='ADD2OAI'),
                                                (OaiAddRecordWithDefaults(metadataFormats=[('metadata', 'http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-21_schema_files/did/didmodel.xsd', 'urn:mpeg:mpeg21:2002:02-DIDL-NS'),
                                                                                           (NL_DIDL_NORMALISED_PREFIX, '', 'http://gh.kb-dans.nl/normalised/v0.9/'),
                                                                                           (NL_DIDL_COMBINED_PREFIX, '', 'http://gh.kb-dans.nl/combined/v0.9/')]),
                                                    (storageComponent,), 
                                                    (oaiJazz,) # Assert partNames header and meta are available from storage!
                                                ) #! OaiAddRecord
                                            #) #!Debug
                                        )
                                    )
                                )
                            #) #Debug
                        ) #!FilterPartNames(allowed=['metadata']
                    ) # !FilterMessages(allowed=['add']
                ) # !venturi
            ) # !record
        ) # !batch
예제 #11
0
def createDownloadHelix(reactor, periodicDownload, oaiDownload,
                        storageComponent, oaiJazz):
    return \
    (periodicDownload, # Scheduled connection to a remote (response / request)...
        (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), # Convert from plain text to lxml-object.
            (oaiDownload, # Implementation/Protocol of a PeriodicDownload...
                (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message.
                    (FilterMessages(['delete']), # Filtert delete messages
                        # (LogComponent("Delete Update"),),
                        (storageComponent,), # Delete from storage
                        (oaiJazz,), # Delete from OAI-pmh repo
                        # Write a 'deleted' part to the storage, that holds the (Record)uploadId.
                        (WriteTombstone(),
                            (storageComponent,),
                        )
                    ),
                    (FilterMessages(allowed=['add']),
                        # TODO: onderstaande toKwarg='data' kan eruit. Dan de volgende regel ook:-)
                        (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'),
                                (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat
                                    (RewritePartname("metadata"), # Hernoemt partname van 'record' naar "metadata".
                                        (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                            (storageComponent,) # Schrijft oai:metadata (=origineel) naar storage.
                                        )
                                    )
                                ),
                                (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat.
                                    (RewritePartname("knaw_long"), # Hernoemt partname van 'record' naar "knaw_long".
                                        (FilterWcpCollection(disallowed=['person', 'research', "organisation"]),
                                            (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                                (storageComponent,), # Schrijft 'long' (=norm:normdoc) naar storage.
                                            )
                                        ),
                                        (ShortConverter(fromKwarg='lxmlNode'), # creeer 'knaw_short' subset formaat.
                                            (RewritePartname("knaw_short"),
                                                (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                                    (storageComponent,) # Schrijft 'short' naar storage.
                                                )
                                            )
                                        ),
                                        (FilterWcpCollection(disallowed=['person', 'research', "organisation"]),
                                            (DcConverter(fromKwarg='lxmlNode'), # Hernoem partname van 'record' naar "oai_dc".
                                                (RewritePartname("oai_dc"),
                                                    (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                                        (storageComponent,) # Schrijft 'oai_dc' naar storage.
                                                    )
                                                )
                                            )
                                        )
                                    )
                                ),
                                # TODO: Check indien conversies misgaan, dat ook de meta en header part niet naar storage gaan: geen 1 part als het even kan...
                                # Schrijf 'header' partname naar storage:
                                (XmlXPath(['/oai:record/oai:header'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP),
                                    (RewritePartname("header"),
                                        (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                            (storageComponent,) # Schrijft OAI-header naar storage.
                                        )
                                    )
                                ),
                                (FilterWcpCollection(allowed=['publication']),
                                    # (LogComponent("PUBLICATION"),),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['publication'], name='NARCISPORTAL'), #TODO: Skip name='NARCISPORTAL'
                                        (oaiJazz,),
                                    ),
                                    (XmlXPath(["//long:knaw_long[long:accessRights ='openAccess']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        # (LogComponent("OPENACCESS"),),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['oa_publication', 'openaire'], name='NARCISPORTAL'),
                                            (oaiJazz,),
                                        )
                                    ),
                                    (XmlXPath(["//long:knaw_long/long:metadata[long:genre ='doctoralthesis']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['thesis'], name='NARCISPORTAL'),
                                            (oaiJazz,),
                                        )
                                    ),
                                    (XmlXPath(['//long:knaw_long/long:metadata/long:grantAgreements/long:grantAgreement[long:code[contains(.,"greement/EC/") or contains(.,"greement/ec/")]][1]'], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['ec_fundedresources', 'openaire'], name='NARCISPORTAL'),
                                            (oaiJazz,),
                                        )
                                    )
                                ),
                                (FilterWcpCollection(allowed=['dataset']),
                                    # (LogComponent("DATASET"),),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['dataset'], name='NARCISPORTAL'),
                                        (oaiJazz,),
                                    )
                                )
                            )
                        ), # Schrijf 'meta' partname naar storage:
                        (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="meta"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            (RewritePartname("meta"),
                                (storageComponent,) # Schrijft harvester 'meta' data naar storage.
                            )
                        )
                    ),
                    (FilterMessages(allowed=['add']), # TODO: Remove this line.
                        # (LogComponent("UnDelete"),),
                        (ResurrectTombstone(),
                            (storageComponent,),
                        )
                    )
                )
            )
        )
    )
예제 #12
0
def main(reactor, port, statePath, **ignored):

    oaiSuspendRegister = SuspendRegister()
    oaiJazz = be((OaiJazz(join(statePath, 'oai'),
                          alwaysDeleteInPrefixes=[NORMALISED_DOC_NAME]),
                  (oaiSuspendRegister, )))

    normLogger = Logger(join(statePath, 'normlogger'))

    # strategie = HashDistributeStrategy() # filename (=partname) is also hashed: difficult to read by human eye...
    strategie = Md5HashDistributeStrategy()

    storeComponent = StorageComponent(
        join(statePath, 'store'),
        strategy=strategie,
        partsRemovedOnDelete=[NORMALISED_DOC_NAME])

    return \
    (Observable(),
        # (scheduledCommitPeriodicCall,),
        # (DebugPrompt(reactor=reactor, port=port+1, globals=locals()),),
        (ObservableHttpServer(reactor=reactor, port=port),
            (BasicHttpHandler(),
                (IpFilter(allowedIps=['127.0.0.1']),
                    (PathFilter('/oaix', excluding=['/oaix/info']),
                        (OaiPmh(repositoryName='Gateway',
                                adminEmail='*****@*****.**',
                                supportXWait=True,
                                batchSize=2000 # Override default batch size of 200.
                            ),
                            (oaiJazz,),
                            (oaiSuspendRegister,),
                            (StorageAdapter(),
                                (storeComponent,),
                            ),
                        )
                    ),
                    (PathFilter('/oaix/info'),
                        (OaiInfo(reactor=reactor, oaiPath='/oai'),
                            (oaiJazz,),
                        )
                    ),
                ),
                (PathFilter('/update'),
                    (SruRecordUpdate(sendRecordData=False, logErrors=True,),
                        (FilterMessages(allowed=['delete']),
                            (storeComponent,),
                            (oaiJazz,),
                        ),
                        (FilterMessages(allowed=['add']),
                            # (LogComponent("LXML:"),),
                            (Validate([('DIDL container','//didl:DIDL', 'didl.xsd'), ('MODS metadata', '//mods:mods', 'mods-3-6.xsd')]),
                                # (LogComponent("VALIDATED:"),),
                                (AddMetadataDocumentPart(partName='normdoc', fromKwarg='lxmlNode'),
                                    (NormaliseDIDL(nsMap=namespacesMap, fromKwarg='lxmlNode'), # Normalise DIDL in partname=normdoc metadata
                                        (normLogger,),
                                        (NormaliseMODS(nsMap=namespacesMap, fromKwarg='lxmlNode'), # Normalise MODS in partname=normdoc metadata
                                            (normLogger,),
                                            (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'),
                                                (RewritePartname(NORMALISED_DOC_NAME), # Rename converted part.
                                                    (storeComponent,), # Store converted/renamed part.
                                                )
                                            ),
                                            (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[NORMALISED_DOC_NAME]),
                                                (oaiJazz,),
                                            )
                                        )
                                    )
                                )
                            )
                        )
                    )
                )
            )
        )
    )