def testXmlPrintLxmlPrettyPrintFalse(self): observable = Observable() xmlprintlxml = XmlPrintLxml(fromKwarg='lxmlNode', toKwarg="data", pretty_print=False) observer = CallTrace('observer', emptyGeneratorMethods=['someMessage']) xmlprintlxml.addObserver(observer) observable.addObserver(xmlprintlxml) list(compose(observable.all.someMessage(lxmlNode=parse(StringIO('<a><b>“c</b></a>'))))) self.assertEquals(['someMessage'], observer.calledMethodNames()) self.assertEquals(['data'], observer.calledMethods[0].kwargs.keys()) self.assertEquals('''<a><b>“c</b></a>''', observer.calledMethods[0].kwargs['data'])
def testTransparency(self): lxml = CallTrace('lxml') lxml2 = CallTrace('lxml2') observable = be((Observable(), ( XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), (lxml, ), (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), ( XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), (lxml2, ), ))), ))) observable.do.something(identifier='identifier', partname='partName', data='<?xml version="1.0"?><a><b>c</b></a>') self.assertEqualsWS('<a><b>c</b></a>', lxml.calledMethods[0].kwargs['data'].decode()) self.assertEqualsWS('<a><b>c</b></a>', lxml2.calledMethods[0].kwargs['data'].decode())
def testMissingFromKwargDoesNothing(self): observer = CallTrace() observable = be( (Observable(), (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), (observer, )))) observable.do.something('identifier', 'partname', parse(StringIO('<a/>'))) self.assertEqual(1, len(observer.calledMethods)) self.assertEqual("<class 'lxml.etree._ElementTree'>", str(type(observer.calledMethods[0].args[2])))
def testToKwargDefaultsToFromKwarg(self): observer = CallTrace() observable = be( (Observable(), (XmlPrintLxml(fromKwarg='data'), (observer,), ) ) ) observable.do.something('identifier', 'partname', data=parse(StringIO('<someXml/>'))) self.assertEquals("something('identifier', 'partname', data='<someXml/>\n')", str(observer.calledMethods[0]))
def testRenameKwargOnConvert(self): observer = CallTrace() observable = be( (Observable(), (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='dataString'), (observer,), ) ) ) observable.do.something('identifier', 'partname', lxmlNode=parse(StringIO('<someXml/>'))) self.assertEquals("something('identifier', 'partname', dataString='<someXml/>\n')", str(observer.calledMethods[0])) observable.do.something('identifier', 'partname', someKwarg=1) self.assertEquals("something('identifier', 'partname', someKwarg=1)", str(observer.calledMethods[1]))
def uploadHelix(lucene, termNumerator, storageComponent, drilldownFields, fieldRegistry): indexHelix = (Fields2LuceneDoc('record', fieldRegistry=fieldRegistry), (termNumerator, ), (lucene, )) return \ (SruRecordUpdate(), (TransactionScope('record'), (Venturi(should=[{'partname': 'record', 'xpath': '.'}], namespaces={'doc': 'http://meresco.org/namespace/example'}), (FilterMessages(allowed=['delete']), (lucene,), (storageComponent,) ), (FilterMessages(allowed=['add']), (Xml2Fields(), (RenameField(lambda name: name.split('.', 1)[-1]), (FilterField(lambda name: 'fieldHier' not in name), indexHelix, ), (FilterField(lambda name: name == 'intfield1'), (RenameField(lambda name: SORTED_PREFIX + name), indexHelix, ) ), (FilterField(lambda name: name in ['field2', 'field3']), (RenameField(lambda name: UNTOKENIZED_PREFIX + name), indexHelix, ) ), ) ), (FieldHier(), indexHelix, ) ), (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), (storageComponent,) ) ) ) )
def createDownloadHelix(reactor, periodicDownload, oaiDownload, storageComponent, oaiJazz): return \ (periodicDownload, # Scheduled connection to a remote (response / request)... (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), # Convert from plain text to lxml-object. (oaiDownload, # Implementation/Protocol of a PeriodicDownload... (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message. (FilterMessages(['delete']), # Filtert delete messages # (LogComponent("Delete Update"),), (storageComponent,), # Delete from storage (oaiJazz,), # Delete from OAI-pmh repo # Write a 'deleted' part to the storage, that holds the (Record)uploadId. (WriteTombstone(), (storageComponent,), ) ), (FilterMessages(allowed=['add']), # (LogComponent("ADD"),), (XmlXPath(['//document:document/document:part[@name="normdoc"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), # (LogComponent("NORMDOC"),), (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), (RewritePartname(NL_DIDL_NORMALISED_PREFIX), # Hernoemt partname van 'record' naar "metadata". (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft oai:metadata (=origineel) naar storage. ) ) ) ), (XmlXPath(['//document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), # TODO: Check indien conversies misgaan, dat ook de meta en header part niet naar storage gaan: geen enkel part als het even kan... # Schrijf 'header' partname naar storage: (XmlXPath(['/oai:record/oai:header'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), (RewritePartname("header"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft OAI-header naar storage. ) ) ), # Schrijf 'metadata' partname naar storage: # Op gharvester21 gaat dit niet goed: Daar is het root element <metadata> in het 'metadata' part, in plaats van <DIDL>. # Liever hier een child::node(), echter gaat deze syntax mis i.c.m. XmlXPath component?? (XmlXPath(['/oai:record/oai:metadata/didl:DIDL'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # (LogComponent("METADATA_PART"),), (RewritePartname("metadata"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft metadata naar storage. ) ) ) ) ), (XmlXPath(['//document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), (NlDidlCombined(nsMap=NAMESPACEMAP, fromKwarg='lxmlNode'), # Create combined format from stored metadataPart and normalized part. (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), # Convert it to plaintext (RewritePartname(NL_DIDL_COMBINED_PREFIX), # Rename combined partName (storageComponent,) # Write combined partName to storage ) ) ) ) ), (XmlXPath(['//document:document/document:part[@name="meta"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (RewritePartname("meta"), (storageComponent,) # Schrijft harvester 'meta' data naar storage. ) ), (OaiAddRecord(metadataPrefixes=[('metadata', 'http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-21_schema_files/did/didmodel.xsd', 'urn:mpeg:mpeg21:2002:02-DIDL-NS'), (NL_DIDL_NORMALISED_PREFIX, '', NAMESPACEMAP.gmhnorm), (NL_DIDL_COMBINED_PREFIX, '', NAMESPACEMAP.gmhcombined)]), #[(partname, schema, namespace)] # (LogComponent("OaiAddRecord:"),), (storageComponent,), (oaiJazz,) # Assert partNames header and meta are available from storage! ), (ResurrectTombstone(), (storageComponent,), ), ), # (FilterMessages(allowed=['add']), # # (LogComponent("UnDelete"),), # (ResurrectTombstone(), # (storageComponent,), # ) # ) ) ) ) )
def main(reactor, port, statePath, **ignored): oaiSuspendRegister = SuspendRegister() oaiJazz = be((OaiJazz(join(statePath, 'oai')), (oaiSuspendRegister, ))) # WST: # strategie = HashDistributeStrategy() # filename (=partname) is also hashed: difficult to read by human eye... strategie = Md5HashDistributeStrategy() storeComponent = StorageComponent( join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[NORMALISED_DOC_NAME]) return \ (Observable(), # (scheduledCommitPeriodicCall,), # (DebugPrompt(reactor=reactor, port=port+1, globals=locals()),), (ObservableHttpServer(reactor=reactor, port=port), (BasicHttpHandler(), (IpFilter(allowedIps=['127.0.0.1']), (PathFilter('/oaix', excluding=['/oaix/info']), (OaiPmh(repositoryName='Gateway', adminEmail='*****@*****.**', supportXWait=True, batchSize=2000 # Override default batch size of 200. ), (oaiJazz,), (oaiSuspendRegister,), (StorageAdapter(), (storeComponent,), ), ) ), (PathFilter('/oaix/info'), (OaiInfo(reactor=reactor, oaiPath='/oai'), (oaiJazz,), ) ), ), (PathFilter('/update'), (SruRecordUpdate(sendRecordData=False, logErrors=True,), (FilterMessages(allowed=['delete']), (storeComponent,), (oaiJazz,), ), (FilterMessages(allowed=['add']), # Does not work? See comments in component... # (AddMetadataFormat(fromKwarg="lxmlNode", name='md_format'), # (LogComponent("AddMetadataFormat"),), # ), (XmlXPath(['srw:recordData/*'], fromKwarg='lxmlNode'), # Stuurt IEDERE matching node in een nieuw bericht door. # (LogComponent("TO LONG CONVERTER:"),), (AddMetadataNamespace(dateformat="%Y-%m-%dT%H:%M:%SZ", fromKwarg='lxmlNode'), # Adds metadataNamespace to meta part in the message. (NormaliseOaiRecord(fromKwarg='lxmlNode'), # Normalises record to: long & original parts. Raises ValidationException if no 'known' metadataformat (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data', pretty_print=False), (RewritePartname(NORMALISED_DOC_NAME), # Rename converted part. (storeComponent,), # Store converted/renamed part. ) ) ), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[NORMALISED_DOC_NAME]), (oaiJazz,), ) ) ) ) ) ) ) ) )
def createDownloadHelix(reactor, periodicDownload, oaiDownload, storageComponent, oaiJazz, oai_oa_cerifJazz): return \ (periodicDownload, # Scheduled connection to a remote (response / request)... (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), # Convert from plain text to lxml-object. (oaiDownload, # Implementation/Protocol of a PeriodicDownload... (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message. (FilterMessages(['delete']), # Filtert delete messages # (LogComponent("Delete Update"),), (storageComponent,), # Delete from storage (oaiJazz,), # Delete from OAI-pmh repo (oai_oa_cerifJazz,), # Write a 'deleted' part to the storage, that holds the (Record)uploadId. (WriteTombstone(), (storageComponent,), ) ), (FilterMessages(allowed=['add']), (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), (FilterWcpCollection(allowed=['research']), (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-project.xsl')], fromKwarg="lxmlNode"), (RewritePartname(OPENAIRE_PARTNAME), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) ) ) ) ) ), (FilterWcpCollection(allowed=['person']), (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-person.xsl')], fromKwarg="lxmlNode"), (RewritePartname(OPENAIRE_PARTNAME), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) ) ) ) ) ), (FilterWcpCollection(allowed=['organisation']), (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-orgunit.xsl')], fromKwarg="lxmlNode"), (RewritePartname(OPENAIRE_PARTNAME), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) ) ) ) ) ), (FilterWcpCollection(allowed=['dataset']), # START CERIF CONVERSION FOR DATASET COLLECTION: cerif-dataset / cerif-software. (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat. (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-product.xsl')], fromKwarg="lxmlNode"), (RewritePartname(OPENAIRE_PARTNAME), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) ) ) ) ) ), (FilterWcpCollection(allowed=['publication']), # START CERIF CONVERSION FOR PUBLICATIONS COLLECTION (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat. (FilterKnawLongGenre(allowed=['patent']), # START PATENTS CONVERSION (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-patent.xsl')], fromKwarg="lxmlNode"), (RewritePartname(OPENAIRE_PARTNAME), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) ) ) ) ), (FilterKnawLongGenre(disallowed=['patent']), # START Publication CONVERSION (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-publication.xsl')], fromKwarg="lxmlNode"), (RewritePartname(OPENAIRE_PARTNAME), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) ) ) ) ) ) ), (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat (RewritePartname("metadata"), # Hernoemt partname van 'record' naar "metadata". (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) # Schrijft oai:metadata (=origineel) naar storage. ) ) ), (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat. (RewritePartname("knaw_long"), # Hernoemt partname van 'record' naar "knaw_long". (FilterWcpCollection(disallowed=['person', 'research', 'organisation']), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,), # Schrijft 'long' (=norm:normdoc) naar storage. ) ), (ShortConverter(fromKwarg='lxmlNode'), # creeer 'knaw_short' subset formaat. (RewritePartname("knaw_short"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft 'short' naar storage. ) ) ), (FilterWcpCollection(disallowed=['person', 'research', 'organisation']), (DcConverter(fromKwarg='lxmlNode'), # Hernoem partname van 'record' naar "oai_dc". (RewritePartname("oai_dc"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft 'oai_dc' naar storage. ) ) ) ) ) ), # TODO: Check indien conversies misgaan, dat ook de meta en header part niet naar storage gaan: geen 1 part als het even kan... # Schrijf 'header' partname naar storage: (XmlXPath(['/oai:record/oai:header'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), (RewritePartname("header"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) # Schrijft OAI-header naar storage. ) ) ), (FilterWcpCollection(allowed=['publication']), # (LogComponent("PUBLICATION"),), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['publication']), (oaiJazz,), ), (XmlXPath(["//long:knaw_long[long:accessRights ='openAccess']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['oa_publication', 'openaire']), (oaiJazz,), ) ), (XmlXPath(["//long:knaw_long/long:metadata[long:genre ='doctoralthesis']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['thesis']), (oaiJazz,), ) ), (XmlXPath(['//long:knaw_long/long:metadata/long:grantAgreements/long:grantAgreement[long:code[contains(.,"greement/EC/") or contains(.,"greement/ec/")]][1]'], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['ec_fundedresources', 'openaire']), (oaiJazz,), ) ), (XmlXPath(["//long:knaw_long/long:metadata[long:genre ='patent']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_patents"]), (oai_oa_cerifJazz,), ) ), (XmlXPath(["//long:knaw_long/long:metadata[long:genre !='patent']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_publications"]), (oai_oa_cerifJazz,), ) ), ), (FilterWcpCollection(allowed=['dataset']), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['dataset']), (oaiJazz,), ), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_products"]), (oai_oa_cerifJazz,), ) ), # Add NOD OpenAIRE Cerif to OpenAIRE-PMH repo. (FilterWcpCollection(allowed=['research']), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_projects"]), (oai_oa_cerifJazz,), ) ), (FilterWcpCollection(allowed=['person']), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=['openaire_cris_persons']), (oai_oa_cerifJazz,), ) ), (FilterWcpCollection(allowed=['organisation']), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=['openaire_cris_orgunits']), (oai_oa_cerifJazz,), ) ) ) ), # Schrijf 'meta' partname naar storage: (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="meta"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (RewritePartname("meta"), (storageComponent,) # Schrijft harvester 'meta' data naar storage. ) ) ), (FilterMessages(allowed=['add']), # (LogComponent("UnDelete"),), (ResurrectTombstone(), (storageComponent,), ) ) ) ) ) )
def createUploadHelix(storageComponent, oaiJazz, loggerComponent): return \ (TransactionScope('batch'), (TransactionScope('record'), (Venturi( should=[ # Order DOES matter: First part goes first! {'partname':'header', 'xpath':'/document:document/document:part[@name="header"]/text()', 'asString':False}, {'partname':'meta', 'xpath':'/document:document/document:part[@name="meta"]/text()', 'asString':False}, {'partname':'metadata', 'xpath':'/document:document/document:part[@name="metadata"]/text()', 'asString':False} ], namespaceMap=namespacesMap), # Remove all delete msgs from storage and OAI: (FilterMessages(allowed=['delete']), #(DNADebug(enabled=False, prefix='DELETE'), (storageComponent,), (oaiJazz,) #) ), (FilterMessages(allowed=['add']), ## Write harvestdate (=now()) to meta part (OAI provenance) (FilterPartByName(included=['meta']), (AddHarvestDateToMetaPart(verbose=False),) ), # Store ALL (original)parts retrieved by Venturi (required ('should') and optional ('could') parts). # Write all uploadParts to storage (header, meta & metadata) (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), (storageComponent,) ), (FilterPartByName(included=['metadata']), # Normalize 'metadata' part: #(DNADebug(enabled=False, prefix='add metadata'), # Validate DIDL and MODS part against their xsd-schema: (Validate([('DIDL container','//didl:DIDL', 'didl.xsd'), ('MODS metadata', '//mods:mods', 'mods-3-6.xsd')], nsMap=namespacesMap), (Normalize_nl_DIDL(nsMap=namespacesMap), # Normalize DIDL in metadataPart (loggerComponent,), (Normalize_nl_MODS(nsMap=namespacesMap), # Normalize MODS in metadataPart. (loggerComponent,), (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), # Convert it from etree.ElementTree to plaintext (RewritePartname(NL_DIDL_NORMALISED_PREFIX), # Rename normalized partName from 'metadata' to 'nl_didl_norm' #(DNADebug(enabled=False, prefix='to storage'), (storageComponent,) # Write normalized partName to storage #) ) ), # Create and store Combined format: (NL_DIDL_combined(nsMap=namespacesMap), # Create combined format from stored metadataPart and normalized part. (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), # Convert it to plaintext (RewritePartname(NL_DIDL_COMBINED_PREFIX), # Rename combined partName (storageComponent,) # Write combined partName to storage ) ) ), # Add parts to OAI repository/index #(DNADebug(enabled=False, prefix='ADD2OAI'), (OaiAddRecordWithDefaults(metadataFormats=[('metadata', 'http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-21_schema_files/did/didmodel.xsd', 'urn:mpeg:mpeg21:2002:02-DIDL-NS'), (NL_DIDL_NORMALISED_PREFIX, '', 'http://gh.kb-dans.nl/normalised/v0.9/'), (NL_DIDL_COMBINED_PREFIX, '', 'http://gh.kb-dans.nl/combined/v0.9/')]), (storageComponent,), (oaiJazz,) # Assert partNames header and meta are available from storage! ) #! OaiAddRecord #) #!Debug ) ) ) #) #Debug ) #!FilterPartNames(allowed=['metadata'] ) # !FilterMessages(allowed=['add'] ) # !venturi ) # !record ) # !batch
def createDownloadHelix(reactor, periodicDownload, oaiDownload, storageComponent, oaiJazz): return \ (periodicDownload, # Scheduled connection to a remote (response / request)... (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), # Convert from plain text to lxml-object. (oaiDownload, # Implementation/Protocol of a PeriodicDownload... (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message. (FilterMessages(['delete']), # Filtert delete messages # (LogComponent("Delete Update"),), (storageComponent,), # Delete from storage (oaiJazz,), # Delete from OAI-pmh repo # Write a 'deleted' part to the storage, that holds the (Record)uploadId. (WriteTombstone(), (storageComponent,), ) ), (FilterMessages(allowed=['add']), # TODO: onderstaande toKwarg='data' kan eruit. Dan de volgende regel ook:-) (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat (RewritePartname("metadata"), # Hernoemt partname van 'record' naar "metadata". (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) # Schrijft oai:metadata (=origineel) naar storage. ) ) ), (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat. (RewritePartname("knaw_long"), # Hernoemt partname van 'record' naar "knaw_long". (FilterWcpCollection(disallowed=['person', 'research', "organisation"]), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,), # Schrijft 'long' (=norm:normdoc) naar storage. ) ), (ShortConverter(fromKwarg='lxmlNode'), # creeer 'knaw_short' subset formaat. (RewritePartname("knaw_short"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft 'short' naar storage. ) ) ), (FilterWcpCollection(disallowed=['person', 'research', "organisation"]), (DcConverter(fromKwarg='lxmlNode'), # Hernoem partname van 'record' naar "oai_dc". (RewritePartname("oai_dc"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft 'oai_dc' naar storage. ) ) ) ) ) ), # TODO: Check indien conversies misgaan, dat ook de meta en header part niet naar storage gaan: geen 1 part als het even kan... # Schrijf 'header' partname naar storage: (XmlXPath(['/oai:record/oai:header'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), (RewritePartname("header"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) # Schrijft OAI-header naar storage. ) ) ), (FilterWcpCollection(allowed=['publication']), # (LogComponent("PUBLICATION"),), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['publication'], name='NARCISPORTAL'), #TODO: Skip name='NARCISPORTAL' (oaiJazz,), ), (XmlXPath(["//long:knaw_long[long:accessRights ='openAccess']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), # (LogComponent("OPENACCESS"),), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['oa_publication', 'openaire'], name='NARCISPORTAL'), (oaiJazz,), ) ), (XmlXPath(["//long:knaw_long/long:metadata[long:genre ='doctoralthesis']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['thesis'], name='NARCISPORTAL'), (oaiJazz,), ) ), (XmlXPath(['//long:knaw_long/long:metadata/long:grantAgreements/long:grantAgreement[long:code[contains(.,"greement/EC/") or contains(.,"greement/ec/")]][1]'], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['ec_fundedresources', 'openaire'], name='NARCISPORTAL'), (oaiJazz,), ) ) ), (FilterWcpCollection(allowed=['dataset']), # (LogComponent("DATASET"),), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['dataset'], name='NARCISPORTAL'), (oaiJazz,), ) ) ) ), # Schrijf 'meta' partname naar storage: (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="meta"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (RewritePartname("meta"), (storageComponent,) # Schrijft harvester 'meta' data naar storage. ) ) ), (FilterMessages(allowed=['add']), # TODO: Remove this line. # (LogComponent("UnDelete"),), (ResurrectTombstone(), (storageComponent,), ) ) ) ) ) )
def main(reactor, port, statePath, **ignored): oaiSuspendRegister = SuspendRegister() oaiJazz = be((OaiJazz(join(statePath, 'oai'), alwaysDeleteInPrefixes=[NORMALISED_DOC_NAME]), (oaiSuspendRegister, ))) normLogger = Logger(join(statePath, 'normlogger')) # strategie = HashDistributeStrategy() # filename (=partname) is also hashed: difficult to read by human eye... strategie = Md5HashDistributeStrategy() storeComponent = StorageComponent( join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[NORMALISED_DOC_NAME]) return \ (Observable(), # (scheduledCommitPeriodicCall,), # (DebugPrompt(reactor=reactor, port=port+1, globals=locals()),), (ObservableHttpServer(reactor=reactor, port=port), (BasicHttpHandler(), (IpFilter(allowedIps=['127.0.0.1']), (PathFilter('/oaix', excluding=['/oaix/info']), (OaiPmh(repositoryName='Gateway', adminEmail='*****@*****.**', supportXWait=True, batchSize=2000 # Override default batch size of 200. ), (oaiJazz,), (oaiSuspendRegister,), (StorageAdapter(), (storeComponent,), ), ) ), (PathFilter('/oaix/info'), (OaiInfo(reactor=reactor, oaiPath='/oai'), (oaiJazz,), ) ), ), (PathFilter('/update'), (SruRecordUpdate(sendRecordData=False, logErrors=True,), (FilterMessages(allowed=['delete']), (storeComponent,), (oaiJazz,), ), (FilterMessages(allowed=['add']), # (LogComponent("LXML:"),), (Validate([('DIDL container','//didl:DIDL', 'didl.xsd'), ('MODS metadata', '//mods:mods', 'mods-3-6.xsd')]), # (LogComponent("VALIDATED:"),), (AddMetadataDocumentPart(partName='normdoc', fromKwarg='lxmlNode'), (NormaliseDIDL(nsMap=namespacesMap, fromKwarg='lxmlNode'), # Normalise DIDL in partname=normdoc metadata (normLogger,), (NormaliseMODS(nsMap=namespacesMap, fromKwarg='lxmlNode'), # Normalise MODS in partname=normdoc metadata (normLogger,), (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), (RewritePartname(NORMALISED_DOC_NAME), # Rename converted part. (storeComponent,), # Store converted/renamed part. ) ), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[NORMALISED_DOC_NAME]), (oaiJazz,), ) ) ) ) ) ) ) ) ) ) )