def testNamespaces(self): xmlXPath = XmlXPath(['/a:aNode/b:bNode'], fromKwarg='lxmlNode', namespaces={ 'a': 'aNamespace', 'b': 'bNamespace' }) lxmlNode = parse( StringIO( '<aNode xmlns="aNamespace"><bNode xmlns="bNamespace">ccc</bNode></aNode>' )) observer = CallTrace('Observer') observable = Observable() observable.addObserver(xmlXPath) xmlXPath.addObserver(observer) observable.do.message(lxmlNode=lxmlNode) message = observer.calledMethods[0] self.assertEqual('message', message.name) newNode = message.kwargs['lxmlNode'] self.assertEqualsWS('<bNode xmlns="bNamespace">ccc</bNode>', lxmltostring(newNode)) newNamespaces = newNode.getroot().nsmap nameSpacesAfterParsing = parse(StringIO( lxmltostring(newNode))).getroot().nsmap self.assertEqual(nameSpacesAfterParsing, newNamespaces)
def testXPathReturnsString(self): xpath = XmlXPath(['/a/t/text()'], fromKwarg="lxmlNode") inputNode = parse(StringIO('<a><t>some text & some <entities></t></a>')) observable = Observable() observer = CallTrace('observer') observable.addObserver(xpath) xpath.addObserver(observer) observable.do.aMethod(lxmlNode=inputNode) self.assertEquals(1, len(observer.calledMethods)) result = observer.calledMethods[0].kwargs self.assertEquals({'lxmlNode': 'some text & some <entities>'}, result)
def testTailTakenCareOfWithoutAffectingOriginal(self): observer = CallTrace('observer', methods={'test': lambda *args, **kwargs: (x for x in [])}) observable = be( (Observable(), (XmlXPath( ['/myns:root/myns:path'], fromKwarg='lxmlNode', namespaces={'myns': 'http://myns.org/'} ), (observer, ), ) ) ) XML = """\ <root xmlns:myns="http://myns.org/" xmlns="http://myns.org/"> <myns:path> <to>me</to> </myns:path>\n </root>""" lxmlNode = parse(StringIO(XML)) self.assertEquals(XML, lxmltostring(lxmlNode)) list(compose(observable.all.test('een tekst', lxmlNode=lxmlNode))) self.assertEquals(1, len(observer.calledMethods)) method = observer.calledMethods[0] self.assertEquals('test', method.name) self.assertEqualsWS('<myns:path xmlns:myns="http://myns.org/" xmlns="http://myns.org/"><to>me</to></myns:path>', lxmltostring(method.kwargs['lxmlNode'])) self.assertEquals("""\ <myns:path xmlns:myns="http://myns.org/" xmlns="http://myns.org/"> <to>me</to> </myns:path>""", lxmltostring(method.kwargs['lxmlNode'])) self.assertEquals(XML, lxmltostring(lxmlNode))
def testNamespaces(self): xmlXPath = XmlXPath(['/a:aNode/b:bNode'], fromKwarg='lxmlNode', namespaces={'a':'aNamespace', 'b':'bNamespace' }) lxmlNode = parse(StringIO('<aNode xmlns="aNamespace"><bNode xmlns="bNamespace">ccc</bNode></aNode>')) observer = CallTrace('Observer') observable = Observable() observable.addObserver(xmlXPath) xmlXPath.addObserver(observer) observable.do.message(lxmlNode=lxmlNode) message = observer.calledMethods[0] self.assertEquals('message', message.name) newNode = message.kwargs['lxmlNode'] self.assertEqualsWS('<bNode xmlns="bNamespace">ccc</bNode>', lxmltostring(newNode)) newNamespaces = newNode.getroot().nsmap nameSpacesAfterParsing = parse(StringIO(lxmltostring(newNode))).getroot().nsmap self.assertEquals(nameSpacesAfterParsing, newNamespaces)
def createXmlXPath(self, xpathList, nsMap): self.observer = CallTrace('observer', ignoredAttributes=['start']) self.observable = be( (Observable(), (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), ( XmlXPath(xpathList, fromKwarg='lxmlNode', namespaces=nsMap), (self.observer, ), ))))
def testDoNotChangeOriginal(self): xmlXPath = XmlXPath(['/a'], fromKwarg='lxmlNode') lxmlNode = parse(StringIO('<a>a</a>')) list(compose(xmlXPath.all_unknown('message', lxmlNode=lxmlNode))) self.assertEqual('<a>a</a>', lxmltostring(lxmlNode))
def createDownloadHelix(reactor, periodicDownload, oaiDownload, storageComponent, oaiJazz): return \ (periodicDownload, # Scheduled connection to a remote (response / request)... (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), # Convert from plain text to lxml-object. (oaiDownload, # Implementation/Protocol of a PeriodicDownload... (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message. (FilterMessages(['delete']), # Filtert delete messages # (LogComponent("Delete Update"),), (storageComponent,), # Delete from storage (oaiJazz,), # Delete from OAI-pmh repo # Write a 'deleted' part to the storage, that holds the (Record)uploadId. (WriteTombstone(), (storageComponent,), ) ), (FilterMessages(allowed=['add']), # (LogComponent("ADD"),), (XmlXPath(['//document:document/document:part[@name="normdoc"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), # (LogComponent("NORMDOC"),), (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), (RewritePartname(NL_DIDL_NORMALISED_PREFIX), # Hernoemt partname van 'record' naar "metadata". (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft oai:metadata (=origineel) naar storage. ) ) ) ), (XmlXPath(['//document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), # TODO: Check indien conversies misgaan, dat ook de meta en header part niet naar storage gaan: geen enkel part als het even kan... # Schrijf 'header' partname naar storage: (XmlXPath(['/oai:record/oai:header'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), (RewritePartname("header"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft OAI-header naar storage. ) ) ), # Schrijf 'metadata' partname naar storage: # Op gharvester21 gaat dit niet goed: Daar is het root element <metadata> in het 'metadata' part, in plaats van <DIDL>. # Liever hier een child::node(), echter gaat deze syntax mis i.c.m. XmlXPath component?? (XmlXPath(['/oai:record/oai:metadata/didl:DIDL'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # (LogComponent("METADATA_PART"),), (RewritePartname("metadata"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft metadata naar storage. ) ) ) ) ), (XmlXPath(['//document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), (NlDidlCombined(nsMap=NAMESPACEMAP, fromKwarg='lxmlNode'), # Create combined format from stored metadataPart and normalized part. (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), # Convert it to plaintext (RewritePartname(NL_DIDL_COMBINED_PREFIX), # Rename combined partName (storageComponent,) # Write combined partName to storage ) ) ) ) ), (XmlXPath(['//document:document/document:part[@name="meta"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (RewritePartname("meta"), (storageComponent,) # Schrijft harvester 'meta' data naar storage. ) ), (OaiAddRecord(metadataPrefixes=[('metadata', 'http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-21_schema_files/did/didmodel.xsd', 'urn:mpeg:mpeg21:2002:02-DIDL-NS'), (NL_DIDL_NORMALISED_PREFIX, '', NAMESPACEMAP.gmhnorm), (NL_DIDL_COMBINED_PREFIX, '', NAMESPACEMAP.gmhcombined)]), #[(partname, schema, namespace)] # (LogComponent("OaiAddRecord:"),), (storageComponent,), (oaiJazz,) # Assert partNames header and meta are available from storage! ), (ResurrectTombstone(), (storageComponent,), ), ), # (FilterMessages(allowed=['add']), # # (LogComponent("UnDelete"),), # (ResurrectTombstone(), # (storageComponent,), # ) # ) ) ) ) )
def main(reactor, port, statePath, **ignored): oaiSuspendRegister = SuspendRegister() oaiJazz = be((OaiJazz(join(statePath, 'oai')), (oaiSuspendRegister, ))) # WST: # strategie = HashDistributeStrategy() # filename (=partname) is also hashed: difficult to read by human eye... strategie = Md5HashDistributeStrategy() storeComponent = StorageComponent( join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[NORMALISED_DOC_NAME]) return \ (Observable(), # (scheduledCommitPeriodicCall,), # (DebugPrompt(reactor=reactor, port=port+1, globals=locals()),), (ObservableHttpServer(reactor=reactor, port=port), (BasicHttpHandler(), (IpFilter(allowedIps=['127.0.0.1']), (PathFilter('/oaix', excluding=['/oaix/info']), (OaiPmh(repositoryName='Gateway', adminEmail='*****@*****.**', supportXWait=True, batchSize=2000 # Override default batch size of 200. ), (oaiJazz,), (oaiSuspendRegister,), (StorageAdapter(), (storeComponent,), ), ) ), (PathFilter('/oaix/info'), (OaiInfo(reactor=reactor, oaiPath='/oai'), (oaiJazz,), ) ), ), (PathFilter('/update'), (SruRecordUpdate(sendRecordData=False, logErrors=True,), (FilterMessages(allowed=['delete']), (storeComponent,), (oaiJazz,), ), (FilterMessages(allowed=['add']), # Does not work? See comments in component... # (AddMetadataFormat(fromKwarg="lxmlNode", name='md_format'), # (LogComponent("AddMetadataFormat"),), # ), (XmlXPath(['srw:recordData/*'], fromKwarg='lxmlNode'), # Stuurt IEDERE matching node in een nieuw bericht door. # (LogComponent("TO LONG CONVERTER:"),), (AddMetadataNamespace(dateformat="%Y-%m-%dT%H:%M:%SZ", fromKwarg='lxmlNode'), # Adds metadataNamespace to meta part in the message. (NormaliseOaiRecord(fromKwarg='lxmlNode'), # Normalises record to: long & original parts. Raises ValidationException if no 'known' metadataformat (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data', pretty_print=False), (RewritePartname(NORMALISED_DOC_NAME), # Rename converted part. (storeComponent,), # Store converted/renamed part. ) ) ), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[NORMALISED_DOC_NAME]), (oaiJazz,), ) ) ) ) ) ) ) ) )
def testDoNotChangeOriginal(self): xmlXPath = XmlXPath(['/a'], fromKwarg='lxmlNode') lxmlNode = parse(StringIO('<a>a</a>')) list(compose(xmlXPath.all_unknown('message', lxmlNode=lxmlNode))) self.assertEquals('<a>a</a>', lxmltostring(lxmlNode))
def createDownloadHelix(reactor, periodicDownload, oaiDownload, storageComponent, oaiJazz, oai_oa_cerifJazz): return \ (periodicDownload, # Scheduled connection to a remote (response / request)... (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), # Convert from plain text to lxml-object. (oaiDownload, # Implementation/Protocol of a PeriodicDownload... (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message. (FilterMessages(['delete']), # Filtert delete messages # (LogComponent("Delete Update"),), (storageComponent,), # Delete from storage (oaiJazz,), # Delete from OAI-pmh repo (oai_oa_cerifJazz,), # Write a 'deleted' part to the storage, that holds the (Record)uploadId. (WriteTombstone(), (storageComponent,), ) ), (FilterMessages(allowed=['add']), (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), (FilterWcpCollection(allowed=['research']), (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-project.xsl')], fromKwarg="lxmlNode"), (RewritePartname(OPENAIRE_PARTNAME), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) ) ) ) ) ), (FilterWcpCollection(allowed=['person']), (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-person.xsl')], fromKwarg="lxmlNode"), (RewritePartname(OPENAIRE_PARTNAME), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) ) ) ) ) ), (FilterWcpCollection(allowed=['organisation']), (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-orgunit.xsl')], fromKwarg="lxmlNode"), (RewritePartname(OPENAIRE_PARTNAME), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) ) ) ) ) ), (FilterWcpCollection(allowed=['dataset']), # START CERIF CONVERSION FOR DATASET COLLECTION: cerif-dataset / cerif-software. (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat. (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-product.xsl')], fromKwarg="lxmlNode"), (RewritePartname(OPENAIRE_PARTNAME), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) ) ) ) ) ), (FilterWcpCollection(allowed=['publication']), # START CERIF CONVERSION FOR PUBLICATIONS COLLECTION (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat. (FilterKnawLongGenre(allowed=['patent']), # START PATENTS CONVERSION (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-patent.xsl')], fromKwarg="lxmlNode"), (RewritePartname(OPENAIRE_PARTNAME), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) ) ) ) ), (FilterKnawLongGenre(disallowed=['patent']), # START Publication CONVERSION (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-publication.xsl')], fromKwarg="lxmlNode"), (RewritePartname(OPENAIRE_PARTNAME), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) ) ) ) ) ) ), (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat (RewritePartname("metadata"), # Hernoemt partname van 'record' naar "metadata". (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) # Schrijft oai:metadata (=origineel) naar storage. ) ) ), (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat. (RewritePartname("knaw_long"), # Hernoemt partname van 'record' naar "knaw_long". (FilterWcpCollection(disallowed=['person', 'research', 'organisation']), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,), # Schrijft 'long' (=norm:normdoc) naar storage. ) ), (ShortConverter(fromKwarg='lxmlNode'), # creeer 'knaw_short' subset formaat. (RewritePartname("knaw_short"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft 'short' naar storage. ) ) ), (FilterWcpCollection(disallowed=['person', 'research', 'organisation']), (DcConverter(fromKwarg='lxmlNode'), # Hernoem partname van 'record' naar "oai_dc". (RewritePartname("oai_dc"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft 'oai_dc' naar storage. ) ) ) ) ) ), # TODO: Check indien conversies misgaan, dat ook de meta en header part niet naar storage gaan: geen 1 part als het even kan... # Schrijf 'header' partname naar storage: (XmlXPath(['/oai:record/oai:header'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), (RewritePartname("header"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) # Schrijft OAI-header naar storage. ) ) ), (FilterWcpCollection(allowed=['publication']), # (LogComponent("PUBLICATION"),), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['publication']), (oaiJazz,), ), (XmlXPath(["//long:knaw_long[long:accessRights ='openAccess']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['oa_publication', 'openaire']), (oaiJazz,), ) ), (XmlXPath(["//long:knaw_long/long:metadata[long:genre ='doctoralthesis']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['thesis']), (oaiJazz,), ) ), (XmlXPath(['//long:knaw_long/long:metadata/long:grantAgreements/long:grantAgreement[long:code[contains(.,"greement/EC/") or contains(.,"greement/ec/")]][1]'], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['ec_fundedresources', 'openaire']), (oaiJazz,), ) ), (XmlXPath(["//long:knaw_long/long:metadata[long:genre ='patent']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_patents"]), (oai_oa_cerifJazz,), ) ), (XmlXPath(["//long:knaw_long/long:metadata[long:genre !='patent']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_publications"]), (oai_oa_cerifJazz,), ) ), ), (FilterWcpCollection(allowed=['dataset']), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['dataset']), (oaiJazz,), ), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_products"]), (oai_oa_cerifJazz,), ) ), # Add NOD OpenAIRE Cerif to OpenAIRE-PMH repo. (FilterWcpCollection(allowed=['research']), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_projects"]), (oai_oa_cerifJazz,), ) ), (FilterWcpCollection(allowed=['person']), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=['openaire_cris_persons']), (oai_oa_cerifJazz,), ) ), (FilterWcpCollection(allowed=['organisation']), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=['openaire_cris_orgunits']), (oai_oa_cerifJazz,), ) ) ) ), # Schrijf 'meta' partname naar storage: (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="meta"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (RewritePartname("meta"), (storageComponent,) # Schrijft harvester 'meta' data naar storage. ) ) ), (FilterMessages(allowed=['add']), # (LogComponent("UnDelete"),), (ResurrectTombstone(), (storageComponent,), ) ) ) ) ) )
def createDownloadHelix(reactor, periodicDownload, oaiDownload, storageComponent, oaiJazz): return \ (periodicDownload, # Scheduled connection to a remote (response / request)... (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), # Convert from plain text to lxml-object. (oaiDownload, # Implementation/Protocol of a PeriodicDownload... (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message. (FilterMessages(['delete']), # Filtert delete messages # (LogComponent("Delete Update"),), (storageComponent,), # Delete from storage (oaiJazz,), # Delete from OAI-pmh repo # Write a 'deleted' part to the storage, that holds the (Record)uploadId. (WriteTombstone(), (storageComponent,), ) ), (FilterMessages(allowed=['add']), # TODO: onderstaande toKwarg='data' kan eruit. Dan de volgende regel ook:-) (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'), (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat (RewritePartname("metadata"), # Hernoemt partname van 'record' naar "metadata". (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) # Schrijft oai:metadata (=origineel) naar storage. ) ) ), (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat. (RewritePartname("knaw_long"), # Hernoemt partname van 'record' naar "knaw_long". (FilterWcpCollection(disallowed=['person', 'research', "organisation"]), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,), # Schrijft 'long' (=norm:normdoc) naar storage. ) ), (ShortConverter(fromKwarg='lxmlNode'), # creeer 'knaw_short' subset formaat. (RewritePartname("knaw_short"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft 'short' naar storage. ) ) ), (FilterWcpCollection(disallowed=['person', 'research', "organisation"]), (DcConverter(fromKwarg='lxmlNode'), # Hernoem partname van 'record' naar "oai_dc". (RewritePartname("oai_dc"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True), (storageComponent,) # Schrijft 'oai_dc' naar storage. ) ) ) ) ) ), # TODO: Check indien conversies misgaan, dat ook de meta en header part niet naar storage gaan: geen 1 part als het even kan... # Schrijf 'header' partname naar storage: (XmlXPath(['/oai:record/oai:header'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), (RewritePartname("header"), (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False), (storageComponent,) # Schrijft OAI-header naar storage. ) ) ), (FilterWcpCollection(allowed=['publication']), # (LogComponent("PUBLICATION"),), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['publication'], name='NARCISPORTAL'), #TODO: Skip name='NARCISPORTAL' (oaiJazz,), ), (XmlXPath(["//long:knaw_long[long:accessRights ='openAccess']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), # (LogComponent("OPENACCESS"),), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['oa_publication', 'openaire'], name='NARCISPORTAL'), (oaiJazz,), ) ), (XmlXPath(["//long:knaw_long/long:metadata[long:genre ='doctoralthesis']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['thesis'], name='NARCISPORTAL'), (oaiJazz,), ) ), (XmlXPath(['//long:knaw_long/long:metadata/long:grantAgreements/long:grantAgreement[long:code[contains(.,"greement/EC/") or contains(.,"greement/ec/")]][1]'], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['ec_fundedresources', 'openaire'], name='NARCISPORTAL'), (oaiJazz,), ) ) ), (FilterWcpCollection(allowed=['dataset']), # (LogComponent("DATASET"),), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['dataset'], name='NARCISPORTAL'), (oaiJazz,), ) ) ) ), # Schrijf 'meta' partname naar storage: (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="meta"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP), (RewritePartname("meta"), (storageComponent,) # Schrijft harvester 'meta' data naar storage. ) ) ), (FilterMessages(allowed=['add']), # TODO: Remove this line. # (LogComponent("UnDelete"),), (ResurrectTombstone(), (storageComponent,), ) ) ) ) ) )
def writerMain(writerReactor, statePath, luceneserverPort, gatewayPort, quickCommit=False): http11Request = be( (HttpRequest1_1(), (SocketPool(reactor=writerReactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)),), ) ) indexCommitTimeout = 30 defaultLuceneSettings = LuceneSettings( commitTimeout=indexCommitTimeout, readonly=False, ) luceneWriter = luceneAndReaderConfig(defaultLuceneSettings, http11Request, luceneserverPort) periodicDownload = PeriodicDownload( writerReactor, host='localhost', port=gatewayPort, schedule=Schedule(period=1 if quickCommit else 10), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need 1 second! Otherwise tests will fail! name='index', autoStart=True) oaiDownload = OaiDownloadProcessor( path='/oaix', metadataPrefix=NORMALISED_DOC_NAME, workingDirectory=join(statePath, 'harvesterstate', 'gateway'), userAgentAddition='idx-server', xWait=True, name='index', autoCommit=False) # Post commit naar Lucene(server): scheduledCommitPeriodicCall = be( (PeriodicCall(writerReactor, message='commit', name='Scheduled commit', schedule=Schedule(period=1 if quickCommit else 300), initialSchedule=Schedule(period=1)), # WST: Flushes data from memory to disk. IntegrationTests need 1 second! Otherwise tests will fail! (API). (AllToDo(), # broadcast message to all components, despite of what kind of message... # (periodicDownload,), # WST: periodicDownload does not do anything with a 'commit' message? So why send it to it??? (LuceneCommit(host='localhost', port=luceneserverPort,), # 'commit' message results in http post to /commit/ to Lucene server: # (LogComponent("PERIODIC"),#), # [PERIODIC] httprequest1_1(*(), **{'body': None, 'host': 'localhost', 'request': '/commit/', 'port': 52501, 'method': 'POST'}) (http11Request,), # ), ) ) ) ) writerServer = \ (Observable(), (scheduledCommitPeriodicCall,), # Stuur periodiek een 'Commit' naar de LuceneServer... # (DebugPrompt(reactor=writerReactor, port=readerPort-1, globals=locals()),), (periodicDownload, # Ga/connect (periodiek) naar de Gateway-server... (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), (oaiDownload, # Haal OAI spulletjes van de Gateway... (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message. # (LogComponent("SRU harvest van GATEWAY"),), #[SRU harvest van GATEWAY] add(*(), **{'partname': 'record', 'identifier': 'meresco:record:1', 'lxmlNode': '_ElementTree(<record xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><header><identifier>meresco:record:1</identifier><datestamp>2016-07-13T15:31:10Z</datestamp></header><metadata><document xmlns="http://meres (FilterMessages(allowed=['add']), (XmlXPath(['/oai:record/oai:metadata/document:document'], fromKwarg='lxmlNode'), # (LogComponent("NormdocToFieldsList"),), (NormdocToFieldsList(), # Platte lijst met veldnamen en waardes... (RecordPidToAuthNid(),), # (LogComponent("NormdocToFieldsList"),), # [DcToFieldsList] add(*(), **{'fieldslist': [('dc:identifier', 'http://meresco.com?record=1'), ('dc:description', 'This is an example program about Search with Meresco'), ('dc:title', 'Example Program 1'), ('dc:creator', 'Seecr'), ('dc:publisher', 'Seecr'), ('dc:date', '2016'), ('dc:type', 'Example'), ('dc:subject', 'Search'), ('dc:language', 'en'), ('dc:rights', 'Open Source')], 'partname': 'record', 'identifier': 'meresco:record:1'}) # [NormdocToFieldsList] lookupNameIds(*(set(['doi:10.1002/lno.10611', 'wos:000423029300003']),), **{}) (FieldsListToLuceneDocument( # Maakt addDocument messege + creeert de facet/drilldown velden waarvan de value's tot max. 256 chars getruncated worden. fieldRegistry=luceneWriter.settings.fieldRegistry, # o.a. drilldownfields definitie untokenizedFieldnames=untokenizedFieldnames, # untokenized fields indexFieldFactory=DcFields, # Creeert een "__all__", veldnaam en optioneel "untokenized.veldnaam"... #rewriteIdentifier=(lambda idee: idee.split(':', 1)[-1]) # meresco:record:1' => 'record:1' ), # (LogComponent("FieldsListToLuceneDocument"),), # [LUCENE_WRITER] addDocument(*(), **{'fields': [{'type': 'TextField', 'name': '__all__', 'value': 'http://meresco.com?record=1'}, {'type': 'TextField', 'name': 'dc:identifier', 'value': 'http://meresco.com?record=1'}, {'type': 'StringField', 'name': 'untokenized.dc:identifier', 'value': 'http://meresco.com?record=1'}, {'type': 'TextField', 'name': '__all__', 'value': 'This is an example program about Search with Meresco'}, {'type': 'TextField', 'name': 'dc:description', 'value': 'This is an example program about Search with Meresco'}, {'type': 'TextField', 'name': '__all__', 'value': 'Example Program 1'}, {'type': 'TextField', 'name': 'dc:title', 'value': 'Example Program 1'}, {'type': 'TextField', 'name': '__all__', 'value': 'Seecr'}, {'type': 'TextField', 'name': 'dc:creator', 'value': 'Seecr'}, {'type': 'TextField', 'name': '__all__', 'value': 'Seecr'}, {'type': 'TextField', 'name': 'dc:publisher', 'value': 'Seecr'}, {'type': 'TextField', 'name': '__all__', 'value': '2016'}, {'type': 'TextField', 'name': 'dc:date', 'value': '2016'}, {'path': ['2016'], 'type': 'FacetField', 'name': 'untokenized.dc:date'}, {'type': 'TextField', 'name': '__all__', 'value': 'Example'}, {'type': 'TextField', 'name': 'dc:type', 'value': 'Example'}, {'type': 'TextField', 'name': '__all__', 'value': 'Search'}, {'type': 'TextField', 'name': 'dc:subject', 'value': 'Search'}, {'path': ['Search'], 'type': 'FacetField', 'name': 'untokenized.dc:subject'}, {'type': 'TextField', 'name': '__all__', 'value': 'en'}, {'type': 'TextField', 'name': 'dc:language', 'value': 'en'}, {'type': 'TextField', 'name': '__all__', 'value': 'Open Source'}, {'type': 'TextField', 'name': 'dc:rights', 'value': 'Open Source'}], 'identifier': 'meresco:record:1'}) # [####LUCENE_WRITER] addDocument(*(), **{'fields': [{'type': 'TextField', 'name': '__all__', 'value': 'knaw'}, {'type': 'TextField', 'name': 'meta:id', 'value': 'knaw'}, {'type': 'TextField', 'name': '__all__', 'value': 'olddata'}, {'type': 'TextField', 'name': 'meta:set', 'value': 'olddata'}, {'type': 'TextField', 'name': '__all__', 'value': 'http://oai.knaw.nl/oai'}, {'type': 'TextField', 'name': 'meta:baseurl', 'value': 'http://oai.knaw.nl/oai'}, {'type': 'TextField', 'name': '__all__', 'value': 'knaw'}, {'type': 'TextField', 'name': 'meta:repositoryGroupId', 'value': 'knaw'}, {'type': 'TextField', 'name': '__all__', 'value': 'nl_didl'}, {'type': 'TextField', 'name': 'meta:metadataPrefix', 'value': 'nl_didl'}, {'type': 'TextField', 'name': '__all__', 'value': 'publication'}, {'type': 'TextField', 'name': 'meta_collection', 'value': 'publication'}, {'path': ['publication'], 'type': 'FacetField', 'name': 'untokenized.meta_collection'}], 'identifier': 'knaw:record:3'}) (luceneWriter,), # ), ) ) # ) # ) ) ), (FilterMessages(allowed=['delete']), (luceneWriter,), ) ) ) ) ) ) return writerServer