Пример #1
0
    def testNamespaces(self):
        xmlXPath = XmlXPath(['/a:aNode/b:bNode'],
                            fromKwarg='lxmlNode',
                            namespaces={
                                'a': 'aNamespace',
                                'b': 'bNamespace'
                            })
        lxmlNode = parse(
            StringIO(
                '<aNode xmlns="aNamespace"><bNode xmlns="bNamespace">ccc</bNode></aNode>'
            ))
        observer = CallTrace('Observer')
        observable = Observable()
        observable.addObserver(xmlXPath)
        xmlXPath.addObserver(observer)

        observable.do.message(lxmlNode=lxmlNode)

        message = observer.calledMethods[0]
        self.assertEqual('message', message.name)
        newNode = message.kwargs['lxmlNode']
        self.assertEqualsWS('<bNode xmlns="bNamespace">ccc</bNode>',
                            lxmltostring(newNode))

        newNamespaces = newNode.getroot().nsmap
        nameSpacesAfterParsing = parse(StringIO(
            lxmltostring(newNode))).getroot().nsmap
        self.assertEqual(nameSpacesAfterParsing, newNamespaces)
    def testXPathReturnsString(self):
        xpath = XmlXPath(['/a/t/text()'], fromKwarg="lxmlNode")
        inputNode = parse(StringIO('<a><t>some text &amp; some &lt;entities&gt;</t></a>'))

        observable = Observable()
        observer = CallTrace('observer')
        observable.addObserver(xpath)
        xpath.addObserver(observer)

        observable.do.aMethod(lxmlNode=inputNode)
        self.assertEquals(1, len(observer.calledMethods))
        result = observer.calledMethods[0].kwargs
        self.assertEquals({'lxmlNode': 'some text & some <entities>'}, result)
    def testTailTakenCareOfWithoutAffectingOriginal(self):
        observer = CallTrace('observer', methods={'test': lambda *args, **kwargs: (x for x in [])})
        observable = be(
            (Observable(),
                (XmlXPath(
                        ['/myns:root/myns:path'],
                        fromKwarg='lxmlNode',
                        namespaces={'myns': 'http://myns.org/'}
                    ),
                    (observer, ),
                )
            )
        )

        XML = """\
<root xmlns:myns="http://myns.org/" xmlns="http://myns.org/">
    <myns:path>
        <to>me</to>
    </myns:path>\n
</root>"""

        lxmlNode = parse(StringIO(XML))
        self.assertEquals(XML, lxmltostring(lxmlNode))
        list(compose(observable.all.test('een tekst', lxmlNode=lxmlNode)))

        self.assertEquals(1, len(observer.calledMethods))
        method = observer.calledMethods[0]
        self.assertEquals('test', method.name)
        self.assertEqualsWS('<myns:path xmlns:myns="http://myns.org/" xmlns="http://myns.org/"><to>me</to></myns:path>', lxmltostring(method.kwargs['lxmlNode']))
        self.assertEquals("""\
<myns:path xmlns:myns="http://myns.org/" xmlns="http://myns.org/">
        <to>me</to>
    </myns:path>""", lxmltostring(method.kwargs['lxmlNode']))

        self.assertEquals(XML, lxmltostring(lxmlNode))
Пример #4
0
    def testNamespaces(self):
        xmlXPath = XmlXPath(['/a:aNode/b:bNode'], fromKwarg='lxmlNode', namespaces={'a':'aNamespace', 'b':'bNamespace' })
        lxmlNode = parse(StringIO('<aNode xmlns="aNamespace"><bNode xmlns="bNamespace">ccc</bNode></aNode>'))
        observer = CallTrace('Observer')
        observable = Observable()
        observable.addObserver(xmlXPath)
        xmlXPath.addObserver(observer)

        observable.do.message(lxmlNode=lxmlNode)

        message = observer.calledMethods[0]
        self.assertEquals('message', message.name)
        newNode = message.kwargs['lxmlNode']
        self.assertEqualsWS('<bNode xmlns="bNamespace">ccc</bNode>', lxmltostring(newNode))

        newNamespaces = newNode.getroot().nsmap
        nameSpacesAfterParsing = parse(StringIO(lxmltostring(newNode))).getroot().nsmap
        self.assertEquals(nameSpacesAfterParsing, newNamespaces)
Пример #5
0
 def createXmlXPath(self, xpathList, nsMap):
     self.observer = CallTrace('observer', ignoredAttributes=['start'])
     self.observable = be(
         (Observable(), (XmlParseLxml(fromKwarg='data',
                                      toKwarg='lxmlNode'), (
                                          XmlXPath(xpathList,
                                                   fromKwarg='lxmlNode',
                                                   namespaces=nsMap),
                                          (self.observer, ),
                                      ))))
Пример #6
0
 def testDoNotChangeOriginal(self):
     xmlXPath = XmlXPath(['/a'], fromKwarg='lxmlNode')
     lxmlNode = parse(StringIO('<a>a</a>'))
     list(compose(xmlXPath.all_unknown('message', lxmlNode=lxmlNode)))
     self.assertEqual('<a>a</a>', lxmltostring(lxmlNode))
Пример #7
0
def createDownloadHelix(reactor, periodicDownload, oaiDownload,
                        storageComponent, oaiJazz):
    return \
    (periodicDownload, # Scheduled connection to a remote (response / request)...
        (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), # Convert from plain text to lxml-object.
            (oaiDownload, # Implementation/Protocol of a PeriodicDownload...
                (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message.
                    (FilterMessages(['delete']), # Filtert delete messages
                        # (LogComponent("Delete Update"),),
                        (storageComponent,), # Delete from storage
                        (oaiJazz,), # Delete from OAI-pmh repo
                        # Write a 'deleted' part to the storage, that holds the (Record)uploadId.
                        (WriteTombstone(),
                            (storageComponent,),
                        )
                    ),
                    (FilterMessages(allowed=['add']),
                        # (LogComponent("ADD"),),

                        (XmlXPath(['//document:document/document:part[@name="normdoc"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            # (LogComponent("NORMDOC"),),
                            (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'),
                                (RewritePartname(NL_DIDL_NORMALISED_PREFIX), # Hernoemt partname van 'record' naar "metadata".
                                    (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                        (storageComponent,) # Schrijft oai:metadata (=origineel) naar storage.
                                    )
                                )
                            )
                        ),

                        (XmlXPath(['//document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'),
                                # TODO: Check indien conversies misgaan, dat ook de meta en header part niet naar storage gaan: geen enkel part als het even kan...
                                # Schrijf 'header' partname naar storage:
                                (XmlXPath(['/oai:record/oai:header'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP),
                                    (RewritePartname("header"),
                                        (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                            (storageComponent,) # Schrijft OAI-header naar storage.
                                        )
                                    )
                                ),
                                # Schrijf 'metadata' partname naar storage:
                                # Op gharvester21 gaat dit niet goed: Daar is het root element <metadata> in het 'metadata' part, in plaats van <DIDL>.
                                # Liever hier een child::node(), echter gaat deze syntax mis i.c.m. XmlXPath component??
                                (XmlXPath(['/oai:record/oai:metadata/didl:DIDL'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP),
                                    # (LogComponent("METADATA_PART"),),
                                    (RewritePartname("metadata"),
                                        (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                            (storageComponent,) # Schrijft metadata naar storage.
                                        )
                                    )
                                )
                            )
                        ),

                        (XmlXPath(['//document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode',
                               toKwarg='data', namespaces=NAMESPACEMAP),
                            (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'),
                                (NlDidlCombined(nsMap=NAMESPACEMAP, fromKwarg='lxmlNode'),
                                    # Create combined format from stored metadataPart and normalized part.
                                    (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'),  # Convert it to plaintext
                                        (RewritePartname(NL_DIDL_COMBINED_PREFIX),  # Rename combined partName
                                            (storageComponent,)  # Write combined partName to storage
                                        )
                                    )
                                )
                            )
                        ),

                        (XmlXPath(['//document:document/document:part[@name="meta"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            (RewritePartname("meta"),
                                (storageComponent,) # Schrijft harvester 'meta' data naar storage.
                            )
                        ),

                        (OaiAddRecord(metadataPrefixes=[('metadata', 'http://standards.iso.org/ittf/PubliclyAvailableStandards/MPEG-21_schema_files/did/didmodel.xsd', 'urn:mpeg:mpeg21:2002:02-DIDL-NS'),
                                    (NL_DIDL_NORMALISED_PREFIX, '', NAMESPACEMAP.gmhnorm),
                                    (NL_DIDL_COMBINED_PREFIX, '', NAMESPACEMAP.gmhcombined)]), #[(partname, schema, namespace)]
                            # (LogComponent("OaiAddRecord:"),),
                            (storageComponent,),
                            (oaiJazz,) # Assert partNames header and meta are available from storage!
                        ),

                        (ResurrectTombstone(),
                            (storageComponent,),
                        ),

                    ),

                    # (FilterMessages(allowed=['add']),
                    #     # (LogComponent("UnDelete"),),
                    #     (ResurrectTombstone(),
                    #         (storageComponent,),
                    #     )
                    # )
                )
            )
        )
    )
Пример #8
0
def main(reactor, port, statePath, **ignored):

    oaiSuspendRegister = SuspendRegister()
    oaiJazz = be((OaiJazz(join(statePath, 'oai')), (oaiSuspendRegister, )))

    # WST:
    # strategie = HashDistributeStrategy() # filename (=partname) is also hashed: difficult to read by human eye...
    strategie = Md5HashDistributeStrategy()

    storeComponent = StorageComponent(
        join(statePath, 'store'),
        strategy=strategie,
        partsRemovedOnDelete=[NORMALISED_DOC_NAME])

    return \
    (Observable(),
        # (scheduledCommitPeriodicCall,),
        # (DebugPrompt(reactor=reactor, port=port+1, globals=locals()),),
        (ObservableHttpServer(reactor=reactor, port=port),
            (BasicHttpHandler(),
                (IpFilter(allowedIps=['127.0.0.1']),
                    (PathFilter('/oaix', excluding=['/oaix/info']),
                        (OaiPmh(repositoryName='Gateway',
                                adminEmail='*****@*****.**',
                                supportXWait=True,
                                batchSize=2000 # Override default batch size of 200.
                            ),
                            (oaiJazz,),
                            (oaiSuspendRegister,),
                            (StorageAdapter(),
                                (storeComponent,),
                            ),
                        )
                    ),
                    (PathFilter('/oaix/info'),
                        (OaiInfo(reactor=reactor, oaiPath='/oai'),
                            (oaiJazz,),
                        )
                    ),
                ),
                (PathFilter('/update'),
                    (SruRecordUpdate(sendRecordData=False, logErrors=True,),
                        (FilterMessages(allowed=['delete']),
                            (storeComponent,),
                            (oaiJazz,),
                        ),
                        (FilterMessages(allowed=['add']),

                            # Does not work? See comments in component...
                            # (AddMetadataFormat(fromKwarg="lxmlNode", name='md_format'),
                            #     (LogComponent("AddMetadataFormat"),),
                            # ),
                            (XmlXPath(['srw:recordData/*'], fromKwarg='lxmlNode'), # Stuurt IEDERE matching node in een nieuw bericht door.
                                # (LogComponent("TO LONG CONVERTER:"),),
                                (AddMetadataNamespace(dateformat="%Y-%m-%dT%H:%M:%SZ", fromKwarg='lxmlNode'), # Adds metadataNamespace to meta part in the message.
                                    (NormaliseOaiRecord(fromKwarg='lxmlNode'), # Normalises record to: long & original parts. Raises ValidationException if no 'known' metadataformat 
                                        (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data', pretty_print=False),
                                            (RewritePartname(NORMALISED_DOC_NAME), # Rename converted part.
                                                (storeComponent,), # Store converted/renamed part.
                                            )
                                        )
                                    ),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[NORMALISED_DOC_NAME]),
                                        (oaiJazz,),
                                    )
                                )
                            )
                        )
                    )
                )
            )
        )
    )
Пример #9
0
 def testDoNotChangeOriginal(self):
     xmlXPath = XmlXPath(['/a'], fromKwarg='lxmlNode')
     lxmlNode = parse(StringIO('<a>a</a>'))
     list(compose(xmlXPath.all_unknown('message', lxmlNode=lxmlNode)))
     self.assertEquals('<a>a</a>', lxmltostring(lxmlNode))
Пример #10
0
def createDownloadHelix(reactor, periodicDownload, oaiDownload,
                        storageComponent, oaiJazz, oai_oa_cerifJazz):
    return \
    (periodicDownload, # Scheduled connection to a remote (response / request)...
        (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), # Convert from plain text to lxml-object.
            (oaiDownload, # Implementation/Protocol of a PeriodicDownload...
                (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message.
                    (FilterMessages(['delete']), # Filtert delete messages
                        # (LogComponent("Delete Update"),),
                        (storageComponent,), # Delete from storage
                        (oaiJazz,), # Delete from OAI-pmh repo
                        (oai_oa_cerifJazz,),
                        # Write a 'deleted' part to the storage, that holds the (Record)uploadId.
                        (WriteTombstone(),
                            (storageComponent,),
                        )
                    ),
                    (FilterMessages(allowed=['add']),
                        (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'),

                                (FilterWcpCollection(allowed=['research']),
                                    (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat
                                        (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-project.xsl')], fromKwarg="lxmlNode"),
                                            (RewritePartname(OPENAIRE_PARTNAME),
                                                (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                                    (storageComponent,)
                                                )
                                            )
                                        )
                                    )
                                ),

                                (FilterWcpCollection(allowed=['person']),
                                    (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat
                                        (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-person.xsl')], fromKwarg="lxmlNode"),
                                            (RewritePartname(OPENAIRE_PARTNAME),
                                                (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                                    (storageComponent,)
                                                )
                                            )
                                        )
                                    )
                                ),
                                (FilterWcpCollection(allowed=['organisation']),
                                    (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat
                                        (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-orgunit.xsl')], fromKwarg="lxmlNode"),
                                            (RewritePartname(OPENAIRE_PARTNAME),
                                                (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                                    (storageComponent,)
                                                )
                                            )
                                        )
                                    )
                                ),
                                (FilterWcpCollection(allowed=['dataset']), # START CERIF CONVERSION FOR DATASET COLLECTION: cerif-dataset / cerif-software.
                                    (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat.
                                        (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-product.xsl')], fromKwarg="lxmlNode"),
                                            (RewritePartname(OPENAIRE_PARTNAME),
                                                (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                                    (storageComponent,)
                                                )
                                            )
                                        )
                                    )
                                ),
                                (FilterWcpCollection(allowed=['publication']), # START CERIF CONVERSION FOR PUBLICATIONS COLLECTION
                                    (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat.
                                        (FilterKnawLongGenre(allowed=['patent']), # START PATENTS CONVERSION
                                            (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-patent.xsl')], fromKwarg="lxmlNode"),
                                                (RewritePartname(OPENAIRE_PARTNAME),
                                                    (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                                        (storageComponent,)
                                                    )
                                                )
                                            )
                                        ),
                                        (FilterKnawLongGenre(disallowed=['patent']), # START Publication CONVERSION
                                            (XsltCrosswalk([join(dirname(abspath(__file__)), '..', '..', 'xslt', 'cerif-publication.xsl')], fromKwarg="lxmlNode"),
                                                (RewritePartname(OPENAIRE_PARTNAME),
                                                    (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                                        (storageComponent,)
                                                    )
                                                )
                                            )
                                        )
                                    )
                                ),
                                (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat
                                    (RewritePartname("metadata"), # Hernoemt partname van 'record' naar "metadata".
                                        (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                            (storageComponent,) # Schrijft oai:metadata (=origineel) naar storage.
                                        )
                                    )
                                ),
                                (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat.
                                    (RewritePartname("knaw_long"), # Hernoemt partname van 'record' naar "knaw_long".
                                        (FilterWcpCollection(disallowed=['person', 'research', 'organisation']),
                                            (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                                (storageComponent,), # Schrijft 'long' (=norm:normdoc) naar storage.
                                            )
                                        ),
                                        (ShortConverter(fromKwarg='lxmlNode'), # creeer 'knaw_short' subset formaat.
                                            (RewritePartname("knaw_short"),
                                                (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                                    (storageComponent,) # Schrijft 'short' naar storage.
                                                )
                                            )
                                        ),
                                        (FilterWcpCollection(disallowed=['person', 'research', 'organisation']),
                                            (DcConverter(fromKwarg='lxmlNode'), # Hernoem partname van 'record' naar "oai_dc".
                                                (RewritePartname("oai_dc"),
                                                    (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                                        (storageComponent,) # Schrijft 'oai_dc' naar storage.
                                                    )
                                                )
                                            )
                                        )
                                    )
                                ),
                                # TODO: Check indien conversies misgaan, dat ook de meta en header part niet naar storage gaan: geen 1 part als het even kan...
                                # Schrijf 'header' partname naar storage:
                                (XmlXPath(['/oai:record/oai:header'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP),
                                    (RewritePartname("header"),
                                        (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                            (storageComponent,) # Schrijft OAI-header naar storage.
                                        )
                                    )
                                ),
                                (FilterWcpCollection(allowed=['publication']),
                                    # (LogComponent("PUBLICATION"),),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['publication']),
                                        (oaiJazz,),
                                    ),
                                    (XmlXPath(["//long:knaw_long[long:accessRights ='openAccess']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['oa_publication', 'openaire']),
                                            (oaiJazz,),
                                        )
                                    ),
                                    (XmlXPath(["//long:knaw_long/long:metadata[long:genre ='doctoralthesis']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['thesis']),
                                            (oaiJazz,),
                                        )
                                    ),
                                    (XmlXPath(['//long:knaw_long/long:metadata/long:grantAgreements/long:grantAgreement[long:code[contains(.,"greement/EC/") or contains(.,"greement/ec/")]][1]'], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['ec_fundedresources', 'openaire']),
                                            (oaiJazz,),
                                        )
                                    ),
                                    (XmlXPath(["//long:knaw_long/long:metadata[long:genre ='patent']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_patents"]),
                                            (oai_oa_cerifJazz,),
                                        )
                                    ),
                                    (XmlXPath(["//long:knaw_long/long:metadata[long:genre !='patent']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_publications"]),
                                            (oai_oa_cerifJazz,),
                                        )
                                    ),
                                ),
                                (FilterWcpCollection(allowed=['dataset']),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['dataset']),
                                        (oaiJazz,),
                                    ),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_products"]),
                                        (oai_oa_cerifJazz,),
                                    )
                                ),
                                # Add NOD OpenAIRE Cerif to OpenAIRE-PMH repo.
                                (FilterWcpCollection(allowed=['research']),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=["openaire_cris_projects"]),
                                        (oai_oa_cerifJazz,),
                                    )
                                ),
                                (FilterWcpCollection(allowed=['person']),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=['openaire_cris_persons']),
                                        (oai_oa_cerifJazz,),
                                    )
                                ),
                                (FilterWcpCollection(allowed=['organisation']),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[OPENAIRE_PARTNAME], setSpecs=['openaire_cris_orgunits']),
                                        (oai_oa_cerifJazz,),
                                    )
                                )
                            )
                        ), # Schrijf 'meta' partname naar storage:
                        (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="meta"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            (RewritePartname("meta"),
                                (storageComponent,) # Schrijft harvester 'meta' data naar storage.
                            )
                        )
                    ),
                    (FilterMessages(allowed=['add']),
                        # (LogComponent("UnDelete"),),
                        (ResurrectTombstone(),
                            (storageComponent,),
                        )
                    )
                )
            )
        )
    )
Пример #11
0
def createDownloadHelix(reactor, periodicDownload, oaiDownload,
                        storageComponent, oaiJazz):
    return \
    (periodicDownload, # Scheduled connection to a remote (response / request)...
        (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)), # Convert from plain text to lxml-object.
            (oaiDownload, # Implementation/Protocol of a PeriodicDownload...
                (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message.
                    (FilterMessages(['delete']), # Filtert delete messages
                        # (LogComponent("Delete Update"),),
                        (storageComponent,), # Delete from storage
                        (oaiJazz,), # Delete from OAI-pmh repo
                        # Write a 'deleted' part to the storage, that holds the (Record)uploadId.
                        (WriteTombstone(),
                            (storageComponent,),
                        )
                    ),
                    (FilterMessages(allowed=['add']),
                        # TODO: onderstaande toKwarg='data' kan eruit. Dan de volgende regel ook:-)
                        (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="record"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            (XmlParseLxml(fromKwarg='data', toKwarg='lxmlNode'),
                                (XmlXPath(['/oai:record/oai:metadata/norm:md_original/child::*'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Origineel 'metadata' formaat
                                    (RewritePartname("metadata"), # Hernoemt partname van 'record' naar "metadata".
                                        (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                            (storageComponent,) # Schrijft oai:metadata (=origineel) naar storage.
                                        )
                                    )
                                ),
                                (XmlXPath(['/oai:record/oai:metadata/norm:normalized/long:knaw_long'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP), # Genormaliseerd 'long' formaat.
                                    (RewritePartname("knaw_long"), # Hernoemt partname van 'record' naar "knaw_long".
                                        (FilterWcpCollection(disallowed=['person', 'research', "organisation"]),
                                            (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                                (storageComponent,), # Schrijft 'long' (=norm:normdoc) naar storage.
                                            )
                                        ),
                                        (ShortConverter(fromKwarg='lxmlNode'), # creeer 'knaw_short' subset formaat.
                                            (RewritePartname("knaw_short"),
                                                (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                                    (storageComponent,) # Schrijft 'short' naar storage.
                                                )
                                            )
                                        ),
                                        (FilterWcpCollection(disallowed=['person', 'research', "organisation"]),
                                            (DcConverter(fromKwarg='lxmlNode'), # Hernoem partname van 'record' naar "oai_dc".
                                                (RewritePartname("oai_dc"),
                                                    (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=True),
                                                        (storageComponent,) # Schrijft 'oai_dc' naar storage.
                                                    )
                                                )
                                            )
                                        )
                                    )
                                ),
                                # TODO: Check indien conversies misgaan, dat ook de meta en header part niet naar storage gaan: geen 1 part als het even kan...
                                # Schrijf 'header' partname naar storage:
                                (XmlXPath(['/oai:record/oai:header'], fromKwarg='lxmlNode', namespaces=NAMESPACEMAP),
                                    (RewritePartname("header"),
                                        (XmlPrintLxml(fromKwarg="lxmlNode", toKwarg="data", pretty_print=False),
                                            (storageComponent,) # Schrijft OAI-header naar storage.
                                        )
                                    )
                                ),
                                (FilterWcpCollection(allowed=['publication']),
                                    # (LogComponent("PUBLICATION"),),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['publication'], name='NARCISPORTAL'), #TODO: Skip name='NARCISPORTAL'
                                        (oaiJazz,),
                                    ),
                                    (XmlXPath(["//long:knaw_long[long:accessRights ='openAccess']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        # (LogComponent("OPENACCESS"),),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['oa_publication', 'openaire'], name='NARCISPORTAL'),
                                            (oaiJazz,),
                                        )
                                    ),
                                    (XmlXPath(["//long:knaw_long/long:metadata[long:genre ='doctoralthesis']"], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['thesis'], name='NARCISPORTAL'),
                                            (oaiJazz,),
                                        )
                                    ),
                                    (XmlXPath(['//long:knaw_long/long:metadata/long:grantAgreements/long:grantAgreement[long:code[contains(.,"greement/EC/") or contains(.,"greement/ec/")]][1]'], fromKwarg='lxmlNode', namespaceMap=NAMESPACEMAP),
                                        (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['ec_fundedresources', 'openaire'], name='NARCISPORTAL'),
                                            (oaiJazz,),
                                        )
                                    )
                                ),
                                (FilterWcpCollection(allowed=['dataset']),
                                    # (LogComponent("DATASET"),),
                                    (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=["oai_dc"], setSpecs=['dataset'], name='NARCISPORTAL'),
                                        (oaiJazz,),
                                    )
                                )
                            )
                        ), # Schrijf 'meta' partname naar storage:
                        (XmlXPath(['/oai:record/oai:metadata/document:document/document:part[@name="meta"]/text()'], fromKwarg='lxmlNode', toKwarg='data', namespaces=NAMESPACEMAP),
                            (RewritePartname("meta"),
                                (storageComponent,) # Schrijft harvester 'meta' data naar storage.
                            )
                        )
                    ),
                    (FilterMessages(allowed=['add']), # TODO: Remove this line.
                        # (LogComponent("UnDelete"),),
                        (ResurrectTombstone(),
                            (storageComponent,),
                        )
                    )
                )
            )
        )
    )
Пример #12
0
def writerMain(writerReactor, statePath, luceneserverPort, gatewayPort, quickCommit=False):

    http11Request = be(
        (HttpRequest1_1(),
            (SocketPool(reactor=writerReactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)),),
        )
    )
    indexCommitTimeout = 30

    defaultLuceneSettings = LuceneSettings(
            commitTimeout=indexCommitTimeout,
            readonly=False,
        )

    luceneWriter = luceneAndReaderConfig(defaultLuceneSettings, http11Request, luceneserverPort)

    periodicDownload = PeriodicDownload(
        writerReactor,
        host='localhost',
        port=gatewayPort,
        schedule=Schedule(period=1 if quickCommit else 10), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need 1 second! Otherwise tests will fail!
        name='index',
        autoStart=True)

    oaiDownload = OaiDownloadProcessor(
        path='/oaix',
        metadataPrefix=NORMALISED_DOC_NAME,
        workingDirectory=join(statePath, 'harvesterstate', 'gateway'),
        userAgentAddition='idx-server',
        xWait=True,
        name='index',
        autoCommit=False)

    # Post commit naar Lucene(server):
    scheduledCommitPeriodicCall = be(
        (PeriodicCall(writerReactor, message='commit', name='Scheduled commit', schedule=Schedule(period=1 if quickCommit else 300), initialSchedule=Schedule(period=1)), # WST: Flushes data from memory to disk. IntegrationTests need 1 second! Otherwise tests will fail! (API).
            (AllToDo(), # broadcast message to all components, despite of what kind of message...
                # (periodicDownload,), # WST: periodicDownload does not do anything with a 'commit' message? So why send it to it???
                (LuceneCommit(host='localhost', port=luceneserverPort,), # 'commit' message results in http post to /commit/ to Lucene server:
                    # (LogComponent("PERIODIC"),#), # [PERIODIC] httprequest1_1(*(), **{'body': None, 'host': 'localhost', 'request': '/commit/', 'port': 52501, 'method': 'POST'})
                    (http11Request,),
                    # ),
                )
            )
        )
    )


    writerServer = \
    (Observable(),
        (scheduledCommitPeriodicCall,), # Stuur periodiek een 'Commit' naar de LuceneServer...
        # (DebugPrompt(reactor=writerReactor, port=readerPort-1, globals=locals()),),
        (periodicDownload, # Ga/connect (periodiek) naar de Gateway-server...
            (XmlParseLxml(fromKwarg="data", toKwarg="lxmlNode", parseOptions=dict(huge_tree=True, remove_blank_text=True)),
                (oaiDownload, # Haal OAI spulletjes van de Gateway...
                    (UpdateAdapterFromOaiDownloadProcessor(), # Maakt van een SRU update/delete bericht (lxmlNode) een relevante message: 'delete' of 'add' message.
                        # (LogComponent("SRU harvest van GATEWAY"),), #[SRU harvest van GATEWAY] add(*(), **{'partname': 'record', 'identifier': 'meresco:record:1', 'lxmlNode': '_ElementTree(<record xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><header><identifier>meresco:record:1</identifier><datestamp>2016-07-13T15:31:10Z</datestamp></header><metadata><document xmlns="http://meres
                        (FilterMessages(allowed=['add']),
                            
                            (XmlXPath(['/oai:record/oai:metadata/document:document'], fromKwarg='lxmlNode'),   
                                # (LogComponent("NormdocToFieldsList"),),                             
                                        (NormdocToFieldsList(), # Platte lijst met veldnamen en waardes...
                                            (RecordPidToAuthNid(),),
                                            # (LogComponent("NormdocToFieldsList"),), # [DcToFieldsList] add(*(), **{'fieldslist': [('dc:identifier', 'http://meresco.com?record=1'), ('dc:description', 'This is an example program about Search with Meresco'), ('dc:title', 'Example Program 1'), ('dc:creator', 'Seecr'), ('dc:publisher', 'Seecr'), ('dc:date', '2016'), ('dc:type', 'Example'), ('dc:subject', 'Search'), ('dc:language', 'en'), ('dc:rights', 'Open Source')], 'partname': 'record', 'identifier': 'meresco:record:1'})
                                            #                                           [NormdocToFieldsList] lookupNameIds(*(set(['doi:10.1002/lno.10611', 'wos:000423029300003']),), **{})
                                            (FieldsListToLuceneDocument( # Maakt addDocument messege + creeert de facet/drilldown velden waarvan de value's tot max. 256 chars getruncated worden.
                                                    fieldRegistry=luceneWriter.settings.fieldRegistry, # o.a. drilldownfields definitie
                                                    untokenizedFieldnames=untokenizedFieldnames, # untokenized fields
                                                    indexFieldFactory=DcFields, # Creeert een "__all__", veldnaam en optioneel "untokenized.veldnaam"... 
                                                    #rewriteIdentifier=(lambda idee: idee.split(':', 1)[-1]) # meresco:record:1' => 'record:1'
                                                ),
                                                # (LogComponent("FieldsListToLuceneDocument"),), # [LUCENE_WRITER] addDocument(*(), **{'fields': [{'type': 'TextField', 'name': '__all__', 'value': 'http://meresco.com?record=1'}, {'type': 'TextField', 'name': 'dc:identifier', 'value': 'http://meresco.com?record=1'}, {'type': 'StringField', 'name': 'untokenized.dc:identifier', 'value': 'http://meresco.com?record=1'}, {'type': 'TextField', 'name': '__all__', 'value': 'This is an example program about Search with Meresco'}, {'type': 'TextField', 'name': 'dc:description', 'value': 'This is an example program about Search with Meresco'}, {'type': 'TextField', 'name': '__all__', 'value': 'Example Program 1'}, {'type': 'TextField', 'name': 'dc:title', 'value': 'Example Program 1'}, {'type': 'TextField', 'name': '__all__', 'value': 'Seecr'}, {'type': 'TextField', 'name': 'dc:creator', 'value': 'Seecr'}, {'type': 'TextField', 'name': '__all__', 'value': 'Seecr'}, {'type': 'TextField', 'name': 'dc:publisher', 'value': 'Seecr'}, {'type': 'TextField', 'name': '__all__', 'value': '2016'}, {'type': 'TextField', 'name': 'dc:date', 'value': '2016'}, {'path': ['2016'], 'type': 'FacetField', 'name': 'untokenized.dc:date'}, {'type': 'TextField', 'name': '__all__', 'value': 'Example'}, {'type': 'TextField', 'name': 'dc:type', 'value': 'Example'}, {'type': 'TextField', 'name': '__all__', 'value': 'Search'}, {'type': 'TextField', 'name': 'dc:subject', 'value': 'Search'}, {'path': ['Search'], 'type': 'FacetField', 'name': 'untokenized.dc:subject'}, {'type': 'TextField', 'name': '__all__', 'value': 'en'}, {'type': 'TextField', 'name': 'dc:language', 'value': 'en'}, {'type': 'TextField', 'name': '__all__', 'value': 'Open Source'}, {'type': 'TextField', 'name': 'dc:rights', 'value': 'Open Source'}], 'identifier': 'meresco:record:1'})
                                                    # [####LUCENE_WRITER] addDocument(*(), **{'fields': [{'type': 'TextField', 'name': '__all__', 'value': 'knaw'}, {'type': 'TextField', 'name': 'meta:id', 'value': 'knaw'}, {'type': 'TextField', 'name': '__all__', 'value': 'olddata'}, {'type': 'TextField', 'name': 'meta:set', 'value': 'olddata'}, {'type': 'TextField', 'name': '__all__', 'value': 'http://oai.knaw.nl/oai'}, {'type': 'TextField', 'name': 'meta:baseurl', 'value': 'http://oai.knaw.nl/oai'}, {'type': 'TextField', 'name': '__all__', 'value': 'knaw'}, {'type': 'TextField', 'name': 'meta:repositoryGroupId', 'value': 'knaw'}, {'type': 'TextField', 'name': '__all__', 'value': 'nl_didl'}, {'type': 'TextField', 'name': 'meta:metadataPrefix', 'value': 'nl_didl'}, {'type': 'TextField', 'name': '__all__', 'value': 'publication'}, {'type': 'TextField', 'name': 'meta_collection', 'value': 'publication'}, {'path': ['publication'], 'type': 'FacetField', 'name': 'untokenized.meta_collection'}], 'identifier': 'knaw:record:3'})
                                                (luceneWriter,),
                                                # ),
                                            )
                                        )
                                    # )
                                # )
                            )
                        ),
                        (FilterMessages(allowed=['delete']),                            
                            (luceneWriter,),
                        )
                    )
                )
            )
        )
    )
    return writerServer