def listIdentifiers(baseUrl, metadataPrefix, set=None, from_=None, until=None, cursor=0, batch_size=10): """Return a list of Headers with the given parameters from the specified OAI-PMH server.""" args = {'verb': "ListIdentifiers", 'metadataPrefix': metadataPrefix } if set is not None: args['set'] = set if from_ is not None: args['from'] = str(from_) if until is not None: args['until'] = str(until) params = urllib.urlencode(args) url = "{0}?{1}".format(baseUrl, params) data = fetch_data(url) headers = [] while data is not None: try: tree = etree.fromstring(data) except: sys.stderr.write(url + '\n') sys.stderr.write(data + '\n') sys.stderr.flush() raise for h in tree.xpath('//oai:header', namespaces={'oai': NS_OAIPMH}): headers.append(headerFromLxmlElement(h)) resTok = tree.xpath('string(//oai:resumptionToken)', namespaces={'oai': NS_OAIPMH}) if resTok: params = urllib.urlencode({'verb': "ListIdentifiers", 'resumptionToken': resTok}) url = "{0}?{1}".format(baseUrl, params) data = fetch_data(url) else: break return headers
def listRecords(baseUrl, metadataPrefix, set=None, from_=None, until=None, cursor=0, batch_size=10): """Return a list of (Header, metadata, about) tuples for records which match the given parameters from the specified OAI-PMH server.""" args = {'verb': "ListRecords", 'metadataPrefix': metadataPrefix} if set is not None: args['set'] = set if from_ is not None: args['from'] = str(from_) if until is not None: args['until'] = str(until) params = urllib.urlencode(args) url = "{0}?{1}".format(baseUrl, params) data = fetch_data(url) records = [] i = 0 while (data is not None): try: tree = etree.fromstring(data) except: print url print data raise for recEl in tree.xpath('//oai:record', namespaces={'oai': NS_OAIPMH}): if i < cursor: i += 1 continue hEl = recEl.xpath('//oai:header', namespaces={'oai': NS_OAIPMH})[0] header = headerFromLxmlElement(hEl) mdEl = recEl.xpath('//oai:metadata/*', namespaces={'oai': NS_OAIPMH})[0] recString = etree.tostring(mdEl) rec = LxmlRecord(mdEl, xml=recString, docId=header.identifier(), byteCount=len(recString)) records.append((header, rec, None)) i += 1 if (len(headers) >= batch_size): return headers resTok = tree.xpath('string(//oai:resumptionToken)', namespaces={'oai': NS_OAIPMH}) if resTok: data = fetch_data(url + '&resumptionToken=' + cgi_encode(resTok)) else: break return records
def getRecord(baseUrl, metadataPrefix, identifier): """Return (Header, metadata, about) tuple of record with specified identifier from the specified OAI-PMH server.""" args = { 'verb': "GetRecord", 'metadataPrefix': metadataPrefix, 'identifier': identifier } params = urllib.urlencode(args) url = "{0}?{1}".format(baseUrl, params) data = fetch_data(url) try: tree = etree.fromstring(data) except: sys.stderr.write(url + '\n') sys.stderr.write(data + '\n') sys.stderr.flush() raise hEl = tree.xpath('//oai:record[1]/oai:header', namespaces={'oai': NS_OAIPMH})[0] header = headerFromLxmlElement(hEl) recEl = tree.xpath('//oai:record[1]/oai:metadata/*', namespaces={'oai': NS_OAIPMH})[0] recString = etree.tostring(recEl) rec = LxmlRecord(recEl, xml=recString, docId=identifier, byteCount=len(recString)) return (header, rec, None)
def listRecords(baseUrl, metadataPrefix, set=None, from_=None, until=None, cursor=0, batch_size=10): """Return a list of (Header, metadata, about) tuples for records which match the given parameters from the specified OAI-PMH server.""" args = {'verb': "ListRecords", 'metadataPrefix': metadataPrefix } if set is not None: args['set'] = set if from_ is not None: args['from'] = str(from_) if until is not None: args['until'] = str(until) params = urllib.urlencode(args) url = "{0}?{1}".format(baseUrl, params) data = fetch_data(url) records = [] i = 0 while (data is not None): try: tree = etree.fromstring(data) except: print url print data raise for recEl in tree.xpath('//oai:record', namespaces={'oai': NS_OAIPMH}): if i < cursor: i+=1 continue hEl = recEl.xpath('//oai:header', namespaces={'oai': NS_OAIPMH})[0] header = headerFromLxmlElement(hEl) mdEl = recEl.xpath('//oai:metadata/*', namespaces={'oai': NS_OAIPMH})[0] recString = etree.tostring(mdEl) rec = LxmlRecord(mdEl, xml=recString, docId=header.identifier(), byteCount=len(recString)) records.append((header, rec, None)) i+=1 if (len(headers) >= batch_size): return headers resTok = tree.xpath('string(//oai:resumptionToken)', namespaces={'oai': NS_OAIPMH}) if resTok: data = fetch_data(url + '&resumptionToken=' + cgi_encode(resTok)) else: break return records
def listIdentifiers(baseUrl, metadataPrefix, set=None, from_=None, until=None, cursor=0, batch_size=10): """Return a list of Headers with the given parameters from the specified OAI-PMH server.""" args = {'verb': "ListIdentifiers", 'metadataPrefix': metadataPrefix} if set is not None: args['set'] = set if from_ is not None: args['from'] = str(from_) if until is not None: args['until'] = str(until) params = urllib.urlencode(args) url = "{0}?{1}".format(baseUrl, params) data = fetch_data(url) headers = [] while data is not None: try: tree = etree.fromstring(data) except: sys.stderr.write(url + '\n') sys.stderr.write(data + '\n') sys.stderr.flush() raise for h in tree.xpath('//oai:header', namespaces={'oai': NS_OAIPMH}): headers.append(headerFromLxmlElement(h)) resTok = tree.xpath('string(//oai:resumptionToken)', namespaces={'oai': NS_OAIPMH}) if resTok: params = urllib.urlencode({ 'verb': "ListIdentifiers", 'resumptionToken': resTok }) url = "{0}?{1}".format(baseUrl, params) data = fetch_data(url) else: break return headers
def getRecord(baseUrl, metadataPrefix, identifier): """Return (Header, metadata, about) tuple of record with specified identifier from the specified OAI-PMH server.""" args = {'verb': "GetRecord", 'metadataPrefix': metadataPrefix, 'identifier': identifier} params = urllib.urlencode(args) url = "{0}?{1}".format(baseUrl, params) data = fetch_data(url) try: tree = etree.fromstring(data) except: sys.stderr.write(url + '\n') sys.stderr.write(data + '\n') sys.stderr.flush() raise hEl = tree.xpath('//oai:record[1]/oai:header', namespaces={'oai': NS_OAIPMH})[0] header = headerFromLxmlElement(hEl) recEl = tree.xpath('//oai:record[1]/oai:metadata/*', namespaces={'oai': NS_OAIPMH})[0] recString = etree.tostring(recEl) rec = LxmlRecord(recEl, xml=recString, docId=identifier, byteCount=len(recString)) return (header, rec, None)