Пример #1
0
def listIdentifiers(baseUrl, metadataPrefix, set=None, from_=None, until=None, cursor=0, batch_size=10):
    """Return a list of Headers with the given parameters from the specified OAI-PMH server."""
    args = {'verb': "ListIdentifiers",
            'metadataPrefix': metadataPrefix
            }
    if set is not None:
        args['set'] = set
    if from_ is not None:
        args['from'] = str(from_)
    if until is not None:
        args['until'] = str(until)
    params = urllib.urlencode(args)
    url = "{0}?{1}".format(baseUrl, params)
    data = fetch_data(url)
    headers = []
    while data is not None:
        try:
            tree = etree.fromstring(data)
        except:
            sys.stderr.write(url + '\n')
            sys.stderr.write(data + '\n')
            sys.stderr.flush()
            raise
        for h in tree.xpath('//oai:header', namespaces={'oai': NS_OAIPMH}):
            headers.append(headerFromLxmlElement(h))
            
        resTok = tree.xpath('string(//oai:resumptionToken)', namespaces={'oai': NS_OAIPMH})
        if resTok:
            params = urllib.urlencode({'verb': "ListIdentifiers",
                                       'resumptionToken': resTok})
            url = "{0}?{1}".format(baseUrl, params)
            data = fetch_data(url)
        else:
            break
    return headers
Пример #2
0
def listRecords(baseUrl,
                metadataPrefix,
                set=None,
                from_=None,
                until=None,
                cursor=0,
                batch_size=10):
    """Return a list of (Header, metadata, about) tuples for records which match the given parameters from the specified OAI-PMH server."""
    args = {'verb': "ListRecords", 'metadataPrefix': metadataPrefix}
    if set is not None:
        args['set'] = set
    if from_ is not None:
        args['from'] = str(from_)
    if until is not None:
        args['until'] = str(until)
    params = urllib.urlencode(args)
    url = "{0}?{1}".format(baseUrl, params)
    data = fetch_data(url)
    records = []
    i = 0
    while (data is not None):
        try:
            tree = etree.fromstring(data)
        except:
            print url
            print data
            raise
        for recEl in tree.xpath('//oai:record', namespaces={'oai': NS_OAIPMH}):
            if i < cursor:
                i += 1
                continue
            hEl = recEl.xpath('//oai:header', namespaces={'oai': NS_OAIPMH})[0]
            header = headerFromLxmlElement(hEl)
            mdEl = recEl.xpath('//oai:metadata/*',
                               namespaces={'oai': NS_OAIPMH})[0]
            recString = etree.tostring(mdEl)
            rec = LxmlRecord(mdEl,
                             xml=recString,
                             docId=header.identifier(),
                             byteCount=len(recString))
            records.append((header, rec, None))
            i += 1
            if (len(headers) >= batch_size):
                return headers

        resTok = tree.xpath('string(//oai:resumptionToken)',
                            namespaces={'oai': NS_OAIPMH})
        if resTok:
            data = fetch_data(url + '&resumptionToken=' + cgi_encode(resTok))
        else:
            break

    return records
Пример #3
0
def getRecord(baseUrl, metadataPrefix, identifier):
    """Return (Header, metadata, about) tuple of record with specified identifier from the specified OAI-PMH server."""
    args = {
        'verb': "GetRecord",
        'metadataPrefix': metadataPrefix,
        'identifier': identifier
    }
    params = urllib.urlencode(args)
    url = "{0}?{1}".format(baseUrl, params)
    data = fetch_data(url)
    try:
        tree = etree.fromstring(data)
    except:
        sys.stderr.write(url + '\n')
        sys.stderr.write(data + '\n')
        sys.stderr.flush()
        raise
    hEl = tree.xpath('//oai:record[1]/oai:header',
                     namespaces={'oai': NS_OAIPMH})[0]
    header = headerFromLxmlElement(hEl)
    recEl = tree.xpath('//oai:record[1]/oai:metadata/*',
                       namespaces={'oai': NS_OAIPMH})[0]
    recString = etree.tostring(recEl)
    rec = LxmlRecord(recEl,
                     xml=recString,
                     docId=identifier,
                     byteCount=len(recString))
    return (header, rec, None)
Пример #4
0
def listRecords(baseUrl, metadataPrefix, set=None, from_=None, until=None, cursor=0, batch_size=10):
    """Return a list of (Header, metadata, about) tuples for records which match the given parameters from the specified OAI-PMH server."""
    args = {'verb': "ListRecords",
            'metadataPrefix': metadataPrefix
            }
    if set is not None:
        args['set'] = set
    if from_ is not None:
        args['from'] = str(from_)
    if until is not None:
        args['until'] = str(until)
    params = urllib.urlencode(args)
    url = "{0}?{1}".format(baseUrl, params)
    data = fetch_data(url)
    records = []
    i = 0
    while (data is not None):
        try:
            tree = etree.fromstring(data)
        except:
            print url
            print data
            raise
        for recEl in tree.xpath('//oai:record', namespaces={'oai': NS_OAIPMH}):
            if i < cursor:
                i+=1
                continue
            hEl = recEl.xpath('//oai:header', namespaces={'oai': NS_OAIPMH})[0]
            header = headerFromLxmlElement(hEl)
            mdEl = recEl.xpath('//oai:metadata/*', namespaces={'oai': NS_OAIPMH})[0]
            recString = etree.tostring(mdEl)
            rec = LxmlRecord(mdEl, xml=recString, docId=header.identifier(), byteCount=len(recString))
            records.append((header, rec, None))
            i+=1
            if (len(headers) >= batch_size):
                return headers
            
        resTok = tree.xpath('string(//oai:resumptionToken)', namespaces={'oai': NS_OAIPMH})
        if resTok:
            data = fetch_data(url + '&resumptionToken=' + cgi_encode(resTok))
        else:
            break

    return records
Пример #5
0
def listIdentifiers(baseUrl,
                    metadataPrefix,
                    set=None,
                    from_=None,
                    until=None,
                    cursor=0,
                    batch_size=10):
    """Return a list of Headers with the given parameters from the specified OAI-PMH server."""
    args = {'verb': "ListIdentifiers", 'metadataPrefix': metadataPrefix}
    if set is not None:
        args['set'] = set
    if from_ is not None:
        args['from'] = str(from_)
    if until is not None:
        args['until'] = str(until)
    params = urllib.urlencode(args)
    url = "{0}?{1}".format(baseUrl, params)
    data = fetch_data(url)
    headers = []
    while data is not None:
        try:
            tree = etree.fromstring(data)
        except:
            sys.stderr.write(url + '\n')
            sys.stderr.write(data + '\n')
            sys.stderr.flush()
            raise
        for h in tree.xpath('//oai:header', namespaces={'oai': NS_OAIPMH}):
            headers.append(headerFromLxmlElement(h))

        resTok = tree.xpath('string(//oai:resumptionToken)',
                            namespaces={'oai': NS_OAIPMH})
        if resTok:
            params = urllib.urlencode({
                'verb': "ListIdentifiers",
                'resumptionToken': resTok
            })
            url = "{0}?{1}".format(baseUrl, params)
            data = fetch_data(url)
        else:
            break
    return headers
Пример #6
0
def getRecord(baseUrl, metadataPrefix, identifier):
    """Return (Header, metadata, about) tuple of record with specified identifier from the specified OAI-PMH server."""
    args = {'verb': "GetRecord",
            'metadataPrefix': metadataPrefix,
            'identifier': identifier}
    params = urllib.urlencode(args)
    url = "{0}?{1}".format(baseUrl, params)
    data = fetch_data(url)
    try:
        tree = etree.fromstring(data)
    except:
        sys.stderr.write(url + '\n')
        sys.stderr.write(data + '\n')
        sys.stderr.flush()
        raise
    hEl = tree.xpath('//oai:record[1]/oai:header', namespaces={'oai': NS_OAIPMH})[0]
    header = headerFromLxmlElement(hEl)
    recEl = tree.xpath('//oai:record[1]/oai:metadata/*', namespaces={'oai': NS_OAIPMH})[0]
    recString = etree.tostring(recEl)
    rec = LxmlRecord(recEl, xml=recString, docId=identifier, byteCount=len(recString))
    return (header, rec, None)