Python split_stream示例，dojson.contrib.marc21.utils.split_stream Python示例

示例#1

0

显示文件

文件： views.py 项目： cs-ubr/rero-ils

def import_bnf_ean(ean):
    """Import record from BNFr given a isbn 13 without dashes."""
    bnf_url = current_app.config['RERO_ILS_APP_IMPORT_BNF_EAN']
    try:
        with urlopen(bnf_url % ean) as response:
            if response.status != 200:
                abort(502)
            # read the xml date from the HTTP response
            xml_data = response.read()

            # create a xml file in memory
            xml_file = six.BytesIO()
            xml_file.write(xml_data)
            xml_file.seek(0)

            # get the record in xml if exists
            # note: the request should returns one record max
            xml_record = next(split_stream(xml_file))

            # convert xml in marc json
            json_data = create_record(xml_record)

            # convert marc json to local json format
            record = unimarctojson.do(json_data)
            response = {'metadata': record}
            return jsonify(response)
    # no record found!
    except StopIteration:
        response = {'record': {}}
        return jsonify(response), 404
    # other errors
    except Exception:
        sys.stdout.flush()
        response = {'record': {}}
        return jsonify(response), 500

示例#2

0

显示文件

文件： utils.py 项目： hachreak/cds-dojson

def load(source):
    """Load MARC XML and return Python dict."""
    for data in split_stream(source):
        record = create_record(data)
        # if record.get('999__', {}).get('a', '') == 'ALBUM':
        #     for rrecord in split_album(record):
        #         yield rrecord
        yield record

示例#3

0

显示文件

文件： test_core.py 项目： topless/dojson

def test_marc21_split_stream():
    """Test MARC21 split_stream()."""
    COLLECTION = u'<collection>{0}{1}</collection>'.format(
        RECORD, RECORD_SIMPLE)
    generator = split_stream(BytesIO(COLLECTION.encode('utf-8')))
    assert etree.tostring(next(generator),
                          method='html').decode('utf-8') == RECORD
    assert etree.tostring(next(generator),
                          method='html').decode('utf-8') == RECORD_SIMPLE

示例#4

0

显示文件

文件： test_core.py 项目： hachreak/dojson

def test_marc21_split_stream():
    """Test MARC21 split_stream()."""
    COLLECTION = u'<collection>{0}{1}</collection>'.format(
        RECORD, RECORD_SIMPLE
    )
    generator = split_stream(BytesIO(COLLECTION.encode('utf-8')))
    assert etree.tostring(
        next(generator), method='html').decode('utf-8') == RECORD
    assert etree.tostring(
        next(generator), method='html').decode('utf-8') == RECORD_SIMPLE

示例#5

0

显示文件

def import_bnf_ean(ean):
    """Import record from BNFr given a isbn 13 without dashes."""
    bnf_url = current_app.config['RERO_ILS_APP_IMPORT_BNF_EAN']
    try:
        with urlopen(bnf_url % ean) as response:
            if response.status != 200:
                abort(500)
            # read the xml date from the HTTP response
            xml_data = response.read()

            # create a xml file in memory
            xml_file = six.BytesIO()
            xml_file.write(xml_data)
            xml_file.seek(0)

            # get the record in xml if exists
            # note: the request should returns one record max
            xml_record = next(split_stream(xml_file))

            # convert xml in marc json
            json_data = create_record(xml_record)

            # convert marc json to local json format
            record = unimarctojson.do(json_data)
            response = {
                'record': record,
                'type': 'success',
                'content': _('The record has been imported.'),
                'title': _('Success:')
            }
            return jsonify(response)
    # no record found!
    except StopIteration:
        response = {
            'record': {},
            'type': 'warning',
            'content': _('EAN (%(ean)s) not found on the BNF server.',
                         ean=ean),
            'title': _('Warning:')
        }
        return jsonify(response), 404
    # other errors
    except Exception as e:
        import sys
        print(e)
        sys.stdout.flush()
        response = {
            'record': {},
            'type': 'danger',
            'content': _('An error occured on the BNF server.'),
            'title': _('Error:')
        }
        return jsonify(response), 500

示例#6

0

显示文件

文件： test_core.py 项目： topless/dojson

def test_marc21_records_over_single_line():
    """Test records over single line."""

    records = (u'<record>foo</record>', u'<record>会意字</record>',
               u'<record>&gt;&amp;&lt;</record>')
    collection = u'<collection>{0}</collection>'.format(u''.join(records))

    generator = split_stream(BytesIO(collection.encode('utf-8')))
    for record in records:
        result = etree.tostring(next(generator),
                                encoding='utf-8',
                                method='xml')
        assert record.encode('utf-8') == result

示例#7

0

显示文件

文件： test_core.py 项目： hachreak/dojson

def test_marc21_records_over_single_line():
    """Test records over single line."""

    records = (u'<record>foo</record>',
               u'<record>会意字</record>',
               u'<record>&gt;&amp;&lt;</record>')
    collection = u'<collection>{0}</collection>'.format(u''.join(records))

    generator = split_stream(BytesIO(collection.encode('utf-8')))
    for record in records:
        result = etree.tostring(next(generator),
                                encoding='utf-8',
                                method='xml')
        assert record.encode('utf-8') == result

示例#8

0

显示文件

def file_nusl_data_generator(start, filename, cache_dir):
    while True:
        with gzip.open(filename, 'rb') as f:
            for data in split_stream(
                    chain_streams([
                        BytesIO(
                            b'<root xmlns="http://www.loc.gov/MARC21/slim">'),
                        f,
                        BytesIO(b'</root>')
                    ])):
                if not start % 1000:
                    print('\r%08d' % start, end='', file=sys.stderr)
                yield data
                start += 1

示例#9

0

显示文件

文件： test_rules.py 项目： Narodni-repozitar/invenio-initial-theses-conversion

def test_rules_3(app, db):
    array = [create_record(data) for data in
             split_stream(
                 open(
                     '/home/semtex/Projekty/nusl/invenio-initial-theses-conversion/tests'
                     '/xml_files/keywords_pipe.xml',
                     'rb'))]
    for idx, field in enumerate(array):
        rec = fix_grantor(field)
        rec = fix_keywords(rec)
        rec = fix_language(rec)
        transformed = old_nusl.do(rec)
        schema = ThesisMetadataSchemaV1()
        marshmallowed = schema.load(transformed).data
        marshmallowed = schema.dump(marshmallowed).data
        print(marshmallowed)

示例#10

0

显示文件

def marcxml_marshmallow_loader():
    """Marshmallow loader for MARCXML requests.

    The method convert only one record, otherwise will return a bad request.
    :return: converted marc21 json record.
    """
    marcxml_records = split_stream(BytesIO(request.data))
    number_of_xml_records = 0
    josn_record = {}
    for marcxml_record in marcxml_records:
        marc21json_record = create_record(marcxml_record)
        josn_record = marc21.do(marc21json_record)
        # converted records are considered as draft
        josn_record['_draft'] = True
        if number_of_xml_records > 0:
            abort(400)
        number_of_xml_records += 1
    return josn_record

示例#11

0

显示文件

def url_nusl_data_generator(start, url, cache_dir, oai=False):
    """

    :param start:
    :param url:
    :param cache_dir:
    :param oai:
    :return:
    """
    ses = session()
    token = None
    while True:
        print('\r%08d' % start, end='',
              file=sys.stderr)  # vytiskne červeně číslo záznamu (start)
        sys.stderr.flush()
        if not oai:
            resp, parsed, token = fetch_nusl_data(
                url, start, cache_dir,
                ses)  # return response of one record from
            # server and parsed response
            stream_generator = split_stream(BytesIO(resp))
        else:
            resp, parsed, token = fetch_nusl_data(url,
                                                  start,
                                                  cache_dir,
                                                  ses,
                                                  oai=True,
                                                  res_token=token)
            stream_generator = split_stream_oai_nusl(BytesIO(resp))

        count = len(list(parsed.iter('{http://www.loc.gov/MARC21/slim}record'))
                    )  # return number of records from one
        # response
        if count:
            for data in stream_generator:
                yield data
        start += count
        if token == "":
            break

示例#12

0

显示文件

文件： utils.py 项目： topless/cds-dojson

def load(source):
    """Load MARC XML and return Python dict."""
    for data in split_stream(source):
        yield create_record(data)

示例#13

0

显示文件

文件： import_records_from_file.py 项目： SCOAP3/scoap3-next

from dojson.contrib.marc21.utils import create_record, split_stream
from scoap3.dojson.hep.model import hep
from invenio_records import Record
from invenio_db import db
from invenio_indexer.api import RecordIndexer
from scoap3.modules.pidstore.minters import scoap3_recid_minter

recs = [hep.do(create_record(data)) for data in split_stream(open('../data/scoap3export.xml', 'r'))]

for i, obj in enumerate(recs, start=1):
    print("Creating record {}/{}".format(i, len(recs)))
    record = Record.create(data, id_=None)
    print record

    # Create persistent identifier.
    pid = scoap3_recid_minter(str(record.id), record)
    print(pid.object_uuid)

    # Commit any changes to record
    record.commit()

    # Commit to DB before indexing
    db.session.commit()

    # Index record
    indexer = RecordIndexer()
    indexer.index_by_id(pid.object_uuid)

示例#14

0

显示文件

def import_documents(institution, pages):
    """Import documents from RERO doc.

    institution: String institution filter for retreiving documents
    pages: Number of pages to import
    """
    url = current_app.config.get('SONAR_DOCUMENTS_RERO_DOC_URL')

    click.secho('Importing {pages} pages of records for "{institution}" '
                'from {url}'.format(pages=pages,
                                    institution=institution,
                                    url=url))

    # Get institution record from database
    institution_record = InstitutionRecord.get_record_by_pid(institution)

    if not institution_record:
        raise ClickException('Institution record not found in database')

    institution_ref_link = InstitutionRecord.get_ref_link(
        'institutions', institution_record['pid'])

    # mapping between institution key and RERO doc filter
    institution_map = current_app.config.get(
        'SONAR_DOCUMENTS_INSTITUTIONS_MAP')

    if not institution_map:
        raise ClickException('Institution map not found in configuration')

    if institution not in institution_map:
        raise ClickException(
            'Institution map for "{institution}" not found in configuration, '
            'keys available {keys}'.format(institution=institution,
                                           keys=institution_map.keys()))

    key = institution_map[institution]
    current_page = 1

    indexer = RecordIndexer()

    while (current_page <= pages):
        click.echo('Importing records {start} to {end}... '.format(
            start=(current_page * 10 - 9), end=(current_page * 10)),
                   nl=False)

        # Read Marc21 data for current page
        response = requests.get(
            '{url}?of=xm&jrec={first_record}&c=NAVSITE.{institution}'.format(
                url=url,
                first_record=(current_page * 10 - 9),
                institution=key.upper()),
            stream=True)

        if response.status_code != 200:
            raise ClickException('Request to "{url}" failed'.format(url=url))

        response.raw.decode_content = True

        ids = []

        for data in split_stream(response.raw):
            # Convert from Marc XML to JSON
            record = create_record(data)

            # Transform JSON
            record = marc21tojson.do(record)

            # Add institution
            record['institution'] = {'$ref': institution_ref_link}

            # Register record to DB
            db_record = DocumentRecord.create(record)
            db.session.commit()

            # Add ID for bulk index in elasticsearch
            ids.append(str(db_record.id))

        # index and process queue in elasticsearch
        indexer.bulk_index(ids)
        indexer.process_bulk_queue()

        current_page += 1

        click.secho('Done', fg='green', nl=True)

    click.secho('Finished', fg='green')

示例#15

0

显示文件

from dojson.contrib.marc21.utils import create_record, split_stream
from scoap3.hep.model import hep
from invenio_records import Record
from invenio_db import db
from invenio_indexer.api import RecordIndexer
from scoap3.modules.pidstore.minters import scoap3_recid_minter

recs = [
    hep.do(create_record(data))
    for data in split_stream(open('../data/scoap3export.xml', 'r'))
]

for i, obj in enumerate(recs, start=1):
    print("Creating record {}/{}".format(i, len(recs)))
    record = Record.create(data, id_=None)
    print record

    # Create persistent identifier.
    pid = scoap3_recid_minter(str(record.id), record)
    print(pid.object_uuid)

    # Commit any changes to record
    record.commit()

    # Commit to DB before indexing
    db.session.commit()

    # Index record
    indexer = RecordIndexer()
    indexer.index_by_id(pid.object_uuid)