def import_bnf_ean(ean): """Import record from BNFr given a isbn 13 without dashes.""" bnf_url = current_app.config['RERO_ILS_APP_IMPORT_BNF_EAN'] try: with urlopen(bnf_url % ean) as response: if response.status != 200: abort(502) # read the xml date from the HTTP response xml_data = response.read() # create a xml file in memory xml_file = six.BytesIO() xml_file.write(xml_data) xml_file.seek(0) # get the record in xml if exists # note: the request should returns one record max xml_record = next(split_stream(xml_file)) # convert xml in marc json json_data = create_record(xml_record) # convert marc json to local json format record = unimarctojson.do(json_data) response = {'metadata': record} return jsonify(response) # no record found! except StopIteration: response = {'record': {}} return jsonify(response), 404 # other errors except Exception: sys.stdout.flush() response = {'record': {}} return jsonify(response), 500
def load(source): """Load MARC XML and return Python dict.""" for data in split_stream(source): record = create_record(data) # if record.get('999__', {}).get('a', '') == 'ALBUM': # for rrecord in split_album(record): # yield rrecord yield record
def test_marc21_split_stream(): """Test MARC21 split_stream().""" COLLECTION = u'<collection>{0}{1}</collection>'.format( RECORD, RECORD_SIMPLE) generator = split_stream(BytesIO(COLLECTION.encode('utf-8'))) assert etree.tostring(next(generator), method='html').decode('utf-8') == RECORD assert etree.tostring(next(generator), method='html').decode('utf-8') == RECORD_SIMPLE
def test_marc21_split_stream(): """Test MARC21 split_stream().""" COLLECTION = u'<collection>{0}{1}</collection>'.format( RECORD, RECORD_SIMPLE ) generator = split_stream(BytesIO(COLLECTION.encode('utf-8'))) assert etree.tostring( next(generator), method='html').decode('utf-8') == RECORD assert etree.tostring( next(generator), method='html').decode('utf-8') == RECORD_SIMPLE
def import_bnf_ean(ean): """Import record from BNFr given a isbn 13 without dashes.""" bnf_url = current_app.config['RERO_ILS_APP_IMPORT_BNF_EAN'] try: with urlopen(bnf_url % ean) as response: if response.status != 200: abort(500) # read the xml date from the HTTP response xml_data = response.read() # create a xml file in memory xml_file = six.BytesIO() xml_file.write(xml_data) xml_file.seek(0) # get the record in xml if exists # note: the request should returns one record max xml_record = next(split_stream(xml_file)) # convert xml in marc json json_data = create_record(xml_record) # convert marc json to local json format record = unimarctojson.do(json_data) response = { 'record': record, 'type': 'success', 'content': _('The record has been imported.'), 'title': _('Success:') } return jsonify(response) # no record found! except StopIteration: response = { 'record': {}, 'type': 'warning', 'content': _('EAN (%(ean)s) not found on the BNF server.', ean=ean), 'title': _('Warning:') } return jsonify(response), 404 # other errors except Exception as e: import sys print(e) sys.stdout.flush() response = { 'record': {}, 'type': 'danger', 'content': _('An error occured on the BNF server.'), 'title': _('Error:') } return jsonify(response), 500
def test_marc21_records_over_single_line(): """Test records over single line.""" records = (u'<record>foo</record>', u'<record>会意字</record>', u'<record>>&<</record>') collection = u'<collection>{0}</collection>'.format(u''.join(records)) generator = split_stream(BytesIO(collection.encode('utf-8'))) for record in records: result = etree.tostring(next(generator), encoding='utf-8', method='xml') assert record.encode('utf-8') == result
def file_nusl_data_generator(start, filename, cache_dir): while True: with gzip.open(filename, 'rb') as f: for data in split_stream( chain_streams([ BytesIO( b'<root xmlns="http://www.loc.gov/MARC21/slim">'), f, BytesIO(b'</root>') ])): if not start % 1000: print('\r%08d' % start, end='', file=sys.stderr) yield data start += 1
def test_rules_3(app, db): array = [create_record(data) for data in split_stream( open( '/home/semtex/Projekty/nusl/invenio-initial-theses-conversion/tests' '/xml_files/keywords_pipe.xml', 'rb'))] for idx, field in enumerate(array): rec = fix_grantor(field) rec = fix_keywords(rec) rec = fix_language(rec) transformed = old_nusl.do(rec) schema = ThesisMetadataSchemaV1() marshmallowed = schema.load(transformed).data marshmallowed = schema.dump(marshmallowed).data print(marshmallowed)
def marcxml_marshmallow_loader(): """Marshmallow loader for MARCXML requests. The method convert only one record, otherwise will return a bad request. :return: converted marc21 json record. """ marcxml_records = split_stream(BytesIO(request.data)) number_of_xml_records = 0 josn_record = {} for marcxml_record in marcxml_records: marc21json_record = create_record(marcxml_record) josn_record = marc21.do(marc21json_record) # converted records are considered as draft josn_record['_draft'] = True if number_of_xml_records > 0: abort(400) number_of_xml_records += 1 return josn_record
def url_nusl_data_generator(start, url, cache_dir, oai=False): """ :param start: :param url: :param cache_dir: :param oai: :return: """ ses = session() token = None while True: print('\r%08d' % start, end='', file=sys.stderr) # vytiskne červeně číslo záznamu (start) sys.stderr.flush() if not oai: resp, parsed, token = fetch_nusl_data( url, start, cache_dir, ses) # return response of one record from # server and parsed response stream_generator = split_stream(BytesIO(resp)) else: resp, parsed, token = fetch_nusl_data(url, start, cache_dir, ses, oai=True, res_token=token) stream_generator = split_stream_oai_nusl(BytesIO(resp)) count = len(list(parsed.iter('{http://www.loc.gov/MARC21/slim}record')) ) # return number of records from one # response if count: for data in stream_generator: yield data start += count if token == "": break
def load(source): """Load MARC XML and return Python dict.""" for data in split_stream(source): yield create_record(data)
from dojson.contrib.marc21.utils import create_record, split_stream from scoap3.dojson.hep.model import hep from invenio_records import Record from invenio_db import db from invenio_indexer.api import RecordIndexer from scoap3.modules.pidstore.minters import scoap3_recid_minter recs = [hep.do(create_record(data)) for data in split_stream(open('../data/scoap3export.xml', 'r'))] for i, obj in enumerate(recs, start=1): print("Creating record {}/{}".format(i, len(recs))) record = Record.create(data, id_=None) print record # Create persistent identifier. pid = scoap3_recid_minter(str(record.id), record) print(pid.object_uuid) # Commit any changes to record record.commit() # Commit to DB before indexing db.session.commit() # Index record indexer = RecordIndexer() indexer.index_by_id(pid.object_uuid)
def import_documents(institution, pages): """Import documents from RERO doc. institution: String institution filter for retreiving documents pages: Number of pages to import """ url = current_app.config.get('SONAR_DOCUMENTS_RERO_DOC_URL') click.secho('Importing {pages} pages of records for "{institution}" ' 'from {url}'.format(pages=pages, institution=institution, url=url)) # Get institution record from database institution_record = InstitutionRecord.get_record_by_pid(institution) if not institution_record: raise ClickException('Institution record not found in database') institution_ref_link = InstitutionRecord.get_ref_link( 'institutions', institution_record['pid']) # mapping between institution key and RERO doc filter institution_map = current_app.config.get( 'SONAR_DOCUMENTS_INSTITUTIONS_MAP') if not institution_map: raise ClickException('Institution map not found in configuration') if institution not in institution_map: raise ClickException( 'Institution map for "{institution}" not found in configuration, ' 'keys available {keys}'.format(institution=institution, keys=institution_map.keys())) key = institution_map[institution] current_page = 1 indexer = RecordIndexer() while (current_page <= pages): click.echo('Importing records {start} to {end}... '.format( start=(current_page * 10 - 9), end=(current_page * 10)), nl=False) # Read Marc21 data for current page response = requests.get( '{url}?of=xm&jrec={first_record}&c=NAVSITE.{institution}'.format( url=url, first_record=(current_page * 10 - 9), institution=key.upper()), stream=True) if response.status_code != 200: raise ClickException('Request to "{url}" failed'.format(url=url)) response.raw.decode_content = True ids = [] for data in split_stream(response.raw): # Convert from Marc XML to JSON record = create_record(data) # Transform JSON record = marc21tojson.do(record) # Add institution record['institution'] = {'$ref': institution_ref_link} # Register record to DB db_record = DocumentRecord.create(record) db.session.commit() # Add ID for bulk index in elasticsearch ids.append(str(db_record.id)) # index and process queue in elasticsearch indexer.bulk_index(ids) indexer.process_bulk_queue() current_page += 1 click.secho('Done', fg='green', nl=True) click.secho('Finished', fg='green')
from dojson.contrib.marc21.utils import create_record, split_stream from scoap3.hep.model import hep from invenio_records import Record from invenio_db import db from invenio_indexer.api import RecordIndexer from scoap3.modules.pidstore.minters import scoap3_recid_minter recs = [ hep.do(create_record(data)) for data in split_stream(open('../data/scoap3export.xml', 'r')) ] for i, obj in enumerate(recs, start=1): print("Creating record {}/{}".format(i, len(recs))) record = Record.create(data, id_=None) print record # Create persistent identifier. pid = scoap3_recid_minter(str(record.id), record) print(pid.object_uuid) # Commit any changes to record record.commit() # Commit to DB before indexing db.session.commit() # Index record indexer = RecordIndexer() indexer.index_by_id(pid.object_uuid)