class TestElasticIndex(object): def setup_class(self): self.index = ElasticIndex('test') def teardown_class(self): self.index.delete_index() def test_scroll(self): self.index.index_into({'test': True}, 1) self.index.index_into({'test': False}, 2) self.index.index_into({'test': True}, 3) self.index.index_into({'test': False}, 4) for i in self.index.scroll(): assert isinstance(i, list) def test_index_into(self): result = self.index.index_into({'test': True, 'object': "This is a string"}, 5) assert result result = self.index.index_into({'test': True, 'object': {'sub-object': "another string"}}, 6) assert not result result = self.index.index_into({'test': False}, 'HAN000827182') assert result def test_search(self): data = list() data.append({'id': '1234', 'test': True}) self.index.bulk(data=data, identifier_key='id') result = self.index.search() assert len(result) == 7 def test_search_not_unpack(self): result = self.index.search(unpack=False) assert len(result) == 7 def test_alias(self): self.index.add_to_alias('test1') assert self.index.instance.indices.get_alias('test1') self.index.remove_from_alias('test1') with pytest.raises(NotFoundError): self.index.instance.indices.get_alias('test1') def test_count(self): result = self.index.count() assert result == 7
class TransformSruExport(DataTransformation): def __init__(self, database, config, logger=logging.getLogger(__name__)): super().__init__(logger) self._database = database self._config = config self.marc = None self.digidata_index = ElasticIndex(**config['digidata']) self.page_conversion_rates = config['page-conversions'] def transform(self, value: str) -> dict: # Do not reoder this function! self.marc = MARCMapper(value) self.marc.add_value('database', self._database) self.marc.identifier() for field in self.marc.get_fields('024'): if field.indicator1 == '7': if 'a' in field and '2' in field: self.marc.add_identifier(field['2'], field['a']) self.marc.add_identifier('swissbib', self.marc['001'].value()) if self._database == 'dsv01': for _035 in self.marc.get_fields('035'): if _035['a'] is not None: if _035['a'].startswith('(IDSBB)'): self.marc.add_identifier('dsv01', _035['a'].split(')')[1]) elif self._database == 'dsv05': self.marc.add_identifier('dsv05', self.marc['001'].value()[3:]) # Do not re-order these! self.enrich() self.parse_record_type() self.parse_date() self.parse_format_codes() self.parse_number_of_pages() self.parse_call_number() self.parse_additional_information() return self.marc.result def enrich(self): """Enriching the metadata from other data sources.""" self.enrich_digidata() def enrich_digidata(self): """Loads data from the digidata elastic index. No live updates, as the digidata repository is on Afrikaportal-elastic, which is only on localhost accessible. To update run the digispace-producer & digispace-consumer. TODO: Load live data instead of copy. To do this direct access to Afrikaportal is necessary. """ query = { "query": { "term": { "system_number": self.marc.result['identifiers'][self._database] } } } result = self.digidata_index.search(query=query) if len(result) > 0: self.marc.add_value('is_digitized', True) if 'number_of_images' in result[0]: self.marc.add_value('number_of_images', result[0]['number_of_images']) else: self.marc.add_value('is_digitized', False) def parse_date(self): """Parsing the date from the various possible fields. Stores where the information was taken from.""" _008_date = self.marc.parse_date_from_008() _046_date = self.marc.parse_date_from_046() if _008_date: year = self.marc.result['dates']['date']['year'] self.marc.add_value_sub('final', 'year', int(year)) self.marc.append_value_sub('final', 'century', int(year / 100) + 1) self.marc.append_value_sub('final', 'decade', year - year % 10) self.marc.add_value_sub('source', 'year', '008') elif _046_date: year = self.marc.result['dates']['exact']['year'] self.marc.add_value_sub('final', 'year', int(year)) self.marc.append_value_sub('final', 'century', int(year / 100) + 1) self.marc.append_value_sub('final', 'decade', year - year % 10) self.marc.add_value_sub('source', 'year', '046') elif self.marc.parse_date_from_264(): year = self.marc.result['dates']['parsed_264_year'] self.marc.add_value_sub('final', 'year', int(year)) self.marc.append_value_sub('final', 'century', int(year / 100) + 1) self.marc.append_value_sub('final', 'decade', year - year % 10) self.marc.add_value_sub('source', 'year', '264') else: self.marc.add_value_sub('source', 'year', 'None') self.marc.add_error_tag('_no_valid_date') def parse_number_of_pages(self): """Figure out the number of pages! First source: digidata number of images. Second source: coverage Third source: estimates. """ self.marc.parse_field_to_subfield('300', 'a', 'extent', 'coverage') pages = 0 name = Units.No # This will be filtered anyway. if self.marc.result['c-format'] in [ 'Objekt', 'Diverse Tonformate', 'Schallplatte', 'Diverse Filmformate', 'Datenbank' ]: pages = 1 name = Units.Gegenstand self.marc.add_value_sub('source', 'pages', 'format') if name == Units.No: pages, name = self.parse_coverage_field() if name == Units.No: raise ValueError('Name should not be None here: {}. {}'.format( self.marc.result['identifier'], pages)) if name != Units.Seiten: self.marc.add_value_sub('source', 'pages', 'estimate') self.marc.add_value_sub('source', 'estimate', name.value) pages = pages * self.page_conversion_rates[name.value] else: self.marc.add_value_sub('source', 'pages', 'coverage') self.marc.add_value_sub('extent', 'pages', pages) if 'number_of_images' in self.marc.result: pages = self.marc.result['number_of_images'] self.marc.add_value_sub('source', 'pages', 'digidata') if 'estimate' in self.marc.result['source']: del self.marc.result['source']['estimate'] self.marc.add_value_sub('final', 'pages', pages) def parse_coverage_field(self) -> Tuple[Union[float, int], Units]: """Parses various values from the coverage field and returns them as tuple: (number of unit, name of unit) """ if 'coverage' in self.marc.result['extent']: coverage = self.marc.result['extent']['coverage'] else: coverage = None swissbib_format = self.marc.result['c-format'] if swissbib_format in ['Klavierauszug', 'Partitur', 'Noten']: return self.parse_partituren(coverage) elif swissbib_format in ['Atlas', 'Karte', 'Diverse Kartenformate']: return self.parse_maps(coverage) elif swissbib_format in ['Brief', 'Briefsammlung']: return self.parse_letters(coverage) elif swissbib_format in ['Diverse Bildformate', 'Fotografie']: return self.parse_fotos(coverage) elif swissbib_format in [ 'Gesamtwerk', 'Buch', 'Verfassung / Gesetz', 'Artikel' ]: return self.parse_books(coverage, swissbib_format) elif swissbib_format in ['Handschrift']: return self.parse_manuscript(coverage) elif swissbib_format in ['Dossier']: return self.parse_dossier(coverage) elif swissbib_format in ['Zeitung', 'Zeitschrift / Schriftenreihe']: # TODO: Bessere implementierung von Zeitschriften. if coverage is None: return 1, Units.Periodikum num, name = parse_volumes(coverage, Units.Band) if num > 0: return num, name year = None to = None if 'dates' in self.marc.result: if 'date' in self.marc.result['dates']: if 'year' in self.marc.result['dates']['date']: year = self.marc.result['dates']['date']['year'] if 'to' in self.marc.result['dates']['date']: to = self.marc.result['dates']['date']['to'] if year is not None and to is not None: return year - to, Units.Band elif year is not None: return 1, Units.Band else: return 1, Units.Periodikum else: logging.error( 'Could not parse %s, with coverage %s and format %s.', self.marc.result['identifier'], coverage, swissbib_format) return 1, Units.Seiten def parse_partituren(self, coverage: str) -> Tuple[Union[float, int], Units]: if coverage is None or empty.fullmatch(coverage): return 1, Units.Partitur num, name = parse_pages(coverage) if num > 0: return num, name stimmen = re.match('Stimme', coverage) if stimmen: return 1, Units.Stimmen stimmen = re.match('Stimmen', coverage) if stimmen: return 3, Units.Stimmen num, name = parse_volumes(coverage, Units.Partitur) results = re.findall('(\d+) Stimme[n]', coverage) for result in results: num += int(result[0]) / 2 results = re.findall('(\d+) (Abt|B|C|H$|He|K|[Pp]art|Ser|T|[Vv]ol)', coverage) for result in results: num += int(result[0]) if num > 0: return num, Units.Partitur num, name = parse_meters(coverage) if num > 0: return num, name return 1, Units.Partitur def parse_maps(self, coverage: str) -> Tuple[Union[float, int], Units]: if coverage is None or empty.fullmatch(coverage): return 4, Units.Karten num, name = parse_pages(coverage) if name == 'Seiten': return num, name maps_matches = re.findall( '(\d+) ([Kc]arte[n]?|Pl[äa]n[e]?|Vogel|Ansicht|Panorama|Manuskript)', coverage) maps = 0 for matches in maps_matches: maps += int(matches[0]) if maps > 0: return maps, Units.Karten atlas_matches = re.findall('(\d+) (Atlas)', coverage) atlas = 0 for match in atlas_matches: atlas += int(match[0]) if atlas > 0: return atlas, Units.Band folders, name = parse_folders(coverage, Units.Kartenmappen) if folders > 0: return folders, name return 4, Units.Karten def parse_letters(self, coverage: str) -> Tuple[Union[float, int], Units]: if coverage is None or empty.fullmatch(coverage): return 2, Units.Briefe pages, name = parse_pages(coverage) results = re.findall( '(\d+) (Karte|Briefkarte|Postkarte|Ansichtskarte|Visitenkarte)', coverage) for result in results: pages += int(result[0]) result = re.match('Briefkarte|Postkarte|Zettel|Karte|Visitenkarte', coverage) if result: pages += 1 if pages > 0: return pages, Units.Seiten letters, name = parse_letters(coverage) if letters > 0: return letters, name volumes, name = parse_volumes(coverage, Units.Briefband) if volumes > 0: return volumes, name folders, name = parse_folders(coverage, Units.Briefmappen) if folders > 0: return folders, name return 2, Units.Briefe def parse_fotos(self, coverage: str) -> Tuple[int, Units]: if coverage is None or empty.fullmatch(coverage): return 1, Units.Seiten pages, name = parse_pages(coverage) results = re.findall( '(\d+) (Kupferstich|Litho|Foto|Zeichnung|Repro|Holzschnitt|Schattenriss' '|Aquarell|Druckgrafik(en)?|Physionotrace|Bild|Stück|Radierung)', coverage) for result in results: pages += int(result[0]) if pages > 0: return pages, Units.Seiten folders, name = parse_folders(coverage, Units.Fotomappen) if folders > 0: return folders, name return 1, Units.Seiten def parse_books(self, coverage: str, swissbib_format: str) -> Tuple[int, Units]: if swissbib_format == 'Artikel': return_type = Units.Artikel else: return_type = Units.Band if coverage is None or empty.fullmatch(coverage): return 1, return_type num, name = parse_pages(coverage) if num > 0: return num, name volumes, name = parse_volumes(coverage, return_type) if volumes > 0: return volumes, name return 1, return_type def parse_manuscript(self, coverage: str) -> Tuple[Union[float, int], Units]: if coverage is None or empty.fullmatch(coverage): return 1, Units.Faszikel num, name = parse_pages(coverage) if num > 0: return num, name volumes, name = parse_volumes(coverage, Units.Manuskriptband) if volumes > 0: return volumes, name folders, name = parse_folders(coverage, Units.Faszikel) results = re.findall( '(\d+) (Stücke|Papiertüte[n]?|Faszikel|Dossier|Broschüre|Zeichenbuch|' 'Heft(e|chen)?|Schuber|Bündel|Konvolut|Schulheft|Umschläge|Büchlein|Umschlag|Predigten)', coverage) for result in results: volumes += int(result[0]) if folders > 0: return folders, Units.Faszikel num, name = parse_boxes(coverage) if num > 0: return num, name letters, name = parse_letters(coverage) if letters > 0: return letters, name return 1, Units.Faszikel def parse_dossier(self, coverage: str) -> Tuple[Union[int, float], Units]: if coverage is None or empty.fullmatch(coverage): return 1, Units.Archiveinheit pages, name = parse_pages(coverage) if pages > 0: return pages, name volumes, name = parse_volumes(coverage, Units.Band) if volumes > 0: return volumes, name boxes, name = parse_boxes(coverage) if boxes > 0: return boxes, name folders, name = parse_folders(coverage, Units.Mappen) if folders > 0: return folders, name lfm, name = parse_meters(coverage) if lfm > 0: return lfm, name letters, name = parse_letters(coverage) if letters > 0: return letters, name archives, name = parse_archive(coverage, Units.Archiveinheit) if archives > 0: return archives, name return 1, Units.Archiveinheit def parse_record_type(self): """Defines a general type for the record. This is used to distinguish between prints and hand written manuscripts. """ self.marc.parse_field('245', 'h', 'print_material') if self._database == 'dsv01': self.marc.add_value_sub('final', 'type', 'print') elif 'print_material' in self.marc.result: if self.marc.result['print_material'] in [ 'Noten', 'Bildmaterial', 'Druckschrift', 'Kartenmaterial' ]: self.marc.add_value_sub('final', 'type', 'print') elif self.marc.result['print_material'] in [ 'Ton', 'Mikroform', 'Gegenstand', 'Filmmaterial' ]: self.marc.add_value_sub('final', 'type', 'other') elif self.marc.result['print_material'] in [ 'Manuskript', 'Notenmanuskript' ]: self.marc.add_value_sub('final', 'type', 'manuscript') else: self.marc.add_value_sub('final', 'type', 'other') self.marc.add_error_tag('_unknown_print_material') logging.warning('Unknown print material: %s in %s.', self.marc.result['print_material'], self.marc.result['identifier']) else: self.marc.add_value_sub('final', 'type', 'manuscript') def parse_call_number(self): """Parses the call number of this record has. Adds the library it belongs to as well. The call number is further indexed in parts to create facets. Only books from A100 & A125 are used. Books older than 1920 are very rare in A140 (UB Medizin) The books in A130 (Altertum) are ignored, because there are not that many, and it would be necessary to further filter the books from UBH. # TODO: Implement a way to process all the call numbers, since one title # can have many of them. # currently just picks the first one. # books can have multiple call numbers for two reasons: # 1. The library owns more than one item. # 2. The bibliographic record describes multiple parts of one title. """ for field in self.marc.get_fields('949'): if field['F'] in ['A100', 'A125']: self.marc.append_value('library', field['F']) if field['j'] != '': self.marc.append_value('call_number', field['j']) if 'call_number' in self.marc.result: results = self.create_call_number_filter() if results is not None: self.marc.add_value_sub('filter', 'prefix', results[0]) if results[1] is not None: self.marc.add_value_sub('filter', 'base', results[1]) if results[2] is not None: self.marc.add_value_sub('filter', 'second', results[2]) self.marc.add_value_sub('filter', 'number', results[3]) def create_call_number_filter( self) -> Optional[Tuple[str, Optional[str], Optional[str], str]]: call_number = '' if len(self.marc.result['call_number']) == 1: call_number = self.marc.result['call_number'][0] else: for call_n in self.marc.result['call_number']: if call_n is not None: if call_n.startswith('UBH'): call_number = call_n call_number = re.sub('\s+', ' ', call_number.strip()) database = self.marc.result['database'] if database == 'dsv05' and call_number != '': call_number = 'HAN ' + call_number if call_number == '': # remove call number if it is empty. del self.marc.result['call_number'] return None if not re.match('(UBH|HAN)', call_number) or re.fullmatch( 'UBH', call_number): # ignore anything which does not comply with convention. return None simple = re.fullmatch('(\w+) ([\w\-*.]+) (\d+)(.*)?', call_number) if simple: return simple.group(1), simple.group(2), None, ( simple.group(3) + simple.group(4)).strip() word_roman = re.fullmatch( '(\w+) (\w+) ([MCLXVI]+[ ]?[a-z]?) (\d+)(.*)?', call_number) if word_roman: return word_roman.group(1), \ word_roman.group(2), \ word_roman.group(3), \ (word_roman.group(4) + word_roman.group(5)).strip() double_word_roman = re.fullmatch( '(\w+) ([\w\-*]+) ([\w\-*]+) ([MCLXVI]+[ ]?[a-z]?) (\d+)(.*)?', call_number) if double_word_roman: return double_word_roman.group(1), \ double_word_roman.group(2) + ' ' + double_word_roman.group(3), \ double_word_roman.group(4), \ double_word_roman.group(5) three_word = re.fullmatch( '(\w+) ([\w\-*]+) ([\w\-*]+) ([A-Za-z\-*]+)(.*)?', call_number) if three_word: return three_word.group(1), \ three_word.group(2) + ' ' + three_word.group(3), \ three_word.group(4), \ three_word.group(5).strip() double_word = re.fullmatch('(\w+) ([\w\-*]+) ([\w\-*]+)(.*)?', call_number) if double_word: return double_word.group(1), double_word.group( 2), double_word.group(3), double_word.group(4).strip() rest_han = re.fullmatch('(HAN) (.*)', call_number) if rest_han: return rest_han.group(1), None, None, rest_han.group(2).strip() rest_ubh = re.fullmatch('(UBH) (.*)', call_number) if rest_ubh: return rest_ubh.group(1), None, None, rest_ubh.group(2).strip() def parse_format_codes(self): """Parse the format codes and replace them with human readable forms. The c-format, the most condensed value is used as format. """ self.marc.parse_field('898', 'a', 'a-format') if 'a-format' in self.marc.result: self.marc.result['a-format'] = format_dict[ self.marc.result['a-format']] self.marc.parse_field('898', 'b', 'b-format') if 'b-format' in self.marc.result: self.marc.result['b-format'] = format_dict[ self.marc.result['b-format']] self.marc.parse_field('898', 'c', 'c-format') if 'c-format' in self.marc.result: self.marc.result['c-format'] = format_dict[ self.marc.result['c-format']] self.marc.add_value_sub('final', 'format', self.marc.result['c-format']) def parse_additional_information(self): """Information which might be interesting in the future, but not needed for current analysis.""" self.marc.parse_leader() self.marc.parse_cat_date() self.marc.parse_rest_008() self.marc.parse_field('245', 'a', 'title') self.marc.parse_field('245', 'b', 'subtitle') self.marc.parse_field('245', 'c', 'author') self.marc.parse_field_to_subfield('264', 'a', 'production', 'place') self.marc.parse_field_to_subfield('264', 'b', 'production', 'publisher') self.marc.parse_field_to_subfield('264', 'c', 'production', 'date') self.marc.parse_field_to_subfield('300', 'b', 'extent', 'physical_attributes') self.marc.parse_field_to_subfield('300', 'c', 'extent', 'size_and_format') self.marc.parse_field_to_subfield('300', 'e', 'extent', 'additional_content') self.marc.parse_field_append_to_subfield('336', 'a', 'extent', 'content') self.marc.parse_field_append_to_subfield('337', 'a', 'extent', 'media') self.marc.parse_field_append_to_subfield('338', 'a', 'extent', 'carrier') self.marc.parse_field_to_subfield('348', 'a', 'extent', 'music') self.marc.parse_field('351', 'c', 'classification') self.marc.parse_field('250', 'a', 'version') self.marc.parse_field_to_subfield('340', 'a', 'extent', 'carrier') self.marc.parse_field_list([ '600', '610', '611', '630', '648', '650', '651', '653', '655', '690', '691' ], { 'a': 'title', '2': 'source', '0': 'identifier' }, 'subject_headings') self.marc.parse_field('856', 'u', 'link') self.marc.parse_field_to_subfield('908', 'a', 'extent', 'format') self.marc.parse_field('909', 'a', 'archive_tag') if 'date' in self.marc.result['production']: self.marc.result['final']['display_date'] = self.marc.result[ 'production']['date'] def pre_filter(self, message: str) -> bool: """Keep only records which belong to Universitätsbibliothek Basel.""" if re.search('{"F": "(A100|A125)"},', message): return False else: return True def post_filter(self, transformed_message: dict) -> bool: # Remove any record which is newer than 1920. if 'year' in transformed_message['final']: if int(transformed_message['final']['year']) > 1920: return True # Remove records of special formats. if transformed_message['final']['format'] in [ 'Objekt', 'Diverse Tonformate', 'Schallplatte', 'Diverse Filmformate', 'Datenbank' ]: return True return False