def __init__(self, config, config_class=ElasticConsumerConfig, logger=logging.getLogger(__name__)): super().__init__(config, config_class, logger=logger) self._index = ElasticIndex(**self.configuration.elastic_settings) self._key = self.configuration.key
def __init__(self, database, config, logger=logging.getLogger(__name__)): super().__init__(logger) self._database = database self._config = config self.marc = None self.digidata_index = ElasticIndex(**config['digidata']) self.page_conversion_rates = config['page-conversions']
def setup_class(self): self.admin = KafkaAdminClient(bootstrap_servers='localhost:9092') self.index = ElasticIndex('test-elastic-producer', 'doc') self.index.index_into({'test': 1}, 0) self.index.index_into({'test': 2}, 1) self.index.index_into({'test': 3}, 2) self.index.index_into({'test': 4}, 3) self.index.index_into({'test': 5}, 4) self.producer = ElasticProducer('configs/elastic/test_elastic_producer_producer.yml') self.consumer = SimpleConsumer('configs/elastic/test_elastic_producer_consumer.yml')
class TestElasticProducer(object): def setup_class(self): self.admin = KafkaAdminClient(bootstrap_servers='localhost:9092') self.index = ElasticIndex('test-elastic-producer', 'doc') self.index.index_into({'test': 1}, 0) self.index.index_into({'test': 2}, 1) self.index.index_into({'test': 3}, 2) self.index.index_into({'test': 4}, 3) self.index.index_into({'test': 5}, 4) self.producer = ElasticProducer('configs/elastic/test_elastic_producer_producer.yml') self.consumer = SimpleConsumer('configs/elastic/test_elastic_producer_consumer.yml') def teardown_class(self): self.consumer.close() self.admin.delete_topics(['test-elastic-producer']) self.admin.close() self.index.delete() #@pytest.mark.skip() def test_produce(self): self.producer.process() key, message = self.consumer.consume() assert key == '0' assert message == '{"test": 1}'
def enrich( index: ElasticIndex, system_number: str ) -> Tuple[Dict[str, int], Optional[str], Optional[List[str]]]: query = { '_source': ['hits.*', 'doi'], 'query': { 'term': { '_id': { 'value': system_number } } } } results = index.scan_index(query=query) if len(results) == 1: logging.debug(results) if 'doi' in results[0]: doi = results[0]['doi'] else: doi = None return results[0]['hits'], doi, [] else: return { '2012': 0, '2013': 0, '2014': 0, '2015': 0, '2016': 0, '2017': 0, '2018': 0, 'total': 0 }, None, []
def enrich( index: ElasticIndex, system_number: str ) -> Tuple[Dict[str, Dict[str, int]], Union[List[str], None]]: query = { '_source': ['bau.*', 'swa.*'], 'query': { 'term': { '_id': { 'value': system_number } } } } results = index.scan_index(query=query) if len(results) == 1: return results[0], [] else: return { 'bau': { '2016': 0, '2017': 0, '2018': 0, 'total': 0 }, 'swa': { '2016': 0, '2017': 0, '2018': 0, 'total': 0 } }, []
def enrich( index: ElasticIndex, system_number: str) -> Tuple[Dict[str, int], Union[List[str], None]]: query = {'query': {'term': {'system_number': {'value': system_number}}}} hits = len(index.scan_index(query=query)) identifier = int(system_number) if identifier < 320000: return {'total': hits}, ['_opac_dual_hit'] else: return {'total': hits}, []
class SimpleElasticConsumer(AbstractBaseConsumer): """ A KafkaConsumer which consumes messages and indexes them into a ElasticIndex one by one. Requires the following configs: Consumer: bootstrap_servers: localhost:9092 client_id: test group_id: elastic-consumer-test auto_offset_reset: earliest Topics: - test ElasticIndex: index: name-of-index doc_type: _doc (default value for elasticsearch 6) url: http://localhost:9200 timeout: 300 """ def __init__(self, config, config_class=ElasticConsumerConfig, logger=logging.getLogger(__name__)): super().__init__(config, config_class, logger=logger) self._index = ElasticIndex(**self.configuration.elastic_settings) def consume(self) -> bool: """ Consumes a single message from the subscribed topic and indexes it into the elasticsearch index. Returns True if successful, False otherwise. """ message = next(self._consumer) key = message.key.decode('utf-8') try: value = json.loads(message.value.decode('utf-8')) except JSONDecodeError as ex: value = { 'message': message.value.decode('utf-8'), 'error': '{}'.format(ex) } result = self._index.index_into(value, key) if result: for assignment in self._consumer.assignment(): pos = self._consumer.position(assignment) if pos != self._consumer.committed(assignment): self._consumer.commit( {assignment: OffsetAndMetadata(pos, "")}) # self._time_logger.info("Consumed and indexed one message.") return result
def upload_to_elastic(self, what: str, identifier='identifier'): """ Uploads a harvest to a elastic search index. :param what: Which harvest it should upload 'ach', 'proj', 'person', 'org', 'pub' :param identifier: What the identifier inside the data is called (default 'identifier') """ data = list() for root_dir, _, files in os.walk(self.harvester_info[what][2]): for file in files: tree = ElementTree.parse(root_dir + '/' + file) root = purge_namespaces(tree.getroot()) for element in root.findall('./ListRecords/record/metadata/'): data.append(json.loads(xml2json(element, 'parker'))) for item in data: clean_data(item, identifier) index = ElasticIndex(self.elastic_index + what + '_' + date.today().isoformat(), 'publication', self.elastic_url) index.bulk(data, identifier)
def transform_affiliated_publication(self, element, parent, edoc_tag, index, doc_type, url): """Transform affiliated publications in projects. Uses the given elastic index to translate a mcss id into a eprints id. When a duplicate is found, all eprints Ids are added and the logging is sent to [email protected]. De-duplication has to be resolved manually. When no match is found the mcss id is ignored. (TODO: send to [email protected]?) """ field = parent.find('./' + edoc_tag) if field is None: field = ET.SubElement(parent, edoc_tag) es = ElasticIndex(index, doc_type, url=url) query = { '_source': ['eprintid'], 'query': { 'term': { 'mcss_id': { 'value': int(element.text) } } } } result = es.scan_index(query) if len(result) == 1: ET.SubElement(field, 'item').text = str(result[0]['eprintid']) elif len(result) > 1: for e in result: ET.SubElement(field, 'item').text = str(e['eprintid']) logging.error( 'Found multiple results with mcss_id %s for project %s %s.', element.text, self.current_id, self.current_title) else: logging.error( 'Found no eprints ID for the following mcss_id: %s for project %s, %s.', element.text, self.current_id, self.current_title)
class ElasticProducer(AbstractBaseProducer): def __init__(self, config: str): super().__init__(config, config_parser=ElasticProducerConfig) self._index = ElasticIndex(**self.configuration.elastic_settings) def process(self): for results in self._index.scroll(**self.configuration.scroll): for record in results: key: str = record['_id'] value: str = json.dumps(record['_source']) self.send(key.encode('utf-8'), value.encode('utf-8')) self.flush() self.close()
def enrich(index: ElasticIndex, system_number: str, database: str) -> Tuple[Dict[str, Dict[str, int]], Union[List[str], None]]: if database == 'dsv01': query = { '_source': ['reservations.*', 'loans.*'], 'query': { 'term': { '_id': { 'value': system_number } } } } results = index.scan_index(query=query) if len(results) == 1: return results[0], [] else: return placeholder, ['_no_aleph_data'] else: # place holder values for scripted fields. return placeholder, ['_no_aleph_data']
def import_data(plattform: str, year: str): index = ElasticIndex("emanus-{}-data-base-{}".format(plattform, year), "doc") with open("emanus-{}-{}.json".format(plattform, year), 'r') as fp: text = json.load(fp) metadata = dict() data = list() for key in text: if key == "data": for item in text[key]: result = item result["identifier"] = item["dimensions"]["pagestem"] data.append(result) else: metadata[key] = text[key] index.bulk(data, identifier_key="identifier") index.index_into(metadata, 0)
def enrich( system_number: str ) -> Tuple[Dict[str, Dict[str, int]], Union[List[str], None]]: total = 0 hits = dict() total_sub = 0 for year in range(2018, 2019): sru = ElasticIndex('sru-{}'.format(year), doc_type='logs', url=swissbib_host) hits['sru'] = dict() query = {'query': {'match': {'requestparams': system_number}}} num = len(sru.scan_index(query=query)) hits['sru'][str(year)] = num total += num total_sub += num hits['sru']['total'] = total_sub for source in ['green', 'jus', 'bb']: hits[source] = dict() total_sub = 0 for year in range(2017, 2019): swissbib = ElasticIndex('swissbib-{}-{}'.format(source, year), doc_type='logs', url=swissbib_host) query = { 'query': { 'term': { 'request_middle.keyword': { 'value': system_number } } } } num = len(swissbib.scan_index(query=query)) hits[source][str(year)] = num total += num total_sub += num hits[source]['total'] = total_sub hits['total'] = total return hits, []
from simple_elastic import ElasticIndex from kafkaflows.digi.user_data import swissbib, aleph, e_codices, e_manuscripta, e_rara, opac host = 'http://localhost:9200' e_codices_index = ElasticIndex('e-codices-data', 'hits', url=host) e_manuscripta_index = ElasticIndex('e-manuscripta-data', 'hits', url=host) e_rara_index = ElasticIndex('e-rara-data', 'hits', url=host) aleph_index = ElasticIndex('aleph-dsv01-data', 'hits', url=host) opac_index = ElasticIndex('opac-access', 'log', url=host) def enrich_user_data(config): for index in config['indexes']: instance = ElasticIndex(**index['index']) query = {'query': {'match_all': {}}} for results in instance.scroll(query=query): for item in results: identifier = item['identifier'] database = item['database'] sys_number = item['identifiers'][database] if 'error_tags' in item: item['error_tags'] = set(item['error_tags']) total = 0 # swissbib
from simple_elastic import ElasticIndex from collections import Counter from roman import fromRoman, InvalidRomanNumeralError, romanNumeralPattern import json import re find_roman_numeral = re.compile('([MCLXVI]+)[^a-z]') roman_numeral = re.compile('^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$') if __name__ == '__main__': index = ElasticIndex('kafka*', 'record') c = Counter() alt_c = Counter() query = { '_source': ['extent.coverage', 'c-format'], 'query': { 'exists': { 'field': 'extent.coverage' } } } missing = open('missing.txt', 'w') find_all_result = open('find_all.txt', 'w') find_all = list() descriptive_coverage = list()
def setup_class(self): self.index = ElasticIndex('test')
class TestElasticIndex(object): def setup_class(self): self.index = ElasticIndex('test') def teardown_class(self): self.index.delete_index() def test_scroll(self): self.index.index_into({'test': True}, 1) self.index.index_into({'test': False}, 2) self.index.index_into({'test': True}, 3) self.index.index_into({'test': False}, 4) for i in self.index.scroll(): assert isinstance(i, list) def test_index_into(self): result = self.index.index_into({'test': True, 'object': "This is a string"}, 5) assert result result = self.index.index_into({'test': True, 'object': {'sub-object': "another string"}}, 6) assert not result result = self.index.index_into({'test': False}, 'HAN000827182') assert result def test_search(self): data = list() data.append({'id': '1234', 'test': True}) self.index.bulk(data=data, identifier_key='id') result = self.index.search() assert len(result) == 7 def test_search_not_unpack(self): result = self.index.search(unpack=False) assert len(result) == 7 def test_alias(self): self.index.add_to_alias('test1') assert self.index.instance.indices.get_alias('test1') self.index.remove_from_alias('test1') with pytest.raises(NotFoundError): self.index.instance.indices.get_alias('test1') def test_count(self): result = self.index.count() assert result == 7
from openpyxl import load_workbook from simple_elastic import ElasticIndex if __name__ == '__main__': wb = load_workbook( 'data/VERZ_DSV01-Ausleihen-Vormerkungen_20180802_bmt.xlsx') ws = wb['vor-1900-vormerkungen'] index = ElasticIndex('aleph-dsv01-data', 'hits') all_data = dict() for row in ws.iter_rows(min_row=2, min_col=2, max_col=4): doc = dict() system_number = str(row[0].value) while len(system_number) != 9: system_number = '0' + system_number if system_number not in all_data: all_data[system_number] = dict() if 'reservations' not in all_data[system_number]: all_data[system_number]['reservations'] = dict() all_data[system_number]['reservations'][str( row[2].value)] = row[1].value ws = wb['vor-1900-ausleihen'] for row in ws.iter_rows(min_row=2, min_col=2, max_col=4): doc = dict() system_number = str(row[0].value)
def transform_dni_to_contributor(self, element, parent, edoc_tag, index='', doc_type='', url='', fdb_index='', fdb_doc_type='', fdb_url=''): """Uses the given DNI to add a full contributor. With the given DNI first edoc dataservice will be searched for a match. If found all data from this contributor is copied over. If the edoc dataservice turns up empty the RDB Persons Database is searched. If found all data is copied over. """ if element.text is not None: es = ElasticIndex(index, doc_type, url=url) query = { '_source': ['contributors'], 'query': { 'term': { 'contributors.dni.keyword': { 'value': int(element.text) } } } } results = es.scan_index(query) if len(results) > 0: # returns all contributors. Only add the one with the right DNI. for contrib in results[0]['contributors']: if 'dni' in contrib and str( contrib['dni']) == element.text: contributor = parent.find('./contributor') if contributor is None: contributor = ET.SubElement(parent, 'contributor') item = ET.SubElement(contributor, 'item') ET.SubElement(item, 'dni').text = str( contrib['dni']).strip() name = ET.SubElement(item, 'name') ET.SubElement(name, 'given').text = str( contrib['name']['given']).strip() ET.SubElement(name, 'family').text = str( contrib['name']['family']).strip() if 'id' in contrib: ET.SubElement(item, 'id').text = str( contrib['id']).strip() if 'orcid' in contrib: ET.SubElement(item, 'orcid').text = str( contrib['orcid']).strip() if 'unibasChPublicId' in contrib: ET.SubElement(item, 'unibasChPublicId').text = str( contrib['unibasChPublicId']).strip() else: # len(results) == 0 # try to search it in RDB persons database. fdb = ElasticIndex(fdb_index, fdb_doc_type, url=fdb_url) query = { 'query': { 'term': { 'dni.keyword': { 'value': int(element.text) } } } } fdb_results = fdb.scan_index(query) if len(fdb_results) == 0: self.logger.error('Could not find an author with dni %s.', element.text) elif len(fdb_results) == 1: # in case of a single find => add the contributor to the list. contributor = parent.find('./contributor') if contributor is None: contributor = ET.SubElement(parent, 'contributor') item = ET.SubElement(contributor, 'item') r = fdb_results[0] ET.SubElement(item, 'id').text = r['email'].strip() if 'unibasCHpublicId' in r: ET.SubElement( item, 'unibasChPublicId').text = r['unibasCHpublicId'] if 'orcid' in r: ET.SubElement(item, 'orcid').text = r['orcid'].strip() ET.SubElement(item, 'dni').text = str(r['dni']).strip() name = ET.SubElement(item, 'name') ET.SubElement(name, 'given').text = r['firstname'].strip() ET.SubElement(name, 'family').text = r['lastname'].strip() else: # Should never happen... self.logger.critical( 'Found several persons with DNI %s in RDB.', element.text) else: self.logger.error('A DNI in element %s is None.', self.current_id)
class TransformSruExport(DataTransformation): def __init__(self, database, config, logger=logging.getLogger(__name__)): super().__init__(logger) self._database = database self._config = config self.marc = None self.digidata_index = ElasticIndex(**config['digidata']) self.page_conversion_rates = config['page-conversions'] def transform(self, value: str) -> dict: # Do not reoder this function! self.marc = MARCMapper(value) self.marc.add_value('database', self._database) self.marc.identifier() for field in self.marc.get_fields('024'): if field.indicator1 == '7': if 'a' in field and '2' in field: self.marc.add_identifier(field['2'], field['a']) self.marc.add_identifier('swissbib', self.marc['001'].value()) if self._database == 'dsv01': for _035 in self.marc.get_fields('035'): if _035['a'] is not None: if _035['a'].startswith('(IDSBB)'): self.marc.add_identifier('dsv01', _035['a'].split(')')[1]) elif self._database == 'dsv05': self.marc.add_identifier('dsv05', self.marc['001'].value()[3:]) # Do not re-order these! self.enrich() self.parse_record_type() self.parse_date() self.parse_format_codes() self.parse_number_of_pages() self.parse_call_number() self.parse_additional_information() return self.marc.result def enrich(self): """Enriching the metadata from other data sources.""" self.enrich_digidata() def enrich_digidata(self): """Loads data from the digidata elastic index. No live updates, as the digidata repository is on Afrikaportal-elastic, which is only on localhost accessible. To update run the digispace-producer & digispace-consumer. TODO: Load live data instead of copy. To do this direct access to Afrikaportal is necessary. """ query = { "query": { "term": { "system_number": self.marc.result['identifiers'][self._database] } } } result = self.digidata_index.search(query=query) if len(result) > 0: self.marc.add_value('is_digitized', True) if 'number_of_images' in result[0]: self.marc.add_value('number_of_images', result[0]['number_of_images']) else: self.marc.add_value('is_digitized', False) def parse_date(self): """Parsing the date from the various possible fields. Stores where the information was taken from.""" _008_date = self.marc.parse_date_from_008() _046_date = self.marc.parse_date_from_046() if _008_date: year = self.marc.result['dates']['date']['year'] self.marc.add_value_sub('final', 'year', int(year)) self.marc.append_value_sub('final', 'century', int(year / 100) + 1) self.marc.append_value_sub('final', 'decade', year - year % 10) self.marc.add_value_sub('source', 'year', '008') elif _046_date: year = self.marc.result['dates']['exact']['year'] self.marc.add_value_sub('final', 'year', int(year)) self.marc.append_value_sub('final', 'century', int(year / 100) + 1) self.marc.append_value_sub('final', 'decade', year - year % 10) self.marc.add_value_sub('source', 'year', '046') elif self.marc.parse_date_from_264(): year = self.marc.result['dates']['parsed_264_year'] self.marc.add_value_sub('final', 'year', int(year)) self.marc.append_value_sub('final', 'century', int(year / 100) + 1) self.marc.append_value_sub('final', 'decade', year - year % 10) self.marc.add_value_sub('source', 'year', '264') else: self.marc.add_value_sub('source', 'year', 'None') self.marc.add_error_tag('_no_valid_date') def parse_number_of_pages(self): """Figure out the number of pages! First source: digidata number of images. Second source: coverage Third source: estimates. """ self.marc.parse_field_to_subfield('300', 'a', 'extent', 'coverage') pages = 0 name = Units.No # This will be filtered anyway. if self.marc.result['c-format'] in [ 'Objekt', 'Diverse Tonformate', 'Schallplatte', 'Diverse Filmformate', 'Datenbank' ]: pages = 1 name = Units.Gegenstand self.marc.add_value_sub('source', 'pages', 'format') if name == Units.No: pages, name = self.parse_coverage_field() if name == Units.No: raise ValueError('Name should not be None here: {}. {}'.format( self.marc.result['identifier'], pages)) if name != Units.Seiten: self.marc.add_value_sub('source', 'pages', 'estimate') self.marc.add_value_sub('source', 'estimate', name.value) pages = pages * self.page_conversion_rates[name.value] else: self.marc.add_value_sub('source', 'pages', 'coverage') self.marc.add_value_sub('extent', 'pages', pages) if 'number_of_images' in self.marc.result: pages = self.marc.result['number_of_images'] self.marc.add_value_sub('source', 'pages', 'digidata') if 'estimate' in self.marc.result['source']: del self.marc.result['source']['estimate'] self.marc.add_value_sub('final', 'pages', pages) def parse_coverage_field(self) -> Tuple[Union[float, int], Units]: """Parses various values from the coverage field and returns them as tuple: (number of unit, name of unit) """ if 'coverage' in self.marc.result['extent']: coverage = self.marc.result['extent']['coverage'] else: coverage = None swissbib_format = self.marc.result['c-format'] if swissbib_format in ['Klavierauszug', 'Partitur', 'Noten']: return self.parse_partituren(coverage) elif swissbib_format in ['Atlas', 'Karte', 'Diverse Kartenformate']: return self.parse_maps(coverage) elif swissbib_format in ['Brief', 'Briefsammlung']: return self.parse_letters(coverage) elif swissbib_format in ['Diverse Bildformate', 'Fotografie']: return self.parse_fotos(coverage) elif swissbib_format in [ 'Gesamtwerk', 'Buch', 'Verfassung / Gesetz', 'Artikel' ]: return self.parse_books(coverage, swissbib_format) elif swissbib_format in ['Handschrift']: return self.parse_manuscript(coverage) elif swissbib_format in ['Dossier']: return self.parse_dossier(coverage) elif swissbib_format in ['Zeitung', 'Zeitschrift / Schriftenreihe']: # TODO: Bessere implementierung von Zeitschriften. if coverage is None: return 1, Units.Periodikum num, name = parse_volumes(coverage, Units.Band) if num > 0: return num, name year = None to = None if 'dates' in self.marc.result: if 'date' in self.marc.result['dates']: if 'year' in self.marc.result['dates']['date']: year = self.marc.result['dates']['date']['year'] if 'to' in self.marc.result['dates']['date']: to = self.marc.result['dates']['date']['to'] if year is not None and to is not None: return year - to, Units.Band elif year is not None: return 1, Units.Band else: return 1, Units.Periodikum else: logging.error( 'Could not parse %s, with coverage %s and format %s.', self.marc.result['identifier'], coverage, swissbib_format) return 1, Units.Seiten def parse_partituren(self, coverage: str) -> Tuple[Union[float, int], Units]: if coverage is None or empty.fullmatch(coverage): return 1, Units.Partitur num, name = parse_pages(coverage) if num > 0: return num, name stimmen = re.match('Stimme', coverage) if stimmen: return 1, Units.Stimmen stimmen = re.match('Stimmen', coverage) if stimmen: return 3, Units.Stimmen num, name = parse_volumes(coverage, Units.Partitur) results = re.findall('(\d+) Stimme[n]', coverage) for result in results: num += int(result[0]) / 2 results = re.findall('(\d+) (Abt|B|C|H$|He|K|[Pp]art|Ser|T|[Vv]ol)', coverage) for result in results: num += int(result[0]) if num > 0: return num, Units.Partitur num, name = parse_meters(coverage) if num > 0: return num, name return 1, Units.Partitur def parse_maps(self, coverage: str) -> Tuple[Union[float, int], Units]: if coverage is None or empty.fullmatch(coverage): return 4, Units.Karten num, name = parse_pages(coverage) if name == 'Seiten': return num, name maps_matches = re.findall( '(\d+) ([Kc]arte[n]?|Pl[äa]n[e]?|Vogel|Ansicht|Panorama|Manuskript)', coverage) maps = 0 for matches in maps_matches: maps += int(matches[0]) if maps > 0: return maps, Units.Karten atlas_matches = re.findall('(\d+) (Atlas)', coverage) atlas = 0 for match in atlas_matches: atlas += int(match[0]) if atlas > 0: return atlas, Units.Band folders, name = parse_folders(coverage, Units.Kartenmappen) if folders > 0: return folders, name return 4, Units.Karten def parse_letters(self, coverage: str) -> Tuple[Union[float, int], Units]: if coverage is None or empty.fullmatch(coverage): return 2, Units.Briefe pages, name = parse_pages(coverage) results = re.findall( '(\d+) (Karte|Briefkarte|Postkarte|Ansichtskarte|Visitenkarte)', coverage) for result in results: pages += int(result[0]) result = re.match('Briefkarte|Postkarte|Zettel|Karte|Visitenkarte', coverage) if result: pages += 1 if pages > 0: return pages, Units.Seiten letters, name = parse_letters(coverage) if letters > 0: return letters, name volumes, name = parse_volumes(coverage, Units.Briefband) if volumes > 0: return volumes, name folders, name = parse_folders(coverage, Units.Briefmappen) if folders > 0: return folders, name return 2, Units.Briefe def parse_fotos(self, coverage: str) -> Tuple[int, Units]: if coverage is None or empty.fullmatch(coverage): return 1, Units.Seiten pages, name = parse_pages(coverage) results = re.findall( '(\d+) (Kupferstich|Litho|Foto|Zeichnung|Repro|Holzschnitt|Schattenriss' '|Aquarell|Druckgrafik(en)?|Physionotrace|Bild|Stück|Radierung)', coverage) for result in results: pages += int(result[0]) if pages > 0: return pages, Units.Seiten folders, name = parse_folders(coverage, Units.Fotomappen) if folders > 0: return folders, name return 1, Units.Seiten def parse_books(self, coverage: str, swissbib_format: str) -> Tuple[int, Units]: if swissbib_format == 'Artikel': return_type = Units.Artikel else: return_type = Units.Band if coverage is None or empty.fullmatch(coverage): return 1, return_type num, name = parse_pages(coverage) if num > 0: return num, name volumes, name = parse_volumes(coverage, return_type) if volumes > 0: return volumes, name return 1, return_type def parse_manuscript(self, coverage: str) -> Tuple[Union[float, int], Units]: if coverage is None or empty.fullmatch(coverage): return 1, Units.Faszikel num, name = parse_pages(coverage) if num > 0: return num, name volumes, name = parse_volumes(coverage, Units.Manuskriptband) if volumes > 0: return volumes, name folders, name = parse_folders(coverage, Units.Faszikel) results = re.findall( '(\d+) (Stücke|Papiertüte[n]?|Faszikel|Dossier|Broschüre|Zeichenbuch|' 'Heft(e|chen)?|Schuber|Bündel|Konvolut|Schulheft|Umschläge|Büchlein|Umschlag|Predigten)', coverage) for result in results: volumes += int(result[0]) if folders > 0: return folders, Units.Faszikel num, name = parse_boxes(coverage) if num > 0: return num, name letters, name = parse_letters(coverage) if letters > 0: return letters, name return 1, Units.Faszikel def parse_dossier(self, coverage: str) -> Tuple[Union[int, float], Units]: if coverage is None or empty.fullmatch(coverage): return 1, Units.Archiveinheit pages, name = parse_pages(coverage) if pages > 0: return pages, name volumes, name = parse_volumes(coverage, Units.Band) if volumes > 0: return volumes, name boxes, name = parse_boxes(coverage) if boxes > 0: return boxes, name folders, name = parse_folders(coverage, Units.Mappen) if folders > 0: return folders, name lfm, name = parse_meters(coverage) if lfm > 0: return lfm, name letters, name = parse_letters(coverage) if letters > 0: return letters, name archives, name = parse_archive(coverage, Units.Archiveinheit) if archives > 0: return archives, name return 1, Units.Archiveinheit def parse_record_type(self): """Defines a general type for the record. This is used to distinguish between prints and hand written manuscripts. """ self.marc.parse_field('245', 'h', 'print_material') if self._database == 'dsv01': self.marc.add_value_sub('final', 'type', 'print') elif 'print_material' in self.marc.result: if self.marc.result['print_material'] in [ 'Noten', 'Bildmaterial', 'Druckschrift', 'Kartenmaterial' ]: self.marc.add_value_sub('final', 'type', 'print') elif self.marc.result['print_material'] in [ 'Ton', 'Mikroform', 'Gegenstand', 'Filmmaterial' ]: self.marc.add_value_sub('final', 'type', 'other') elif self.marc.result['print_material'] in [ 'Manuskript', 'Notenmanuskript' ]: self.marc.add_value_sub('final', 'type', 'manuscript') else: self.marc.add_value_sub('final', 'type', 'other') self.marc.add_error_tag('_unknown_print_material') logging.warning('Unknown print material: %s in %s.', self.marc.result['print_material'], self.marc.result['identifier']) else: self.marc.add_value_sub('final', 'type', 'manuscript') def parse_call_number(self): """Parses the call number of this record has. Adds the library it belongs to as well. The call number is further indexed in parts to create facets. Only books from A100 & A125 are used. Books older than 1920 are very rare in A140 (UB Medizin) The books in A130 (Altertum) are ignored, because there are not that many, and it would be necessary to further filter the books from UBH. # TODO: Implement a way to process all the call numbers, since one title # can have many of them. # currently just picks the first one. # books can have multiple call numbers for two reasons: # 1. The library owns more than one item. # 2. The bibliographic record describes multiple parts of one title. """ for field in self.marc.get_fields('949'): if field['F'] in ['A100', 'A125']: self.marc.append_value('library', field['F']) if field['j'] != '': self.marc.append_value('call_number', field['j']) if 'call_number' in self.marc.result: results = self.create_call_number_filter() if results is not None: self.marc.add_value_sub('filter', 'prefix', results[0]) if results[1] is not None: self.marc.add_value_sub('filter', 'base', results[1]) if results[2] is not None: self.marc.add_value_sub('filter', 'second', results[2]) self.marc.add_value_sub('filter', 'number', results[3]) def create_call_number_filter( self) -> Optional[Tuple[str, Optional[str], Optional[str], str]]: call_number = '' if len(self.marc.result['call_number']) == 1: call_number = self.marc.result['call_number'][0] else: for call_n in self.marc.result['call_number']: if call_n is not None: if call_n.startswith('UBH'): call_number = call_n call_number = re.sub('\s+', ' ', call_number.strip()) database = self.marc.result['database'] if database == 'dsv05' and call_number != '': call_number = 'HAN ' + call_number if call_number == '': # remove call number if it is empty. del self.marc.result['call_number'] return None if not re.match('(UBH|HAN)', call_number) or re.fullmatch( 'UBH', call_number): # ignore anything which does not comply with convention. return None simple = re.fullmatch('(\w+) ([\w\-*.]+) (\d+)(.*)?', call_number) if simple: return simple.group(1), simple.group(2), None, ( simple.group(3) + simple.group(4)).strip() word_roman = re.fullmatch( '(\w+) (\w+) ([MCLXVI]+[ ]?[a-z]?) (\d+)(.*)?', call_number) if word_roman: return word_roman.group(1), \ word_roman.group(2), \ word_roman.group(3), \ (word_roman.group(4) + word_roman.group(5)).strip() double_word_roman = re.fullmatch( '(\w+) ([\w\-*]+) ([\w\-*]+) ([MCLXVI]+[ ]?[a-z]?) (\d+)(.*)?', call_number) if double_word_roman: return double_word_roman.group(1), \ double_word_roman.group(2) + ' ' + double_word_roman.group(3), \ double_word_roman.group(4), \ double_word_roman.group(5) three_word = re.fullmatch( '(\w+) ([\w\-*]+) ([\w\-*]+) ([A-Za-z\-*]+)(.*)?', call_number) if three_word: return three_word.group(1), \ three_word.group(2) + ' ' + three_word.group(3), \ three_word.group(4), \ three_word.group(5).strip() double_word = re.fullmatch('(\w+) ([\w\-*]+) ([\w\-*]+)(.*)?', call_number) if double_word: return double_word.group(1), double_word.group( 2), double_word.group(3), double_word.group(4).strip() rest_han = re.fullmatch('(HAN) (.*)', call_number) if rest_han: return rest_han.group(1), None, None, rest_han.group(2).strip() rest_ubh = re.fullmatch('(UBH) (.*)', call_number) if rest_ubh: return rest_ubh.group(1), None, None, rest_ubh.group(2).strip() def parse_format_codes(self): """Parse the format codes and replace them with human readable forms. The c-format, the most condensed value is used as format. """ self.marc.parse_field('898', 'a', 'a-format') if 'a-format' in self.marc.result: self.marc.result['a-format'] = format_dict[ self.marc.result['a-format']] self.marc.parse_field('898', 'b', 'b-format') if 'b-format' in self.marc.result: self.marc.result['b-format'] = format_dict[ self.marc.result['b-format']] self.marc.parse_field('898', 'c', 'c-format') if 'c-format' in self.marc.result: self.marc.result['c-format'] = format_dict[ self.marc.result['c-format']] self.marc.add_value_sub('final', 'format', self.marc.result['c-format']) def parse_additional_information(self): """Information which might be interesting in the future, but not needed for current analysis.""" self.marc.parse_leader() self.marc.parse_cat_date() self.marc.parse_rest_008() self.marc.parse_field('245', 'a', 'title') self.marc.parse_field('245', 'b', 'subtitle') self.marc.parse_field('245', 'c', 'author') self.marc.parse_field_to_subfield('264', 'a', 'production', 'place') self.marc.parse_field_to_subfield('264', 'b', 'production', 'publisher') self.marc.parse_field_to_subfield('264', 'c', 'production', 'date') self.marc.parse_field_to_subfield('300', 'b', 'extent', 'physical_attributes') self.marc.parse_field_to_subfield('300', 'c', 'extent', 'size_and_format') self.marc.parse_field_to_subfield('300', 'e', 'extent', 'additional_content') self.marc.parse_field_append_to_subfield('336', 'a', 'extent', 'content') self.marc.parse_field_append_to_subfield('337', 'a', 'extent', 'media') self.marc.parse_field_append_to_subfield('338', 'a', 'extent', 'carrier') self.marc.parse_field_to_subfield('348', 'a', 'extent', 'music') self.marc.parse_field('351', 'c', 'classification') self.marc.parse_field('250', 'a', 'version') self.marc.parse_field_to_subfield('340', 'a', 'extent', 'carrier') self.marc.parse_field_list([ '600', '610', '611', '630', '648', '650', '651', '653', '655', '690', '691' ], { 'a': 'title', '2': 'source', '0': 'identifier' }, 'subject_headings') self.marc.parse_field('856', 'u', 'link') self.marc.parse_field_to_subfield('908', 'a', 'extent', 'format') self.marc.parse_field('909', 'a', 'archive_tag') if 'date' in self.marc.result['production']: self.marc.result['final']['display_date'] = self.marc.result[ 'production']['date'] def pre_filter(self, message: str) -> bool: """Keep only records which belong to Universitätsbibliothek Basel.""" if re.search('{"F": "(A100|A125)"},', message): return False else: return True def post_filter(self, transformed_message: dict) -> bool: # Remove any record which is newer than 1920. if 'year' in transformed_message['final']: if int(transformed_message['final']['year']) > 1920: return True # Remove records of special formats. if transformed_message['final']['format'] in [ 'Objekt', 'Diverse Tonformate', 'Schallplatte', 'Diverse Filmformate', 'Datenbank' ]: return True return False
from simple_elastic import ElasticIndex from collections import Counter from multiprocessing import Pool import re import json if __name__ == '__main__': url_counter = Counter() domain_counter = Counter() counter = Counter() hold = Counter() copy = Counter() index = ElasticIndex('swissbib-*', 'logs') query = { '_source': ['search_params.trackurl'], 'query': { 'exists': { 'field': 'search_params.trackurl' } } } with open('common-urls.json', 'r') as fp: common_urls = json.load(fp) pool = Pool(processes=4) process_results = list() for results in index.scroll(query=query, size=10000):
from simple_elastic import ElasticIndex import json if __name__ == '__main__': data = ElasticIndex('e-codices-data', 'hits') data.dump(".")
with open('suppl/accepted-dois.json', 'w') as fp: json.dump(accepted_dois, fp, indent=4, ensure_ascii=False) with open('suppl/rejected-dois.json', 'w') as fp: json.dump(rejected_dois, fp, indent=4, ensure_ascii=False) with open('suppl/call-numbers.json', 'w') as fp: json.dump(sorted(list(call_numbers_encoded)), fp, indent=4, ensure_ascii=False) with open('suppl/output.json', 'w') as fp: json.dump(collect_stems, fp, indent=2, ensure_ascii=False) target = ElasticIndex('e-codices-data', 'hits') index = ElasticIndex('kafka-dsv05-*', 'record') for key in collect_stems: if key == '0001': # TODO: A combined manuscript. continue item = dict() cn = transform_call_number(key) query = { '_source': ['call_number', 'identifiers.*'], 'query': { 'term': { 'call_number.keyword': cn }
for v in mapping[sys_number]['vlids']: for y in ['2016', '2017', '2018']: if sys_number not in result: result[sys_number] = dict() if p not in result[sys_number]: result[sys_number][p] = dict() if v in vlids[p][y]: if y not in result[sys_number]: result[sys_number][p][y] = vlids[p][y][v]['page-views'] else: result[sys_number][p][y] += vlids[p][y][v][ 'page-views'] else: if y not in result[sys_number][p]: result[sys_number][p][y] = 0 elastic_data = list() for sys_number in result: item = dict() item['bau'] = dict() total = 0 for y in result[sys_number]['erara-bau']: item['bau'][y] = result[sys_number]['erara-bau'][y] total += item['bau'][y] item['bau']['total'] = total item['identifier'] = sys_number elastic_data.append(item) index = ElasticIndex('e-rara-data', 'hits') index.bulk(elastic_data, 'identifier')
class BulkElasticConsumer(AbstractBaseConsumer): """ Will attempt to collect a number of messages and then bulk index them. Collection will either wait some time or collect 10'000 messages. Consumer: bootstrap_servers: localhost:9092 client_id: test group_id: elastic-consumer-test auto_offset_reset: earliest Topics: - test ElasticIndex: index: name-of-index doc_type: _doc (default value for elasticsearch 6) url: http://localhost:9200 timeout: 300 IdentifierKey: name-of-key-value (optional, if not specified the Kafka key value will be used.) """ def __init__(self, config, config_class=ElasticConsumerConfig, logger=logging.getLogger(__name__)): super().__init__(config, config_class, logger=logger) self._index = ElasticIndex(**self.configuration.elastic_settings) self._key = self.configuration.key @property def configuration(self) -> ElasticConsumerConfig: return super().configuration def consume(self) -> bool: data = list() messages = self._consumer.poll(100, 10000) if messages: # TODO: Only works if there is a single partition per consumer. As soon as the number of consumers is lower # TODO: or higher than the number of partitions this fails. for message in messages[self._consumer.assignment().pop()]: key = message.key.decode('utf-8') try: value = json.loads(message.value.decode('utf-8')) except JSONDecodeError as ex: self._error_logger.error( "Failed to JSONDecode message: {}.".format( message.value.decode('utf-8'))) value = { 'message': message.value.decode('utf-8'), 'error': '{}'.format(ex) } if self._key not in value: value['_key'] = key data.append(value) now = time.time() if len(data) > 0: result = self._index.bulk(data, self._key, op_type=self.configuration.op_type, upsert=self.configuration.upsert) then = time.time() amount = then - now self._time_logger.info( "Success! Indexed {} messages to {} in {} seconds.".format( len(data), self._index.index, amount)) else: result = False if result: for assignment in self._consumer.assignment(): pos = self._consumer.position(assignment) if pos != self._consumer.committed(assignment): self._consumer.commit( {assignment: OffsetAndMetadata(pos, "")}) return result
from simple_elastic import ElasticIndex from datetime import date dsv01_full_export = ElasticIndex('dsv01-sys-numbers-before-1900', 'record') with open('data/dsv01_system_numbers_vor_1900_arc_export_20180802.csv', 'r', encoding='utf-16') as file: for line in file: year, sys_number = line.split(',') doc = dict() while len(sys_number) != 10: sys_number = '0' + sys_number doc['system_number'] = sys_number.strip() doc['publication_date'] = year doc['index_date'] = date.today().isoformat() dsv01_full_export.index_into(doc, doc['system_number'])
def enrich_user_data(config): for index in config['indexes']: instance = ElasticIndex(**index['index']) query = {'query': {'match_all': {}}} for results in instance.scroll(query=query): for item in results: identifier = item['identifier'] database = item['database'] sys_number = item['identifiers'][database] if 'error_tags' in item: item['error_tags'] = set(item['error_tags']) total = 0 # swissbib hits, error_tags = swissbib.enrich(identifier) item['hits']['swissbib'] = hits total += hits['total'] for tag in error_tags: item['error_tags'].add(tag) # opac hits, error_tags = opac.enrich(opac_index, sys_number) item['hits']['opac-access'] = hits total += hits['total'] for tag in error_tags: item['error_tags'].add(tag) # aleph hits, error_tags = aleph.enrich(aleph_index, sys_number, database) item['hits']['aleph'] = hits total += hits['loans']['total'] for tag in error_tags: item['error_tags'].add(tag) if database == 'dsv05': # e-rara hits, error_tags = e_rara.enrich(e_rara_index, sys_number) item['hits']['e-rara'] = hits total += hits['bau']['total'] for tag in error_tags: item['error_tags'].add(tag) # e-manuscripta hits, error_tags = e_manuscripta.enrich( e_manuscripta_index, sys_number) item['hits']['e-manuscripta'] = hits total += hits['bau']['total'] total += hits['swa']['total'] for tag in error_tags: item['error_tags'].add(tag) # e-codices hits, doi, error_tags = e_codices.enrich( e_codices_index, sys_number) item['hits']['e-codices'] = hits total += hits['total'] for tag in error_tags: item['error_tags'].add(tag) if doi is not None: if 'doi' in item['identifiers']: if isinstance(item['identifiers']['doi'], list): item['identifiers']['doi'].append(doi) else: item['identifiers']['doi'] = [ item['identifiers']['doi'], doi ] # e-mails dsv05 # TODO item['error_tags'] = list(item['error_tags']) item['hits']['total'] = total instance.index_into(item, item['identifier'])
index_object = captures index_object['identifier'] = identifier identifier += 1 date = re.match( '(?P<day>\d+)\/(?P<month>\w+)\/(?P<year>\d+):' '(?P<hour>\d+):(?P<minute>\d+):(?P<second>\d+) .*$', captures['timestamp']) if date: dates = date.groupdict() index_object['date_parts'] = dates else: logging.warning('Could not parse date time in line "%s".', line) else: logging.warning('Could not parse line "%s".', line) if index_object is not None: index_objects.append(index_object) count += 1 if count == 500: index = ElasticIndex('opac-access', 'log') index.bulk(index_objects, identifier_key='identifier') count = 0 index_objects.clear() index = ElasticIndex('opac-access', 'log') index.bulk(index_objects, identifier_key='identifier')
from simple_elastic import ElasticIndex from collections import Counter import json if __name__ == '__main__': c = Counter() index = ElasticIndex('kafka*', 'record') with open('data/collected-hits-e-plattforms.json', 'r') as fp: data = json.load(fp) for sys_number in data: query = { 'query': { 'bool': { 'should': [{ 'match': { 'identifiers.dsv05': sys_number } }, { 'match': { 'identifiers.dsv01': sys_number } }], 'minimum_should_match': 1 } } } results = index.scan_index(query=query) if len(results) == 1: