def import_data(plattform: str, year: str): index = ElasticIndex("emanus-{}-data-base-{}".format(plattform, year), "doc") with open("emanus-{}-{}.json".format(plattform, year), 'r') as fp: text = json.load(fp) metadata = dict() data = list() for key in text: if key == "data": for item in text[key]: result = item result["identifier"] = item["dimensions"]["pagestem"] data.append(result) else: metadata[key] = text[key] index.bulk(data, identifier_key="identifier") index.index_into(metadata, 0)
class SimpleElasticConsumer(AbstractBaseConsumer): """ A KafkaConsumer which consumes messages and indexes them into a ElasticIndex one by one. Requires the following configs: Consumer: bootstrap_servers: localhost:9092 client_id: test group_id: elastic-consumer-test auto_offset_reset: earliest Topics: - test ElasticIndex: index: name-of-index doc_type: _doc (default value for elasticsearch 6) url: http://localhost:9200 timeout: 300 """ def __init__(self, config, config_class=ElasticConsumerConfig, logger=logging.getLogger(__name__)): super().__init__(config, config_class, logger=logger) self._index = ElasticIndex(**self.configuration.elastic_settings) def consume(self) -> bool: """ Consumes a single message from the subscribed topic and indexes it into the elasticsearch index. Returns True if successful, False otherwise. """ message = next(self._consumer) key = message.key.decode('utf-8') try: value = json.loads(message.value.decode('utf-8')) except JSONDecodeError as ex: value = { 'message': message.value.decode('utf-8'), 'error': '{}'.format(ex) } result = self._index.index_into(value, key) if result: for assignment in self._consumer.assignment(): pos = self._consumer.position(assignment) if pos != self._consumer.committed(assignment): self._consumer.commit( {assignment: OffsetAndMetadata(pos, "")}) # self._time_logger.info("Consumed and indexed one message.") return result
class TestElasticProducer(object): def setup_class(self): self.admin = KafkaAdminClient(bootstrap_servers='localhost:9092') self.index = ElasticIndex('test-elastic-producer', 'doc') self.index.index_into({'test': 1}, 0) self.index.index_into({'test': 2}, 1) self.index.index_into({'test': 3}, 2) self.index.index_into({'test': 4}, 3) self.index.index_into({'test': 5}, 4) self.producer = ElasticProducer('configs/elastic/test_elastic_producer_producer.yml') self.consumer = SimpleConsumer('configs/elastic/test_elastic_producer_consumer.yml') def teardown_class(self): self.consumer.close() self.admin.delete_topics(['test-elastic-producer']) self.admin.close() self.index.delete() #@pytest.mark.skip() def test_produce(self): self.producer.process() key, message = self.consumer.consume() assert key == '0' assert message == '{"test": 1}'
class TestElasticIndex(object): def setup_class(self): self.index = ElasticIndex('test') def teardown_class(self): self.index.delete_index() def test_scroll(self): self.index.index_into({'test': True}, 1) self.index.index_into({'test': False}, 2) self.index.index_into({'test': True}, 3) self.index.index_into({'test': False}, 4) for i in self.index.scroll(): assert isinstance(i, list) def test_index_into(self): result = self.index.index_into({'test': True, 'object': "This is a string"}, 5) assert result result = self.index.index_into({'test': True, 'object': {'sub-object': "another string"}}, 6) assert not result result = self.index.index_into({'test': False}, 'HAN000827182') assert result def test_search(self): data = list() data.append({'id': '1234', 'test': True}) self.index.bulk(data=data, identifier_key='id') result = self.index.search() assert len(result) == 7 def test_search_not_unpack(self): result = self.index.search(unpack=False) assert len(result) == 7 def test_alias(self): self.index.add_to_alias('test1') assert self.index.instance.indices.get_alias('test1') self.index.remove_from_alias('test1') with pytest.raises(NotFoundError): self.index.instance.indices.get_alias('test1') def test_count(self): result = self.index.count() assert result == 7
index = ElasticIndex('kafka-dsv05-*', 'record') for key in collect_stems: if key == '0001': # TODO: A combined manuscript. continue item = dict() cn = transform_call_number(key) query = { '_source': ['call_number', 'identifiers.*'], 'query': { 'term': { 'call_number.keyword': cn } } } results = index.scan_index(query=query) if len(results) == 1: result = results[0] item['hits'] = collect_stems[key] if 'doi' not in result['identifiers']: if key in dois: item['doi'] = dois[key] target.index_into(item, result['identifiers']['dsv05']) else: print(key, cn, results)
def enrich_user_data(config): for index in config['indexes']: instance = ElasticIndex(**index['index']) query = {'query': {'match_all': {}}} for results in instance.scroll(query=query): for item in results: identifier = item['identifier'] database = item['database'] sys_number = item['identifiers'][database] if 'error_tags' in item: item['error_tags'] = set(item['error_tags']) total = 0 # swissbib hits, error_tags = swissbib.enrich(identifier) item['hits']['swissbib'] = hits total += hits['total'] for tag in error_tags: item['error_tags'].add(tag) # opac hits, error_tags = opac.enrich(opac_index, sys_number) item['hits']['opac-access'] = hits total += hits['total'] for tag in error_tags: item['error_tags'].add(tag) # aleph hits, error_tags = aleph.enrich(aleph_index, sys_number, database) item['hits']['aleph'] = hits total += hits['loans']['total'] for tag in error_tags: item['error_tags'].add(tag) if database == 'dsv05': # e-rara hits, error_tags = e_rara.enrich(e_rara_index, sys_number) item['hits']['e-rara'] = hits total += hits['bau']['total'] for tag in error_tags: item['error_tags'].add(tag) # e-manuscripta hits, error_tags = e_manuscripta.enrich( e_manuscripta_index, sys_number) item['hits']['e-manuscripta'] = hits total += hits['bau']['total'] total += hits['swa']['total'] for tag in error_tags: item['error_tags'].add(tag) # e-codices hits, doi, error_tags = e_codices.enrich( e_codices_index, sys_number) item['hits']['e-codices'] = hits total += hits['total'] for tag in error_tags: item['error_tags'].add(tag) if doi is not None: if 'doi' in item['identifiers']: if isinstance(item['identifiers']['doi'], list): item['identifiers']['doi'].append(doi) else: item['identifiers']['doi'] = [ item['identifiers']['doi'], doi ] # e-mails dsv05 # TODO item['error_tags'] = list(item['error_tags']) item['hits']['total'] = total instance.index_into(item, item['identifier'])
from simple_elastic import ElasticIndex from datetime import date dsv01_full_export = ElasticIndex('dsv01-sys-numbers-before-1900', 'record') with open('data/dsv01_system_numbers_vor_1900_arc_export_20180802.csv', 'r', encoding='utf-16') as file: for line in file: year, sys_number = line.split(',') doc = dict() while len(sys_number) != 10: sys_number = '0' + sys_number doc['system_number'] = sys_number.strip() doc['publication_date'] = year doc['index_date'] = date.today().isoformat() dsv01_full_export.index_into(doc, doc['system_number'])
# TODO: Rework how this works and think about how I need the data? wb = load_workbook('data/VERZ_DSV01-Ausleihen-Vormerkungen_20180802_bmt.xlsx') ws = wb['vor-1900-vormerkungen'] index = ElasticIndex('reservations', 'record') for row in ws.iter_rows(min_row=2, min_col=2, max_col=4): doc = dict() system_number = str(row[0].value) while len(system_number) != 9: system_number = '0' + system_number doc['system_number'] = system_number doc['reservations'] = row[1].value doc['year'] = str(row[2].value) index.index_into(doc, doc['system_number'] + doc['year']) ws = wb['vor-1900-ausleihen'] for row in ws.iter_rows(min_row=2, min_col=2, max_col=4): doc = dict() system_number = str(row[0].value) while len(system_number) != 9: system_number = '0' + system_number doc['system_number'] = system_number doc['loans'] = row[1].value doc['year'] = str(row[2].value) record = index.get(doc['system_number'] + doc['year']) if record is not None: doc['reservations'] = record['reservations'] index.index_into(doc, doc['system_number'] + doc['year'])