def import_data(plattform: str, year: str):

    index = ElasticIndex("emanus-{}-data-base-{}".format(plattform, year),
                         "doc")

    with open("emanus-{}-{}.json".format(plattform, year), 'r') as fp:
        text = json.load(fp)
        metadata = dict()
        data = list()
        for key in text:
            if key == "data":
                for item in text[key]:
                    result = item
                    result["identifier"] = item["dimensions"]["pagestem"]
                    data.append(result)
            else:
                metadata[key] = text[key]
        index.bulk(data, identifier_key="identifier")
        index.index_into(metadata, 0)
class SimpleElasticConsumer(AbstractBaseConsumer):
    """
    A KafkaConsumer which consumes messages and indexes them into a ElasticIndex one by one.

    Requires the following configs:

        Consumer:
          bootstrap_servers: localhost:9092
          client_id: test
          group_id: elastic-consumer-test
          auto_offset_reset: earliest
        Topics:
          - test
        ElasticIndex:
          index: name-of-index
          doc_type: _doc (default value for elasticsearch 6)
          url: http://localhost:9200
          timeout: 300

    """
    def __init__(self,
                 config,
                 config_class=ElasticConsumerConfig,
                 logger=logging.getLogger(__name__)):
        super().__init__(config, config_class, logger=logger)
        self._index = ElasticIndex(**self.configuration.elastic_settings)

    def consume(self) -> bool:
        """
        Consumes a single message from the subscribed topic and indexes it into the elasticsearch index.

        Returns True if successful, False otherwise.
        """
        message = next(self._consumer)

        key = message.key.decode('utf-8')
        try:
            value = json.loads(message.value.decode('utf-8'))
        except JSONDecodeError as ex:
            value = {
                'message': message.value.decode('utf-8'),
                'error': '{}'.format(ex)
            }
        result = self._index.index_into(value, key)

        if result:
            for assignment in self._consumer.assignment():
                pos = self._consumer.position(assignment)
                if pos != self._consumer.committed(assignment):
                    self._consumer.commit(
                        {assignment: OffsetAndMetadata(pos, "")})
        # self._time_logger.info("Consumed and indexed one message.")
        return result
示例#3
0
class TestElasticProducer(object):

    def setup_class(self):
        self.admin = KafkaAdminClient(bootstrap_servers='localhost:9092')
        self.index = ElasticIndex('test-elastic-producer', 'doc')
        self.index.index_into({'test': 1}, 0)
        self.index.index_into({'test': 2}, 1)
        self.index.index_into({'test': 3}, 2)
        self.index.index_into({'test': 4}, 3)
        self.index.index_into({'test': 5}, 4)

        self.producer = ElasticProducer('configs/elastic/test_elastic_producer_producer.yml')
        self.consumer = SimpleConsumer('configs/elastic/test_elastic_producer_consumer.yml')

    def teardown_class(self):
        self.consumer.close()
        self.admin.delete_topics(['test-elastic-producer'])
        self.admin.close()
        self.index.delete()

    #@pytest.mark.skip()
    def test_produce(self):
        self.producer.process()
        key, message = self.consumer.consume()
        assert key == '0'
        assert message == '{"test": 1}'
示例#4
0
class TestElasticIndex(object):

    def setup_class(self):
        self.index = ElasticIndex('test')

    def teardown_class(self):
        self.index.delete_index()

    def test_scroll(self):
        self.index.index_into({'test': True}, 1)
        self.index.index_into({'test': False}, 2)
        self.index.index_into({'test': True}, 3)
        self.index.index_into({'test': False}, 4)
        for i in self.index.scroll():
            assert isinstance(i, list)

    def test_index_into(self):

        result = self.index.index_into({'test': True, 'object': "This is a string"}, 5)
        assert result
        result = self.index.index_into({'test': True, 'object': {'sub-object': "another string"}}, 6)
        assert not result
        result = self.index.index_into({'test': False}, 'HAN000827182')
        assert result

    def test_search(self):
        data = list()
        data.append({'id': '1234', 'test': True})
        self.index.bulk(data=data, identifier_key='id')
        result = self.index.search()
        assert len(result) == 7

    def test_search_not_unpack(self):
        result = self.index.search(unpack=False)
        assert len(result) == 7

    def test_alias(self):
        self.index.add_to_alias('test1')
        assert self.index.instance.indices.get_alias('test1')
        self.index.remove_from_alias('test1')
        with pytest.raises(NotFoundError):
            self.index.instance.indices.get_alias('test1')

    def test_count(self):
        result = self.index.count()
        assert result == 7
示例#5
0
    index = ElasticIndex('kafka-dsv05-*', 'record')
    for key in collect_stems:
        if key == '0001':
            # TODO: A combined manuscript.
            continue

        item = dict()

        cn = transform_call_number(key)
        query = {
            '_source': ['call_number', 'identifiers.*'],
            'query': {
                'term': {
                    'call_number.keyword': cn
                }
            }
        }

        results = index.scan_index(query=query)

        if len(results) == 1:
            result = results[0]
            item['hits'] = collect_stems[key]
            if 'doi' not in result['identifiers']:
                if key in dois:
                    item['doi'] = dois[key]

            target.index_into(item, result['identifiers']['dsv05'])
        else:
            print(key, cn, results)
示例#6
0
def enrich_user_data(config):

    for index in config['indexes']:
        instance = ElasticIndex(**index['index'])

        query = {'query': {'match_all': {}}}

        for results in instance.scroll(query=query):
            for item in results:
                identifier = item['identifier']
                database = item['database']
                sys_number = item['identifiers'][database]

                if 'error_tags' in item:
                    item['error_tags'] = set(item['error_tags'])

                total = 0

                # swissbib
                hits, error_tags = swissbib.enrich(identifier)
                item['hits']['swissbib'] = hits
                total += hits['total']
                for tag in error_tags:
                    item['error_tags'].add(tag)

                # opac
                hits, error_tags = opac.enrich(opac_index, sys_number)
                item['hits']['opac-access'] = hits
                total += hits['total']
                for tag in error_tags:
                    item['error_tags'].add(tag)

                # aleph
                hits, error_tags = aleph.enrich(aleph_index, sys_number,
                                                database)
                item['hits']['aleph'] = hits
                total += hits['loans']['total']
                for tag in error_tags:
                    item['error_tags'].add(tag)

                if database == 'dsv05':
                    # e-rara
                    hits, error_tags = e_rara.enrich(e_rara_index, sys_number)
                    item['hits']['e-rara'] = hits
                    total += hits['bau']['total']
                    for tag in error_tags:
                        item['error_tags'].add(tag)

                    # e-manuscripta
                    hits, error_tags = e_manuscripta.enrich(
                        e_manuscripta_index, sys_number)
                    item['hits']['e-manuscripta'] = hits
                    total += hits['bau']['total']
                    total += hits['swa']['total']
                    for tag in error_tags:
                        item['error_tags'].add(tag)

                    # e-codices
                    hits, doi, error_tags = e_codices.enrich(
                        e_codices_index, sys_number)
                    item['hits']['e-codices'] = hits
                    total += hits['total']
                    for tag in error_tags:
                        item['error_tags'].add(tag)

                    if doi is not None:
                        if 'doi' in item['identifiers']:
                            if isinstance(item['identifiers']['doi'], list):
                                item['identifiers']['doi'].append(doi)
                            else:
                                item['identifiers']['doi'] = [
                                    item['identifiers']['doi'], doi
                                ]

                # e-mails dsv05
                # TODO

                item['error_tags'] = list(item['error_tags'])

                item['hits']['total'] = total

                instance.index_into(item, item['identifier'])
from simple_elastic import ElasticIndex

from datetime import date

dsv01_full_export = ElasticIndex('dsv01-sys-numbers-before-1900', 'record')

with open('data/dsv01_system_numbers_vor_1900_arc_export_20180802.csv',
          'r',
          encoding='utf-16') as file:
    for line in file:
        year, sys_number = line.split(',')
        doc = dict()
        while len(sys_number) != 10:
            sys_number = '0' + sys_number
        doc['system_number'] = sys_number.strip()
        doc['publication_date'] = year
        doc['index_date'] = date.today().isoformat()
        dsv01_full_export.index_into(doc, doc['system_number'])
# TODO: Rework how this works and think about how I need the data?

wb = load_workbook('data/VERZ_DSV01-Ausleihen-Vormerkungen_20180802_bmt.xlsx')
ws = wb['vor-1900-vormerkungen']

index = ElasticIndex('reservations', 'record')

for row in ws.iter_rows(min_row=2, min_col=2, max_col=4):
    doc = dict()
    system_number = str(row[0].value)
    while len(system_number) != 9:
        system_number = '0' + system_number
    doc['system_number'] = system_number
    doc['reservations'] = row[1].value
    doc['year'] = str(row[2].value)
    index.index_into(doc, doc['system_number'] + doc['year'])

ws = wb['vor-1900-ausleihen']

for row in ws.iter_rows(min_row=2, min_col=2, max_col=4):
    doc = dict()
    system_number = str(row[0].value)
    while len(system_number) != 9:
        system_number = '0' + system_number
    doc['system_number'] = system_number
    doc['loans'] = row[1].value
    doc['year'] = str(row[2].value)
    record = index.get(doc['system_number'] + doc['year'])
    if record is not None:
        doc['reservations'] = record['reservations']
        index.index_into(doc, doc['system_number'] + doc['year'])