Пример #1
0
def save(media):
	if media and 'id' in media:
		if not media['mediatype']:
			# ignore for now, but this should send an email notification that there is missing data
			# so that the classifications.py file can be updated
			print "%s is missing a media type, ignoring for now" % (media['id'])
			return
		elasticsearch_connection.add_or_update_item(media['id'], json.dumps(media), media['mediatype'])
Пример #2
0
def save(object):
	if object and 'id' in object:
		if not object['classification']:
			# ignore for now, but this should send an email notification that there is missing data
			# so that the classifications.py file can be updated
			print "%s is missing a classification, ignoring for now" % (object['id'])
			return
		elasticsearch_connection.add_or_update_item(object['id'], json.dumps(object), object['classification'])
Пример #3
0
def save(constituent):
    if constituent and 'id' in constituent:
        if not constituent['type']:
            print "%s is missing a type, ignoring for now" % (
                constituent['id'])
            return
        elasticsearch_connection.add_or_update_item(constituent['id'],
                                                    json.dumps(constituent),
                                                    constituent['type'])
Пример #4
0
def save(object):
    if object and 'id' in object:
        if not object['classification']:
            # ignore for now, but this should send an email notification that there is missing data
            # so that the classifications.py file can be updated
            print "%s is missing a classification, ignoring for now" % (
                object['id'])
            return
        elasticsearch_connection.add_or_update_item(object['id'],
                                                    json.dumps(object),
                                                    object['classification'])
Пример #5
0
def save_manifest(manifest, id):
    if manifest and 'id' in manifest:
        elasticsearch_connection.add_or_update_item(id, json.dumps(manifest),
                                                    'manifest',
                                                    ELASTICSEARCH_IIIF_INDEX)
Пример #6
0
def save(site):
    if site and 'id' in site:
        elasticsearch_connection.add_or_update_item(site['id'],
                                                    json.dumps(site), 'sites',
                                                    ELASTICSEARCH_INDEX)
Пример #7
0
def save(site):
	if site and 'id' in site:
		elasticsearch_connection.add_or_update_item(site['id'], json.dumps(site), 'sites')
Пример #8
0
def save(pub):
    if pub and 'id' in pub:
        elasticsearch_connection.add_or_update_item(pub['id'], json.dumps(pub),
                                                    'pubdocs',
                                                    ELASTICSEARCH_INDEX)
Пример #9
0
def create_library():
    print("Creating Digital Library...")
    time.sleep(
        3
    )  # for some reason the library isn't always fully populated. see if a time delay helps

    author_ids = []
    size = 20
    results_from = 0
    es = elasticsearch_connection.get_connection()
    es_index = ELASTICSEARCH_INDEX

    # delete library
    results = es.search(index=es_index,
                        doc_type='library',
                        body={
                            "size": 500,
                            "stored_fields": ["_id", "name"],
                            "query": {
                                "match_all": {}
                            }
                        })['hits']['hits']
    for r in results:
        elasticsearch_connection.delete(r['_id'], 'library',
                                        ELASTICSEARCH_INDEX)

    total = es.search(index=es_index,
                      doc_type='pubdocs',
                      body={
                          "size": 0,
                          "query": {
                              "match_all": {}
                          }
                      })['hits']['total']

    while results_from < total:
        results = es.search(index=es_index,
                            doc_type='pubdocs',
                            body={
                                "size": size,
                                "from": results_from,
                                "query": {
                                    "match_all": {}
                                }
                            })
        for r in results['hits']['hits']:
            result = r['_source']
            if 'pdf' not in result or result['pdf'] == '':
                continue
            authors = result['authors']

            # if this doc has no authors, set the author to 'No Author' and proceed
            if len(authors) == 0:
                authors.append('No Author')

            for author in authors:
                author_id = author.replace(' ', '')
                sortauthor = author.lower().strip()
                sortauthor = str(
                    unicodedata.normalize('NFD', sortauthor).encode(
                        'ascii', 'ignore').decode("utf-8"))
                # see if this author already exists
                if author_id in author_ids:
                    author_data = elasticsearch_connection.get_item(
                        author_id, 'library', ELASTICSEARCH_INDEX)
                else:
                    author_ids.append(author_id)
                    author_data = {}
                    author_data['name'] = author
                    author_data['sortname'] = sortauthor
                    author_data['docs'] = []

                author_data['docs'].append({
                    'displaytext':
                    result['boilertext'],
                    'sorttext':
                    result['notes']
                    if result['notes'] is not None else result['title'],
                    'format':
                    result['format'],
                    # add file size
                    'url':
                    result['pdf']
                })
                author_data['docs'].sort(key=operator.itemgetter('sorttext'))

                data = json.dumps(author_data)
                elasticsearch_connection.add_or_update_item(
                    author_id, data, 'library', ELASTICSEARCH_INDEX)

        results_from = results_from + size
    print("Finished Digital Library...")
Пример #10
0
def save(constituent):
    if constituent and "id" in constituent:
        if not constituent["type"]:
            print "%s is missing a type, ignoring for now" % (constituent["id"])
            return
        elasticsearch_connection.add_or_update_item(constituent["id"], json.dumps(constituent), constituent["type"])
Пример #11
0
def save(pub):
	if pub and 'id' in pub:
		elasticsearch_connection.add_or_update_item(pub['id'], json.dumps(pub), 'pubdocs')
Пример #12
0
def create_library():
	print "Creating Digital Library..."

	author_ids = []
	size = 20
	results_from = 0
	es = elasticsearch_connection.get_connection()
	es_index = elasticsearch_connection.ELASTICSEARCH_INDEX

	# delete library
	results = es.search(index=es_index, doc_type='library', body={
		"size" : 500,
		"fields" : ["_id", "name"],
		"query": {
			"match_all" : {}
		}
	})['hits']['hits']
	for r in results:
		elasticsearch_connection.delete(r['_id'], 'library')

	total = es.search(index=es_index, doc_type='pubdocs', body={
		"size" : 0,
		"query": {
			"match_all" : {}
		}
	})['hits']['total']

	while results_from < total:
		results = es.search(index=es_index, doc_type='pubdocs', body={
			"size" : size,
			"from" : results_from,
			"query": {
				"match_all" : {}
			}
		})
		for r in results['hits']['hits']:
			result = r['_source']
			if 'pdf' not in result or result['pdf'] == '':
				continue
			authors = result['authors']

			# if this doc has no authors, set the author to 'No Author' and proceed
			if len(authors) == 0:
				authors.append('No Author')

			for author in authors:
				author_id = author.replace(' ', '')
				# see if this author already exists
				if author_id in author_ids:
					author_data = elasticsearch_connection.get_item(author_id, 'library')
				else:
					author_ids.append(author_id)
					author_data = {}
					author_data['name'] = author
					author_data['docs'] = []

				author_data['docs'].append({
					'displaytext' : result['boilertext'],
					'format' : result['format'],
					# add file size
					'url' : result['pdf']
				})
				author_data['docs'].sort(key=operator.itemgetter('displaytext'))

				data = json.dumps(author_data)
				elasticsearch_connection.add_or_update_item(author_id, data, 'library')

		results_from = results_from + size
	print "Finished Digital Library..."
Пример #13
0
def save(manifest):
    if manifest and 'id' in manifest:
        elasticsearch_connection.add_or_update_item(manifest['id'],
                                                    json.dumps(manifest),
                                                    'manifest',
                                                    ELASTICSEARCH_INDEX)