Пример #1
0
    def __init__(self, args):

        # read args, route verb to verb handler
        self.verb_routes = {
            'GetRecord': self._GetRecord,
            'Identify': self._Identify,
            'ListIdentifiers': self._ListIdentifiers,
            'ListMetadataFormats': self._ListMetadataFormats,
            'ListRecords': self._ListRecords,
            'ListSets': self._ListSets
        }

        # debug
        logger.debug(args)

        self.args = args.copy()
        self.request_timestamp = datetime.datetime.now()
        self.request_timestamp_string = self.request_timestamp.strftime(
            '%Y-%m-%dT%H:%M:%SZ')
        self.record_nodes = []

        # published dataframe slice parameters
        self.start = 0
        self.chunk_size = settings.OAI_RESPONSE_SIZE
        self.publish_set_id = None
        if 'set' in self.args.keys():
            self.publish_set_id = self.args['set']
        else:
            self.publish_set_id = None

        # get instance of Published model
        self.published = models.PublishedRecords()

        # begin scaffolding
        self.scaffold()
Пример #2
0
def job_publish(ct_id):

	# get CombineTask (ct)
	try:
		ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id))
		logger.info('using %s' % ct)

		# get CombineJob
		cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id']))

		# publish job
		publish_results = cjob.job.publish(publish_set_id=ct.task_params['publish_set_id'])

		# add publish_set_id to published subsets if present
		for published_subset in ct.task_params['in_published_subsets']:
			logger.debug('adding publish_set_id to Published Subset: %s' % published_subset)
			pr = models.PublishedRecords(subset=published_subset)
			pr.add_publish_set_id_to_subset(publish_set_id=ct.task_params['publish_set_id'])

		# REEVALUATE SUBSET HIERARCHY
			# If the Org or Record Group exists in any published subset, re-evaluate that list of job|#

		# remove from published subsets
		cjob.job.remove_from_published_precounts()

		# save export output to Combine Task output
		ct.refresh_from_db()
		ct.task_output_json = json.dumps({
			'job_id':ct.task_params['job_id'],
			'publish_results':publish_results
		})
		ct.save()

	except Exception as e:

		logger.info(str(e))

		# attempt to capture error and return for task
		ct.task_output_json = json.dumps({
			'error':str(e)
		})
		ct.save()
Пример #3
0
def job_publish(ct_id):
    # get CombineTask (ct)
    try:
        ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id))
        LOGGER.info('using %s', ct)

        # get CombineJob
        cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id']))

        # publish job
        publish_results = cjob.job.publish(
            publish_set_id=ct.task_params['publish_set_id'])

        # remove from published subsets
        cjob.job.remove_from_published_precounts()

        # add publish_set_id to published subsets if present, and remove precount
        for published_subset in ct.task_params['in_published_subsets']:
            LOGGER.debug('adding publish_set_id to Published Subset: %s',
                         published_subset)
            pr = models.PublishedRecords(subset=published_subset)
            pr.add_publish_set_id_to_subset(
                publish_set_id=ct.task_params['publish_set_id'])

        # save export output to Combine Task output
        ct.refresh_from_db()
        ct.task_output_json = json.dumps({
            'job_id': ct.task_params['job_id'],
            'publish_results': publish_results
        })
        ct.save()

    except Exception as e:

        LOGGER.info(str(e))

        # attempt to capture error and return for task
        ct.task_output_json = json.dumps({'error': str(e)})
        ct.save()
Пример #4
0
def export_documents(ct_id):

	'''
	- submit livy job and poll until complete
		- use livy session from cjob (works, but awkward way to get this)
	- add wrapper element to file parts
	- rename file parts
	- tar/zip together
	'''

	# get CombineBackgroundTask
	ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id))
	logger.info('using %s' % ct)

	# generate spark code
	output_path = '/tmp/%s' % str(uuid.uuid4())

	# handle single Job
	if 'job_id' in ct.task_params.keys():

		# get CombineJob
		cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id']))

		# set archive filename of loose XML files
		archive_filename_root = 'j_%s_documents' % cjob.job.id

		# build job_dictionary
		job_dict = {'j%s' % cjob.job.id: [cjob.job.id]}
		logger.info(job_dict)

	# handle published records
	if 'published' in ct.task_params.keys():

		# set archive filename of loose XML files
		archive_filename_root = 'published_documents'

		# get anonymous CombineJob
		cjob = models.CombineJob()

		# get published records to determine sets
		pr = models.PublishedRecords(subset=ct.task_params['subset'])

		# init job dictionary
		job_dict = {}

		# handle published jobs with publish set ids
		for publish_id, jobs in pr.sets.items():
			job_dict[publish_id] = [ job.id for job in jobs ]

		# handle "loose" Jobs
		job_dict['no_publish_set_id'] = [job.id for job in pr.published_jobs.filter(publish_set_id='')]

		# debug
		logger.info(job_dict)

	# update task params
	ct.refresh_from_db()
	ct.update_task_params({
		'output_path':output_path,
		'archive_filename_root':archive_filename_root,
		'job_dict':job_dict
	})

	# prepare spark code
	spark_code = "import math,uuid\nfrom console import *\nexport_records_as_xml(spark, %d)" % (int(ct_id))
	logger.info(spark_code)

	try:

		# check for livy session
		_check_livy_session()

		# submit to livy
		logger.info('submitting code to Spark')
		submit = models.LivyClient().submit_job(cjob.livy_session.session_id, {'code':spark_code})

		# poll until complete
		logger.info('polling for Spark job to complete...')
		results = polling.poll(lambda: models.LivyClient().job_status(submit.headers['Location']).json(), check_success=spark_job_done, step=5, poll_forever=True)
		logger.info(results)

		# handle s3 bucket
		if ct.task_params.get('s3_export', False):

			if ct.task_params.get('s3_export_type') == 'archive':

				logger.debug('writing archive file to S3')

				# create single archive file
				ct = _create_export_documents_archive(ct)

				# upload to s3
				s3 = boto3.resource('s3',
					aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
					aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY)
				s3.Object(ct.task_params['s3_bucket'], ct.task_params['s3_key'])\
				.put(Body=open(ct.task_params['export_output_archive'],'rb'))

				# delete all traces from local output
				shutil.rmtree(ct.task_params['output_path'])

			elif ct.task_params.get('s3_export_type') == 'spark_df':
				logger.debug('s3 export type was spark_df, nothing to cleanup or do')

			# save export output to Combine Task output
			ct.refresh_from_db()
			ct.task_output_json = json.dumps({
				's3_export_type':ct.task_params['s3_export_type'],
				'export_output':'s3://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'].lstrip('/')),
			})
			ct.save()
			logger.info(ct.task_output_json)

		# handle local filesystem
		else:

			# create single archive file
			ct = _create_export_documents_archive(ct)

			# save export output to Combine Task output
			ct.refresh_from_db()
			ct.task_output_json = json.dumps({
				'export_output':ct.task_params['export_output_archive'],
				'name':ct.task_params['export_output_archive'].split('/')[-1],
				'content_type':ct.task_params['content_type'],
				'export_dir':"/".join(ct.task_params['export_output_archive'].split('/')[:-1])
			})
			ct.save()
			logger.info(ct.task_output_json)

	except Exception as e:

		logger.info(str(e))

		# attempt to capture error and return for task
		ct.task_output_json = json.dumps({
			'error':str(e)
		})
		ct.save()
Пример #5
0
def export_mapped_fields(ct_id):

	# get CombineTask (ct)
	ct = models.CombineBackgroundTask.objects.get(pk=int(ct_id))

	try:

		# JSON export
		if ct.task_params['mapped_fields_export_type'] == 'json':

			# handle single Job
			if 'job_id' in ct.task_params.keys():

				# get CombineJob
				cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id']))

				# set output filename
				output_path = '/tmp/%s' % uuid.uuid4().hex
				os.mkdir(output_path)
				export_output = '%s/job_%s_mapped_fields.json' % (output_path, cjob.job.id)

				# build command list
				cmd = [
					"elasticdump",
					"--input=http://%s:9200/j%s" % (settings.ES_HOST, cjob.job.id),
					"--output=%s" % export_output,
					"--type=data",
					"--sourceOnly",
					"--ignore-errors",
					"--noRefresh"
				]

			# handle published records
			if 'published' in ct.task_params.keys():

				# set output filename
				output_path = '/tmp/%s' % uuid.uuid4().hex
				os.mkdir(output_path)
				export_output = '%s/published_mapped_fields.json' % (output_path)

				# get list of jobs ES indices to export
				pr = models.PublishedRecords(subset=ct.task_params['subset'])
				es_list = ','.join(['j%s' % job.id for job in pr.published_jobs])

				# build command list
				cmd = [
					"elasticdump",
					"--input=http://%s:9200/%s" % (settings.ES_HOST, es_list),
					"--output=%s" % export_output,
					"--type=data",
					"--sourceOnly",
					"--ignore-errors",
					"--noRefresh"
				]

			# if fields provided, limit
			if ct.task_params['mapped_field_include']:
				logger.info('specific fields selected, adding to elasticdump command:')
				searchBody = {
					"_source":ct.task_params['mapped_field_include']
				}
				cmd.append("--searchBody='%s'" % json.dumps(searchBody))


		# CSV export
		if ct.task_params['mapped_fields_export_type'] == 'csv':

			# handle single Job
			if 'job_id' in ct.task_params.keys():

				# get CombineJob
				cjob = models.CombineJob.get_combine_job(int(ct.task_params['job_id']))

				# set output filename
				output_path = '/tmp/%s' % uuid.uuid4().hex
				os.mkdir(output_path)
				export_output = '%s/job_%s_mapped_fields.csv' % (output_path, cjob.job.id)

				# build command list
				cmd = [
					"es2csv",
					"-u http://%s:9200" % settings.ES_HOST,
					"-q '*'",
					"-i 'j%s'" % cjob.job.id,
					"-D 'record'",
					"-o '%s'" % export_output
				]

			# handle published records
			if 'published' in ct.task_params.keys():

				# set output filename
				output_path = '/tmp/%s' % uuid.uuid4().hex
				os.mkdir(output_path)
				export_output = '%s/published_mapped_fields.csv' % (output_path)

				# get list of jobs ES indices to export
				pr = models.PublishedRecords(subset=ct.task_params['subset'])
				es_list = ','.join(['j%s' % job.id for job in pr.published_jobs])

				# build command list
				cmd = [
					"es2csv",
					"-u http://%s:9200" % settings.ES_HOST,
					"-q '*'",
					"-i '%s'" % es_list,
					"-D 'record'",
					"-o '%s'" % export_output
				]

			# handle kibana style
			if ct.task_params['kibana_style']:
				cmd.append('-k')
				cmd.append("-kd '|'")

			# if fields provided, limit
			if ct.task_params['mapped_field_include']:
				logger.info('specific fields selected, adding to es2csv command:')
				cmd.append('-f ' + " ".join(["'%s'" % field for field in ct.task_params['mapped_field_include']]))

		# execute compiled command
		logger.info(cmd)
		os.system(" ".join(cmd))

		# handle compression
		if ct.task_params['archive_type'] == 'none':
			logger.info('uncompressed csv file requested, continuing')

		elif ct.task_params['archive_type'] == 'zip':

			logger.info('creating compressed zip archive')
			content_type = 'application/zip'

			# establish output archive file
			export_output_archive = '%s/%s.zip' % (output_path, export_output.split('/')[-1])

			with zipfile.ZipFile(export_output_archive,'w', zipfile.ZIP_DEFLATED) as zip:
				zip.write(export_output, export_output.split('/')[-1])

			# set export output to archive file
			export_output = export_output_archive

		# tar.gz
		elif ct.task_params['archive_type'] == 'targz':

			logger.info('creating compressed tar archive')
			content_type = 'application/gzip'

			# establish output archive file
			export_output_archive = '%s/%s.tar.gz' % (output_path, export_output.split('/')[-1])

			with tarfile.open(export_output_archive, 'w:gz') as tar:
				tar.add(export_output, arcname=export_output.split('/')[-1])

			# set export output to archive file
			export_output = export_output_archive

		# handle s3 bucket
		if ct.task_params.get('s3_export', False):

			logger.debug('writing archive file to S3')

			# upload to s3
			s3 = boto3.resource('s3',
				aws_access_key_id=settings.AWS_ACCESS_KEY_ID,
				aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY)
			s3.Object(ct.task_params['s3_bucket'], ct.task_params['s3_key'])\
			.put(Body=open(export_output,'rb'))

			# delete all traces from local output
			shutil.rmtree(output_path)

			# save export output to Combine Task output
			ct.refresh_from_db()
			ct.task_output_json = json.dumps({
				's3_export_type':ct.task_params['s3_export_type'],
				'export_output':'s3://%s/%s' % (ct.task_params['s3_bucket'], ct.task_params['s3_key'].lstrip('/')),
			})
			ct.save()
			logger.info(ct.task_output_json)

		# handle local filesystem
		else:

			# save export output to Combine Task output
			ct.refresh_from_db()
			ct.task_output_json = json.dumps({
				'export_output':export_output,
				'name':export_output.split('/')[-1],
				'export_dir':"/".join(export_output.split('/')[:-1])
			})
			ct.save()

	except Exception as e:

		logger.info(str(e))

		# attempt to capture error and return for task
		ct.task_output_json = json.dumps({
			'error':str(e)
		})
		ct.save()