def prepare(cls, username, **kwargs): """ Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob, or fail with a suitable exception :param kwargs: arbitrary keyword arguments pertaining to this task type :return: a BackgroundJob instance representing this task """ write = kwargs.get("write", True) prepall = kwargs.get("prepall", False) if not write and prepall: raise BackgroundException( "'prepall' must be used with the 'write' parameter set to True (why prep but not save?)" ) params = {} cls.set_param(params, "write", write) cls.set_param(params, "prepall", prepall) # first prepare a job record job = models.BackgroundJob() job.user = username job.action = cls.__action__ job.params = params if prepall: job.add_audit_message( "'prepall' arg set. 'unchanged' articles will also have their indexes refreshed." ) return job
def prepare(cls, username, **kwargs): """ Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob, or fail with a suitable exception :param kwargs: arbitrary keyword arguments pertaining to this task type :return: a BackgroundJob instance representing this task """ params = {} cls.set_param( params, 'clean', False if "clean" not in kwargs else kwargs["clean"] if kwargs["clean"] is not None else False) cls.set_param( params, "prune", False if "prune" not in kwargs else kwargs["prune"] if kwargs["prune"] is not None else False) cls.set_param( params, "types", "all" if "types" not in kwargs else kwargs["types"] if kwargs["types"] in ["all", "journal", "article"] else "all") container = app.config.get("STORE_PUBLIC_DATA_DUMP_CONTAINER") if container is None: raise BackgroundException( "You must set STORE_PUBLIC_DATA_DUMP_CONTAINER in the config") # first prepare a job record job = models.BackgroundJob() job.user = username job.action = cls.__action__ job.params = params return job
def run(self): """ Execute the task as specified by the background_job :return: """ job = self.background_job params = job.params ids = self.get_param(params, 'ids') if not self._job_parameter_check(params): raise BackgroundException( u"{}.run run without sufficient parameters".format( self.__class__.__name__)) # repeat the estimations and log what they were at the time the job ran, in addition to what the user saw # when requesting the job in journal_bulk_delete_manage estimates = self.estimate_delete_counts( json.loads(job.reference['selection_query'])) job.add_audit_message( u"About to delete an estimated {} journals with {} articles associated with their ISSNs." .format(estimates['journals-to-be-deleted'], estimates['articles-to-be-deleted'])) journal_delete_q_by_ids = models.Journal.make_query( should_terms={'_id': ids}, consistent_order=False) models.Journal.delete_selected(query=journal_delete_q_by_ids, articles=True, snapshot_journals=True, snapshot_articles=True) job.add_audit_message( u"Deleted {} journals and all articles associated with their ISSNs." .format(len(ids)))
def prepare(cls, username, **kwargs): """ Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob, or fail with a suitable exception :param kwargs: arbitrary keyword arguments pertaining to this task type :return: a BackgroundJob instance representing this task """ super(JournalBulkDeleteBackgroundTask, cls).prepare(username, **kwargs) # first prepare a job record job = models.BackgroundJob() job.user = username job.action = cls.__action__ job.reference = {'selection_query': json.dumps(kwargs['selection_query'])} params = {} cls.set_param(params, 'ids', kwargs['ids']) if not cls._job_parameter_check(params): raise BackgroundException("{}.prepare run without sufficient parameters".format(cls.__name__)) job.params = params # now ensure that we have the locks for all the records, if they are lockable # will raise an exception if this fails lock.batch_lock('journal', kwargs['ids'], username, timeout=app.config.get("BACKGROUND_TASK_LOCK_TIMEOUT", 3600)) return job
def run(self): """ Execute the task as specified by the background_job :return: """ job = self.background_job params = job.params ids = self.get_param(params, 'ids') if not self._job_parameter_check(params): raise BackgroundException( u"{}.run run without sufficient parameters".format( self.__class__.__name__)) batches_count = len(ids) / self.BATCH_SIZE + ( 0 if len(ids) % self.BATCH_SIZE == 0 else 1) job.add_audit_message( u"About to delete {} articles in {} batches".format( len(ids), batches_count)) for batch_num, batch in enumerate(batch_up(ids, self.BATCH_SIZE), start=1): article_delete_q_by_ids = models.Article.make_query( should_terms={'_id': batch}, consistent_order=False) models.Article.delete_selected(query=article_delete_q_by_ids, snapshot=True) job.add_audit_message( u"Deleted {} articles in batch {} of {}".format( len(batch), batch_num, batches_count)) job.add_audit_message(u"Deleted {} articles".format(len(ids)))
def prepare(cls, username, **kwargs): """ Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob, or fail with a suitable exception :param kwargs: arbitrary keyword arguments pertaining to this task type :return: a BackgroundJob instance representing this task """ super(ArticleBulkDeleteBackgroundTask, cls).prepare(username, **kwargs) # first prepare a job record job = models.BackgroundJob() job.user = username job.action = cls.__action__ refs = {} cls.set_reference(refs, "selection_query", json.dumps(kwargs['selection_query'])) job.reference = refs params = {} cls.set_param(params, 'ids', kwargs['ids']) if not cls._job_parameter_check(params): raise BackgroundException( u"{}.prepare run without sufficient parameters".format( cls.__name__)) job.params = params return job
def prepare(cls, username, **kwargs): """ Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob, or fail with a suitable exception :param kwargs: arbitrary keyword arguments pertaining to this task type :return: a BackgroundJob instance representing this task """ upload_dir = app.config.get("UPLOAD_DIR") if upload_dir is None: raise BackgroundException("UPLOAD_DIR is not set in configuration") f = kwargs.get("upload_file") schema = kwargs.get("schema") url = kwargs.get("url") previous = kwargs.get("previous", []) if f is None and url is None: raise BackgroundException( "You must specify one of 'upload_file' or 'url' as keyword arguments" ) if schema is None: raise BackgroundException( "You must specify 'schema' in the keyword arguments") file_upload_id = None if f is not None and f.filename != "": file_upload_id = cls._file_upload(username, f, schema, previous) elif url is not None and url != "": file_upload_id = cls._url_upload(username, url, schema, previous) if file_upload_id is None: raise BackgroundException("No file upload record was created") # first prepare a job record job = models.BackgroundJob() job.user = username job.action = cls.__action__ params = {} cls.set_param(params, "file_upload_id", file_upload_id) cls.set_param(params, "attempts", 0) job.params = params return job
def prepare(cls, username, **kwargs): """ Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob, or fail with a suitable exception :param kwargs: arbitrary keyword arguments pertaining to this task type :return: a BackgroundJob instance representing this task """ super(JournalBulkEditBackgroundTask, cls).prepare(username, **kwargs) # first prepare a job record job = models.BackgroundJob() job.user = username job.action = cls.__action__ refs = {} cls.set_reference(refs, "selection_query", json.dumps(kwargs['selection_query'])) job.reference = refs params = {} # get the named parameters we know may be there cls.set_param(params, 'ids', kwargs['ids']) if "note" in kwargs and kwargs[ "note"] is not None and kwargs["note"] != "": cls.set_param(params, 'note', kwargs.get('note', '')) # get the metadata overwrites if "replacement_metadata" in kwargs: metadata = {} for k, v in kwargs["replacement_metadata"].items(): if v is not None and v != "": metadata[k] = v if len(metadata.keys()) > 0: cls.set_param(params, 'replacement_metadata', json.dumps(metadata)) if not cls._job_parameter_check(params): raise BackgroundException( "{}.prepare run without sufficient parameters".format( cls.__name__)) job.params = params # now ensure that we have the locks for all the journals # will raise an exception if this fails lock.batch_lock("journal", kwargs['ids'], username, timeout=app.config.get("BACKGROUND_TASK_LOCK_TIMEOUT", 3600)) return job
def run(self): """ Execute the task as specified by the background_jon :return: """ job = self.background_job params = job.params if params is None: raise BackgroundException( u"IngestArticleBackgroundTask.run run without sufficient parameters" ) file_upload_id = self.get_param(params, "file_upload_id") if file_upload_id is None: raise BackgroundException( u"IngestArticleBackgroundTask.run run without sufficient parameters" ) file_upload = models.FileUpload.pull(file_upload_id) if file_upload is None: raise BackgroundException( u"IngestArticleBackgroundTask.run unable to find file upload with id {x}" .format(x=file_upload_id)) try: # if the file "exists", this means its a remote file that needs to be downloaded, so do that if file_upload.status == "exists": job.add_audit_message( u"Downloading file for file upload {x}, job {y}".format( x=file_upload_id, y=job.id)) self._download(file_upload) # if the file is validated, which will happen if it has been uploaded, or downloaded successfully, process it. if file_upload.status == "validated": job.add_audit_message( u"Importing file for file upload {x}, job {y}".format( x=file_upload_id, y=job.id)) self._process(file_upload) finally: file_upload.save()
def prepare(cls, username, **kwargs): """ Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob, or fail with a suitable exception :param kwargs: arbitrary keyword arguments pertaining to this task type :return: a BackgroundJob instance representing this task """ base_url = app.config.get("BASE_URL") if base_url is None: raise BackgroundException("BASE_URL must be set in configuration before we can generate a sitemap") cdir = app.config.get("CACHE_DIR") if cdir is None: raise BackgroundException("You must set CACHE_DIR in the config") # first prepare a job record job = models.BackgroundJob() job.user = username job.action = cls.__action__ return job
def prepare(cls, username, **kwargs): """ Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob, or fail with a suitable exception :param username: user who called this job :param kwargs: arbitrary keyword arguments pertaining to this task type :return: a BackgroundJob instance representing this task """ if not app.config.get("ENABLE_EMAIL", False): raise BackgroundException("Email has been disabled in config. Set ENABLE_EMAIL to True to run this task.") # first prepare a job record job = models.BackgroundJob() job.user = username job.action = cls.__action__ return job
def run(self): """ Execute the task as specified by the background_job :return: """ job = self.background_job params = job.params if not self._job_parameter_check(params): raise BackgroundException(u"{}.run run without sufficient parameters".format(self.__class__.__name__)) # get the parameters for the job ids = self.get_param(params, 'ids') note = self.get_param(params, 'note') metadata = json.loads(self.get_param(params, 'replacement_metadata', "{}")) # if there is metadata, validate it if (len(metadata.keys()) > 0): formdata = MultiDict(metadata) fc = formcontext.JournalFormFactory.get_form_context( role="bulk_edit", form_data=formdata ) if not fc.validate(): raise BackgroundException("Unable to validate replacement metadata: " + json.dumps(metadata)) for journal_id in ids: updated = False j = models.Journal.pull(journal_id) if j is None: job.add_audit_message(u"Journal with id {} does not exist, skipping".format(journal_id)) continue fc = formcontext.JournalFormFactory.get_form_context(role="admin", source=j) # turn on the "all fields optional" flag, so that bulk tasks don't cause errors that the user iterface # would allow you to bypass fc.form.make_all_fields_optional.data = True if "editor_group" in metadata: fc.form.editor.data = None elif j.editor_group is not None: # FIXME: this is a bit of a stop-gap, pending a more substantial referential-integrity-like solution # if the editor group is not being changed, validate that the editor is actually in the editor group, # and if not, unset them eg = models.EditorGroup.pull_by_key("name", j.editor_group) if eg is not None: all_eds = eg.associates + [eg.editor] if j.editor not in all_eds: fc.form.editor.data = None else: # if we didn't find the editor group, this is broken anyway, so reset the editor data anyway fc.form.editor.data = None if "contact_email" in metadata: fc.form.confirm_contact_email.data = metadata["contact_email"] for k, v in metadata.iteritems(): job.add_audit_message(u"Setting {f} to {x} for journal {y}".format(f=k, x=v, y=journal_id)) fc.form[k].data = v updated = True if note: job.add_audit_message(u"Adding note to for journal {y}".format(y=journal_id)) fc.form.notes.append_entry( {'date': datetime.now().strftime(app.config['DEFAULT_DATE_FORMAT']), 'note': note} ) updated = True if updated: if fc.validate(): try: fc.finalise() except formcontext.FormContextException as e: job.add_audit_message(u"Form context exception while bulk editing journal {} :\n{}".format(journal_id, e.message)) else: data_submitted = {} for affected_field_name in fc.form.errors.keys(): affected_field = getattr(fc.form, affected_field_name, ' Field {} does not exist on form. '.format(affected_field_name)) if isinstance(affected_field, basestring): # ideally this should never happen, an error should not be reported on a field that is not present on the form data_submitted[affected_field_name] = affected_field continue data_submitted[affected_field_name] = affected_field.data job.add_audit_message( u"Data validation failed while bulk editing journal {} :\n" u"{}\n\n" u"The data from the fields with the errors is:\n{}".format( journal_id, json.dumps(fc.form.errors), json.dumps(data_submitted) ) )
def run(self): """ Execute the task as specified by the background_job :return: """ job = self.background_job params = job.params clean = self.get_param(params, 'clean') prune = self.get_param(params, 'prune') types = self.get_param(params, 'types') tmpStore = StoreFactory.tmp() mainStore = StoreFactory.get("public_data_dump") container = app.config.get("STORE_PUBLIC_DATA_DUMP_CONTAINER") if clean: mainStore.delete_container(container) job.add_audit_message("Deleted existing data dump files") job.save() # create dir with today's date day_at_start = dates.today() # Do the search and save it page_size = app.config.get("DISCOVERY_BULK_PAGE_SIZE", 1000) records_per_file = app.config.get('DISCOVERY_RECORDS_PER_FILE', 100000) if types == 'all': types = ['article', 'journal'] else: types = [types] urls = {"article": None, "journal": None} sizes = {"article": None, "journal": None} # Scroll for article and/or journal for typ in types: job.add_audit_message(dates.now() + ": Starting export of " + typ) job.save() out_dir = tmpStore.path(container, "doaj_" + typ + "_data_" + day_at_start, create_container=True, must_exist=False) out_name = os.path.basename(out_dir) zipped_name = out_name + ".tar.gz" zip_dir = os.path.dirname(out_dir) zipped_path = os.path.join(zip_dir, zipped_name) tarball = tarfile.open(zipped_path, "w:gz") file_num = 1 out_file, path, filename = self._start_new_file( tmpStore, container, typ, day_at_start, file_num) first_in_file = True count = 0 for result in DiscoveryApi.scroll(typ, None, None, page_size, scan=True): if not first_in_file: out_file.write(",\n") else: first_in_file = False out_file.write(json.dumps(result)) count += 1 if count >= records_per_file: file_num += 1 self._finish_file(tmpStore, container, filename, path, out_file, tarball) job.save() out_file, path, filename = self._start_new_file( tmpStore, container, typ, day_at_start, file_num) first_in_file = True count = 0 if count > 0: self._finish_file(tmpStore, container, filename, path, out_file, tarball) job.save() tarball.close() # Copy the source directory to main store try: filesize = self._copy_on_complete(mainStore, tmpStore, container, zipped_path) job.save() except Exception as e: tmpStore.delete_container(container) raise BackgroundException( "Error copying {0} data on complete {1}\n".format( typ, str(e))) store_url = mainStore.url(container, zipped_name) urls[typ] = store_url sizes[typ] = filesize if prune: self._prune_container(mainStore, container, day_at_start, types) job.save() self.background_job.add_audit_message( "Removing temp store container {x}".format(x=container)) tmpStore.delete_container(container) # finally update the cache cache.Cache.cache_public_data_dump(urls["article"], sizes["article"], urls["journal"], sizes["journal"]) job.add_audit_message(dates.now() + ": done")
def __fail(record, previous, error): message = 'The URL could not be accessed; ' + error record.failed(message) record.save() previous.insert(0, record) raise BackgroundException(message)
def _file_upload(cls, username, f, schema, previous): # prep a record to go into the index, to record this upload record = models.FileUpload() record.upload(username, f.filename) record.set_id() # the file path that we are going to write to xml = os.path.join(app.config.get("UPLOAD_DIR", "."), record.local_filename) # it's critical here that no errors cause files to get left behind unrecorded try: # write the incoming file out to the XML file f.save(xml) # save the index entry record.save() except: # if we can't record either of these things, we need to back right off try: file_failed(xml) except: pass try: record.delete() except: pass raise BackgroundException( "Failed to upload file - please contact an administrator") xwalk_name = app.config.get("ARTICLE_CROSSWALKS", {}).get(schema) xwalk = plugin.load_class(xwalk_name)() # now we have the record in the index and on disk, we can attempt to # validate it try: with open(xml) as handle: xwalk.validate_file(handle) record.validated(schema) record.save() previous.insert(0, record) return record.id except IngestException as e: record.failed(e.message, e.inner_message) try: file_failed(xml) except: pass record.save() previous.insert(0, record) raise BackgroundException("Failed to upload file: " + e.message + "; " + str(e.inner_message)) except Exception as e: record.failed("File system error when reading file") try: file_failed(xml) except: pass record.save() previous.insert(0, record) raise BackgroundException( "Failed to upload file - please contact an administrator")
def run(self): """ Execute the task as specified by the background_job :return: """ job = self.background_job params = job.params ids = self.get_param(params, 'ids') editor_group = self.get_param(params, 'editor_group') note = self.get_param(params, 'note') application_status = self.get_param(params, 'application_status') if not self._job_parameter_check(params): raise BackgroundException( "{}.run run without sufficient parameters".format( self.__class__.__name__)) for suggestion_id in ids: updated = False s = models.Suggestion.pull(suggestion_id) if s is None: job.add_audit_message( "Suggestion with id {} does not exist, skipping".format( suggestion_id)) continue fc = formcontext.ApplicationFormFactory.get_form_context( role="admin", source=s) if editor_group: job.add_audit_message( "Setting editor_group to {x} for suggestion {y}".format( x=str(editor_group), y=suggestion_id)) # set the editor group f = fc.form.editor_group f.data = editor_group # clear the editor ed = fc.form.editor ed.data = None updated = True if note: job.add_audit_message( "Adding note to for suggestion {y}".format( y=suggestion_id)) fc.form.notes.append_entry({ 'date': datetime.now().strftime(app.config['DEFAULT_DATE_FORMAT']), 'note': note }) updated = True if application_status: job.add_audit_message( "Setting application_status to {x} for suggestion {y}". format(x=str(editor_group), y=suggestion_id)) f = fc.form.application_status f.data = application_status updated = True if updated: if fc.validate(): try: fc.finalise() except formcontext.FormContextException as e: job.add_audit_message( "Form context exception while bulk editing suggestion {} :\n{}" .format(suggestion_id, str(e))) else: data_submitted = {} for affected_field_name in list(fc.form.errors.keys()): affected_field = getattr( fc.form, affected_field_name, ' Field {} does not exist on form. '.format( affected_field_name)) if isinstance( affected_field, str ): # ideally this should never happen, an error should not be reported on a field that is not present on the form data_submitted[ affected_field_name] = affected_field continue data_submitted[ affected_field_name] = affected_field.data job.add_audit_message( "Data validation failed while bulk editing suggestion {} :\n" "{}\n\n" "The data from the fields with the errors is:\n{}". format(suggestion_id, json.dumps(fc.form.errors), json.dumps(data_submitted)))