Exemplo n.º 1
0
    def prepare(cls, username, **kwargs):
        """
        Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob,
        or fail with a suitable exception

        :param kwargs: arbitrary keyword arguments pertaining to this task type
        :return: a BackgroundJob instance representing this task
        """

        write = kwargs.get("write", True)
        prepall = kwargs.get("prepall", False)

        if not write and prepall:
            raise BackgroundException(
                "'prepall' must be used with the 'write' parameter set to True (why prep but not save?)"
            )

        params = {}
        cls.set_param(params, "write", write)
        cls.set_param(params, "prepall", prepall)

        # first prepare a job record
        job = models.BackgroundJob()
        job.user = username
        job.action = cls.__action__
        job.params = params
        if prepall:
            job.add_audit_message(
                "'prepall' arg set. 'unchanged' articles will also have their indexes refreshed."
            )
        return job
Exemplo n.º 2
0
    def prepare(cls, username, **kwargs):
        """
        Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob,
        or fail with a suitable exception

        :param kwargs: arbitrary keyword arguments pertaining to this task type
        :return: a BackgroundJob instance representing this task
        """
        params = {}
        cls.set_param(
            params, 'clean', False if "clean" not in kwargs else
            kwargs["clean"] if kwargs["clean"] is not None else False)
        cls.set_param(
            params, "prune", False if "prune" not in kwargs else
            kwargs["prune"] if kwargs["prune"] is not None else False)
        cls.set_param(
            params, "types",
            "all" if "types" not in kwargs else kwargs["types"]
            if kwargs["types"] in ["all", "journal", "article"] else "all")

        container = app.config.get("STORE_PUBLIC_DATA_DUMP_CONTAINER")
        if container is None:
            raise BackgroundException(
                "You must set STORE_PUBLIC_DATA_DUMP_CONTAINER in the config")

        # first prepare a job record
        job = models.BackgroundJob()
        job.user = username
        job.action = cls.__action__
        job.params = params
        return job
Exemplo n.º 3
0
    def run(self):
        """
        Execute the task as specified by the background_job
        :return:
        """
        job = self.background_job
        params = job.params

        ids = self.get_param(params, 'ids')

        if not self._job_parameter_check(params):
            raise BackgroundException(
                u"{}.run run without sufficient parameters".format(
                    self.__class__.__name__))

        # repeat the estimations and log what they were at the time the job ran, in addition to what the user saw
        # when requesting the job in journal_bulk_delete_manage
        estimates = self.estimate_delete_counts(
            json.loads(job.reference['selection_query']))
        job.add_audit_message(
            u"About to delete an estimated {} journals with {} articles associated with their ISSNs."
            .format(estimates['journals-to-be-deleted'],
                    estimates['articles-to-be-deleted']))

        journal_delete_q_by_ids = models.Journal.make_query(
            should_terms={'_id': ids}, consistent_order=False)
        models.Journal.delete_selected(query=journal_delete_q_by_ids,
                                       articles=True,
                                       snapshot_journals=True,
                                       snapshot_articles=True)
        job.add_audit_message(
            u"Deleted {} journals and all articles associated with their ISSNs."
            .format(len(ids)))
Exemplo n.º 4
0
    def prepare(cls, username, **kwargs):
        """
        Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob,
        or fail with a suitable exception

        :param kwargs: arbitrary keyword arguments pertaining to this task type
        :return: a BackgroundJob instance representing this task
        """

        super(JournalBulkDeleteBackgroundTask, cls).prepare(username, **kwargs)

        # first prepare a job record
        job = models.BackgroundJob()
        job.user = username
        job.action = cls.__action__
        job.reference = {'selection_query': json.dumps(kwargs['selection_query'])}

        params = {}
        cls.set_param(params, 'ids', kwargs['ids'])

        if not cls._job_parameter_check(params):
            raise BackgroundException("{}.prepare run without sufficient parameters".format(cls.__name__))

        job.params = params

        # now ensure that we have the locks for all the records, if they are lockable
        # will raise an exception if this fails
        lock.batch_lock('journal', kwargs['ids'], username, timeout=app.config.get("BACKGROUND_TASK_LOCK_TIMEOUT", 3600))

        return job
Exemplo n.º 5
0
    def run(self):
        """
        Execute the task as specified by the background_job
        :return:
        """
        job = self.background_job
        params = job.params

        ids = self.get_param(params, 'ids')

        if not self._job_parameter_check(params):
            raise BackgroundException(
                u"{}.run run without sufficient parameters".format(
                    self.__class__.__name__))

        batches_count = len(ids) / self.BATCH_SIZE + (
            0 if len(ids) % self.BATCH_SIZE == 0 else 1)
        job.add_audit_message(
            u"About to delete {} articles in {} batches".format(
                len(ids), batches_count))

        for batch_num, batch in enumerate(batch_up(ids, self.BATCH_SIZE),
                                          start=1):
            article_delete_q_by_ids = models.Article.make_query(
                should_terms={'_id': batch}, consistent_order=False)
            models.Article.delete_selected(query=article_delete_q_by_ids,
                                           snapshot=True)
            job.add_audit_message(
                u"Deleted {} articles in batch {} of {}".format(
                    len(batch), batch_num, batches_count))

        job.add_audit_message(u"Deleted {} articles".format(len(ids)))
Exemplo n.º 6
0
    def prepare(cls, username, **kwargs):
        """
        Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob,
        or fail with a suitable exception

        :param kwargs: arbitrary keyword arguments pertaining to this task type
        :return: a BackgroundJob instance representing this task
        """

        super(ArticleBulkDeleteBackgroundTask, cls).prepare(username, **kwargs)

        # first prepare a job record
        job = models.BackgroundJob()
        job.user = username
        job.action = cls.__action__
        refs = {}
        cls.set_reference(refs, "selection_query",
                          json.dumps(kwargs['selection_query']))
        job.reference = refs

        params = {}
        cls.set_param(params, 'ids', kwargs['ids'])

        if not cls._job_parameter_check(params):
            raise BackgroundException(
                u"{}.prepare run without sufficient parameters".format(
                    cls.__name__))

        job.params = params

        return job
Exemplo n.º 7
0
    def prepare(cls, username, **kwargs):
        """
        Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob,
        or fail with a suitable exception

        :param kwargs: arbitrary keyword arguments pertaining to this task type
        :return: a BackgroundJob instance representing this task
        """

        upload_dir = app.config.get("UPLOAD_DIR")
        if upload_dir is None:
            raise BackgroundException("UPLOAD_DIR is not set in configuration")

        f = kwargs.get("upload_file")
        schema = kwargs.get("schema")
        url = kwargs.get("url")
        previous = kwargs.get("previous", [])

        if f is None and url is None:
            raise BackgroundException(
                "You must specify one of 'upload_file' or 'url' as keyword arguments"
            )
        if schema is None:
            raise BackgroundException(
                "You must specify 'schema' in the keyword arguments")

        file_upload_id = None
        if f is not None and f.filename != "":
            file_upload_id = cls._file_upload(username, f, schema, previous)
        elif url is not None and url != "":
            file_upload_id = cls._url_upload(username, url, schema, previous)

        if file_upload_id is None:
            raise BackgroundException("No file upload record was created")

        # first prepare a job record
        job = models.BackgroundJob()
        job.user = username
        job.action = cls.__action__

        params = {}
        cls.set_param(params, "file_upload_id", file_upload_id)
        cls.set_param(params, "attempts", 0)
        job.params = params

        return job
Exemplo n.º 8
0
    def prepare(cls, username, **kwargs):
        """
        Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob,
        or fail with a suitable exception

        :param kwargs: arbitrary keyword arguments pertaining to this task type
        :return: a BackgroundJob instance representing this task
        """

        super(JournalBulkEditBackgroundTask, cls).prepare(username, **kwargs)

        # first prepare a job record
        job = models.BackgroundJob()
        job.user = username
        job.action = cls.__action__

        refs = {}
        cls.set_reference(refs, "selection_query",
                          json.dumps(kwargs['selection_query']))
        job.reference = refs

        params = {}

        # get the named parameters we know may be there
        cls.set_param(params, 'ids', kwargs['ids'])
        if "note" in kwargs and kwargs[
                "note"] is not None and kwargs["note"] != "":
            cls.set_param(params, 'note', kwargs.get('note', ''))

        # get the metadata overwrites
        if "replacement_metadata" in kwargs:
            metadata = {}
            for k, v in kwargs["replacement_metadata"].items():
                if v is not None and v != "":
                    metadata[k] = v
            if len(metadata.keys()) > 0:
                cls.set_param(params, 'replacement_metadata',
                              json.dumps(metadata))

        if not cls._job_parameter_check(params):
            raise BackgroundException(
                "{}.prepare run without sufficient parameters".format(
                    cls.__name__))

        job.params = params

        # now ensure that we have the locks for all the journals
        # will raise an exception if this fails
        lock.batch_lock("journal",
                        kwargs['ids'],
                        username,
                        timeout=app.config.get("BACKGROUND_TASK_LOCK_TIMEOUT",
                                               3600))

        return job
Exemplo n.º 9
0
    def run(self):
        """
        Execute the task as specified by the background_jon
        :return:
        """
        job = self.background_job
        params = job.params

        if params is None:
            raise BackgroundException(
                u"IngestArticleBackgroundTask.run run without sufficient parameters"
            )

        file_upload_id = self.get_param(params, "file_upload_id")
        if file_upload_id is None:
            raise BackgroundException(
                u"IngestArticleBackgroundTask.run run without sufficient parameters"
            )

        file_upload = models.FileUpload.pull(file_upload_id)
        if file_upload is None:
            raise BackgroundException(
                u"IngestArticleBackgroundTask.run unable to find file upload with id {x}"
                .format(x=file_upload_id))

        try:
            # if the file "exists", this means its a remote file that needs to be downloaded, so do that
            if file_upload.status == "exists":
                job.add_audit_message(
                    u"Downloading file for file upload {x}, job {y}".format(
                        x=file_upload_id, y=job.id))
                self._download(file_upload)

            # if the file is validated, which will happen if it has been uploaded, or downloaded successfully, process it.
            if file_upload.status == "validated":
                job.add_audit_message(
                    u"Importing file for file upload {x}, job {y}".format(
                        x=file_upload_id, y=job.id))
                self._process(file_upload)
        finally:
            file_upload.save()
Exemplo n.º 10
0
    def prepare(cls, username, **kwargs):
        """
        Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob,
        or fail with a suitable exception

        :param kwargs: arbitrary keyword arguments pertaining to this task type
        :return: a BackgroundJob instance representing this task
        """

        base_url = app.config.get("BASE_URL")
        if base_url is None:
            raise BackgroundException("BASE_URL must be set in configuration before we can generate a sitemap")

        cdir = app.config.get("CACHE_DIR")
        if cdir is None:
            raise BackgroundException("You must set CACHE_DIR in the config")

        # first prepare a job record
        job = models.BackgroundJob()
        job.user = username
        job.action = cls.__action__
        return job
Exemplo n.º 11
0
    def prepare(cls, username, **kwargs):
        """
        Take an arbitrary set of keyword arguments and return an instance of a BackgroundJob,
        or fail with a suitable exception
        :param username: user who called this job
        :param kwargs: arbitrary keyword arguments pertaining to this task type
        :return: a BackgroundJob instance representing this task
        """

        if not app.config.get("ENABLE_EMAIL", False):
            raise BackgroundException("Email has been disabled in config. Set ENABLE_EMAIL to True to run this task.")

        # first prepare a job record
        job = models.BackgroundJob()
        job.user = username
        job.action = cls.__action__
        return job
Exemplo n.º 12
0
    def run(self):
        """
        Execute the task as specified by the background_job
        :return:
        """
        job = self.background_job
        params = job.params

        if not self._job_parameter_check(params):
            raise BackgroundException(u"{}.run run without sufficient parameters".format(self.__class__.__name__))

        # get the parameters for the job
        ids = self.get_param(params, 'ids')
        note = self.get_param(params, 'note')
        metadata = json.loads(self.get_param(params, 'replacement_metadata', "{}"))

        # if there is metadata, validate it
        if (len(metadata.keys()) > 0):
            formdata = MultiDict(metadata)
            fc = formcontext.JournalFormFactory.get_form_context(
                role="bulk_edit",
                form_data=formdata
            )
            if not fc.validate():
                raise BackgroundException("Unable to validate replacement metadata: " + json.dumps(metadata))

        for journal_id in ids:
            updated = False

            j = models.Journal.pull(journal_id)

            if j is None:
                job.add_audit_message(u"Journal with id {} does not exist, skipping".format(journal_id))
                continue

            fc = formcontext.JournalFormFactory.get_form_context(role="admin", source=j)

            # turn on the "all fields optional" flag, so that bulk tasks don't cause errors that the user iterface
            # would allow you to bypass
            fc.form.make_all_fields_optional.data = True

            if "editor_group" in metadata:
                fc.form.editor.data = None
            elif j.editor_group is not None:
                # FIXME: this is a bit of a stop-gap, pending a more substantial referential-integrity-like solution
                # if the editor group is not being changed, validate that the editor is actually in the editor group,
                # and if not, unset them
                eg = models.EditorGroup.pull_by_key("name", j.editor_group)
                if eg is not None:
                    all_eds = eg.associates + [eg.editor]
                    if j.editor not in all_eds:
                        fc.form.editor.data = None
                else:
                    # if we didn't find the editor group, this is broken anyway, so reset the editor data anyway
                    fc.form.editor.data = None

            if "contact_email" in metadata:
                fc.form.confirm_contact_email.data = metadata["contact_email"]

            for k, v in metadata.iteritems():
                job.add_audit_message(u"Setting {f} to {x} for journal {y}".format(f=k, x=v, y=journal_id))
                fc.form[k].data = v
                updated = True

            if note:
                job.add_audit_message(u"Adding note to for journal {y}".format(y=journal_id))
                fc.form.notes.append_entry(
                    {'date': datetime.now().strftime(app.config['DEFAULT_DATE_FORMAT']), 'note': note}
                )
                updated = True
            
            if updated:
                if fc.validate():
                    try:
                        fc.finalise()
                    except formcontext.FormContextException as e:
                        job.add_audit_message(u"Form context exception while bulk editing journal {} :\n{}".format(journal_id, e.message))
                else:
                    data_submitted = {}
                    for affected_field_name in fc.form.errors.keys():
                        affected_field = getattr(fc.form, affected_field_name,
                                                 ' Field {} does not exist on form. '.format(affected_field_name))
                        if isinstance(affected_field, basestring):  # ideally this should never happen, an error should not be reported on a field that is not present on the form
                            data_submitted[affected_field_name] = affected_field
                            continue

                        data_submitted[affected_field_name] = affected_field.data
                    job.add_audit_message(
                        u"Data validation failed while bulk editing journal {} :\n"
                        u"{}\n\n"
                        u"The data from the fields with the errors is:\n{}".format(
                            journal_id, json.dumps(fc.form.errors), json.dumps(data_submitted)
                        )
                    )
Exemplo n.º 13
0
    def run(self):
        """
        Execute the task as specified by the background_job
        :return:
        """
        job = self.background_job
        params = job.params

        clean = self.get_param(params, 'clean')
        prune = self.get_param(params, 'prune')
        types = self.get_param(params, 'types')

        tmpStore = StoreFactory.tmp()
        mainStore = StoreFactory.get("public_data_dump")
        container = app.config.get("STORE_PUBLIC_DATA_DUMP_CONTAINER")

        if clean:
            mainStore.delete_container(container)
            job.add_audit_message("Deleted existing data dump files")
            job.save()

        # create dir with today's date
        day_at_start = dates.today()

        # Do the search and save it
        page_size = app.config.get("DISCOVERY_BULK_PAGE_SIZE", 1000)
        records_per_file = app.config.get('DISCOVERY_RECORDS_PER_FILE', 100000)

        if types == 'all':
            types = ['article', 'journal']
        else:
            types = [types]

        urls = {"article": None, "journal": None}
        sizes = {"article": None, "journal": None}

        # Scroll for article and/or journal
        for typ in types:
            job.add_audit_message(dates.now() + ": Starting export of " + typ)
            job.save()

            out_dir = tmpStore.path(container,
                                    "doaj_" + typ + "_data_" + day_at_start,
                                    create_container=True,
                                    must_exist=False)
            out_name = os.path.basename(out_dir)
            zipped_name = out_name + ".tar.gz"
            zip_dir = os.path.dirname(out_dir)
            zipped_path = os.path.join(zip_dir, zipped_name)
            tarball = tarfile.open(zipped_path, "w:gz")

            file_num = 1
            out_file, path, filename = self._start_new_file(
                tmpStore, container, typ, day_at_start, file_num)

            first_in_file = True
            count = 0
            for result in DiscoveryApi.scroll(typ,
                                              None,
                                              None,
                                              page_size,
                                              scan=True):
                if not first_in_file:
                    out_file.write(",\n")
                else:
                    first_in_file = False
                out_file.write(json.dumps(result))
                count += 1

                if count >= records_per_file:
                    file_num += 1
                    self._finish_file(tmpStore, container, filename, path,
                                      out_file, tarball)
                    job.save()
                    out_file, path, filename = self._start_new_file(
                        tmpStore, container, typ, day_at_start, file_num)
                    first_in_file = True
                    count = 0

            if count > 0:
                self._finish_file(tmpStore, container, filename, path,
                                  out_file, tarball)
                job.save()

            tarball.close()

            # Copy the source directory to main store
            try:
                filesize = self._copy_on_complete(mainStore, tmpStore,
                                                  container, zipped_path)
                job.save()
            except Exception as e:
                tmpStore.delete_container(container)
                raise BackgroundException(
                    "Error copying {0} data on complete {1}\n".format(
                        typ, str(e)))

            store_url = mainStore.url(container, zipped_name)
            urls[typ] = store_url
            sizes[typ] = filesize

        if prune:
            self._prune_container(mainStore, container, day_at_start, types)
            job.save()

        self.background_job.add_audit_message(
            "Removing temp store container {x}".format(x=container))
        tmpStore.delete_container(container)

        # finally update the cache
        cache.Cache.cache_public_data_dump(urls["article"], sizes["article"],
                                           urls["journal"], sizes["journal"])

        job.add_audit_message(dates.now() + ": done")
Exemplo n.º 14
0
 def __fail(record, previous, error):
     message = 'The URL could not be accessed; ' + error
     record.failed(message)
     record.save()
     previous.insert(0, record)
     raise BackgroundException(message)
Exemplo n.º 15
0
    def _file_upload(cls, username, f, schema, previous):
        # prep a record to go into the index, to record this upload
        record = models.FileUpload()
        record.upload(username, f.filename)
        record.set_id()

        # the file path that we are going to write to
        xml = os.path.join(app.config.get("UPLOAD_DIR", "."),
                           record.local_filename)

        # it's critical here that no errors cause files to get left behind unrecorded
        try:
            # write the incoming file out to the XML file
            f.save(xml)

            # save the index entry
            record.save()
        except:
            # if we can't record either of these things, we need to back right off
            try:
                file_failed(xml)
            except:
                pass
            try:
                record.delete()
            except:
                pass

            raise BackgroundException(
                "Failed to upload file - please contact an administrator")

        xwalk_name = app.config.get("ARTICLE_CROSSWALKS", {}).get(schema)
        xwalk = plugin.load_class(xwalk_name)()

        # now we have the record in the index and on disk, we can attempt to
        # validate it
        try:
            with open(xml) as handle:
                xwalk.validate_file(handle)
            record.validated(schema)
            record.save()
            previous.insert(0, record)
            return record.id

        except IngestException as e:
            record.failed(e.message, e.inner_message)
            try:
                file_failed(xml)
            except:
                pass
            record.save()
            previous.insert(0, record)
            raise BackgroundException("Failed to upload file: " + e.message +
                                      "; " + str(e.inner_message))
        except Exception as e:
            record.failed("File system error when reading file")
            try:
                file_failed(xml)
            except:
                pass
            record.save()
            previous.insert(0, record)
            raise BackgroundException(
                "Failed to upload file - please contact an administrator")
Exemplo n.º 16
0
    def run(self):
        """
        Execute the task as specified by the background_job
        :return:
        """
        job = self.background_job
        params = job.params

        ids = self.get_param(params, 'ids')
        editor_group = self.get_param(params, 'editor_group')
        note = self.get_param(params, 'note')
        application_status = self.get_param(params, 'application_status')

        if not self._job_parameter_check(params):
            raise BackgroundException(
                "{}.run run without sufficient parameters".format(
                    self.__class__.__name__))

        for suggestion_id in ids:
            updated = False

            s = models.Suggestion.pull(suggestion_id)

            if s is None:
                job.add_audit_message(
                    "Suggestion with id {} does not exist, skipping".format(
                        suggestion_id))
                continue

            fc = formcontext.ApplicationFormFactory.get_form_context(
                role="admin", source=s)

            if editor_group:
                job.add_audit_message(
                    "Setting editor_group to {x} for suggestion {y}".format(
                        x=str(editor_group), y=suggestion_id))

                # set the editor group
                f = fc.form.editor_group
                f.data = editor_group

                # clear the editor
                ed = fc.form.editor
                ed.data = None

                updated = True

            if note:
                job.add_audit_message(
                    "Adding note to for suggestion {y}".format(
                        y=suggestion_id))
                fc.form.notes.append_entry({
                    'date':
                    datetime.now().strftime(app.config['DEFAULT_DATE_FORMAT']),
                    'note':
                    note
                })
                updated = True

            if application_status:
                job.add_audit_message(
                    "Setting application_status to {x} for suggestion {y}".
                    format(x=str(editor_group), y=suggestion_id))
                f = fc.form.application_status
                f.data = application_status
                updated = True

            if updated:
                if fc.validate():
                    try:
                        fc.finalise()
                    except formcontext.FormContextException as e:
                        job.add_audit_message(
                            "Form context exception while bulk editing suggestion {} :\n{}"
                            .format(suggestion_id, str(e)))
                else:
                    data_submitted = {}
                    for affected_field_name in list(fc.form.errors.keys()):
                        affected_field = getattr(
                            fc.form, affected_field_name,
                            ' Field {} does not exist on form. '.format(
                                affected_field_name))
                        if isinstance(
                                affected_field, str
                        ):  # ideally this should never happen, an error should not be reported on a field that is not present on the form
                            data_submitted[
                                affected_field_name] = affected_field
                            continue

                        data_submitted[
                            affected_field_name] = affected_field.data
                    job.add_audit_message(
                        "Data validation failed while bulk editing suggestion {} :\n"
                        "{}\n\n"
                        "The data from the fields with the errors is:\n{}".
                        format(suggestion_id, json.dumps(fc.form.errors),
                               json.dumps(data_submitted)))