def do_activity(self, data=None):

        try:
            session = Session(self.settings)
            version = session.get_value(data['run'], 'version')
            filename = session.get_value(data['run'], 'filename_last_element')

            article_structure = ArticleInfo(filename)

            version_date, error = self.get_version(self.settings, article_structure, article_structure.article_id, version)


            if error is not None:
                self.logger.error(error)
                self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'],
                                        self.pretty_name, "error",
                                        " ".join(("Error Looking up version article", article_structure.article_id,
                                                 "message:", error)))
                return activity.activity.ACTIVITY_PERMANENT_FAILURE

            self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'],
                                    self.pretty_name, "end",
                                    " ".join(("Finished Version Lookup for article", article_structure.article_id,
                                    "version:", version)))

            session.store_value(data['run'], 'update_date', version_date)

            return activity.activity.ACTIVITY_SUCCESS

        except Exception as e:
            self.logger.exception("Exception when trying to Lookup next version")
            self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'], self.pretty_name,
                                    "error", " ".join(("Error looking up version for article",
                                                      article_structure.article_id, "message:", str(e))))
            return activity.activity.ACTIVITY_PERMANENT_FAILURE
Пример #2
0
    def do_activity(self, data=None):

        try:
            session = Session(self.settings)
            version = session.get_value(data['run'], 'version')
            filename = session.get_value(data['run'], 'filename_last_element')

            article_structure = ArticleInfo(filename)

            version_date, error = self.get_version(
                self.settings, article_structure, article_structure.article_id,
                version)

            if error is not None:
                self.logger.error(error)
                self.emit_monitor_event(
                    self.settings, article_structure.article_id, version,
                    data['run'], self.pretty_name, "error", " ".join(
                        ("Error Looking up version article",
                         article_structure.article_id, "message:", error)))
                return activity.activity.ACTIVITY_PERMANENT_FAILURE

            self.emit_monitor_event(
                self.settings, article_structure.article_id, version,
                data['run'], self.pretty_name, "end", " ".join(
                    ("Finished Version Lookup for article",
                     article_structure.article_id, "version:", version)))

            session.store_value(data['run'], 'update_date', version_date)

            return activity.activity.ACTIVITY_SUCCESS

        except Exception as e:
            self.logger.exception(
                "Exception when trying to Lookup next version")
            self.emit_monitor_event(
                self.settings, article_structure.article_id, version,
                data['run'], self.pretty_name, "error", " ".join(
                    ("Error looking up version for article",
                     article_structure.article_id, "message:", str(e))))
            return activity.activity.ACTIVITY_PERMANENT_FAILURE
Пример #3
0
    def do_activity(self, data=None):

        try:

            info = S3NotificationInfo.from_dict(data)
            filename = info.file_name[info.file_name.rfind('/')+1:]
            session = Session(self.settings)
            session.store_value(data['run'], 'filename_last_element', filename)

            article_structure = ArticleInfo(filename)

            if article_structure.article_id is None:
                self.logger.error("Name '%s' did not match expected pattern for article id" % filename)
                raise RuntimeError("article_structure.article_id is None. File pattern problem.")

            version = self.get_version(self.settings, article_structure, data['version_lookup_function'])
            session.store_value(data['run'], 'version', version)
            article_id = article_structure.article_id

            self.emit_monitor_event(self.settings, article_id, version, data['run'],
                                    self.pretty_name, "start",
                                    " ".join(("Version Lookup for article", article_id, "version:", version)))

            self.set_monitor_property(self.settings, article_id, "article-id", article_id, "text")
            self.set_monitor_property(self.settings, article_id, "publication-status", "publication in progress",
                                      "text",
                                      version=version)

            self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'],
                                    self.pretty_name, "end",
                                    " ".join(("Finished Version Lookup for article", article_structure.article_id,
                                              "version:", version)))
            return activity.activity.ACTIVITY_SUCCESS

        except Exception as e:
            self.logger.exception("Exception when trying to Lookup Version. Error: " + str(e))
            return activity.activity.ACTIVITY_PERMANENT_FAILURE
    def do_activity(self, data=None):

        """
        Do the work
        """

        run = data['run']

        if self.logger:
            self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4))
        info = S3NotificationInfo.from_dict(data)

        storage_context = StorageContext(self.settings)

        session = Session(self.settings)

        filename_last_element = session.get_value(run, 'filename_last_element')
        # zip name contains version information for previously archived zip files
        article_structure = ArticleInfo(filename_last_element)
        article_id = article_structure.article_id
        session.store_value(run, 'article_id', article_id)
        session.store_value(run, 'file_name', info.file_name)

        if self.logger:
            self.logger.info("Expanding file %s" % info.file_name)

        version = session.get_value(run, 'version')

        status = article_structure.status
        if status is None or (status != 'vor' and status != 'poa'):
            self.logger.error("Name '%s' did not match expected pattern for status" %
                              filename_last_element)
            return activity.activity.ACTIVITY_PERMANENT_FAILURE  # status could not be determined, exit workflow.

        article_version_id = article_id + '.' + version
        session.store_value(run, 'article_version_id', article_version_id)
        session.store_value(run, 'run', run)
        session.store_value(run, 'status', status)
        self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start",
                                "Starting expansion of article " + article_id)


        try:
            # download zip to temp folder
            tmp = self.get_tmp_dir()
            local_zip_file = self.open_file_from_tmp_dir(filename_last_element, mode='wb')
            storage_resource_origin = self.settings.storage_provider + "://" + info.bucket_name + "/" + info.file_name
            storage_context.get_resource_to_file(storage_resource_origin, local_zip_file)
            local_zip_file.close()

            # extract zip contents
            folder_name = path.join(article_version_id, run)
            content_folder = path.join(tmp, folder_name)
            makedirs(content_folder)
            with ZipFile(path.join(tmp, filename_last_element)) as zf:
                zf.extractall(content_folder)

            upload_filenames = []
            for f in listdir(content_folder):
                if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_':
                    upload_filenames.append(f)

            bucket_folder_name = article_version_id + '/' + run
            for filename in upload_filenames:
                source_path = path.join(content_folder, filename)
                dest_path = bucket_folder_name + '/' + filename
                storage_resource_dest = self.settings.storage_provider + "://" + self.settings.publishing_buckets_prefix + \
                                        self.settings.expanded_bucket + "/" + dest_path
                storage_context.set_resource_from_filename(storage_resource_dest, source_path)

            self.clean_tmp_dir()

            session.store_value(run, 'expanded_folder', bucket_folder_name)
            self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article",
                                    "end", "Finished expansion of article " + article_id +
                                    " for version " + version + " run " + str(run) +
                                    " into " + bucket_folder_name)
        except Exception as e:
            self.logger.exception("Exception when expanding article")
            self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article",
                                    "error", "Error expanding article " + article_id +
                                    " message:" + e.message)
            return activity.activity.ACTIVITY_PERMANENT_FAILURE

        return True
    def do_activity(self, data=None):
        """
        Do the work
        """

        session = Session(self.settings)
        version = session.get_value(self.get_workflowId(), 'version')
        article_id = session.get_value(self.get_workflowId(), 'article_id')
        article_version_id = article_id + '.' + version
        run = session.get_value(self.get_workflowId(), 'run')

        self.emit_monitor_event(self.settings, article_id, version, run, "Convert JATS", "start",
                                "Starting conversion of article xml to EIF for " + article_id)

        try:

            if self.logger:
                self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4))
            expanded_folder_name = session.get_value(self.get_workflowId(), 'expanded_folder')
            expanded_folder_bucket = self.settings.publishing_buckets_prefix + self.settings.expanded_bucket
            print expanded_folder_name

            conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key)
            bucket = conn.get_bucket(expanded_folder_bucket)

            bucket_folder_name = expanded_folder_name
            (xml_key, xml_filename) = self.get_article_xml_key(bucket, bucket_folder_name)
            if xml_key is None:
                self.logger.error("Article XML path not found")
                return False

            if self.logger:
                self.logger.info("Converting file %s" % xml_filename)

            xml = xml_key.get_contents_as_string()
            if self.logger:
                self.logger.info("Downloaded contents of file %s" % xml_filename)

            json_output = jats_scraper.scrape(xml, article_version=version)

            # Add update date if it is in the session
            update_date = None
            try:
                update_date = session.get_value(self.get_workflowId(), 'update_date')
            except:
                update_date = None
            if update_date:
                json_output = self.add_update_date_to_json(json_output, update_date, xml_filename)

            if self.logger:
                self.logger.info("Scraped file %s" % xml_filename)

            output_folder = article_version_id + '/' + run
            output_name = xml_filename.replace('.xml', '.json')
            output_bucket = self.settings.publishing_buckets_prefix + self.settings.eif_bucket
            output_path = output_folder + '/' + output_name
            destination = conn.get_bucket(output_bucket)
            destination_key = Key(destination)
            output_key = output_path
            destination_key.key = output_key
            destination_key.set_contents_from_string(json_output)

            if self.logger:
                self.logger.info("Uploaded key %s to %s" % (output_path, output_bucket))

            self.set_dashboard_properties(json_output, article_id, version)

            session.store_value(self.get_workflowId(), "eif_filename", output_key)
            eif_object = json.loads(json_output)
            session.store_value(self.get_workflowId(), 'article_path', eif_object.get('path'))
            self.emit_monitor_event(self.settings, article_id, version, run, "Convert JATS", "end",
                                    "XML converted to EIF for article " + article_id + " to " + output_key)

        except Exception as e:
            self.logger.exception("Exception when converting article XML to EIF")
            self.emit_monitor_event(self.settings, article_id, version, run, "Convert JATS", "error",
                                    "Error in conversion of article xml to EIF for " + article_id +
                                    " message:" + e.message)
            return False

        return True
    def do_activity(self, data=None):

        """
        Do the work
        """
        if self.logger:
            self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4))
        info = S3NotificationInfo.from_dict(data)

        # set up required connections
        conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key)
        source_bucket = conn.get_bucket(info.bucket_name)
        dest_bucket = conn.get_bucket(self.settings.publishing_buckets_prefix + self.settings.expanded_bucket)
        session = Session(self.settings)

        article_id_match = re.match(ur'elife-(.*?)-', info.file_name)
        article_id = article_id_match.group(1)
        session.store_value(self.get_workflowId(), 'article_id', article_id)

        if self.logger:
            self.logger.info("Expanding file %s" % info.file_name)

        # extract any doi, version and updated date information from the filename
        version = None
        # zip name contains version information for previously archived zip files
        m = re.search(ur'-v([0-9]*?)[\.|-]', info.file_name)
        if m is not None:
            version = m.group(1)
        if version is None:
            version = self.get_next_version(article_id)
        if version == '-1':
            return False  # version could not be determined, exit workflow. Can't emit event as no version.

        sm = re.search(ur'.*?-.*?-(.*?)-', info.file_name)
        if sm is not None:
            status = sm.group(1)
        if status is None:
            return False  # version could not be determined, exit workflow. Can't emit event as no version.
        run = str(uuid.uuid4())
        # store version for other activities in this workflow execution
        session.store_value(self.get_workflowId(), 'version', version)

        # TODO : extract and store updated date if supplied

        article_version_id = article_id + '.' + version
        session.store_value(self.get_workflowId(), 'article_version_id', article_version_id)
        session.store_value(self.get_workflowId(), 'run', run)
        session.store_value(self.get_workflowId(), 'status', status)
        self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start",
                                "Starting expansion of article " + article_id)
        self.set_monitor_property(self.settings, article_id, "article_id", article_id, "text")
        try:

            # download zip to temp folder
            tmp = self.get_tmp_dir()
            key = Key(source_bucket)
            key.key = info.file_name
            local_zip_file = self.open_file_from_tmp_dir(info.file_name, mode='wb')
            key.get_contents_to_file(local_zip_file)
            local_zip_file.close()

            bucket_folder_name = article_version_id + '/' + run
            folder_name = path.join(article_version_id, run)

            # extract zip contents
            content_folder = path.join(tmp, folder_name)
            makedirs(content_folder)
            with ZipFile(path.join(tmp, info.file_name)) as zf:
                zf.extractall(content_folder)

            # TODO : rename files (versions!)

            # TODO : edit xml and rename references

            upload_filenames = []
            for f in listdir(content_folder):
                if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_':
                    upload_filenames.append(f)

            for filename in upload_filenames:
                source_path = path.join(content_folder, filename)
                dest_path = bucket_folder_name + '/' + filename
                k = Key(dest_bucket)
                k.key = dest_path
                k.set_contents_from_filename(source_path)

            session.store_value(self.get_workflowId(), 'expanded_folder', bucket_folder_name)
            self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "end",
                                    "Finished expansion of article " + article_id +
                                    " for version " + version + " run " + str(run) + " into " + bucket_folder_name)
        except Exception as e:
            self.logger.exception("Exception when expanding article")
            self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "error",
                                    "Error expanding article " + article_id + " message:" + e.message)
            return False

        return True
Пример #7
0
    def do_activity(self, data=None):
        """
        Do the work
        """

        run = data['run']
        session = Session(self.settings)
        version = session.get_value(run, 'version')
        article_id = session.get_value(run, 'article_id')
        article_version_id = article_id + '.' + version

        self.emit_monitor_event(
            self.settings, article_id, version, run, "Convert JATS", "start",
            "Starting conversion of article xml to EIF for " + article_id)

        try:

            if self.logger:
                self.logger.info('data: %s' %
                                 json.dumps(data, sort_keys=True, indent=4))
            expanded_folder_name = session.get_value(run, 'expanded_folder')
            expanded_folder_bucket = (self.settings.publishing_buckets_prefix +
                                      self.settings.expanded_bucket)

            conn = S3Connection(self.settings.aws_access_key_id,
                                self.settings.aws_secret_access_key)
            bucket = conn.get_bucket(expanded_folder_bucket)

            bucket_folder_name = expanded_folder_name
            (xml_key,
             xml_filename) = self.get_article_xml_key(bucket,
                                                      bucket_folder_name)
            if xml_key is None:
                self.logger.error("Article XML path not found")
                return False

            if self.logger:
                self.logger.info("Converting file %s" % xml_filename)

            xml = xml_key.get_contents_as_string()
            if self.logger:
                self.logger.info("Downloaded contents of file %s" %
                                 xml_filename)

            json_output = jats_scraper.scrape(xml, article_version=version)

            # Add update date if it is in the session
            update_date = session.get_value(run, 'update_date')
            if update_date:
                json_output = self.add_update_date_to_json(
                    json_output, update_date, xml_filename)

            if self.logger:
                self.logger.info("Scraped file %s" % xml_filename)

            output_folder = article_version_id + '/' + run
            output_name = xml_filename.replace('.xml', '.json')
            output_bucket = self.settings.publishing_buckets_prefix + self.settings.eif_bucket
            output_path = output_folder + '/' + output_name
            destination = conn.get_bucket(output_bucket)
            destination_key = Key(destination)
            output_key = output_path
            destination_key.key = output_key
            destination_key.set_contents_from_string(json_output)

            if self.logger:
                self.logger.info("Uploaded key %s to %s" %
                                 (output_path, output_bucket))

            self.set_dashboard_properties(json_output, article_id, version)

            session.store_value(run, "eif_location", output_key)
            eif_object = json.loads(json_output)
            session.store_value(run, 'article_path', eif_object.get('path'))
            self.emit_monitor_event(
                self.settings, article_id, version, run, "Convert JATS", "end",
                "XML converted to EIF for article " + article_id + " to " +
                output_key)

        except Exception as e:
            self.logger.exception(
                "Exception when converting article XML to EIF")
            self.emit_monitor_event(
                self.settings, article_id, version, run, "Convert JATS",
                "error", "Error in conversion of article xml to EIF for " +
                article_id + " message:" + e.message)
            return False

        return True
Пример #8
0
    def do_activity(self, data=None):

        """
        Do the work
        """

        run = data['run']

        if self.logger:
            self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4))
        info = S3NotificationInfo.from_dict(data)

        storage_context = StorageContext(self.settings)

        session = Session(self.settings)

        filename_last_element = session.get_value(run, 'filename_last_element')
        # zip name contains version information for previously archived zip files
        article_structure = ArticleInfo(filename_last_element)
        article_id = article_structure.article_id
        session.store_value(run, 'article_id', article_id)
        session.store_value(run, 'file_name', info.file_name)

        if self.logger:
            self.logger.info("Expanding file %s" % info.file_name)

        version = session.get_value(run, 'version')

        status = article_structure.status
        if status is None or (status != 'vor' and status != 'poa'):
            self.logger.error("Name '%s' did not match expected pattern for status" %
                              filename_last_element)
            return activity.activity.ACTIVITY_PERMANENT_FAILURE  # status could not be determined, exit workflow.

        article_version_id = article_id + '.' + version
        session.store_value(run, 'article_version_id', article_version_id)
        session.store_value(run, 'run', run)
        session.store_value(run, 'status', status)
        self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start",
                                "Starting expansion of article " + article_id)


        try:
            # download zip to temp folder
            tmp = self.get_tmp_dir()
            local_zip_file = self.open_file_from_tmp_dir(filename_last_element, mode='wb')
            storage_resource_origin = self.settings.storage_provider + "://" + info.bucket_name + "/" + info.file_name
            storage_context.get_resource_to_file(storage_resource_origin, local_zip_file)
            local_zip_file.close()

            # extract zip contents
            folder_name = path.join(article_version_id, run)
            content_folder = path.join(tmp, folder_name)
            makedirs(content_folder)
            with ZipFile(path.join(tmp, filename_last_element)) as zf:
                zf.extractall(content_folder)

            upload_filenames = []
            for f in listdir(content_folder):
                if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_':
                    upload_filenames.append(f)
            self.check_filenames(upload_filenames)

            bucket_folder_name = article_version_id + '/' + run
            for filename in upload_filenames:
                source_path = path.join(content_folder, filename)
                dest_path = bucket_folder_name + '/' + filename
                storage_resource_dest = self.settings.storage_provider + "://" + self.settings.publishing_buckets_prefix + \
                                        self.settings.expanded_bucket + "/" + dest_path
                storage_context.set_resource_from_filename(storage_resource_dest, source_path)

            self.clean_tmp_dir()

            session.store_value(run, 'expanded_folder', bucket_folder_name)
            self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article",
                                    "end", "Finished expansion of article " + article_id +
                                    " for version " + version + " run " + str(run) +
                                    " into " + bucket_folder_name)
        except Exception as e:
            self.logger.exception("Exception when expanding article")
            self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article",
                                    "error", "Error expanding article " + article_id +
                                    " message:" + e.message)
            return activity.activity.ACTIVITY_PERMANENT_FAILURE

        return True
Пример #9
0
    def do_activity(self, data=None):
        """
        Do the work
        """

        session = Session(self.settings)
        version = session.get_value(self.get_workflowId(), "version")
        article_id = session.get_value(self.get_workflowId(), "article_id")
        article_version_id = article_id + "." + version
        run = session.get_value(self.get_workflowId(), "run")

        self.emit_monitor_event(
            self.settings,
            article_id,
            version,
            run,
            "Convert JATS",
            "start",
            "Starting conversion of article xml to EIF for " + article_id,
        )

        try:

            if self.logger:
                self.logger.info("data: %s" % json.dumps(data, sort_keys=True, indent=4))
            expanded_folder_name = session.get_value(self.get_workflowId(), "expanded_folder")
            expanded_folder_bucket = self.settings.publishing_buckets_prefix + self.settings.expanded_bucket
            print expanded_folder_name

            conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key)
            bucket = conn.get_bucket(expanded_folder_bucket)

            bucket_folder_name = expanded_folder_name
            (xml_key, xml_filename) = self.get_article_xml_key(bucket, bucket_folder_name)
            if xml_key is None:
                self.logger.error("Article XML path not found")
                return False

            if self.logger:
                self.logger.info("Converting file %s" % xml_filename)

            xml = xml_key.get_contents_as_string()
            if self.logger:
                self.logger.info("Downloaded contents of file %s" % xml_filename)

            json_output = jats_scraper.scrape(xml)

            if self.logger:
                self.logger.info("Scraped file %s" % xml_filename)

            output_folder = article_version_id + "/" + run
            output_name = xml_filename.replace(".xml", ".json")
            output_bucket = self.settings.publishing_buckets_prefix + self.settings.eif_bucket
            output_path = output_folder + "/" + output_name
            destination = conn.get_bucket(output_bucket)
            destination_key = Key(destination)
            output_key = output_path
            destination_key.key = output_key
            destination_key.set_contents_from_string(json_output)

            if self.logger:
                self.logger.info("Uploaded key %s to %s" % (output_path, output_bucket))

            session.store_value(self.get_workflowId(), "eif_filename", output_key)
            eif_object = json.loads(json_output)
            session.store_value(self.get_workflowId(), "article_path", eif_object.get("path"))
            self.emit_monitor_event(
                self.settings,
                article_id,
                version,
                run,
                "Post EIF",
                "success",
                "XML converted to EIF for article " + article_id + " to " + output_key,
            )

        except Exception as e:
            self.logger.exception("Exception when converting article XML to EIF")
            self.emit_monitor_event(
                self.settings,
                article_id,
                version,
                run,
                "Convert JATS",
                "error",
                "Error in conversion of article xml to EIF for " + article_id + " message:" + e.message,
            )
            return False

        return True