def do_activity(self, data=None): try: session = Session(self.settings) version = session.get_value(data['run'], 'version') filename = session.get_value(data['run'], 'filename_last_element') article_structure = ArticleInfo(filename) version_date, error = self.get_version(self.settings, article_structure, article_structure.article_id, version) if error is not None: self.logger.error(error) self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "error", " ".join(("Error Looking up version article", article_structure.article_id, "message:", error))) return activity.activity.ACTIVITY_PERMANENT_FAILURE self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "end", " ".join(("Finished Version Lookup for article", article_structure.article_id, "version:", version))) session.store_value(data['run'], 'update_date', version_date) return activity.activity.ACTIVITY_SUCCESS except Exception as e: self.logger.exception("Exception when trying to Lookup next version") self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "error", " ".join(("Error looking up version for article", article_structure.article_id, "message:", str(e)))) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): try: session = Session(self.settings) version = session.get_value(data['run'], 'version') filename = session.get_value(data['run'], 'filename_last_element') article_structure = ArticleInfo(filename) version_date, error = self.get_version( self.settings, article_structure, article_structure.article_id, version) if error is not None: self.logger.error(error) self.emit_monitor_event( self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "error", " ".join( ("Error Looking up version article", article_structure.article_id, "message:", error))) return activity.activity.ACTIVITY_PERMANENT_FAILURE self.emit_monitor_event( self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "end", " ".join( ("Finished Version Lookup for article", article_structure.article_id, "version:", version))) session.store_value(data['run'], 'update_date', version_date) return activity.activity.ACTIVITY_SUCCESS except Exception as e: self.logger.exception( "Exception when trying to Lookup next version") self.emit_monitor_event( self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "error", " ".join( ("Error looking up version for article", article_structure.article_id, "message:", str(e)))) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): try: info = S3NotificationInfo.from_dict(data) filename = info.file_name[info.file_name.rfind('/')+1:] session = Session(self.settings) session.store_value(data['run'], 'filename_last_element', filename) article_structure = ArticleInfo(filename) if article_structure.article_id is None: self.logger.error("Name '%s' did not match expected pattern for article id" % filename) raise RuntimeError("article_structure.article_id is None. File pattern problem.") version = self.get_version(self.settings, article_structure, data['version_lookup_function']) session.store_value(data['run'], 'version', version) article_id = article_structure.article_id self.emit_monitor_event(self.settings, article_id, version, data['run'], self.pretty_name, "start", " ".join(("Version Lookup for article", article_id, "version:", version))) self.set_monitor_property(self.settings, article_id, "article-id", article_id, "text") self.set_monitor_property(self.settings, article_id, "publication-status", "publication in progress", "text", version=version) self.emit_monitor_event(self.settings, article_structure.article_id, version, data['run'], self.pretty_name, "end", " ".join(("Finished Version Lookup for article", article_structure.article_id, "version:", version))) return activity.activity.ACTIVITY_SUCCESS except Exception as e: self.logger.exception("Exception when trying to Lookup Version. Error: " + str(e)) return activity.activity.ACTIVITY_PERMANENT_FAILURE
def do_activity(self, data=None): """ Do the work """ run = data['run'] if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) info = S3NotificationInfo.from_dict(data) storage_context = StorageContext(self.settings) session = Session(self.settings) filename_last_element = session.get_value(run, 'filename_last_element') # zip name contains version information for previously archived zip files article_structure = ArticleInfo(filename_last_element) article_id = article_structure.article_id session.store_value(run, 'article_id', article_id) session.store_value(run, 'file_name', info.file_name) if self.logger: self.logger.info("Expanding file %s" % info.file_name) version = session.get_value(run, 'version') status = article_structure.status if status is None or (status != 'vor' and status != 'poa'): self.logger.error("Name '%s' did not match expected pattern for status" % filename_last_element) return activity.activity.ACTIVITY_PERMANENT_FAILURE # status could not be determined, exit workflow. article_version_id = article_id + '.' + version session.store_value(run, 'article_version_id', article_version_id) session.store_value(run, 'run', run) session.store_value(run, 'status', status) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start", "Starting expansion of article " + article_id) try: # download zip to temp folder tmp = self.get_tmp_dir() local_zip_file = self.open_file_from_tmp_dir(filename_last_element, mode='wb') storage_resource_origin = self.settings.storage_provider + "://" + info.bucket_name + "/" + info.file_name storage_context.get_resource_to_file(storage_resource_origin, local_zip_file) local_zip_file.close() # extract zip contents folder_name = path.join(article_version_id, run) content_folder = path.join(tmp, folder_name) makedirs(content_folder) with ZipFile(path.join(tmp, filename_last_element)) as zf: zf.extractall(content_folder) upload_filenames = [] for f in listdir(content_folder): if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_': upload_filenames.append(f) bucket_folder_name = article_version_id + '/' + run for filename in upload_filenames: source_path = path.join(content_folder, filename) dest_path = bucket_folder_name + '/' + filename storage_resource_dest = self.settings.storage_provider + "://" + self.settings.publishing_buckets_prefix + \ self.settings.expanded_bucket + "/" + dest_path storage_context.set_resource_from_filename(storage_resource_dest, source_path) self.clean_tmp_dir() session.store_value(run, 'expanded_folder', bucket_folder_name) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "end", "Finished expansion of article " + article_id + " for version " + version + " run " + str(run) + " into " + bucket_folder_name) except Exception as e: self.logger.exception("Exception when expanding article") self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "error", "Error expanding article " + article_id + " message:" + e.message) return activity.activity.ACTIVITY_PERMANENT_FAILURE return True
def do_activity(self, data=None): """ Do the work """ session = Session(self.settings) version = session.get_value(self.get_workflowId(), 'version') article_id = session.get_value(self.get_workflowId(), 'article_id') article_version_id = article_id + '.' + version run = session.get_value(self.get_workflowId(), 'run') self.emit_monitor_event(self.settings, article_id, version, run, "Convert JATS", "start", "Starting conversion of article xml to EIF for " + article_id) try: if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) expanded_folder_name = session.get_value(self.get_workflowId(), 'expanded_folder') expanded_folder_bucket = self.settings.publishing_buckets_prefix + self.settings.expanded_bucket print expanded_folder_name conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = conn.get_bucket(expanded_folder_bucket) bucket_folder_name = expanded_folder_name (xml_key, xml_filename) = self.get_article_xml_key(bucket, bucket_folder_name) if xml_key is None: self.logger.error("Article XML path not found") return False if self.logger: self.logger.info("Converting file %s" % xml_filename) xml = xml_key.get_contents_as_string() if self.logger: self.logger.info("Downloaded contents of file %s" % xml_filename) json_output = jats_scraper.scrape(xml, article_version=version) # Add update date if it is in the session update_date = None try: update_date = session.get_value(self.get_workflowId(), 'update_date') except: update_date = None if update_date: json_output = self.add_update_date_to_json(json_output, update_date, xml_filename) if self.logger: self.logger.info("Scraped file %s" % xml_filename) output_folder = article_version_id + '/' + run output_name = xml_filename.replace('.xml', '.json') output_bucket = self.settings.publishing_buckets_prefix + self.settings.eif_bucket output_path = output_folder + '/' + output_name destination = conn.get_bucket(output_bucket) destination_key = Key(destination) output_key = output_path destination_key.key = output_key destination_key.set_contents_from_string(json_output) if self.logger: self.logger.info("Uploaded key %s to %s" % (output_path, output_bucket)) self.set_dashboard_properties(json_output, article_id, version) session.store_value(self.get_workflowId(), "eif_filename", output_key) eif_object = json.loads(json_output) session.store_value(self.get_workflowId(), 'article_path', eif_object.get('path')) self.emit_monitor_event(self.settings, article_id, version, run, "Convert JATS", "end", "XML converted to EIF for article " + article_id + " to " + output_key) except Exception as e: self.logger.exception("Exception when converting article XML to EIF") self.emit_monitor_event(self.settings, article_id, version, run, "Convert JATS", "error", "Error in conversion of article xml to EIF for " + article_id + " message:" + e.message) return False return True
def do_activity(self, data=None): """ Do the work """ if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) info = S3NotificationInfo.from_dict(data) # set up required connections conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) source_bucket = conn.get_bucket(info.bucket_name) dest_bucket = conn.get_bucket(self.settings.publishing_buckets_prefix + self.settings.expanded_bucket) session = Session(self.settings) article_id_match = re.match(ur'elife-(.*?)-', info.file_name) article_id = article_id_match.group(1) session.store_value(self.get_workflowId(), 'article_id', article_id) if self.logger: self.logger.info("Expanding file %s" % info.file_name) # extract any doi, version and updated date information from the filename version = None # zip name contains version information for previously archived zip files m = re.search(ur'-v([0-9]*?)[\.|-]', info.file_name) if m is not None: version = m.group(1) if version is None: version = self.get_next_version(article_id) if version == '-1': return False # version could not be determined, exit workflow. Can't emit event as no version. sm = re.search(ur'.*?-.*?-(.*?)-', info.file_name) if sm is not None: status = sm.group(1) if status is None: return False # version could not be determined, exit workflow. Can't emit event as no version. run = str(uuid.uuid4()) # store version for other activities in this workflow execution session.store_value(self.get_workflowId(), 'version', version) # TODO : extract and store updated date if supplied article_version_id = article_id + '.' + version session.store_value(self.get_workflowId(), 'article_version_id', article_version_id) session.store_value(self.get_workflowId(), 'run', run) session.store_value(self.get_workflowId(), 'status', status) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start", "Starting expansion of article " + article_id) self.set_monitor_property(self.settings, article_id, "article_id", article_id, "text") try: # download zip to temp folder tmp = self.get_tmp_dir() key = Key(source_bucket) key.key = info.file_name local_zip_file = self.open_file_from_tmp_dir(info.file_name, mode='wb') key.get_contents_to_file(local_zip_file) local_zip_file.close() bucket_folder_name = article_version_id + '/' + run folder_name = path.join(article_version_id, run) # extract zip contents content_folder = path.join(tmp, folder_name) makedirs(content_folder) with ZipFile(path.join(tmp, info.file_name)) as zf: zf.extractall(content_folder) # TODO : rename files (versions!) # TODO : edit xml and rename references upload_filenames = [] for f in listdir(content_folder): if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_': upload_filenames.append(f) for filename in upload_filenames: source_path = path.join(content_folder, filename) dest_path = bucket_folder_name + '/' + filename k = Key(dest_bucket) k.key = dest_path k.set_contents_from_filename(source_path) session.store_value(self.get_workflowId(), 'expanded_folder', bucket_folder_name) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "end", "Finished expansion of article " + article_id + " for version " + version + " run " + str(run) + " into " + bucket_folder_name) except Exception as e: self.logger.exception("Exception when expanding article") self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "error", "Error expanding article " + article_id + " message:" + e.message) return False return True
def do_activity(self, data=None): """ Do the work """ run = data['run'] session = Session(self.settings) version = session.get_value(run, 'version') article_id = session.get_value(run, 'article_id') article_version_id = article_id + '.' + version self.emit_monitor_event( self.settings, article_id, version, run, "Convert JATS", "start", "Starting conversion of article xml to EIF for " + article_id) try: if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) expanded_folder_name = session.get_value(run, 'expanded_folder') expanded_folder_bucket = (self.settings.publishing_buckets_prefix + self.settings.expanded_bucket) conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = conn.get_bucket(expanded_folder_bucket) bucket_folder_name = expanded_folder_name (xml_key, xml_filename) = self.get_article_xml_key(bucket, bucket_folder_name) if xml_key is None: self.logger.error("Article XML path not found") return False if self.logger: self.logger.info("Converting file %s" % xml_filename) xml = xml_key.get_contents_as_string() if self.logger: self.logger.info("Downloaded contents of file %s" % xml_filename) json_output = jats_scraper.scrape(xml, article_version=version) # Add update date if it is in the session update_date = session.get_value(run, 'update_date') if update_date: json_output = self.add_update_date_to_json( json_output, update_date, xml_filename) if self.logger: self.logger.info("Scraped file %s" % xml_filename) output_folder = article_version_id + '/' + run output_name = xml_filename.replace('.xml', '.json') output_bucket = self.settings.publishing_buckets_prefix + self.settings.eif_bucket output_path = output_folder + '/' + output_name destination = conn.get_bucket(output_bucket) destination_key = Key(destination) output_key = output_path destination_key.key = output_key destination_key.set_contents_from_string(json_output) if self.logger: self.logger.info("Uploaded key %s to %s" % (output_path, output_bucket)) self.set_dashboard_properties(json_output, article_id, version) session.store_value(run, "eif_location", output_key) eif_object = json.loads(json_output) session.store_value(run, 'article_path', eif_object.get('path')) self.emit_monitor_event( self.settings, article_id, version, run, "Convert JATS", "end", "XML converted to EIF for article " + article_id + " to " + output_key) except Exception as e: self.logger.exception( "Exception when converting article XML to EIF") self.emit_monitor_event( self.settings, article_id, version, run, "Convert JATS", "error", "Error in conversion of article xml to EIF for " + article_id + " message:" + e.message) return False return True
def do_activity(self, data=None): """ Do the work """ run = data['run'] if self.logger: self.logger.info('data: %s' % json.dumps(data, sort_keys=True, indent=4)) info = S3NotificationInfo.from_dict(data) storage_context = StorageContext(self.settings) session = Session(self.settings) filename_last_element = session.get_value(run, 'filename_last_element') # zip name contains version information for previously archived zip files article_structure = ArticleInfo(filename_last_element) article_id = article_structure.article_id session.store_value(run, 'article_id', article_id) session.store_value(run, 'file_name', info.file_name) if self.logger: self.logger.info("Expanding file %s" % info.file_name) version = session.get_value(run, 'version') status = article_structure.status if status is None or (status != 'vor' and status != 'poa'): self.logger.error("Name '%s' did not match expected pattern for status" % filename_last_element) return activity.activity.ACTIVITY_PERMANENT_FAILURE # status could not be determined, exit workflow. article_version_id = article_id + '.' + version session.store_value(run, 'article_version_id', article_version_id) session.store_value(run, 'run', run) session.store_value(run, 'status', status) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "start", "Starting expansion of article " + article_id) try: # download zip to temp folder tmp = self.get_tmp_dir() local_zip_file = self.open_file_from_tmp_dir(filename_last_element, mode='wb') storage_resource_origin = self.settings.storage_provider + "://" + info.bucket_name + "/" + info.file_name storage_context.get_resource_to_file(storage_resource_origin, local_zip_file) local_zip_file.close() # extract zip contents folder_name = path.join(article_version_id, run) content_folder = path.join(tmp, folder_name) makedirs(content_folder) with ZipFile(path.join(tmp, filename_last_element)) as zf: zf.extractall(content_folder) upload_filenames = [] for f in listdir(content_folder): if isfile(join(content_folder, f)) and f[0] != '.' and not f[0] == '_': upload_filenames.append(f) self.check_filenames(upload_filenames) bucket_folder_name = article_version_id + '/' + run for filename in upload_filenames: source_path = path.join(content_folder, filename) dest_path = bucket_folder_name + '/' + filename storage_resource_dest = self.settings.storage_provider + "://" + self.settings.publishing_buckets_prefix + \ self.settings.expanded_bucket + "/" + dest_path storage_context.set_resource_from_filename(storage_resource_dest, source_path) self.clean_tmp_dir() session.store_value(run, 'expanded_folder', bucket_folder_name) self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "end", "Finished expansion of article " + article_id + " for version " + version + " run " + str(run) + " into " + bucket_folder_name) except Exception as e: self.logger.exception("Exception when expanding article") self.emit_monitor_event(self.settings, article_id, version, run, "Expand Article", "error", "Error expanding article " + article_id + " message:" + e.message) return activity.activity.ACTIVITY_PERMANENT_FAILURE return True
def do_activity(self, data=None): """ Do the work """ session = Session(self.settings) version = session.get_value(self.get_workflowId(), "version") article_id = session.get_value(self.get_workflowId(), "article_id") article_version_id = article_id + "." + version run = session.get_value(self.get_workflowId(), "run") self.emit_monitor_event( self.settings, article_id, version, run, "Convert JATS", "start", "Starting conversion of article xml to EIF for " + article_id, ) try: if self.logger: self.logger.info("data: %s" % json.dumps(data, sort_keys=True, indent=4)) expanded_folder_name = session.get_value(self.get_workflowId(), "expanded_folder") expanded_folder_bucket = self.settings.publishing_buckets_prefix + self.settings.expanded_bucket print expanded_folder_name conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key) bucket = conn.get_bucket(expanded_folder_bucket) bucket_folder_name = expanded_folder_name (xml_key, xml_filename) = self.get_article_xml_key(bucket, bucket_folder_name) if xml_key is None: self.logger.error("Article XML path not found") return False if self.logger: self.logger.info("Converting file %s" % xml_filename) xml = xml_key.get_contents_as_string() if self.logger: self.logger.info("Downloaded contents of file %s" % xml_filename) json_output = jats_scraper.scrape(xml) if self.logger: self.logger.info("Scraped file %s" % xml_filename) output_folder = article_version_id + "/" + run output_name = xml_filename.replace(".xml", ".json") output_bucket = self.settings.publishing_buckets_prefix + self.settings.eif_bucket output_path = output_folder + "/" + output_name destination = conn.get_bucket(output_bucket) destination_key = Key(destination) output_key = output_path destination_key.key = output_key destination_key.set_contents_from_string(json_output) if self.logger: self.logger.info("Uploaded key %s to %s" % (output_path, output_bucket)) session.store_value(self.get_workflowId(), "eif_filename", output_key) eif_object = json.loads(json_output) session.store_value(self.get_workflowId(), "article_path", eif_object.get("path")) self.emit_monitor_event( self.settings, article_id, version, run, "Post EIF", "success", "XML converted to EIF for article " + article_id + " to " + output_key, ) except Exception as e: self.logger.exception("Exception when converting article XML to EIF") self.emit_monitor_event( self.settings, article_id, version, run, "Convert JATS", "error", "Error in conversion of article xml to EIF for " + article_id + " message:" + e.message, ) return False return True