示例#1
0
 def test_get_s3_key_names_from_bucket(self):
     "simple tests for coverage"
     fake_bucket = FakeBucket()
     fake_bucket.items += self.fake_s3_keys
     fake_bucket.items += self.fake_s3_prefixes
     self.assertEqual(len(s3lib.get_s3_key_names_from_bucket(fake_bucket)), 3)
     self.assertEqual(len(s3lib.get_s3_key_names_from_bucket(
         fake_bucket, file_extensions=['.xml'])), 1)
     self.assertEqual(len(s3lib.get_s3_key_names_from_bucket(
         fake_bucket, file_extensions=['.xml', '.pdf'])), 2)
     self.assertEqual(len(s3lib.get_s3_key_names_from_bucket(
         fake_bucket, key_type='prefix')), 1)
    def rename_article_s3_objects(self, bucket_folder_name, version):
        """
        Main function to rename article objects on S3
        and apply the renamed file names to the article XML file
        """

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key,
                               host=self.settings.s3_hostname)
        bucket = s3_conn.lookup(self.expanded_bucket_name)

        # bucket object list
        s3_key_names = s3lib.get_s3_key_names_from_bucket(
            bucket=bucket,
            prefix=bucket_folder_name + "/")

        # Get the old name to new name map
        file_name_map = self.build_file_name_map(s3_key_names, version)

        # log file names for reference
        if self.logger:
            self.logger.info('file_name_map: %s' %
                             json.dumps(file_name_map, sort_keys=True, indent=4))

        # rename_s3_objects(old_name_new_name_dict)
        self.rename_s3_objects(bucket, self.expanded_bucket_name, bucket_folder_name, file_name_map)

        # rewrite_and_upload_article_xml()
        xml_filename = self.find_xml_filename_in_map(file_name_map)
        self.download_file_from_bucket(bucket, bucket_folder_name, xml_filename)
        self.rewrite_xml_file(xml_filename, file_name_map)
        self.upload_file_to_bucket(bucket, bucket_folder_name, xml_filename)
示例#3
0
    def zip_revision_number(self, fid):
        """
        Look at previously supplied files and determine the
        next revision number
        """
        revision = None

        bucket_name = self.publish_bucket
        prefix = self.published_zip_folder + '/'

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(
            bucket=bucket,
            prefix=prefix)

        s3_key_name = s3lib.latest_pmc_zip_revision(fid, s3_key_names)

        if s3_key_name:
            # Found an existing PMC zip file, look for a revision number
            revision_match = re.match(ur'.*r(.*)\.zip$', s3_key_name)
            if revision_match is None:
                # There is a zip but no revision number, use 1
                revision = 1
            else:
                # Use the latest revision plus 1
                revision = int(revision_match.group(1)) + 1

        return revision
    def download_pmc_zip_from_s3(self, doi_id, workflow):
        """
        Simple download of PMC zip file from the live bucket
        """
        bucket_name = self.pmc_zip_bucket
        prefix = self.pmc_zip_folder

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)
        
        s3_key_names = s3lib.get_s3_key_names_from_bucket(
            bucket          = bucket,
            prefix          = prefix)
        
        s3_key_name = s3lib.latest_pmc_zip_revision(doi_id, s3_key_names)
        
        if s3_key_name:
            
            # Download
            s3_key = bucket.get_key(s3_key_name)

            filename = s3_key_name.split("/")[-1]

            filename_plus_path = (self.get_tmp_dir() + os.sep +
                                  self.INPUT_DIR + os.sep + filename)
            mode = "wb"
            f = open(filename_plus_path, mode)
            s3_key.get_contents_to_file(f)
            f.close()
            
            return True
        else:
            return False
    def zip_revision_number(self, fid):
        """
        Look at previously supplied files and determine the
        next revision number
        """
        revision = None
        
        bucket_name = self.publish_bucket
        prefix = self.published_zip_folder + '/'

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)
        
        s3_key_names = s3lib.get_s3_key_names_from_bucket(
            bucket          = bucket,
            prefix          = prefix)
        
        s3_key_name = s3lib.latest_pmc_zip_revision(fid, s3_key_names)
        
        if s3_key_name:
            # Found an existing PMC zip file, look for a revision number
            revision_match = re.match(ur'.*r(.*)\.zip$', s3_key_name)
            if revision_match is None:
                # There is a zip but no revision number, use 1
                revision = 1
            else:
                # Use the latest revision plus 1
                revision = int(revision_match.group(1)) + 1
        
        return revision
 def get_outbox_s3_key_names(self, force = None):
     """
     Separately get a list of S3 key names form the outbox
     for reporting purposes, excluding the outbox folder itself
     """
     
     # Return cached values if available
     if self.outbox_s3_key_names and not force:
         return self.outbox_s3_key_names
     
     bucket_name = self.publish_bucket
     
     # Connect to S3 and bucket
     s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key)
     bucket = s3_conn.lookup(bucket_name)
     
     s3_key_names = s3lib.get_s3_key_names_from_bucket(
         bucket          = bucket,
         prefix          = self.outbox_folder)
     
     # Remove the outbox_folder from the list, if present
     try:
         s3_key_names.remove(self.outbox_folder)
     except:
         pass
     
     self.outbox_s3_key_names = s3_key_names
     
     return self.outbox_s3_key_names
    def download_files_from_s3_outbox(self):
        """
        Connect to the S3 bucket, and from the outbox folder,
        download the .xml and .pdf files to be bundled.
        """
        file_extensions = []
        file_extensions.append(".xml")
        
        bucket_name = self.publish_bucket
        
        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)
        
        s3_key_names = s3lib.get_s3_key_names_from_bucket(
            bucket          = bucket,
            prefix          = self.outbox_folder,
            file_extensions = file_extensions)
        
        for name in s3_key_names:
            # Download objects from S3 and save to disk
            s3_key = bucket.get_key(name)

            filename = name.split("/")[-1]

            # Save .xml and .pdf to different folders
            if re.search(".*\\.xml$", name):
                dirname = self.elife_poa_lib.settings.STAGING_TO_HW_DIR

            filename_plus_path = dirname + os.sep + filename
            mode = "wb"
            f = open(filename_plus_path, mode)
            s3_key.get_contents_to_file(f)
            f.close()
    def get_outbox_s3_key_names(self, force=None):
        """
        Separately get a list of S3 key names form the outbox
        for reporting purposes, excluding the outbox folder itself
        """

        # Return cached values if available
        if self.outbox_s3_key_names and not force:
            return self.outbox_s3_key_names

        bucket_name = self.publish_bucket

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id,
                               self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(
            bucket=bucket, prefix=self.outbox_folder)

        # Remove the outbox_folder from the list, if present
        try:
            s3_key_names.remove(self.outbox_folder)
        except:
            pass

        self.outbox_s3_key_names = s3_key_names

        return self.outbox_s3_key_names
    def download_files_from_s3_outbox(self):
        """
        Connect to the S3 bucket, and from the outbox folder,
        download the .xml and .pdf files to be bundled.
        """
        file_extensions = []
        file_extensions.append(".xml")

        bucket_name = self.publish_bucket

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id,
                               self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(
            bucket=bucket,
            prefix=self.outbox_folder,
            file_extensions=file_extensions)

        for name in s3_key_names:
            # Download objects from S3 and save to disk
            s3_key = bucket.get_key(name)

            filename = name.split("/")[-1]

            # Save .xml and .pdf to different folders
            if re.search(".*\\.xml$", name):
                dirname = self.elife_poa_lib.settings.STAGING_TO_HW_DIR

            filename_plus_path = dirname + os.sep + filename
            mode = "wb"
            f = open(filename_plus_path, mode)
            s3_key.get_contents_to_file(f)
            f.close()
示例#10
0
    def download_pmc_zip_from_s3(self, doi_id, workflow):
        """
        Simple download of PMC zip file from the live bucket
        """
        bucket_name = self.pmc_zip_bucket
        prefix = self.pmc_zip_folder

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id,
                               self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(bucket=bucket,
                                                          prefix=prefix)

        s3_key_name = s3lib.latest_pmc_zip_revision(doi_id, s3_key_names)

        if s3_key_name:

            # Download
            s3_key = bucket.get_key(s3_key_name)

            filename = s3_key_name.split("/")[-1]

            filename_plus_path = (self.get_tmp_dir() + os.sep +
                                  self.INPUT_DIR + os.sep + filename)
            mode = "wb"
            f = open(filename_plus_path, mode)
            s3_key.get_contents_to_file(f)
            f.close()

            return True
        else:
            return False
    def rename_article_s3_objects(self, bucket_folder_name, version):
        """
        Main function to rename article objects on S3
        and apply the renamed file names to the article XML file
        """

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id,
                               self.settings.aws_secret_access_key,
                               host=self.settings.s3_hostname)
        bucket = s3_conn.lookup(self.expanded_bucket_name)

        # bucket object list
        s3_key_names = s3lib.get_s3_key_names_from_bucket(
            bucket=bucket, prefix=bucket_folder_name + "/")

        # Get the old name to new name map
        file_name_map = self.build_file_name_map(s3_key_names, version)

        # log file names for reference
        if self.logger:
            self.logger.info(
                'file_name_map: %s' %
                json.dumps(file_name_map, sort_keys=True, indent=4))

        # rename_s3_objects(old_name_new_name_dict)
        self.rename_s3_objects(bucket, self.expanded_bucket_name,
                               bucket_folder_name, file_name_map)

        # rewrite_and_upload_article_xml()
        xml_filename = self.find_xml_filename_in_map(file_name_map)
        self.download_file_from_bucket(bucket, bucket_folder_name,
                                       xml_filename)
        self.rewrite_xml_file(xml_filename, file_name_map)
        self.upload_file_to_bucket(bucket, bucket_folder_name, xml_filename)
    def archive_zip_file_name(self, article, status='vor'):
        """
        Get the file name of the most recent archive zip from the archive bucket
        """
        zip_file_name = None
        bucket_name = self.archive_bucket

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id,
                               self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(bucket=bucket)

        return self.latest_archive_zip_revision(article.doi_id, s3_key_names, self.journal, status)
示例#13
0
 def get_s3_key_names_from_bucket(self, bucket_name, prefix, file_extensions):
   """
   Use live s3 bucket connection to get the s3 key names
   from the bucket. This is so functions that rely on the data
   can use test data when running automated tests
   """
   s3_key_names = None
   # Connect to S3 and bucket
   s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key)
   bucket = s3_conn.lookup(bucket_name)
   
   s3_key_names = s3lib.get_s3_key_names_from_bucket(
       bucket          = bucket,
       key_type        = "key",
       prefix          = prefix,
       file_extensions = file_extensions)
   
   return s3_key_names
示例#14
0
 def get_folder_names_from_bucket(self, bucket_name, prefix):
   """
   Use live s3 bucket connection to get the folder names
   from the bucket. This is so functions that rely on the data
   can use test data when running automated tests
   """
   folder_names = None
   # Connect to S3 and bucket
   s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key)
   bucket = s3_conn.lookup(bucket_name)
   
   # Step one, get all the subfolder names
   folder_names = s3lib.get_s3_key_names_from_bucket(
           bucket          = bucket,
           key_type        = "prefix",
           prefix          = prefix)
   
   return folder_names
示例#15
0
    def get_s3_key_names_from_bucket(self, bucket_name, prefix, file_extensions):
        """
        Use live s3 bucket connection to get the s3 key names
        from the bucket. This is so functions that rely on the data
        can use test data when running automated tests
        """
        s3_key_names = None
        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id,
                               self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(
            bucket=bucket,
            key_type="key",
            prefix=prefix,
            file_extensions=file_extensions)

        return s3_key_names
示例#16
0
    def get_folder_names_from_bucket(self, bucket_name, prefix):
        """
        Use live s3 bucket connection to get the folder names
        from the bucket. This is so functions that rely on the data
        can use test data when running automated tests
        """
        folder_names = None
        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id,
                               self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        # Step one, get all the subfolder names
        folder_names = s3lib.get_s3_key_names_from_bucket(
            bucket=bucket,
            key_type="prefix",
            prefix=prefix)

        return folder_names
    def download_files_from_s3_outbox(self):
        """
        Connect to the S3 bucket, and from the outbox folder,
        download the .xml to be processed
        """
        filenames = []

        file_extensions = []
        file_extensions.append(".xml")

        bucket_name = self.publish_bucket

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id,
                               self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(
            bucket=bucket,
            prefix=self.outbox_folder,
            file_extensions=file_extensions)

        for name in s3_key_names:
            # Download objects from S3 and save to disk
            s3_key = bucket.get_key(name)

            filename = name.split("/")[-1]

            # Download to the activity temp directory
            dirname = self.get_tmp_dir()

            filename_plus_path = dirname + os.sep + filename

            mode = "wb"
            f = open(filename_plus_path, mode)
            s3_key.get_contents_to_file(f)
            f.close()

            filenames.append(filename_plus_path)

        return filenames
    def download_files_from_s3_outbox(self):
        """
        Connect to the S3 bucket, and from the outbox folder,
        download the .xml to be processed
        """
        filenames = []

        file_extensions = []
        file_extensions.append(".xml")

        bucket_name = self.publish_bucket

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id,
                               self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(
            bucket=bucket,
            prefix=self.outbox_folder,
            file_extensions=file_extensions)

        for name in s3_key_names:
            # Download objects from S3 and save to disk
            s3_key = bucket.get_key(name)

            filename = name.split("/")[-1]

            # Download to the activity temp directory
            dirname = self.get_tmp_dir()

            filename_plus_path = dirname + os.sep + filename

            mode = "wb"
            f = open(filename_plus_path, mode)
            s3_key.get_contents_to_file(f)
            f.close()

            filenames.append(filename_plus_path)

        return filenames
    def does_source_zip_exist_from_s3(self, doi_id):
        """

        """
        bucket_name = self.pmc_zip_bucket
        prefix = self.pmc_zip_folder

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id,
                               self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(bucket=bucket,
                                                          prefix=prefix)

        s3_key_name = s3lib.latest_pmc_zip_revision(doi_id, s3_key_names)

        if s3_key_name:
            return True
        else:
            return False
示例#20
0
    def download_files_from_s3(self):
        """
        Connect to the S3 bucket, and from the outbox folder,
        download the .xml and .pdf files to be bundled.
        """

        file_extensions = []
        file_extensions.append(".xml")
        file_extensions.append(".pdf")
        file_extensions.append(".zip")

        bucket_name = self.input_bucket

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id,
                               self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(
            bucket=bucket,
            prefix=self.outbox_folder,
            file_extensions=file_extensions)
        self.outbox_s3_key_names = s3_key_names

        for name in s3_key_names:
            # Download objects from S3 and save to disk
            s3_key = bucket.get_key(name)

            filename = name.split("/")[-1]

            filename_plus_path = self.INPUT_DIR + os.sep + filename

            if self.logger:
                self.logger.info('PublishFinalPOA downloading: %s' %
                                 filename_plus_path)

            mode = "wb"
            f = open(filename_plus_path, mode)
            s3_key.get_contents_to_file(f)
            f.close()
    def does_source_zip_exist_from_s3(self, doi_id):
        """

        """
        bucket_name = self.pmc_zip_bucket
        prefix = self.pmc_zip_folder

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id,
                               self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(
            bucket=bucket,
            prefix=prefix)

        s3_key_name = s3lib.latest_pmc_zip_revision(doi_id, s3_key_names)

        if s3_key_name:
            return True
        else:
            return False
示例#22
0
    def next_revision_number(self, doi_id, status='poa'):
        """
        From the bucket, get a list of zip files
        and determine the next revision number to use
        """
        next_revision_number = 1

        bucket_name = self.publish_bucket

        file_extensions = []
        file_extensions.append(".zip")

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id,
                               self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(bucket=bucket)

        max_revision_number = 0
        for key_name in s3_key_names:

            name_prefix = 'elife-' + str(doi_id).zfill(5) + '-' + str(
                status) + '-r'
            if key_name.startswith(name_prefix):
                # Attempt to get a revision number from the matching files
                try:
                    part = key_name.replace(name_prefix, '')
                    revision = int(part.split('.')[0])
                except (IndexError, ValueError):
                    revision = None
                if revision and revision > max_revision_number:
                    max_revision_number = revision

        if max_revision_number > 0:
            next_revision_number = max_revision_number + 1

        return next_revision_number
 def check_published_folder_exists(self):
     
     if not self.published_folder_name:
         return None
     
     bucket_name = self.input_bucket
     
     # Connect to S3 and bucket
     s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key)
     bucket = s3_conn.lookup(bucket_name)
     
     # Strip the trailing slash from the folder name if present
     published_folder_prefix = self.published_folder_name.rstrip('/')
     
     s3_key_names = s3lib.get_s3_key_names_from_bucket(
         bucket          = bucket,
         key_type        = 'prefix',
         prefix          = published_folder_prefix)
     
     if len(s3_key_names) > 0:
         return True
     else:
         return False
示例#24
0
    def download_files_from_s3(self):
        """
        Connect to the S3 bucket, and from the outbox folder,
        download the .xml and .pdf files to be bundled.
        """

        file_extensions = []
        file_extensions.append(".xml")
        file_extensions.append(".pdf")
        file_extensions.append(".zip")

        bucket_name = self.input_bucket

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(bucket=bucket,
                                                          prefix=self.outbox_folder,
                                                          file_extensions=file_extensions)
        self.outbox_s3_key_names = s3_key_names

        for name in s3_key_names:
            # Download objects from S3 and save to disk
            s3_key = bucket.get_key(name)

            filename = name.split("/")[-1]

            filename_plus_path = self.INPUT_DIR + os.sep + filename

            if self.logger:
                self.logger.info('PublishFinalPOA downloading: %s' % filename_plus_path)

            mode = "wb"
            f = open(filename_plus_path, mode)
            s3_key.get_contents_to_file(f)
            f.close()
示例#25
0
    def next_revision_number(self, doi_id, status='poa'):
        """
        From the bucket, get a list of zip files
        and determine the next revision number to use
        """
        next_revision_number = 1

        bucket_name = self.publish_bucket

        file_extensions = []
        file_extensions.append(".zip")

        # Connect to S3 and bucket
        s3_conn = S3Connection(self.settings.aws_access_key_id, self.settings.aws_secret_access_key)
        bucket = s3_conn.lookup(bucket_name)

        s3_key_names = s3lib.get_s3_key_names_from_bucket(bucket=bucket)

        max_revision_number = 0
        for key_name in s3_key_names:

            name_prefix = 'elife-' + str(doi_id).zfill(5) + '-' + str(status) + '-r'
            if key_name.startswith(name_prefix):
                # Attempt to get a revision number from the matching files
                try:
                    part = key_name.replace(name_prefix, '')
                    revision = int(part.split('.')[0])
                except (IndexError, ValueError):
                    revision = None
                if revision and revision > max_revision_number:
                    max_revision_number = revision

        if max_revision_number > 0:
            next_revision_number = max_revision_number + 1

        return next_revision_number