예제 #1
0
    def needs_downloading(self, own_processor_id=None) -> bool:
        """Determine if a file needs to be downloaded.

        This is true if the file has already been downloaded and lost
        without getting processed.
        """
        # If the file is downloaded and the file actually exists on disk,
        # then it doens't need to be downloaded.
        if self.absolute_file_path and os.path.exists(self.absolute_file_path):
            # ok a file exists, if this file has an SHA1 ensure that it's the same
            existing_file_sha1 = calculate_sha1(self.absolute_file_path)
            if self.sha1 and self.sha1 != existing_file_sha1:
                return True
            # otherwise, sha1 matches and the file doesn't need to be downloaded
            return False

        unstarted_downloader_jobs = self.downloader_jobs.filter(
            start_time__isnull=True, success__isnull=True, retried=False
        )

        # If the file has a downloader job that hasn't even started yet,
        # then it doesn't need another.
        if unstarted_downloader_jobs.count() > 0:
            return False

        # Do an extra check for blocking jobs for trancsriptome indices.
        # This is necessary because needs_processing() won't check
        # the blocking jobs for them because they're supposed to
        # have multiple processor jobs. However if the file does need to
        # be redownloaded, we only want one downloader job to be recreated.
        if self.has_blocking_jobs(own_processor_id):
            return False

        # If this file has been processed, then it doesn't need to be downloaded again.
        return self.needs_processing(own_processor_id)
예제 #2
0
def _upload(job_context: Dict) -> Dict:
    """ Uploads the result file to S3 and notifies user. """

    # There has been a failure already, don't try to upload anything.
    if not job_context.get("output_file", None):
        logger.error(
            "Was told to upload a smash result without an output_file.")
        return job_context

    try:
        if job_context.get("upload", True) and settings.RUNNING_IN_CLOUD:
            s3_client = boto3.client('s3')

            # Note that file expiry is handled by the S3 object lifecycle,
            # managed by terraform.
            s3_client.upload_file(job_context["output_file"],
                                  RESULTS_BUCKET,
                                  job_context["output_file"].split('/')[-1],
                                  ExtraArgs={'ACL': 'public-read'})
            result_url = ("https://s3.amazonaws.com/" + RESULTS_BUCKET + "/" +
                          job_context["output_file"].split('/')[-1])

            job_context["result_url"] = result_url

            logger.debug("Result uploaded!",
                         result_url=job_context["result_url"])

            job_context["dataset"].s3_bucket = RESULTS_BUCKET
            job_context["dataset"].s3_key = job_context["output_file"].split(
                '/')[-1]
            job_context["dataset"].size_in_bytes = calculate_file_size(
                job_context["output_file"])
            job_context["dataset"].sha1 = calculate_sha1(
                job_context["output_file"])

            job_context["dataset"].save()

            # File is uploaded, we can delete the local.
            try:
                os.remove(job_context["output_file"])
            except OSError:
                pass

    except Exception as e:
        logger.exception("Failed to upload smash result file.",
                         file=job_context["output_file"])
        job_context['job'].success = False
        job_context['job'].failure_reason = "Failure reason: " + str(e)
        # Delay failing this pipeline until the failure notify has been sent
        # job_context['success'] = False

    return job_context
예제 #3
0
    def sync_from_s3(self, force=False, path=None):
        """ Downloads a file from S3 to the local file system.
        Returns the absolute file path.
        """
        path = path if path is not None else self.absolute_file_path

        if not settings.RUNNING_IN_CLOUD and not force:
            if os.path.exists(path):
                return path
            else:
                # If the file doesn't exist at path and we're not
                # running in the cloud, then the file is almost
                # certainly at its absolute_file_path because it never got deleted.
                if os.path.exists(self.absolute_file_path):
                    shutil.copyfile(self.absolute_file_path, path)
                    return path
                else:
                    # We don't have the file :(
                    return None

        target_directory = os.path.dirname(path)
        os.makedirs(target_directory, exist_ok=True)

        if not self.s3_bucket or not self.s3_key:
            raise ValueError(
                "Tried to download a computed file with no s3_bucket or s3_key"
            )

        try:
            S3.download_file(self.s3_bucket, self.s3_key, path)

            # Veryify sync integrity
            synced_sha1 = calculate_sha1(path)

            if self.sha1 != synced_sha1:
                raise AssertionError(
                    "SHA1 of downloaded ComputedFile doesn't match database SHA1!"
                )

            return path
        except Exception as e:
            logger.exception(e, computed_file_id=self.pk)
            return None
예제 #4
0
def _update_result_objects(job_context: Dict) -> Dict:
    """Closes out the dataset object."""
    dataset = job_context["dataset"]

    dataset.s3_bucket = RESULTS_BUCKET
    dataset.s3_key = job_context["output_file"].split("/")[-1]
    dataset.size_in_bytes = calculate_file_size(job_context["output_file"])
    dataset.sha1 = calculate_sha1(job_context["output_file"])
    dataset.is_processing = False
    dataset.is_processed = True
    dataset.is_available = True
    dataset.expires_on = timezone.now() + timedelta(days=7)
    dataset.save()

    if settings.RUNNING_IN_CLOUD and job_context.get("upload", True):
        # File is uploaded and the metadata is updated, can delete the local.
        try:
            os.remove(job_context["output_file"])
        except OSError:
            pass

    job_context["success"] = True

    return job_context
예제 #5
0
 def calculate_sha1(self) -> None:
     """ Calculate the SHA1 value of a given file.
     """
     self.sha1 = calculate_sha1(self.absolute_file_path)
     return self.sha1