def needs_downloading(self, own_processor_id=None) -> bool: """Determine if a file needs to be downloaded. This is true if the file has already been downloaded and lost without getting processed. """ # If the file is downloaded and the file actually exists on disk, # then it doens't need to be downloaded. if self.absolute_file_path and os.path.exists(self.absolute_file_path): # ok a file exists, if this file has an SHA1 ensure that it's the same existing_file_sha1 = calculate_sha1(self.absolute_file_path) if self.sha1 and self.sha1 != existing_file_sha1: return True # otherwise, sha1 matches and the file doesn't need to be downloaded return False unstarted_downloader_jobs = self.downloader_jobs.filter( start_time__isnull=True, success__isnull=True, retried=False ) # If the file has a downloader job that hasn't even started yet, # then it doesn't need another. if unstarted_downloader_jobs.count() > 0: return False # Do an extra check for blocking jobs for trancsriptome indices. # This is necessary because needs_processing() won't check # the blocking jobs for them because they're supposed to # have multiple processor jobs. However if the file does need to # be redownloaded, we only want one downloader job to be recreated. if self.has_blocking_jobs(own_processor_id): return False # If this file has been processed, then it doesn't need to be downloaded again. return self.needs_processing(own_processor_id)
def _upload(job_context: Dict) -> Dict: """ Uploads the result file to S3 and notifies user. """ # There has been a failure already, don't try to upload anything. if not job_context.get("output_file", None): logger.error( "Was told to upload a smash result without an output_file.") return job_context try: if job_context.get("upload", True) and settings.RUNNING_IN_CLOUD: s3_client = boto3.client('s3') # Note that file expiry is handled by the S3 object lifecycle, # managed by terraform. s3_client.upload_file(job_context["output_file"], RESULTS_BUCKET, job_context["output_file"].split('/')[-1], ExtraArgs={'ACL': 'public-read'}) result_url = ("https://s3.amazonaws.com/" + RESULTS_BUCKET + "/" + job_context["output_file"].split('/')[-1]) job_context["result_url"] = result_url logger.debug("Result uploaded!", result_url=job_context["result_url"]) job_context["dataset"].s3_bucket = RESULTS_BUCKET job_context["dataset"].s3_key = job_context["output_file"].split( '/')[-1] job_context["dataset"].size_in_bytes = calculate_file_size( job_context["output_file"]) job_context["dataset"].sha1 = calculate_sha1( job_context["output_file"]) job_context["dataset"].save() # File is uploaded, we can delete the local. try: os.remove(job_context["output_file"]) except OSError: pass except Exception as e: logger.exception("Failed to upload smash result file.", file=job_context["output_file"]) job_context['job'].success = False job_context['job'].failure_reason = "Failure reason: " + str(e) # Delay failing this pipeline until the failure notify has been sent # job_context['success'] = False return job_context
def sync_from_s3(self, force=False, path=None): """ Downloads a file from S3 to the local file system. Returns the absolute file path. """ path = path if path is not None else self.absolute_file_path if not settings.RUNNING_IN_CLOUD and not force: if os.path.exists(path): return path else: # If the file doesn't exist at path and we're not # running in the cloud, then the file is almost # certainly at its absolute_file_path because it never got deleted. if os.path.exists(self.absolute_file_path): shutil.copyfile(self.absolute_file_path, path) return path else: # We don't have the file :( return None target_directory = os.path.dirname(path) os.makedirs(target_directory, exist_ok=True) if not self.s3_bucket or not self.s3_key: raise ValueError( "Tried to download a computed file with no s3_bucket or s3_key" ) try: S3.download_file(self.s3_bucket, self.s3_key, path) # Veryify sync integrity synced_sha1 = calculate_sha1(path) if self.sha1 != synced_sha1: raise AssertionError( "SHA1 of downloaded ComputedFile doesn't match database SHA1!" ) return path except Exception as e: logger.exception(e, computed_file_id=self.pk) return None
def _update_result_objects(job_context: Dict) -> Dict: """Closes out the dataset object.""" dataset = job_context["dataset"] dataset.s3_bucket = RESULTS_BUCKET dataset.s3_key = job_context["output_file"].split("/")[-1] dataset.size_in_bytes = calculate_file_size(job_context["output_file"]) dataset.sha1 = calculate_sha1(job_context["output_file"]) dataset.is_processing = False dataset.is_processed = True dataset.is_available = True dataset.expires_on = timezone.now() + timedelta(days=7) dataset.save() if settings.RUNNING_IN_CLOUD and job_context.get("upload", True): # File is uploaded and the metadata is updated, can delete the local. try: os.remove(job_context["output_file"]) except OSError: pass job_context["success"] = True return job_context
def calculate_sha1(self) -> None: """ Calculate the SHA1 value of a given file. """ self.sha1 = calculate_sha1(self.absolute_file_path) return self.sha1