def run(self): global packages, LOCALURL, REMOTEURL, failed_files, VERBOSE # prnt(" ".join(["wget", "-O", "\""+ os.path.join(LOCALURL, packages[self._args[0]]["location"]) +"\"", urlpath.join(REMOTEURL, packages[self._args[0]]["location"])])) # proc = subprocess.Popen(["wget", "-O", "\""+ os.path.join(LOCALURL, packages[self._args[0]]["location"]) +"\"", urlpath.join(REMOTEURL, packages[self._args[0]]["location"])]) # proc.wait() if VERBOSE: prnt(" ".join(["wget", "-O", os.path.join(LOCALURL, packages[self._args[0]]["location"]), urlpath.join(REMOTEURL, packages[self._args[0]]["location"])])) retcode = subprocess.call([" ".join(["wget", "-O", os.path.join(LOCALURL, packages[self._args[0]]["location"]), urlpath.join(REMOTEURL, packages[self._args[0]]["location"])])], shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=os.environ) if retcode: prnt("ERROR: Failed to download", urlpath.join(REMOTEURL, packages[self._args[0]]["location"]), file=sys.stderr) failed_files.append(urlpath.join(REMOTEURL, packages[self._args[0]]["location"]))
def generate_id(cls, commit): """Calculate action ID.""" host = os.environ.get('RENKU_DOMAIN') or 'localhost' # always set the id by the identifier return urllib.parse.urljoin( 'https://{host}'.format(host=host), posixpath.join('/activities', 'commit/{commit.hexsha}'.format(commit=commit)))
def generate_id(cls, commitsha): """Calculate action ID.""" host = 'localhost' if hasattr(cls, 'client'): host = cls.client.remote.get('host') or host host = os.environ.get('RENKU_DOMAIN') or 'localhost' # always set the id by the identifier return urllib.parse.urljoin( 'https://{host}'.format(host=host), posixpath.join( '/activities', 'commit/{commit}'.format(commit=commitsha) ) )
def _migrate_broken_dataset_paths(client): """Ensure all paths are using correct directory structure.""" for dataset in client.datasets.values(): dataset_path = Path(dataset.path) expected_path = (client.path / client.renku_datasets_path / Path(quote(dataset.identifier, safe=''))) # migrate the refs ref = LinkReference.create( client=client, name='datasets/{0}'.format(dataset.short_name), force=True, ) ref.set_reference(expected_path / client.METADATA) if not dataset_path.exists(): dataset_path = (client.path / client.renku_datasets_path / uuid.UUID(dataset.identifier).hex) if not expected_path.exists(): shutil.move(str(dataset_path), str(expected_path)) dataset.path = expected_path dataset.__reference__ = expected_path / client.METADATA for file_ in dataset.files: file_path = Path(file_.path) if not file_path.exists() and file_.path.startswith('..'): new_path = (client.path / client.renku_datasets_path / dataset.uid / file_path).resolve().relative_to( client.path) file_.path = new_path _, commit, _ = client.resolve_in_submodules( client.find_previous_commit(file_.path, revision='HEAD'), file_.path, ) host = client.remote.get('host') or 'localhost' host = os.environ.get('RENKU_DOMAIN') or host # always set the id by the identifier file_._id = urllib.parse.urljoin( 'https://{host}'.format(host=host), posixpath.join('/blob/{hexsha}/{path}'.format( hexsha=commit.hexsha, path=new_path))) file_._label = '{}@{}'.format(new_path, commit.hexsha) dataset.to_yaml()
def process_image(*args, s3client, patientcache): """Processing images from the raw dump Takes a single image, downloads it into temporary storage and extracts its metadata. The metadata is then uploaded here, except if the file already exists. If the image file already exists at the correct location, it's not passed on to the next step. Parameters ---------- task, key, _ : tuple[str, str, None] A task name (only handling "process" tasks), and an object to act on. s3client : S3Client The service that handles S3 data access patientcache: The cache that stores the asignments of patients to groups Yields ------ tuple[str, str, str or pydicom.FileDataset] Tuple containing the task name("copy" or "metadata"), and other parameters depending on the task. "copy" passes on the original object and new location. "metadata" passes on the target metadata location and the image data to extract from. """ # check file type task, key, _ = args image_path = Path(key) if task != "process" or image_path.suffix.lower() != ".dcm": # not an image, don't do anything with it yield bonobo.constants.NOT_MODIFIED # Stop here for processing return image_uuid = image_path.stem # download the image image_data = PartialDicom(s3client, key).download() if image_data is None: # we couldn't read the image data correctly logger.warning( f"Object '{key}' couldn't be loaded as a DICOM file, skipping!" ) return # extract the required data from the image patient_id = image_data.PatientID study_id = image_data.StudyInstanceUID series_id = image_data.SeriesInstanceUID group = patientcache.get_group(patient_id) if group is not None: training_set = group == "training" else: logger.error( f"Image without patient data: {key}; " + f"included patient ID: {patient_id}; " + "skipping!" ) return prefix = ( constants.TRAINING_PREFIX if training_set else constants.VALIDATION_PREFIX ) image_type = constants.MODALITY.get( image_data["Modality"].value, "unknown" ) date = helpers.get_date_from_key(key) if date: # the location of the new files new_key = posixpath.join( prefix, image_type, patient_id, study_id, series_id, image_path.name, ) metadata_key = posixpath.join( prefix, f"{image_type}-metadata", patient_id, study_id, series_id, f"{image_uuid}.json", ) # send off to copy or upload steps if not s3client.object_exists(new_key): yield "copy", key, new_key if not s3client.object_exists(metadata_key): yield "metadata", metadata_key, image_data
log.warning("Tool not found: {}".format(path)) continue img = elem.find( ".//div[@class='views-field views-field-field-image']//img") if img is None: log.warning("No image for {}".format(path)) continue sourcename = urlparse(img.attrib["src"]).path.split("/")[-1] basename = unquote(sourcename.split(".")[0]).strip() if " " in basename: name = " ".join([part for part in basename.split(" ")[:-1]]) else: name = basename filename = "{} 300.png".format(name) filepath = posixpath.join(images_path, filename) blob_image = None if not posixpath.exists(filepath): log.warning( "{}: Image file not found: {} ({}). Attempting download". format(path, filepath, sourcename)) with TemporaryDirectory(prefix="euphorieimage") as tmpdir: urllib.request.urlretrieve( "{}{}".format(BASE_URL, img.attrib["src"]), filepath) try: with open(filepath, "rb") as imagefile: blob_image = NamedBlobImage(data=imagefile.read(), filename=filename) except Exception as e: log.warning( "Unable to download image from website. Error: {}".
# Is it one or many objects to install as libraries? if os.path.isdir(args.library_path): # Directory path specified, grab all files of type args.objective # TODO: Decide if this should be recursive or not? all_packages = [ p for p in os.listdir(args.library_path) if os.path.splitext(p)[1] == '.' + args.objective ] else: all_packages = [args.library_path] # Get the Jar's name and it's destination folder # Replace the job.json's content job_def["libraries"] = [{ args.objective: posixpath.join(args.cloud_path, package) } for package in all_packages] # If it's an egg, we use spark_python_task, otherwise it's spark_jar_task objective_task_name = "spark_python_task" if args.objective == "egg" else "spark_jar_task" if args.objective == "egg": # You need a python_file to run the app job_def[objective_task_name] = {"python_file": args.python_file} else: # You need a main_class_name to run the app job_def[objective_task_name] = {"main_class_name": args.main_class} # Parameters is an attribute across egg and jar tasks if args.parameters: job_def[objective_task_name].update({"parameters": args.parameters})
if VERBOSE: print_log("Remote Base URL:", BASEURL) print_log("Local Base path:", LOCALPATH) # Download repodata base = urllib.parse.urlparse(BASEURL) print_log("Creating Connection") if VERBOSE: print_log("Scheme: ", base.scheme, ", hostname:", base.netloc) httpAgent = (http.client.HTTPSConnection if base.scheme == "https" else http.client.HTTPConnection)(base.netloc) print_log("Sending request for repomd.xml") if VERBOSE: print_log("GET", path.join(base.path, "repodata", "repomd.xml")) repomd_response = sendHttpRequest( httpAgent, "GET", path.join(base.path, "repodata", "repomd.xml")) if VERBOSE: print_log("Returned HTTP Status:", repomd_response.status) if repomd_response.status != 200: print_err("HTTP Status " + str(repomd_response.status)) exit(1) print_log("Parsing repomd.xml") repomd_rawcontent = repomd_response.read( int(repomd_response.getheader("Content-Length"))).decode() repomd_content = {} repomd_regex = {} repomd_regex["datatype"] = re.compile(r"\<data type\=\"([^\"]*)\"\>")
def process_image(*args, keycache, config, patientcache): """ Processing images from the raw dump Takes a single image, downloads it into temporary storage and extracts its metadata. The metadata is then uploaded here, except if the file already exists. If the image file already exists at the correct location, it's not passed on to the next step. :param obj: the object in question :type obj: boto3.resource('s3').ObjectSummary :param keycache: the key cache service (provided by bonobo) :type keycache: Keycache :return: a task name, the original object, and a new key where it should be copied within the bucket :rtype: (string, boto3.resource('s3').ObjectSummary, string) """ # check file type task, obj, _ = args if task != "process" or Path(obj.key).suffix.lower() != ".dcm": # not an image, don't do anything with it return bonobo.constants.NOT_MODIFIED # check if work is already done image_in_cache = keycache.exists(obj.key) image_uuid = Path(obj.key).stem metadata_in_cache = keycache.exists(f"{image_uuid}.json") if metadata_in_cache and image_in_cache: # files exist, nothing to do here return # download the image image_data = PartialDicom(obj.Object()).download() if image_data is None: # we couldn't read the image data correctly logger.warning( f"Object '{obj.key}' couldn't be loaded as a DICOM file, skipping!" ) return # extract the required data from the image patient_id = image_data.PatientID study_id = image_data.StudyInstanceUID series_id = image_data.SeriesInstanceUID group = patientcache.get_group(patient_id) if group is not None: training_set = group == "training" else: logger.error(f"Image without patient data: {obj.key}; " + f"included patient ID: {patient_id}; " + "skipping!") return prefix = (constants.TRAINING_PREFIX if training_set else constants.VALIDATION_PREFIX) image_type = constants.MODALITY.get(image_data["Modality"].value, "unknown") date = get_date_from_key(obj.key) if date: # the location of the new files new_key = posixpath.join( prefix, image_type, patient_id, study_id, series_id, Path(obj.key).name, ) metadata_key = posixpath.join( prefix, f"{image_type}-metadata", patient_id, study_id, series_id, f"{image_uuid}.json", ) # send off to copy or upload steps if not object_exists(new_key): yield "copy", obj, new_key if not object_exists(metadata_key): yield "metadata", metadata_key, image_data