Пример #1
0
 def run(self):
     global packages, LOCALURL, REMOTEURL, failed_files, VERBOSE
     # prnt(" ".join(["wget", "-O", "\""+ os.path.join(LOCALURL, packages[self._args[0]]["location"]) +"\"", urlpath.join(REMOTEURL, packages[self._args[0]]["location"])]))
     # proc = subprocess.Popen(["wget", "-O", "\""+ os.path.join(LOCALURL, packages[self._args[0]]["location"]) +"\"", urlpath.join(REMOTEURL, packages[self._args[0]]["location"])])
     # proc.wait()
     if VERBOSE:
         prnt(" ".join(["wget", "-O", os.path.join(LOCALURL, packages[self._args[0]]["location"]), urlpath.join(REMOTEURL, packages[self._args[0]]["location"])]))
     retcode = subprocess.call([" ".join(["wget", "-O", os.path.join(LOCALURL, packages[self._args[0]]["location"]), urlpath.join(REMOTEURL, packages[self._args[0]]["location"])])], shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=os.environ)
     if retcode:
         prnt("ERROR: Failed to download", urlpath.join(REMOTEURL, packages[self._args[0]]["location"]), file=sys.stderr)
         failed_files.append(urlpath.join(REMOTEURL, packages[self._args[0]]["location"]))
Пример #2
0
    def generate_id(cls, commit):
        """Calculate action ID."""
        host = os.environ.get('RENKU_DOMAIN') or 'localhost'

        # always set the id by the identifier
        return urllib.parse.urljoin(
            'https://{host}'.format(host=host),
            posixpath.join('/activities',
                           'commit/{commit.hexsha}'.format(commit=commit)))
Пример #3
0
    def generate_id(cls, commitsha):
        """Calculate action ID."""
        host = 'localhost'
        if hasattr(cls, 'client'):
            host = cls.client.remote.get('host') or host
        host = os.environ.get('RENKU_DOMAIN') or 'localhost'

        # always set the id by the identifier
        return urllib.parse.urljoin(
            'https://{host}'.format(host=host),
            posixpath.join(
                '/activities', 'commit/{commit}'.format(commit=commitsha)
            )
        )
def _migrate_broken_dataset_paths(client):
    """Ensure all paths are using correct directory structure."""
    for dataset in client.datasets.values():
        dataset_path = Path(dataset.path)

        expected_path = (client.path / client.renku_datasets_path /
                         Path(quote(dataset.identifier, safe='')))

        # migrate the refs
        ref = LinkReference.create(
            client=client,
            name='datasets/{0}'.format(dataset.short_name),
            force=True,
        )
        ref.set_reference(expected_path / client.METADATA)

        if not dataset_path.exists():
            dataset_path = (client.path / client.renku_datasets_path /
                            uuid.UUID(dataset.identifier).hex)

        if not expected_path.exists():
            shutil.move(str(dataset_path), str(expected_path))
            dataset.path = expected_path
            dataset.__reference__ = expected_path / client.METADATA

        for file_ in dataset.files:
            file_path = Path(file_.path)
            if not file_path.exists() and file_.path.startswith('..'):
                new_path = (client.path / client.renku_datasets_path /
                            dataset.uid / file_path).resolve().relative_to(
                                client.path)

                file_.path = new_path

                _, commit, _ = client.resolve_in_submodules(
                    client.find_previous_commit(file_.path, revision='HEAD'),
                    file_.path,
                )
                host = client.remote.get('host') or 'localhost'
                host = os.environ.get('RENKU_DOMAIN') or host

                # always set the id by the identifier
                file_._id = urllib.parse.urljoin(
                    'https://{host}'.format(host=host),
                    posixpath.join('/blob/{hexsha}/{path}'.format(
                        hexsha=commit.hexsha, path=new_path)))
                file_._label = '{}@{}'.format(new_path, commit.hexsha)

        dataset.to_yaml()
def process_image(*args, s3client, patientcache):
    """Processing images from the raw dump

    Takes a single image, downloads it into temporary storage
    and extracts its metadata.

    The metadata is then uploaded here, except if the file already exists.

    If the image file already exists at the correct location, it's not passed
    on to the next step.

    Parameters
    ----------
    task, key, _ : tuple[str, str, None]
        A task name (only handling "process" tasks), and an object to act on.
    s3client : S3Client
        The service that handles S3 data access
    patientcache:
        The cache that stores the asignments of patients to groups

    Yields
    ------
    tuple[str, str, str or pydicom.FileDataset]
        Tuple containing the task name("copy" or "metadata"), and other parameters
        depending on the task. "copy" passes on the original object and new location.
        "metadata" passes on the target metadata location and the image data to extract from.
    """
    # check file type
    task, key, _ = args
    image_path = Path(key)
    if task != "process" or image_path.suffix.lower() != ".dcm":
        # not an image, don't do anything with it
        yield bonobo.constants.NOT_MODIFIED
        # Stop here for processing
        return

    image_uuid = image_path.stem

    # download the image
    image_data = PartialDicom(s3client, key).download()
    if image_data is None:
        # we couldn't read the image data correctly
        logger.warning(
            f"Object '{key}' couldn't be loaded as a DICOM file, skipping!"
        )
        return

    # extract the required data from the image
    patient_id = image_data.PatientID
    study_id = image_data.StudyInstanceUID
    series_id = image_data.SeriesInstanceUID
    group = patientcache.get_group(patient_id)
    if group is not None:
        training_set = group == "training"
    else:
        logger.error(
            f"Image without patient data: {key}; "
            + f"included patient ID: {patient_id}; "
            + "skipping!"
        )
        return
    prefix = (
        constants.TRAINING_PREFIX
        if training_set
        else constants.VALIDATION_PREFIX
    )
    image_type = constants.MODALITY.get(
        image_data["Modality"].value, "unknown"
    )

    date = helpers.get_date_from_key(key)
    if date:
        # the location of the new files
        new_key = posixpath.join(
            prefix,
            image_type,
            patient_id,
            study_id,
            series_id,
            image_path.name,
        )
        metadata_key = posixpath.join(
            prefix,
            f"{image_type}-metadata",
            patient_id,
            study_id,
            series_id,
            f"{image_uuid}.json",
        )
        # send off to copy or upload steps
        if not s3client.object_exists(new_key):
            yield "copy", key, new_key
        if not s3client.object_exists(metadata_key):
            yield "metadata", metadata_key, image_data
Пример #6
0
            log.warning("Tool not found: {}".format(path))
            continue

        img = elem.find(
            ".//div[@class='views-field views-field-field-image']//img")
        if img is None:
            log.warning("No image for {}".format(path))
            continue
        sourcename = urlparse(img.attrib["src"]).path.split("/")[-1]
        basename = unquote(sourcename.split(".")[0]).strip()
        if " " in basename:
            name = " ".join([part for part in basename.split(" ")[:-1]])
        else:
            name = basename
        filename = "{} 300.png".format(name)
        filepath = posixpath.join(images_path, filename)
        blob_image = None
        if not posixpath.exists(filepath):
            log.warning(
                "{}: Image file not found: {} ({}). Attempting download".
                format(path, filepath, sourcename))
            with TemporaryDirectory(prefix="euphorieimage") as tmpdir:
                urllib.request.urlretrieve(
                    "{}{}".format(BASE_URL, img.attrib["src"]), filepath)
                try:
                    with open(filepath, "rb") as imagefile:
                        blob_image = NamedBlobImage(data=imagefile.read(),
                                                    filename=filename)
                except Exception as e:
                    log.warning(
                        "Unable to download image from website. Error: {}".
Пример #7
0
    # Is it one or many objects to install as libraries?
    if os.path.isdir(args.library_path):
        # Directory path specified, grab all files of type args.objective
        # TODO: Decide if this should be recursive or not?
        all_packages = [
            p for p in os.listdir(args.library_path)
            if os.path.splitext(p)[1] == '.' + args.objective
        ]
    else:
        all_packages = [args.library_path]

    # Get the Jar's name and it's destination folder
    # Replace the job.json's content
    job_def["libraries"] = [{
        args.objective:
        posixpath.join(args.cloud_path, package)
    } for package in all_packages]

    # If it's an egg, we use spark_python_task, otherwise it's spark_jar_task
    objective_task_name = "spark_python_task" if args.objective == "egg" else "spark_jar_task"
    if args.objective == "egg":
        # You need a python_file to run the app
        job_def[objective_task_name] = {"python_file": args.python_file}
    else:
        # You need a main_class_name to run the app
        job_def[objective_task_name] = {"main_class_name": args.main_class}

    # Parameters is an attribute across egg and jar tasks
    if args.parameters:
        job_def[objective_task_name].update({"parameters": args.parameters})
Пример #8
0
if VERBOSE:
    print_log("Remote Base URL:", BASEURL)
    print_log("Local Base path:", LOCALPATH)

# Download repodata
base = urllib.parse.urlparse(BASEURL)
print_log("Creating Connection")
if VERBOSE:
    print_log("Scheme: ", base.scheme, ", hostname:", base.netloc)
httpAgent = (http.client.HTTPSConnection if base.scheme == "https" else
             http.client.HTTPConnection)(base.netloc)

print_log("Sending request for repomd.xml")
if VERBOSE:
    print_log("GET", path.join(base.path, "repodata", "repomd.xml"))
repomd_response = sendHttpRequest(
    httpAgent, "GET", path.join(base.path, "repodata", "repomd.xml"))

if VERBOSE:
    print_log("Returned HTTP Status:", repomd_response.status)
if repomd_response.status != 200:
    print_err("HTTP Status " + str(repomd_response.status))
    exit(1)

print_log("Parsing repomd.xml")
repomd_rawcontent = repomd_response.read(
    int(repomd_response.getheader("Content-Length"))).decode()
repomd_content = {}
repomd_regex = {}
repomd_regex["datatype"] = re.compile(r"\<data type\=\"([^\"]*)\"\>")
Пример #9
0
def process_image(*args, keycache, config, patientcache):
    """ Processing images from the raw dump

    Takes a single image, downloads it into temporary storage
    and extracts its metadata.

    The metadata is then uploaded here, except if the file already exists.

    If the image file already exists at the correct location, it's not passed
    on to the next step.

    :param obj: the object in question
    :type obj: boto3.resource('s3').ObjectSummary
    :param keycache: the key cache service (provided by bonobo)
    :type keycache: Keycache
    :return: a task name, the original object, and a new key where it should be copied within the bucket
    :rtype: (string, boto3.resource('s3').ObjectSummary, string)
    """
    # check file type
    task, obj, _ = args
    if task != "process" or Path(obj.key).suffix.lower() != ".dcm":
        # not an image, don't do anything with it
        return bonobo.constants.NOT_MODIFIED

    # check if work is already done
    image_in_cache = keycache.exists(obj.key)
    image_uuid = Path(obj.key).stem
    metadata_in_cache = keycache.exists(f"{image_uuid}.json")
    if metadata_in_cache and image_in_cache:
        # files exist, nothing to do here
        return

    # download the image
    image_data = PartialDicom(obj.Object()).download()
    if image_data is None:
        # we couldn't read the image data correctly
        logger.warning(
            f"Object '{obj.key}' couldn't be loaded as a DICOM file, skipping!"
        )
        return

    # extract the required data from the image
    patient_id = image_data.PatientID
    study_id = image_data.StudyInstanceUID
    series_id = image_data.SeriesInstanceUID
    group = patientcache.get_group(patient_id)
    if group is not None:
        training_set = group == "training"
    else:
        logger.error(f"Image without patient data: {obj.key}; " +
                     f"included patient ID: {patient_id}; " + "skipping!")
        return
    prefix = (constants.TRAINING_PREFIX
              if training_set else constants.VALIDATION_PREFIX)
    image_type = constants.MODALITY.get(image_data["Modality"].value,
                                        "unknown")

    date = get_date_from_key(obj.key)
    if date:
        # the location of the new files
        new_key = posixpath.join(
            prefix,
            image_type,
            patient_id,
            study_id,
            series_id,
            Path(obj.key).name,
        )
        metadata_key = posixpath.join(
            prefix,
            f"{image_type}-metadata",
            patient_id,
            study_id,
            series_id,
            f"{image_uuid}.json",
        )
        # send off to copy or upload steps
        if not object_exists(new_key):
            yield "copy", obj, new_key
        if not object_exists(metadata_key):
            yield "metadata", metadata_key, image_data