Exemplo n.º 1
0
def update_dataset(zenodo_dataset, conp_dataset, token):
    # To update a dataset, we don't know which files have been updated
    # so we have to remove all existing files and redownload all files
    # fresh from the latest version of that zenodo dataset

    dataset_dir = conp_dataset["directory"]
    dats_dir = os.path.join(dataset_dir, "DATS.json")
    zenodo_tracker_path = os.path.join(dataset_dir,
                                       ".conp-zenodo-crawler.json")

    # Remove all data and DATS.json files
    for file_name in os.listdir(dataset_dir):
        if file_name[0] == "." or file_name == "README.md":
            continue
        api.remove(os.path.join(dataset_dir, file_name), check=False)

    d = api.Dataset(dataset_dir)

    for bucket in zenodo_dataset["files"]:
        download_file(bucket, d, dataset_dir)

    # If DATS.json isn't in downloaded files, create new DATS.json
    if not os.path.isfile(dats_dir):
        create_new_dats(dataset_dir, dats_dir, zenodo_dataset)

    # Add/update .conp-zenodo-crawler.json tracker file
    create_zenodo_tracker(zenodo_tracker_path, zenodo_dataset)

    # Save all changes and push to github
    d.save()
    d.publish(to="github")
Exemplo n.º 2
0
    def check_comparison_dir(self):
        cmpr_path = self.data.comparison_dir
        dl_dset = datalad.Dataset(str(self.data.tests_data_dir))
        if not cmpr_path.exists():
            raise ValueError(
                "The following path does not exist but is required to "
                f"perform a test:{cmpr_path}.\n You may wish to run the "
                "test with the --create_sample_output flag or generate "
                "output for future test sessions with "
                "--save_sample_output. ")
        cmpr_files = list(cmpr_path.glob("**/*"))
        cmpr_files_rel = [f.relative_to(cmpr_path) for f in cmpr_files]

        files_required = [
            f.relative_to(self.data.outdir) for f in self.file_list
        ]
        missing_files = []
        for f in files_required:
            if f not in cmpr_files_rel:
                missing_files.append(str(cmpr_path / f))
        if missing_files:
            m_str = " ".join(missing_files)
            raise ValueError(
                "The following files are missing and are required to "
                f"fully complete the test: {m_str} ")

        need_data = any(p.is_symlink() and not p.exists() for p in cmpr_files)
        if need_data:
            dm.try_data_download([cmpr_path], dl_dset.path, self.data.logger)
Exemplo n.º 3
0
def run_tests(tests_dir, **args_dict):

    check_git_config()
    test_data = datalad.Dataset(str(tests_dir / "afni_ci_test_data"))
    if test_data.repo:
        check_test_data_repo(
            test_data, ignore_dirty_data=args_dict.get("ignore_dirty_data")
        )

    cmd_args = get_test_cmd_args(**args_dict)
    cmd_args = configure_parallelism(cmd_args, args_dict.get("use_all_cores"))
    cmd_args = configure_for_coverage(tests_dir, cmd_args, **args_dict)
    if args_dict.get("build_dir"):
        cmd = generate_cmake_command_as_required(tests_dir, args_dict)
        cmd += f""";ARGS='{' '.join(x for x in cmd_args)}' ninja pytest"""
    else:
        cmd = f"""{sys.executable} -m pytest {' '.join(x for x in cmd_args)}"""

    if args_dict.get("coverage"):
        # append gcovr to assemble coverage report for C code
        cmd += f"; gcovr -s --xml -o {tests_dir}/gcovr_output.xml -r {args_dict['build_dir']}/src"
        # append command for compiling and uploading codecov report
        #
        # apparently there is a security issue with codecov, we must
        # investigate; however, this currently is NOT be being run in
        # the CircleCI tests ---it probably should not be used, either,
        # but we should hold a static version of the script that is
        # reliable (which would required occasional checks for updates)

        cmd += "; bash -c 'bash <(curl -s https://codecov.io/bash)'"

    print(f"Executing: {cmd}")
    res = subprocess.run(cmd, shell=True, env=os.environ.copy())
    sys.exit(res.returncode)
Exemplo n.º 4
0
def run_tests(tests_dir, **args_dict):

    check_git_config()
    test_data = datalad.Dataset(str(tests_dir / "afni_ci_test_data"))
    if test_data.repo:
        check_test_data_repo(
            test_data, ignore_dirty_data=args_dict.get("ignore_dirty_data")
        )

    cmd_args = get_test_cmd_args(**args_dict)
    cmd_args = configure_parallelism(cmd_args, args_dict.get("use_all_cores"))
    cmd_args = configure_for_coverage(tests_dir, cmd_args, **args_dict)
    if args_dict.get("build_dir"):
        cmd = generate_cmake_command_as_required(tests_dir, args_dict)
        cmd += f""";ARGS='{' '.join(x for x in cmd_args)}' ninja pytest"""
    else:
        cmd = f"""{sys.executable} -m pytest {' '.join(x for x in cmd_args)}"""

    if args_dict.get("coverage"):
        # append gcovr to assemble coverage report for C code
        cmd += f"; gcovr -s --xml -o {tests_dir}/gcovr_output.xml -r {args_dict['build_dir']}/src"
        # append command for compiling and uploading codecov report
        cmd += "; bash -c 'bash <(curl -s https://codecov.io/bash)'"

    print(f"Executing: {cmd}")
    res = subprocess.run(cmd, shell=True, env=os.environ.copy())
    sys.exit(res.returncode)
Exemplo n.º 5
0
def unlock():
    repo = Repo()
    project: str = project_name2env(repo.working_dir.split("/")[-1])
    token: (str | None) = os.getenv(project + "_ZENODO_TOKEN", None)

    if not token:
        raise Exception(
            f"{project}_ZENODO_TOKEN not found. Cannot inject the Zenodo token into the git-annex urls."
        )

    annex = repo.git.annex
    if repo.active_branch.name != "master":
        raise Exception("Dataset repository not set to branch 'master'")

    if not os.path.isfile(".conp-zenodo-crawler.json"):
        raise Exception("'.conp-zenodo-crawler.json file not found")

    with open(".conp-zenodo-crawler.json", "r") as f:
        metadata = json.load(f)

    # Ensure correct data
    if not metadata["restricted"]:
        raise Exception("Dataset not restricted, no need to unlock")
    if len(metadata["private_files"]["archive_links"]) == 0 and len(
            metadata["private_files"]["files"]) == 0:
        raise Exception("No restricted files to unlock")

    # Set token in archive link URLs
    if len(metadata["private_files"]["archive_links"]) > 0:
        repo.git.checkout("git-annex")
        changes = False
        for link in metadata["private_files"]["archive_links"]:
            for dir_name, dirs, files in os.walk("."):
                for file_name in files:
                    file_path = os.path.join(dir_name, file_name)
                    if ".git" in file_path:
                        continue
                    with open(file_path, "r") as f:
                        s = f.read()
                    if link in s and "access_token" not in s:
                        changes = True
                        s = s.replace(link, link + "?access_token=" + token)
                        with open(file_path, "w") as f:
                            f.write(s)
        if changes:
            repo.git.add(".")
            repo.git.commit("-m", "Unlock dataset")
        repo.git.checkout("master")

    # Set token in non-archive link URLs
    if len(metadata["private_files"]["files"]) > 0:
        datalad = api.Dataset(".")
        for file in metadata["private_files"]["files"]:
            annex("rmurl", file["name"], file["link"])
            annex("addurl", file["link"] + "?access_token=" + token, "--file",
                  file["name"], "--relaxed")
            datalad.save()

    print("Done")
Exemplo n.º 6
0
def create_new_dataset(dataset, token):
    dir = os.path.join("projects", dataset["title"])
    d = api.Dataset(dir)
    d.create()
    d.create_sibling_github(("conp-dataset-" + dataset["title"])[0:100],
                            github_login=token,
                            github_passwd=token)
    for file_url in dataset["files"]:
        d.download_url(file_url, archive=True)
        d.publish(to="github")
Exemplo n.º 7
0
def base_dataset(tmpdir_factory):
    skipif.no_datalad()
    import datalad.api as dl
    path = str(tmpdir_factory.mktemp("dataset"))
    ds = dl.Dataset(path).create(force=True)

    create_tree(ds.path, {"foo": "foo",
                          "bar": "bar",
                          "d": {"in": "content\n"}})
    ds.add(".")
    ds.repo.tag("root")
    return ds
Exemplo n.º 8
0
def container_dataset(tmpdir_factory):
    skipif.no_datalad()
    skipif.no_network()

    if "datalad_container" not in external_versions:
        pytest.skip("datalad-container not installed")

    import datalad.api as dl
    path = str(tmpdir_factory.mktemp("container_dataset"))
    ds = dl.Dataset(path).create(force=True)
    ds.containers_add("dc", url="shub://datalad/datalad-container:testhelper")
    return ds
Exemplo n.º 9
0
def get_tests_data_dir(config_obj):
    """Get the path to the test data directory. If the test data directory
    does not exist or is not populated, install with datalad.
    """
    logger = logging.getLogger("Test data setup")

    tests_data_dir = get_test_data_path(config_obj)

    # remote should be configured or something is badly amiss...
    dl_dset = datalad.Dataset(str(tests_data_dir))
    if (
        dl_dset.is_installed()
        and "remote.afni_ci_test_data.url" not in dl_dset.config.keys()
    ):
        for f in dl_dset.pathobj.glob("**/*"):
            try:
                f.chmod(0o700)
            except FileNotFoundError:
                # missing symlink, nothing to worry about
                pass
        logger.warn("Not sure about test data, perhaps you should try removing...")
        raise ValueError("Not sure about test data, perhaps you should try removing...")
        # shutil.rmtree(dl_dset.pathobj)

    # datalad is required and the datalad repository is used for data.
    if not (tests_data_dir / ".datalad").exists():
        try:
            global dl_lock
            dl_lock.acquire()
            if not (tests_data_dir / ".datalad").exists():
                logger.warn("Installing test data")
                datalad.install(
                    str(tests_data_dir),
                    "https://gin.g-node.org/leej3/afni_ci_test_data",
                    recursive=True,
                    on_failure="stop",
                )
        finally:
            dl_lock.release()
    # Needs to be user writeable:
    some_files = [".git/logs/HEAD"]
    for f in some_files:
        data_file = tests_data_dir / f
        if not data_file.exists():
            raise ValueError(
                f"{f} does not exist (parent existences: {f.parent.exists()}"
            )
        if not os.access(data_file, os.W_OK):
            raise ValueError(f"{f} is not user writeable ({os.getuid()})")
    return tests_data_dir
Exemplo n.º 10
0
    def download_datalad_repo(self, git_ref="master", ignore_dirty_data=False):
        """
        Makes sure datalad repository is downloaded. If a commit is
        provided this should be checked out. Dirty data (data in the
        repository that has not been committed is ignored if
        ignore_dirty_data is set to True.
        """

        if not self.params["data"].get("url"):
            raise ValueError("A value for url must be provided if the data "
                             "type is datalad_repo ")
        # Get directory name for repository
        dl_dset = datalad.Dataset(str(self.params['data']['location']))
        get_tests_data_dir(dl_dset, dset_url=self.params['data']['url'])
Exemplo n.º 11
0
def create_new_dataset(dataset, token, force, username):
    repo_title = ("conp-dataset-" + dataset["title"])[0:100]
    full_repository = "{}/{}".format(username, repo_title)

    # Check for existing github repo with same name
    if not verify_repository(username, full_repository, token, dataset, force):
        return ""

    dataset_dir = os.path.join("projects", dataset["title"])
    d = api.Dataset(dataset_dir)
    d.create()
    d.no_annex("DATS.json")
    d.no_annex("README.md")
    d.no_annex(".conp-zenodo-crawler.json")
    d.config.add("datalad.log.timestamp", "true")
    d.save()

    r = d.create_sibling_github(repo_title,
                                name="github",
                                github_login=token,
                                github_passwd=token)

    for bucket in dataset["files"]:
        download_file(bucket, d, dataset_dir)

    # Create DATS.json if it doesn't exist
    if not os.path.isfile(os.path.join(dataset_dir, "DATS.json")):
        create_new_dats(dataset_dir, os.path.join(dataset_dir, "DATS.json"),
                        dataset)

    # Create README.md if doesn't exist
    if not os.path.isfile(os.path.join(dataset_dir, "README.md")):
        create_readme(dataset, dataset_dir)

    # Add .conp-zenodo-crawler.json tracker file
    create_zenodo_tracker(
        os.path.join(dataset_dir, ".conp-zenodo-crawler.json"), dataset)

    # Save all changes and push to github
    d.save()
    d.publish(to="github")

    # Add description to Github repo
    add_description(token, repo_title, username, dataset)

    update_gitmodules(dataset_dir, r[0][1].replace(token + "@", ""))

    return d.path
Exemplo n.º 12
0
def try_data_download(file_fetch_list, test_data_dir, logger):
    try:
        global dl_lock
        dl_lock.acquire(poll_intervall=1)
        dl_dset = datalad.Dataset(str(test_data_dir))
        # Fetching the data
        process_for_fetching_data = Process(
            target=dl_dset.get, kwargs={"path": [str(p) for p in file_fetch_list]}
        )

        # attempts should be timed-out to deal with unpredictable stalls.
        process_for_fetching_data.start()
        # logger.debug(f"Fetching data for {test_data_dir}")
        process_for_fetching_data.join(timeout=60)
        if process_for_fetching_data.is_alive():
            # terminate the process.
            process_for_fetching_data.terminate()
            # logger.warn(f"Data fetching timed out for {file_fetch_list}")
            return False
        elif process_for_fetching_data.exitcode != 0:
            # logger.warn(f"Data fetching failed for {file_fetch_list}")
            return False
        else:
            return True
    except (
        IncompleteResultsError,
        ValueError,
        CommandError,
        TimeoutError,
        Timeout,
    ) as err:
        logger.warn(
            f"Datalad download failure ({type(err)}) for {test_data_dir}. Will try again"
        )

        return False

    finally:
        # make sure datalad repo wasn't updated to git annex version 8. Not sure why this is happening
        git_config_file = Path(test_data_dir) / ".git" / "config"
        git_config_file.write_text(
            git_config_file.read_text().replace("version = 8", "version = 7")
        )
        dl_lock.release()
        sleep(random.randint(1, 10))
Exemplo n.º 13
0
def process_path_obj(path_obj, test_data_dir):
    """
    Convert paths to the pathlib Path type and get the data for test_data_dir,
    a datalad repository.

    Args: path_obj (str/pathlib.Path or iterable): Paths as
        strings/pathlib.Path  or non-str iterables with elements of these
        types can be passed as arguments for conversion to Path objects

        test_data_dir (pathlib.Path): An existing datalad repository containing the test data.
    Returns:
        Path or iterable of Paths: path_obj appropriately converted to pathlib Paths
        objects with files in test_data_dir data fetched as required.
    """
    dl_dset = datalad.Dataset(str(test_data_dir))
    if type(path_obj) == str:
        path_obj = Path(path_obj)

    if isinstance(path_obj, Path):
        check_file_exists(path_obj, test_data_dir)
        file_fetch_list = generate_fetch_list(path_obj, test_data_dir)
        dl_dset.get(path=file_fetch_list)
        return test_data_dir / path_obj
    elif iter(path_obj):
        file_fetch_list = []
        for input_file in path_obj:
            input_file = Path(input_file)
            check_file_exists(input_file, test_data_dir)
            file_fetch_list = file_fetch_list + generate_fetch_list(
                input_file, test_data_dir
            )

        dl_dset.get(path=file_fetch_list)

        return [test_data_dir / p for p in path_obj]
    else:

        raise TypeError(
            "data_paths must contain values that are of type str or a "
            "non-str iterable type. i.e. list, tuple... "
        )
Exemplo n.º 14
0
def try_data_download(file_fetch_list, test_data_dir, logger):
    try:
        global dl_lock
        dl_lock.acquire(poll_intervall=1)
        dl_dset = datalad.Dataset(str(test_data_dir))
        # Fetching the data
        process_for_fetching_data = Process(
            target=dl_dset.get,
            kwargs={"path": [str(p) for p in file_fetch_list]})

        # attempts should be timed-out to deal with unpredictable stalls.
        process_for_fetching_data.start()
        # logger.debug(f"Fetching data for {test_data_dir}")
        process_for_fetching_data.join(timeout=60)
        if process_for_fetching_data.is_alive():
            # terminate the process.
            process_for_fetching_data.terminate()
            # logger.warn(f"Data fetching timed out for {file_fetch_list}")
            return False
        elif process_for_fetching_data.exitcode != 0:
            # logger.warn(f"Data fetching failed for {file_fetch_list}")
            return False
        else:
            return True
    except (
            IncompleteResultsError,
            ValueError,
            CommandError,
            TimeoutError,
            Timeout,
    ) as err:
        logger.warn(
            f"Datalad download failure ({type(err)}) for {test_data_dir}. Will try again"
        )

        return False

    finally:
        dl_lock.release()
        sleep(random.randint(1, 10))
Exemplo n.º 15
0
def try_data_download(file_fetch_list, test_data_dir):
    global lock
    dl_dset = datalad.Dataset(str(test_data_dir))
    attempt_count = 0
    lock.acquire()
    while attempt_count < 2:
        try:
            # Fetching the data
            process_for_fetching_data = Process(
                target=dl_dset.get,
                kwargs={"path": [str(p) for p in file_fetch_list]})

            # attempts should be timed-out to deal with of unpredictable stalls.
            process_for_fetching_data.start()
            process_for_fetching_data.join(timeout=30)
            if process_for_fetching_data.is_alive():
                # terminate the process.
                process_for_fetching_data.terminate()
                raise IncompleteResultsError(
                    f"Data fetching timed out for {file_fetch_list}")
            elif process_for_fetching_data.exitcode != 0:
                raise ValueError(f"Data fetching failed for {file_fetch_list}")
            else:
                lock.release()
                return
        except (IncompleteResultsError, CommandError) as e:
            # Try another loop
            attempt_count += 1
            # make sure datalad repo wasn't updated to git annex version 8. Not sure why this is happening
            git_config_file = Path(test_data_dir) / ".git" / "config"
            git_config_file.write_text(git_config_file.read_text().replace(
                "version = 8", "version = 7"))
            continue

    # datalad download attempts failed
    pytest.exit(
        "Datalad download failed 5 times, you may not be connected to the internet"
    )
Exemplo n.º 16
0
def test_orc_datalad_run_container(tmpdir, container_dataset, shell,
                                   orc_class):
    import datalad.api as dl
    # Avoid the dataset fixture because the subdataset will make its simplistic
    # cleanup fail.
    ds = dl.Dataset(op.join(str(tmpdir), "ds")).create()
    ds.install(path="subds", source=container_dataset)
    if orc_class == orcs.DataladLocalRunOrchestrator:
        # We need to have the image locally in order to copy it to the
        # non-dataset remote.
        ds.get(op.join("subds", ".datalad", "environments"))
    with chpwd(ds.path):
        orc = orc_class(
            shell, submission_type="local",
            job_spec={"root_directory": op.join(str(tmpdir), "nm-run"),
                      "outputs": ["out"],
                      "container": "subds/dc",
                      "command_str": 'sh -c "ls / >out"'})
        orc.prepare_remote()
        orc.submit()
        orc.follow()
        orc.fetch()
        assert ds.repo.file_has_content("out")
        assert "singularity" in open("out").read()
    file_paths = []
    for data_file_response in specific_data:
        assert data_file_response[
            'status'] == 'ok', "Requires an 'ok' status, received %s" % (
                data_file_response['status'])
        if data_file_response['type'] == 'file':
            file_paths.append(data_file_response['path'])
    return file_paths


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("dataset_dir",
                        help="Directory to add datalad repository into")
    parser.add_argument("dataset_name", help="Name of datalad repository")
    parser.add_argument("--get_data",
                        nargs='?',
                        default=None,
                        help="Flag indicating whether to get the data")
    args = parser.parse_args()
    print(args)
    ds_name = os.path.basename(args.dataset_name)
    ds_path = os.path.join(args.dataset_dir, ds_name)
    if not os.path.exists(ds_path):
        ds, _ignore = install_dataset(args.dataset_name, args.dataset_dir)
    else:
        ds = api.Dataset(ds_path)
    subjects, datatypes = get_type_neuro_data(ds_path)
    if args.get_data != None:
        file_paths = get_dataset_data(ds, args.get_data)
import sys
import datalad.api as dl

ds = dl.Dataset(sys.argv[1])
repo = ds.repo

# the actual repository should be separately accessible
# (via webserver) and is therefore placed into a dedicated
# subdataset, which we can publish individually. we can also
# further subdivide it for large repos
# do not register immediate in the superdataset, but consolidate
# in a single commit at the end
dl.create(ds.pathobj / 'repo')

# destination for the reprepro config
(ds.pathobj / 'conf').mkdir()
# we want the config to be in git
repo.call_annex(['config', '--set', 'annex.largefiles', 'exclude=conf/*'])
# establish basic config for repository and reprepro behavior
(ds.pathobj / 'conf' / 'options').write_text("""\
# we want repropro to ask for a key passphrase and not just die
ask-passphrase
# tell reprepro where the repository root is (root of the subdataset)
outdir +b/repo
""")
# the DB files written and read by reprepro need special handling
# we need to keep them unlocked (for reprepro to function normally
# without datalad), but we also do not want them in git, and we also
# cannot fully ignore them: make sure the anything in db/ is tracked
# but always unlocked
repo.call_annex(['config', '--set', 'annex.addunlocked', 'include=db/*'])
Exemplo n.º 19
0
## Datalad has a Python API!
One particularly nice aspect of datalad is that it has a Python API, which means that anything you would like to do with datalad in the commandline, can also be run in Python. See the details of the datalad [Python API](http://docs.datalad.org/en/latest/modref.html).

For example, suppose you would like to clone a data repository, such as the Localizer dataset. You can run `dl.clone(source=url, path=location)`. Make sure you set `localizer_path` to the location where you would like the Localizer repository installed.

import os
import glob
import datalad.api as dl
import pandas as pd

localizer_path = '/Users/lukechang/Dropbox/Dartbrains/data/Localizer'

dl.clone(source='https://gin.g-node.org/ljchang/Localizer', path=localizer_path)


We can now create a dataset instance using `dl.Dataset(path_to_data)`.

ds = dl.Dataset(localizer_path)

How much of the dataset have we downloaded?  We can check the status of the annex using `ds.status(annex='all')`.

results = ds.status(annex='all')

Looks like it's empty, which makes sense since we only cloned the dataset. 

Now we need to get some data. Let's start with something small to play with first.

Let's use `glob` to find all of the tab-delimited confound data generated by fmriprep. 

file_list = glob.glob(os.path.join(localizer_path, '*', 'fmriprep', '*', 'func', '*tsv'))
file_list.sort()
Exemplo n.º 20
0
def ProcessFiles(graph, scan_type, output_directory, project_location, args):
    '''
    This function will essentially cycle through the acquisition objects in the NIDM file loaded into graph
    and depending on the scan_type will try and copy the image to the output_directory
    '''

    if scan_type == Constants.NIDM_MRI_DIFFUSION_TENSOR.uri:
        bids_ext = 'dwi'
    elif scan_type == Constants.NIDM_MRI_ANATOMIC_SCAN.uri:
        bids_ext = 'anat'
    elif scan_type == Constants.NIDM_MRI_FUNCTION_SCAN.uri:
        bids_ext = 'func'

    # query NIDM document for acquisition entity "subjects" with predicate nidm:hasImageUsageType and object scan_type
    for acq in graph.subjects(predicate=URIRef(
            Constants.NIDM_IMAGE_USAGE_TYPE.uri),
                              object=URIRef(scan_type)):
        # first see if file exists locally.  Get nidm:Project prov:Location and append the nfo:Filename of the image
        # from the acq acquisition entity.  If that file doesn't exist try the prov:Location in the func acq
        # entity and see if we can download it from the cloud

        # get acquisition uuid from entity uuid
        temp = graph.objects(subject=acq,
                             predicate=Constants.PROV['wasGeneratedBy'])
        for item in temp:
            activity = item
        # get participant ID with sio:Subject role in anat_acq qualified association
        part_id = GetParticipantIDFromAcquisition(
            nidm_file_list=[args.rdf_file], acquisition=activity)

        # make BIDS sub directory
        if 'sub' in (part_id['ID'].values)[0]:
            sub_dir = join(output_directory, (part_id['ID'].values)[0])
        else:
            sub_dir = join(output_directory,
                           "sub-" + (part_id['ID'].values)[0])
        sub_filename_base = "sub-" + (part_id['ID'].values)[0]
        if not os.path.exists(sub_dir):
            os.makedirs(sub_dir)

        # make BIDS scan type directory (bids_ext) directory
        if not os.path.exists(join(sub_dir, bids_ext)):
            os.makedirs(join(sub_dir, bids_ext))

        for filename in graph.objects(subject=acq,
                                      predicate=URIRef(
                                          Constants.NIDM_FILENAME.uri)):
            # check if file exists
            for location in project_location:
                # if MRI exists in this location then copy and rename
                if isfile((location[0] + filename).lstrip("file:")):
                    # copy and rename file to be BIDS compliant
                    copyfile((location[0] + filename).lstrip("file:"),
                             join(sub_dir, bids_ext,
                                  sub_filename_base + splitext(filename)[1]))
                    continue
            # if the file wasn't accessible locally, try with the prov:Location in the acq
            for location in graph.objects(subject=acq,
                                          predicate=URIRef(
                                              Constants.PROV['Location'])):
                # try to download the file and rename
                ret = GetImageFromURL(location)
                if ret == -1:
                    print(
                        "ERROR! Can't download file: %s from url: %s, trying to copy locally...."
                        % (filename, location))
                    if "file" in location:
                        location = str(location).lstrip("file:")
                        print("Trying to copy file from %s" % (location))
                        try:
                            copyfile(
                                location,
                                join(output_directory, sub_dir, bids_ext,
                                     basename(filename)))

                        except:
                            print(
                                "ERROR! Failed to find file %s on filesystem..."
                                % location)
                            if not args.no_downloads:
                                try:
                                    print(
                                        "Running datalad get command on dataset: %s"
                                        % location)
                                    dl.Dataset(os.path.dirname(location)).get(
                                        recursive=True, jobs=1)

                                except:
                                    print(
                                        "ERROR! Datalad returned error: %s for dataset %s."
                                        % (sys.exc_info()[0], location))
                                    GetImageFromAWS(location=location,
                                                    output_file=join(
                                                        output_directory,
                                                        sub_dir, bids_ext,
                                                        basename(filename)),
                                                    args=args)

                else:
                    # copy temporary file to BIDS directory
                    copyfile(
                        ret,
                        join(output_directory, sub_dir, bids_ext,
                             basename(filename)))

                # if we were able to copy the image file then add the json sidecar file with additional metadata
                # available in the NIDM file
                if isfile(
                        join(output_directory, sub_dir, bids_ext,
                             basename(filename))):
                    # get rest of metadata for this acquisition and store in sidecar file
                    if "gz" in basename(filename):
                        image_filename = splitext(
                            splitext(basename(filename))[0])[0]
                    else:
                        image_filename = splitext(basename(filename))[0]
                    AddMetadataToImageSidecar(graph_entity=acq,
                                              graph=graph,
                                              output_directory=join(
                                                  output_directory, sub_dir,
                                                  bids_ext),
                                              image_filename=image_filename)

            # if this is a DWI scan then we should copy over the b-value and b-vector files
            if bids_ext == 'dwi':
                # search for entity uuid with rdf:type nidm:b-value that was generated by activity
                query = """
                    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                    PREFIX prov: <http://www.w3.org/ns/prov#>
                    PREFIX nidm: <http://purl.org/nidash/nidm#>
    
                    SELECT DISTINCT ?entity
                        WHERE {
                            ?entity rdf:type <http://purl.org/nidash/nidm#b-value> ;
                                prov:wasGeneratedBy <%s> .
                        }""" % activity
                # print(query)
                qres = graph.query(query)

                for row in qres:
                    bval_entity = str(row[0])

                # if the file wasn't accessible locally, try with the prov:Location in the acq
                for location in graph.objects(subject=URIRef(bval_entity),
                                              predicate=URIRef(
                                                  Constants.PROV['Location'])):
                    # try to download the file and rename
                    ret = GetImageFromURL(location)
                    if ret == -1:
                        print(
                            "ERROR! Can't download file: %s from url: %s, trying to copy locally...."
                            % (filename, location))
                        if "file" in location:
                            location = str(location).lstrip("file:")
                            print("Trying to copy file from %s" % (location))
                            try:
                                copyfile(
                                    location,
                                    join(output_directory, sub_dir, bids_ext,
                                         basename(location)))
                            except:
                                print(
                                    "ERROR! Failed to find file %s on filesystem..."
                                    % location)
                                if not args.no_downloads:
                                    try:
                                        print(
                                            "Running datalad get command on dataset: %s"
                                            % location)
                                        dl.Dataset(
                                            os.path.dirname(location)).get(
                                                recursive=True, jobs=1)

                                    except:
                                        print(
                                            "ERROR! Datalad returned error: %s for dataset %s."
                                            % (sys.exc_info()[0], location))
                                        GetImageFromAWS(
                                            location=location,
                                            output_file=join(
                                                output_directory, sub_dir,
                                                bids_ext, basename(location)),
                                            args=args)
                # search for entity uuid with rdf:type nidm:b-value that was generated by activity
                query = """
                    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
                    PREFIX prov: <http://www.w3.org/ns/prov#>
                    PREFIX nidm: <http://purl.org/nidash/nidm#>

                    SELECT DISTINCT ?entity
                        WHERE {
                            ?entity rdf:type <http://purl.org/nidash/nidm#b-vector> ;
                                prov:wasGeneratedBy <%s> .
                        }""" % activity
                # print(query)
                qres = graph.query(query)

                for row in qres:
                    bvec_entity = str(row[0])

                # if the file wasn't accessible locally, try with the prov:Location in the acq
                for location in graph.objects(subject=URIRef(bvec_entity),
                                              predicate=URIRef(
                                                  Constants.PROV['Location'])):
                    # try to download the file and rename
                    ret = GetImageFromURL(location)
                    if ret == -1:
                        print(
                            "ERROR! Can't download file: %s from url: %s, trying to copy locally...."
                            % (filename, location))
                        if "file" in location:
                            location = str(location).lstrip("file:")
                            print("Trying to copy file from %s" % (location))
                            try:
                                copyfile(
                                    location,
                                    join(output_directory, sub_dir, bids_ext,
                                         basename(location)))
                            except:
                                print(
                                    "ERROR! Failed to find file %s on filesystem..."
                                    % location)
                                if not args.no_downloads:
                                    try:
                                        print(
                                            "Running datalad get command on dataset: %s"
                                            % location)
                                        dl.Dataset(
                                            os.path.dirname(location)).get(
                                                recursive=True, jobs=1)

                                    except:
                                        print(
                                            "ERROR! Datalad returned error: %s for dataset %s."
                                            % (sys.exc_info()[0], location))
                                        GetImageFromAWS(
                                            location=location,
                                            output_file=join(
                                                output_directory, sub_dir,
                                                bids_ext, basename(location)),
                                            args=args)
### Datalad has a Python API!
One particularly nice aspect of datalad is that it has a Python API, which means that anything you would like to do with datalad in the commandline, can also be run in Python. See the details of the datalad [Python API](http://docs.datalad.org/en/latest/modref.html).

For example, suppose you would like to clone a data repository, such as the Sherlock dataset. You can run `dl.clone(source=url, path=location)`. Make sure you set `sherlock_path` to the location where you would like the Sherlock repository installed.

import os
import glob
import datalad.api as dl
import pandas as pd

sherlock_path = '/Users/lukechang/Downloads/Sherlock'

dl.clone(source='https://gin.g-node.org/ljchang/Sherlock', path=sherlock_path)


We can now create a dataset instance using `dl.Dataset(path_to_data)`.

ds = dl.Dataset(sherlock_path)

How much of the dataset have we downloaded?  We can check the status of the annex using `ds.status(annex='all')`.

results = ds.status(annex='all')

Looks like it's empty, which makes sense since we only cloned the dataset. 

Now we need to get some data. Let's start with something small to play with first.

Let's use `glob` to find all of the tab-delimited confound data generated by fmriprep. 

file_list = glob.glob(os.path.join(sherlock_path, 'fmriprep', '*', 'func', '*tsv'))
file_list.sort()
%matplotlib inline

### Data
This tutorial will be using the **Sherlock** dataset and will require downloading the cropped and denoised **hdf5** files, the annotations file `Sherlock_Segments_1000_NN_2017.xlsx`, and the preprocessed video text file `video_text.npy`.

You will want to change `datadir` to wherever you have installed the Sherlock datalad repository (e.g. `~/data`). We will initialize a datalad dataset instance and get the files we need for this tutorial. If you've already downloaded everything, this cell should execute quickly. See the [Download Data Tutorial](http://naturalistic-data.org/features/notebooks/Download_Data.html) for more information about how to install and use datalad.

datadir = '/Volumes/Engram/Data/Sherlock'

# If dataset hasn't been installed, clone from GIN repository
if not os.path.exists(datadir):
    dl.clone(source='https://gin.g-node.org/ljchang/Sherlock', path=datadir)

# Initialize dataset
ds = dl.Dataset(datadir)

# Get Cropped & Denoised HDF5 Files
result = ds.get(lsdir(os.path.join(datadir, 'fmriprep', '*', 'func', '*crop*hdf5')))

# Get Annotation File
result = ds.get(os.path.join(datadir, 'onsets', 'Sherlock_Segments_1000_NN_2017.xlsx'))

# Get Preprocessed Video Text
result = ds.get(os.path.join(datadir, 'stimuli', 'video_text.npy'))

## ROI responses while viewing Sherlock

Following the [functional alignment tutorial](http://naturalistic-data.org/features/notebooks/Functional_Alignment.html), we'll select out voxels in early visual cortex from the *Sherlock* dataset.  We'll also examine primary auditory cortex and motor cortex responses.  Then we'll apply the HyperTools pipeline to the dataset and visualize the responses within each ROI as a 3D image. Note you could also work with the Average ROI csv files as we did with the Dynamic Correlation tutorial. Here, we will load the full dataset and manually extract ROIs.

mask = Brain_Data('https://neurovault.org/media/images/8423/k50_2mm.nii.gz')
data_dir_paranoia = '/Volumes/Engram/Data/Paranoia/'
data_dir_sherlock = '/Volumes/Engram/Data/Sherlock/'

paranoia_audio = os.path.join(data_dir_paranoia, 'stimuli', 'stimuli_story1_audio.wav')
sherlock_video = os.path.join(data_dir_sherlock, 'stimuli','stimuli_Sherlock.m4v')

# If datasets haven't been installed, clone from GIN repository
if not os.path.exists(data_dir_paranoia):
    dl.clone(source='https://gin.g-node.org/ljchang/Paranoia', path=data_dir_paranoia)

if not os.path.exists(data_dir_sherlock):
    dl.clone(source='https://gin.g-node.org/ljchang/Sherlock', path=data_dir_sherlock)

# Initialize dataset
ds_paranoia = dl.Dataset(data_dir_paranoia)
ds_sherlock = dl.Dataset(data_dir_sherlock)

# Get Paranoia story
result = ds_paranoia.get(paranoia_audio)

# Get Sherlock video
result = ds_sherlock.get(sherlock_video)



## Getting Started

The best way to see what *pliers* can offer is to jump right into an example. 

### Example 1: Audio RMS
plt.rc('xtick', labelsize=smallsize); plt.rc('ytick', labelsize=smallsize); plt.rc('legend', fontsize=mediumsize)
plt.rc('figure', titlesize=largesize); plt.rc('axes', labelsize=mediumsize); plt.rc('axes', titlesize=mediumsize)

### Data
This tutorial will be using the **Sherlock** dataset and will require downloading the cropped & denoised **.nii.gz** files. The tutorial will be mostly working with spatial patterns within the angular gyrus, so if you would like to get started with the tutorial right away without waiting for all of the nifti files to load, you can download the masked data as a `.npy` file from [figshare: Sherlock data for OHBM](https://figshare.com/articles/Sherlock_data_for_OHBM/12436955). 

You will want to change `data_dir` to wherever you have installed the Sherlock datalad repository. We will initialize a datalad dataset instance and get the files we need for this tutorial. If you've already downloaded everything, you can skip this cell. See the [Download Data Tutorial](http://naturalistic-data.org/features/notebooks/Download_Data.html) for more information about how to install and use datalad.

data_dir = '/Volumes/Emily_MyPassport2TB/Sherlock/'

# If dataset hasn't been installed, clone from GIN repository
if not os.path.exists(data_dir):
    dl.clone(source='https://gin.g-node.org/ljchang/Sherlock', path=data_dir)

# Initialize dataset
ds = dl.Dataset(data_dir)

# Get Denoised nifti Files
result = ds.get(glob.glob(os.path.join(data_dir, 'fmriprep', '*', 'func', f'*denoise*nii.gz')))

### 0. Load Angular Gyrus data

From the angular gyrus (area PG from (Eickhoff et al., 2005)), we'll load movie data from all subjects, and recall data from one subject. Subjects were watching the first hour of [A Study in Pink](https://en.wikipedia.org/wiki/A_Study_in_Pink) (here we are loading only the first half of this data), and then freely recalled the narrative. Please refer to [Chen et al. (2017)](https://doi.org/10.1038/nn.4450) to learn more about this dataset.

We can load this data from the nii files by applying an angular gyrus mask, which we then cache into a numpy file to speed up loading in the future. If you'd like to skip this nii-loading step (which can be slow), you can download the npy files from [figshare: Sherlock data for OHBM](https://figshare.com/articles/Sherlock_data_for_OHBM/12436955).

mask = Brain_Data('https://neurovault.org/media/images/8423/AG_mask.nii.gz').to_nifti()

if (not os.path.exists(data_dir + 'Sherlock_AG_movie.npy') or
    not os.path.exists(data_dir + 'Sherlock_AG_recall.npy')):
    movie = []
Exemplo n.º 25
0
import os
import glob
import datalad.api as dl
import pandas as pd

localizer_path = '/Users/lukechang/Dropbox/Dartbrains/data/Localizer'

dl.clone(source='https://gin.g-node.org/ljchang/Localizer', path=localizer_path)


# We can now create a dataset instance using `dl.Dataset(path_to_data)`.

# In[6]:


ds = dl.Dataset(localizer_path)


# How much of the dataset have we downloaded?  We can check the status of the annex using `ds.status(annex='all')`.

# In[12]:


results = ds.status(annex='all')


# Looks like it's empty, which makes sense since we only cloned the dataset. 
# 
# Now we need to get some data. Let's start with something small to play with first.
# 
# Let's use `glob` to find all of the tab-delimited confound data generated by fmriprep. 
Exemplo n.º 26
0
    def prepare_remote(self):
        """Prepare dataset sibling on remote.
        """
        if not self.ds.repo.get_active_branch():
            # publish() fails when HEAD is detached.
            raise OrchestratorError(
                "You must be on a branch to use the {} orchestrator".format(
                    self.name))
        if not self.session.exists(self.root_directory):
            self.session.mkdir(self.root_directory, parents=True)

        resource = self.resource
        session = self.session

        inputs = list(self.get_inputs())
        if isinstance(session, SSHSession):
            if resource.key_filename:
                dl_version = external_versions["datalad"]
                if dl_version < "0.11.3":
                    # Connecting will probably fail because `key_filename` is
                    # set, but we have no way to tell DataLad about it.
                    lgr.warning(
                        "DataLad version %s detected. "
                        "0.11.3 or greater is required to use an "
                        "identity file not specified in ~/.ssh/config",
                        dl_version)
                # Make the identity file available to 'datalad sshrun' even if
                # it is not configured in .ssh/config. This is particularly
                # important for AWS keys.
                os.environ["DATALAD_SSH_IDENTITYFILE"] = resource.key_filename
                from datalad import cfg
                cfg.reload(force=True)

            sshurl = _format_ssh_url(
                resource.user,
                # AWS resource does not have host attribute.
                getattr(resource, "host", None) or session.connection.host,
                getattr(resource, "port", None),
                self.working_directory)

            # TODO: Add one level deeper with reckless clone per job to deal
            # with concurrent jobs?
            if not session.exists(self.working_directory):
                remotes = self.ds.repo.get_remotes()
                if resource.name in remotes:
                    raise OrchestratorError(
                        "Remote '{}' unexpectedly exists. "
                        "Either delete remote or rename resource.".format(
                            resource.name))

                self.ds.create_sibling(sshurl,
                                       name=resource.name,
                                       recursive=True)
                since = None  # Avoid since="" for non-existing repo.
            else:
                remote_branch = "{}/{}".format(
                    resource.name, self.ds.repo.get_active_branch())
                if self.ds.repo.commit_exists(remote_branch):
                    since = ""
                else:
                    # If the remote branch doesn't exist yet, publish will fail
                    # with since="".
                    since = None

            from datalad.support.exceptions import IncompleteResultsError
            try:
                self.ds.publish(to=resource.name, since=since, recursive=True)
            except IncompleteResultsError:
                raise OrchestratorError(
                    "'datalad publish' failed. Try running "
                    "'datalad update -s {} --merge --recursive' first".format(
                        resource.name))

            self._fix_up_dataset()

            if inputs:
                lgr.info("Making inputs available")
                try:
                    # TODO: Whether we try this `get` should be configurable.
                    self._execute_in_wdir("datalad get {}".format(
                        # FIXME: This should use something like
                        # execute_command_batch.
                        " ".join(map(shlex_quote, inputs))))
                except OrchestratorError:
                    # Should use --since for existing repo, but it doesn't seem
                    # to sync wrt content.
                    self.ds.publish(to=resource.name,
                                    path=inputs,
                                    recursive=True)
        elif resource.type == "shell":
            import datalad.api as dl
            if not session.exists(self.working_directory):
                dl.install(self.working_directory, source=self.ds.path)

            self.session.execute_command("git push '{}' HEAD:{}-base".format(
                self.working_directory, self.job_refname))
            self._checkout_target()

            if inputs:
                installed_ds = dl.Dataset(self.working_directory)
                installed_ds.get(inputs)
        else:
            # TODO: Handle more types?
            raise OrchestratorError("Unsupported resource type {}".format(
                resource.type))

        if not session.exists(self.meta_directory):
            session.mkdir(self.meta_directory, parents=True)
Exemplo n.º 27
0
#!/usr/bin/env python3

import os.path as op
import sys
import xml.dom.minidom

import datalad.api as dl
from datalad.support.network import download_url

ds = dl.Dataset(op.dirname(op.dirname(op.realpath(__file__))))

if 'datalad' not in ds.repo.get_remotes():
    from datalad.customremotes.base import init_datalad_remote
    init_datalad_remote(ds.repo, 'datalad', autoenable=True)

# doc = xml.dom.minidom.parse('/tmp/outi-7T.xml')
topurl = 'https://db.humanconnectome.org/data/archive/projects/HCP_Resources/resources/7T_Movies/'
doc = xml.dom.minidom.parseString(download_url(topurl))

files = [{f: e.getAttribute(f)
          for f in ('ID', 'URI', 'digest', 'name')}
         for e in doc.getElementsByTagName("cat:entry")]
# from pprint import pprint
# pprint(files)
added = list(
    ds.addurls(files, topurl + 'files/{URI}', '{URI}', fast=False, save=False))
print(f"Processed {len(added)} entries")