Пример #1
0
def test_yield_entities_from_filter(dataservice_setup):
    n = 2
    populate_data(n)

    si = 1
    filter = {"study_id": f"SD_{si}1111111"}

    # Get all participants from one study
    endpoint = "participants"
    for ps in [
            list(yield_entities_from_filter(DATASERVICE_URL, endpoint,
                                            filter)),
            list(yield_entities(DATASERVICE_URL, endpoint, filter)),
    ]:
        assert len(ps) == n
        for p in ps:
            assert p["kf_id"].startswith(f"PT_{si}")

    # Get all biospecimens from one study
    endpoint = "biospecimens"
    for bs in [
            list(yield_entities_from_filter(DATASERVICE_URL, endpoint,
                                            filter)),
            list(yield_entities(DATASERVICE_URL, endpoint, filter)),
    ]:
        assert len(bs) == (n * n)
        for b in bs:
            assert b["kf_id"].startswith(f"BS_{si}")
 def entities_dict(endpoint, filt):
     return {
         e["kf_id"]: e
         for e in yield_entities(
             self.api_url, endpoint, filt, True
         )
     }
Пример #3
0
def find_descendants_by_filter(
    api_url,
    endpoint,
    filter,
    ignore_gfs_with_hidden_external_contribs,
    kfids_only=True,
    db_url=None,
):
    """
    Similar to find_descendants_by_kfids but starts with an API endpoint filter
    instead of a list of endpoint KFIDs.
    """
    things = list(yield_entities(api_url, endpoint, filter,
                                 show_progress=True))
    if kfids_only:
        things = [t["kf_id"] for t in things]

    descendants = find_descendants_by_kfids(
        db_url or api_url,
        endpoint,
        things,
        ignore_gfs_with_hidden_external_contribs,
        kfids_only=kfids_only,
    )
    return descendants
Пример #4
0
def test_yield_entities_from_kfids(dataservice_setup):
    n = 2
    populate_data(n)

    kfid_set = {"SD_11111111", "PT_11111111", "BS_11111111"}
    for es in [
            list(yield_entities_from_kfids(DATASERVICE_URL, kfid_set)),
            list(yield_entities(DATASERVICE_URL, None, kfid_set)),
    ]:
        assert len(es) == len(kfid_set)
        found_kfids = {e["kf_id"] for e in es}
        assert kfid_set == found_kfids
Пример #5
0
def merge_s3_and_kf_gfs(ds_url,
                        study_kfid,
                        study_bucket,
                        exclude_s3_keypaths=None):
    """Return file data from S3 and the Kids First dataservice merged together
    on external_id to see which S3 files have been loaded into the data service
    and which loaded files no longer exist.

    Note: You must be able to query both S3 and the dataservice
        (VPN + chopaws if running locally)

    :param study_kfid: Dataservice KFID of the study
    :type study_kfid: string
    :param study_bucket: Amazon S3 bucket containing study files
    :type study_bucket: string
    :param exclude_s3_keypaths: S3 paths starting with these strings will be
        excluded, optional, defaults to None
    :type exclude_s3_keypaths: string, iterable
    :return: list of dicts
    """
    # Files from the dataservice
    # We use the API because direct DB queries won't give us the gen3 fields
    kf = {
        e["external_id"]: {
            f"kf_{k.lower()}": v
            for k, v in e.items()
            if k not in ["_links", "access_urls", "urls"]
        }
        for e in yield_entities(
            ds_url,
            "genomic-files",
            {"study_id": study_kfid},
            show_progress=True,
        )
    }

    # Files from S3
    s3 = {
        "s3://" + o["Bucket"] + "/" + o["Key"]:
        {f"s3_{k.lower()}": v
         for k, v in o.items()}
        for o in fetch_bucket_obj_info(
            study_bucket,
            drop_folders=True,
        )
    }

    # Sadly it's muuuuch harder to exclude paths on the S3 request side because
    # the S3 API doesn't support it. So we're stuck for now waiting for
    # potentially thousands of pagination requests that we don't care about,
    # and then we remove them here.
    if exclude_s3_keypaths:
        if isinstance(exclude_s3_keypaths, str):
            exclude_s3_keypaths = (exclude_s3_keypaths, )
        elif exclude_s3_keypaths is not None:
            exclude_s3_keypaths = tuple(exclude_s3_keypaths)

        s3 = {
            k: v
            for k, v in s3.items()
            if not v["s3_key"].startswith(exclude_s3_keypaths)
        }

    # Merge them together
    s3kf = defaultdict(dict, s3)
    for k, v in kf.items():
        s3kf[k].update(v)

    return list(s3kf.values())
Пример #6
0
def find_descendants_by_kfids(
    api_or_db_url,
    parent_endpoint,
    parents,
    ignore_gfs_with_hidden_external_contribs,
    kfids_only=True,
):
    """
    Given a set of KFIDs from a specified endpoint, find the KFIDs of all
    descendant entities.

    Given a family kfid, the result will be all participants in that family,
    all of the participants' biospecimens/outcomes/phenotypes/etc, all of
    their biospecimens' resultant genomic files, and all of the genomic files'
    sequencing experiments and read groups.

    Given a set of genomic file kfids, the result will be just their sequencing
    experiments and read groups.

    If you plan to make the discovered descendants visible, you should set
    ignore_gfs_with_hidden_external_contribs=True so that you don't accidentally
    unhide a genomic file that has hidden contributing biospecimens.

    If you plan to make the discovered descendants hidden, you should set
    ignore_gfs_with_hidden_external_contribs=False so that everything linked to
    the hidden biospecimens also get hidden.

    Special performance note: a database connect url will run MUCH faster
    compared to a dataservice api host

    :param api_or_db_url: dataservice api host _or_ database connect url
        e.g. "https://kf-api-dataservice.kidsfirstdrc.org" or
        "postgres://<USERNAME>:<PASSWORD>@kf-dataservice-postgres-prd.kids-first.io:5432/kfpostgresprd"
    :param parent_endpoint: endpoint of the starting kfids being passed in
    :param parents: iterable of starting kfids or entities associated with the
        parent_endpoint
    :param ignore_gfs_with_hidden_external_contribs: whether to ignore
        genomic files (and their descendants) that contain information from
        hidden biospecimens unrelated to the given parents.
    :param kfids_only: only return KFIDs, not entire entities
    :returns: dict mapping endpoints to their sets of discovered kfids
    """
    use_api = api_or_db_url.startswith(("http:", "https:"))

    if use_api:
        parent_type = parent_endpoint
    else:
        endpoint_to_table = {
            "studies": "study",
            "participants": "participant",
            "family-relationships": "family_relationship",
            "outcomes": "outcome",
            "phenotypes": "phenotype",
            "diagnoses": "diagnosis",
            "biospecimens": "biospecimen",
            "families": "family",
            "biospecimen-genomic-files": "biospecimen_genomic_file",
            "biospecimen-diagnoses": "biospecimen_diagnosis",
            "genomic-files": "genomic_file",
            "read-group-genomic-files": "read_group_genomic_file",
            "sequencing-experiment-genomic-files":
            "sequencing_experiment_genomic_file",
            "read-groups": "read_group",
            "sequencing-experiments": "sequencing_experiment",
        }
        table_to_endpoint = {v: k for k, v in endpoint_to_table.items()}
        parent_type = endpoint_to_table[parent_endpoint]

    if use_api:
        descendancy = _api_descendancy
    else:
        descendancy = _db_descendancy
        db_conn = psycopg2.connect(api_or_db_url)
        db_cur = db_conn.cursor(cursor_factory=psycopg2.extras.DictCursor)

    if isinstance(parents, str):
        parents = [parents]

    if isinstance(next(iter(parents), None), dict):
        parent_kfids = set(p["kf_id"] for p in parents)
        descendants = {parent_type: {p["kf_id"]: p for p in parents}}
    else:
        parent_kfids = set(parents)
        if use_api:
            descendants = {
                parent_type: {
                    e["kf_id"]: e
                    for e in yield_entities(api_or_db_url, None, parent_kfids)
                }
            }
        else:
            query = f"select distinct * from {parent_type} where kf_id in %s"
            db_cur.execute(query, (tuple(parent_kfids | {None}), ))
            descendants = {
                parent_type: {p["kf_id"]: dict(p)
                              for p in db_cur.fetchall()}
            }

    done = set()
    for t in descendancy.keys():
        if t != parent_type:
            done.add(t)
        else:
            break

    def _inner(parent_type, parent_kfids, descendants):
        if parent_type in done:
            return
        done.add(parent_type)
        for (child_type, link_on_parent,
             link_on_child) in descendancy.get(parent_type, []):
            if use_api:
                with ThreadPoolExecutor() as tpex:
                    futures = [
                        tpex.submit(
                            _accumulate,
                            yield_entities,
                            api_or_db_url,
                            child_type,
                            {link_on_child: k},
                            show_progress=True,
                        ) for k in parent_kfids
                    ]
                    children = {
                        e["kf_id"]: e
                        for f in as_completed(futures) for e in f.result()
                    }
            else:
                # special case for getting to families from studies
                if parent_type == "study" and child_type == "family":
                    query = (
                        "select distinct family.* from family join participant"
                        " on participant.family_id = family.kf_id join study on"
                        " participant.study_id = study.kf_id where study.kf_id "
                        "in %s")
                else:
                    query = (
                        f"select distinct {child_type}.* from {child_type} join {parent_type}"
                        f" on {child_type}.{link_on_child} = {parent_type}.{link_on_parent}"
                        f" where {parent_type}.kf_id in %s")
                db_cur.execute(query, (tuple(parent_kfids | {None}), ))
                children = {c["kf_id"]: dict(c) for c in db_cur.fetchall()}

            if children:
                descendants[child_type] = descendants.get(child_type, dict())
                descendants[child_type].update(children)

            if (child_type == "genomic_file"
                ) and ignore_gfs_with_hidden_external_contribs:
                # Ignore multi-specimen genomic files that have hidden
                # contributing specimens which are not in the descendants
                extra_contrib_gfs = find_gfs_with_extra_contributors(
                    api_or_db_url,
                    descendants["biospecimen"],
                    descendants["genomic_file"],
                )
                to_remove = (extra_contrib_gfs["hidden"]
                             | extra_contrib_gfs["mixed_visibility"])
                descendants["genomic_file"] = {
                    k: v
                    for k, v in descendants["genomic_file"].items()
                    if k not in to_remove
                }
        for (child_type, _, _) in descendancy.get(parent_type, []):
            if descendants.get(child_type):
                _inner(child_type, descendants[child_type].keys(), descendants)

    _inner(parent_type, parent_kfids, descendants)

    if not use_api:
        descendants = {table_to_endpoint[k]: v for k, v in descendants.items()}

    if kfids_only:
        for k, v in descendants.items():
            descendants[k] = set(descendants[k])

    return descendants