Exemplo n.º 1
0
def create_fake_pedigree(
    n: int,
    sample_list: List[str],
    exclude_real_probands: bool = False,
    max_tries: int = 10,
    real_pedigree: Optional[hl.Pedigree] = None,
) -> hl.Pedigree:
    """
    Generate a pedigree made of trios created by sampling 3 random samples in the sample list.

    - If `real_pedigree` is given, then children in the resulting fake trios will not include any trio with proband - parents
      that are in the real ones.
    - Each sample can be used only once as a proband in the resulting trios.
    - Sex of probands in fake trios is random.

    :param n: Number of fake trios desired in the pedigree
    :param sample_list: List of samples
    :param exclude_real_probands: If set, then fake trios probands cannot be in the real trios probands.
    :param max_tries: Maximum number of sampling to try before bailing out (preventing infinite loop if `n` is too large w.r.t. the number of samples)
    :param real_pedigree: Optional pedigree to exclude children from
    :return: Fake pedigree
    """
    real_trios = ({trio.s: trio
                   for trio in real_pedigree.trios}
                  if real_pedigree is not None else dict())

    if exclude_real_probands and len(real_trios) == len(set(sample_list)):
        logger.warning(
            "All samples are in the real probands list; cannot create any fake pedigrees with exclude_real_probands=True. Returning an empty Pedigree."
        )
        return hl.Pedigree([])

    fake_trios = {}
    tries = 0
    while len(fake_trios) < n and tries < max_tries:
        s, mat_id, pat_id = random.sample(sample_list, 3)
        if (s in real_trios and (exclude_real_probands or {mat_id, pat_id} == {
                real_trios[s].mat_id, real_trios[s].pat_id
        })) or s in fake_trios:
            tries += 1
        else:
            tries = 0
            fake_trios[s] = hl.Trio(
                s=s,
                pat_id=pat_id,
                mat_id=mat_id,
                fam_id=f"fake_{str(len(fake_trios))}",
                is_female=bool(random.getrandbits(1)),
            )

    if tries == max_tries:
        logger.warning(
            "Only returning %d fake trios; random trio sampling stopped after reaching the maximum %d iterations",
            len(fake_trios),
            max_tries,
        )

    return hl.Pedigree(list(fake_trios.values()))
Exemplo n.º 2
0
def pandas_to_ped(ped_pd: pd.DataFrame):
    """
    Creates a Hail Pedigree object from trios stored as rows in a DataFrame.
    Input columns should contain 'fam_id', 's', 'is_female', 'pat_id', 'mat_id'

    :param DataFrame ped_pd: Input DataFrame
    :return: Pedigree
    :rtype: Pedigree
    """
    return hl.Pedigree([
        hl.Trio(s=row.s,
                is_female=row.is_female,
                pat_id=row.pat_id,
                mat_id=row.mat_id,
                fam_id=str(row.fam_id)) for row in ped_pd.itertuples()
    ])
Exemplo n.º 3
0
def create_fake_pedigree(n: int,
                         sample_list: List[str],
                         real_pedigree: hl.Pedigree = None) -> hl.Pedigree:
    """

    Generates a "fake" pedigree made of trios created by sampling 3 random samples in the sample list.
    If `real_pedigree` is given, then children from the real pedigrees won't be used as probands.
    This functions insures that:
    - All probands are unique
    - All individuals in a trio are different

    :param int n: Number of fake trios desired in the pedigree
    :param list of str sample_list: List of samples
    :param Pedigree real_pedigree: Optional pedigree to exclude children from
    :return: Fake pedigree
    :rtype: Pedigree
    """

    probands = set()
    if real_pedigree is not None:
        probands = {trio.s
                    for trio in real_pedigree.trios
                    }.intersection(set(sample_list))
        if len(probands) == len(sample_list):
            raise ValueError(
                "Full sample list for fake trios generation needs to include samples that aren't probands in the real trios."
            )

    fake_trios = []
    for i in range(n):
        mat_id, pat_id = random.sample(sample_list, 2)
        s = random.choice(sample_list)
        while s in probands.union({mat_id, pat_id}):
            s = random.choice(sample_list)

        probands.add(s)

        fake_trios.append(
            hl.Trio(s=s,
                    pat_id=pat_id,
                    mat_id=mat_id,
                    fam_id=str(str(i + 1)),
                    is_female=True))

    return hl.Pedigree(fake_trios)
Exemplo n.º 4
0
    def get_trios(
        fam_id: str,
        parent_child_pairs: List[Tuple[str, str]],
        related_pairs: Dict[Tuple[str, str], str],
    ) -> List[hl.Trio]:
        """
        Generates trios based from the list of parent-child pairs in the family and
        all related pairs in the data. Only complete parent/offspring trios are included in the results.

        The trios are assembled as follows:
        1. All pairs of unrelated samples with different sexes within the family are extracted as possible parent pairs
        2. For each possible parent pair, a list of all children is constructed (each child in the list has a parent-offspring pair with each parent)
        3. If there are multiple children for a given parent pair, all children should be siblings with each other
        4. Check that each child was only assigned a single pair of parents. If a child is found to have multiple parent pairs, they are ALL discarded.

        :param fam_id: The family ID
        :param parent_child_pairs: The parent-child pairs for this family
        :param related_pairs: All related sample pairs in the data
        :return: List of trios in the family
        """
        def get_possible_parents(samples: List[str]) -> List[Tuple[str, str]]:
            """
            1. All pairs of unrelated samples with different sexes within the family are extracted as possible parent pairs

            :param samples: All samples in the family
            :return: Possible parent pairs
            """
            possible_parents = []
            for i in range(len(samples)):
                for j in range(i + 1, len(samples)):
                    if (related_pairs.get(
                            tuple(sorted([samples[i], samples[j]]))) is None):
                        if sex.get(samples[i]) is False and sex.get(
                                samples[j]) is True:
                            possible_parents.append((samples[i], samples[j]))
                        elif (sex.get(samples[i]) is True
                              and sex.get(samples[j]) is False):
                            possible_parents.append((samples[j], samples[i]))
            return possible_parents

        def get_children(possible_parents: Tuple[str, str]) -> List[str]:
            """
            2. For a given possible parent pair, a list of all children is constructed (each child in the list has a parent-offspring pair with each parent)

            :param possible_parents: A pair of possible parents
            :return: The list of all children (if any) corresponding to the possible parents
            """
            possible_offsprings = defaultdict(
                set
            )  # stores sample -> set of parents in the possible_parents where (sample, parent) is found in possible_child_pairs
            for pair in parent_child_pairs:
                if possible_parents[0] == pair[0]:
                    possible_offsprings[pair[1]].add(possible_parents[0])
                elif possible_parents[0] == pair[1]:
                    possible_offsprings[pair[0]].add(possible_parents[0])
                elif possible_parents[1] == pair[0]:
                    possible_offsprings[pair[1]].add(possible_parents[1])
                elif possible_parents[1] == pair[1]:
                    possible_offsprings[pair[0]].add(possible_parents[1])

            return [
                s for s, parents in possible_offsprings.items()
                if len(parents) == 2
            ]

        def check_sibs(children: List[str]) -> bool:
            """
            3. If there are multiple children for a given parent pair, all children should be siblings with each other

            :param children: List of all children for a given parent pair
            :return: Whether all children in the list are siblings
            """
            for i in range(len(children)):
                for j in range(i + 1, len(children)):
                    if (related_pairs[tuple(sorted([children[i], children[j]
                                                    ]))] != SIBLINGS):
                        return False
            return True

        def discard_multi_parents_children(trios: List[hl.Trio]):
            """
            4. Check that each child was only assigned a single pair of parents. If a child is found to have multiple parent pairs, they are ALL discarded.

            :param trios: All trios formed for this family
            :return: The list of trios for which each child has a single parents pair.
            """
            children_trios = defaultdict(list)
            for trio in trios:
                children_trios[trio.s].append(trio)

            for s, s_trios in children_trios.items():
                if len(s_trios) > 1:
                    logger.warning(
                        "Discarded duplicated child {0} found multiple in trios: {1}"
                        .format(s, ", ".join([str(trio) for trio in s_trios])))

            return [
                trios[0] for trios in children_trios.values()
                if len(trios) == 1
            ]

        # Get all possible pairs of parents in (father, mother) order
        all_possible_parents = get_possible_parents(
            list({s
                  for pair in parent_child_pairs for s in pair}))

        trios = []
        for possible_parents in all_possible_parents:
            children = get_children(possible_parents)
            if check_sibs(children):
                trios.extend([
                    hl.Trio(
                        s=s,
                        fam_id=fam_id,
                        pat_id=possible_parents[0],
                        mat_id=possible_parents[1],
                        is_female=sex.get(s),
                    ) for s in children
                ])
            else:
                logger.warning(
                    "Discarded family with same parents, and multiple offspring that weren't siblings:"
                    "\nMother: {}\nFather:{}\nChildren:{}".format(
                        possible_parents[0], possible_parents[1],
                        ", ".join(children)))

        return discard_multi_parents_children(trios)
Exemplo n.º 5
0
def infer_families(
        kin_ht: hl.Table,  # the kinship hail table
        sex: Dict[str, bool],  # the dictionary of sexes
        i_col:
    str = 'i',  # the rest of these are default that can be set to something else if needed
        j_col: str = 'j',
        pi_hat_col: str = 'pi_hat',
        ibd2_col: str = 'ibd2',
        ibd1_col: str = 'ibd1',
        ibd0_col: str = 'ibd0',
        first_degree_threshold: Tuple[float, float] = (0.4, 0.75),
        second_degree_threshold: Tuple[float, float] = (0.195, 0.3),
        ibd1_second_degree_threshold: float = 0.40,
        ibd2_parent_offspring_threshold: float = 0.30,
        ibd1_parent_offspring_threshold: float = 0.70,
        ibd0_parent_offspring_threshold: float = 0.15) -> hl.Pedigree:
    """
    Infers familial relationships from the results of pc_relate and sex information.
    Note that both kinship and ibd2 are needed in the pc_relate output.
    This function returns a pedigree containing trios inferred from the data. Family ID can be the same for multiple
    trios if one or more members of the trios are related (e.g. sibs, multi-generational family). Trios are ordered by family ID.
    Note that this function only returns complete trios defined as:
    one child, one father and one mother (sex is required for both parents)
    :param Table kin_ht: pc_relate output table
    :param dict of str -> bool sex: A dict containing the sex for each sample. True = female, False = male, None = unknown
    :param str i_col: Column containing the 1st sample id in the ibd table
    :param str j_col: Column containing the 2nd sample id in the ibd table
    #:param str kin_col: Column containing the kinship in the ibd table
    :param str pi_hat_col: Column containing the pi_hat in the ibd table
    :param str ibd2_col: Column containing ibd2 in the pc_relate table
    :param (float, float) first_degree_threshold: Lower/upper bounds for kin for 1st degree relatives
    :param (float, float) second_degree_threshold: Lower/upper bounds for kin for 2nd degree relatives
    :param float ibd2_parent_offspring_threshold: Upper bound on ibd2 for a parent/offspring
    :return: Pedigree containing all trios in the data
    :rtype: Pedigree
    """
    def get_fam_samples(
        sample: str,
        fam: Set[str],
        samples_rel: Dict[str, Set[str]],
    ) -> Set[str]:
        """
        Given a sample, its known family and a dict that links samples with their relatives, outputs the set of
        samples that constitute this sample family.
        :param str sample: sample
        :param dict of str -> set of str samples_rel: dict(
        :param set of str fam: sample known family
        :return: Family including the sample
        :rtype: set of str
        """
        fam.add(
            sample
        )  # usually this starts out as a blank set except for the case two lines below
        for s2 in samples_rel[
                sample]:  # iterate through the sample's relatives
            if s2 not in fam:
                fam = get_fam_samples(
                    s2, fam, samples_rel
                )  # this part is to get who s2 is related to but that sample may not have been related to?
        return fam

    def get_indexed_ibd(
            pc_relate_rows: List[hl.Struct]) -> Dict[Tuple[str, str], float]:
        """
        Given rows from a pc_relate table, creates dicts with:
        keys: Pairs of individuals, lexically ordered
        values: ibd2, ibd1, ibd0
        :param list of hl.Struct pc_relate_rows: Rows from a pc_relate table
        :return: Dict of lexically ordered pairs of individuals -> kinship
        :rtype: dict of (str, str) -> float
        """
        ibd2 = dict()
        ibd1 = dict()
        ibd0 = dict()
        for row in pc_relate_rows:
            ibd2[tuple(sorted((row[i_col], row[j_col])))] = row[
                ibd2_col]  # this is just getting the ibd2 value for every sample pair
            ibd1[tuple(sorted((row[i_col], row[j_col])))] = row[
                ibd1_col]  # this is just getting the ibd1 value for every sample pair
            ibd0[tuple(sorted((row[i_col], row[j_col])))] = row[
                ibd0_col]  # this is just getting the ibd0 value for every sample pair

        return ibd2, ibd1, ibd0

    def get_parents(possible_parents: List[str],
                    relative_pairs: List[Tuple[str, str]],
                    sex: Dict[str, bool]) -> Union[Tuple[str, str], None]:
        """
        Given a list of possible parents for a sample (first degree relatives with low ibd2),
        looks for a single pair of samples that are unrelated with different sexes.
        If a single pair is found, return the pair (father, mother)
        :param list of str possible_parents: Possible parents
        :param list of (str, str) relative_pairs: Pairs of relatives, used to check that parents aren't related with each other
        :param dict of str -> bool sex: Dict mapping samples to their sex (True = female, False = male, None or missing = unknown)
        :return: (father, mother) if found, `None` otherwise
        :rtype: (str, str) or None
        """

        parents = []
        logging.info(f"You have {len(possible_parents)} possible parent(s)")
        while len(possible_parents
                  ) > 1:  # go through the entire list of possible parents
            p1 = possible_parents.pop()  # start with the first possible parent

            for p2 in possible_parents:
                logging.info(str(tuple(sorted((p1, p2)))) + '\n')

                if tuple(
                        sorted((p1, p2))
                ) not in relative_pairs:  # to what degree is a "relative"? will this work for grandparent, mom, child?
                    logging.info(
                        "your potential parent's don't appear to be relatives\n"
                    )
                    logging.info("SEX p1: " + str(sex.get(p1)) + '\n')
                    logging.info("SEX p2: " + str(sex.get(p2)) + '\n')

                    if sex.get(p1) is False and sex.get(p2):
                        parents.append((p1, p2))
                        logging.info("found in order 1\n")
                    elif sex.get(p1) and sex.get(p2) is False:
                        parents.append((p2, p1))
                        logging.info("found in order 2\n")
                else:
                    logging.info("Your Parents are Related!!!\n\n")

        if len(parents) == 1:
            logging.info("Found your parents!\n")
            return parents[0]

        return None

    # Duplicated samples to remove (If not provided, this function won't work as it assumes that each child has exactly two parents)
    duplicated_samples = set()
    try:
        dups = hl.literal(duplicated_samples)
    except:
        dups = hl.empty_array(hl.tstr)

    first_degree_pairs = kin_ht.filter(
        (kin_ht[pi_hat_col] >= first_degree_threshold[0])
        & (kin_ht[pi_hat_col] <= first_degree_threshold[1])
        & ~dups.contains(kin_ht[i_col]) &
        ~dups.contains(kin_ht[j_col])  # so not including any duplicate samples
    ).collect()

    first_degree_relatives = defaultdict(set)
    for row in first_degree_pairs:
        first_degree_relatives[row[i_col]].add(
            row[j_col]
        )  # so you're making a list for every sample that includes any other sample they are related to by first degree
        first_degree_relatives[row[j_col]].add(row[i_col])

    # Add second degree relatives for those samples
    # This is needed to distinguish grandparent - child - parent from child - mother, father down the line
    first_degree_samples = hl.literal(set(first_degree_relatives.keys()))

    second_degree_samples = kin_ht.filter((
        (kin_ht[pi_hat_col] >= first_degree_threshold[0])
        & (kin_ht[pi_hat_col] <= first_degree_threshold[1])) | (
            (kin_ht[pi_hat_col] >= second_degree_threshold[0])
            & (kin_ht[ibd1_col] >= ibd1_second_degree_threshold)
            & (kin_ht[pi_hat_col] < second_degree_threshold[1]))).collect()

    ibd2, ibd1, ibd0 = get_indexed_ibd(
        second_degree_samples
    )  # this is just getting the ibd values for every sample pair

    fam_id = 1
    trios = []
    duos = []
    decisions = {}
    while len(first_degree_relatives) > 0:
        s_fam = get_fam_samples(
            list(first_degree_relatives)[0], set(), first_degree_relatives
        )  # just feed in the entire dictionary because it gets keyed out to only that sample in the function anyway
        for s in s_fam:
            logging.info(f"Processing sample: {s}")
            s_rel = first_degree_relatives.pop(
                s
            )  # because your popping, the above index of [0] will appropriately be updated
            possible_parents = []
            for rel in s_rel:  # so s rel is a list of all the people s (which was popped off) was related to by first degree

                if (ibd2[tuple(sorted((s, rel)))] <= ibd2_parent_offspring_threshold) & \
                    (ibd1[tuple(sorted((s, rel)))] >= ibd1_parent_offspring_threshold) &  \
                    (ibd0[tuple(sorted((s, rel)))] <= ibd0_parent_offspring_threshold): # if the ib2 value for that pair is below that parent threshold
                    possible_parents.append(rel)

            #these will be the proband-offspring only pairs
            if len(possible_parents) == 1:
                duos.append(sorted((s, possible_parents[0])))
                decisions[s] = possible_parents[0]
            else:
                parents = get_parents(possible_parents, list(ibd2.keys()), sex)

                decisions[s] = parents

                if parents is not None:  # just formatting the trio output here
                    trios.append(
                        hl.Trio(s=s,
                                fam_id=str(fam_id),
                                pat_id=parents[0],
                                mat_id=parents[1],
                                is_female=sex.get(s)))

        fam_id += 1

    return hl.Pedigree(trios), duos, decisions
Exemplo n.º 6
0
def infer_families(
        kin_ht: hl.Table,
        sex: Dict[str, bool],
        duplicated_samples: Set[str],
        i_col: str = 'i',
        j_col: str = 'j',
        kin_col: str = 'kin',
        ibd2_col: str = 'ibd2',
        first_degree_threshold: Tuple[float, float] = (0.2, 0.4),
        second_degree_threshold: Tuple[float, float] = (0.05, 0.16),
        ibd2_parent_offspring_threshold: float = 0.2) -> hl.Pedigree:
    """

    Infers familial relationships from the results of pc_relate and sex information.
    Note that both kinship and ibd2 are needed in the pc_relate output.

    This function returns a pedigree containing trios inferred from the data. Family ID can be the same for multiple
    trios if one or more members of the trios are related (e.g. sibs, multi-generational family). Trios are ordered by family ID.

    Note that this function only returns complete trios defined as:
    one child, one father and one mother (sex is required for both parents)

    :param Table kin_ht: pc_relate output table
    :param dict of str -> bool sex: A dict containing the sex for each sample. True = female, False = male, None = unknown
    :param set of str duplicated_samples: Duplicated samples to remove (If not provided, this function won't work as it assumes that each child has exactly two parents)
    :param str i_col: Column containing the 1st sample id in the pc_relate table
    :param str j_col: Column containing the 2nd sample id in the pc_relate table
    :param str kin_col: Column containing the kinship in the pc_relate table
    :param str ibd2_col: Column containing ibd2 in the pc_relate table
    :param (float, float) first_degree_threshold: Lower/upper bounds for kin for 1st degree relatives
    :param (float, float) second_degree_threshold: Lower/upper bounds for kin for 2nd degree relatives
    :param float ibd2_parent_offspring_threshold: Upper bound on ibd2 for a parent/offspring
    :return: Pedigree containing all trios in the data
    :rtype: Pedigree
    """
    def get_fam_samples(
        sample: str,
        fam: Set[str],
        samples_rel: Dict[str, Set[str]],
    ) -> Set[str]:
        """
        Given a sample, its known family and a dict that links samples with their relatives, outputs the set of
        samples that constitute this sample family.

        :param str sample: sample
        :param dict of str -> set of str samples_rel: dict(sample -> set(sample_relatives))
        :param set of str fam: sample known family
        :return: Family including the sample
        :rtype: set of str
        """
        fam.add(sample)
        for s2 in samples_rel[sample]:
            if s2 not in fam:
                fam = get_fam_samples(s2, fam, samples_rel)
        return fam

    def get_indexed_ibd2(
            pc_relate_rows: List[hl.Struct]) -> Dict[Tuple[str, str], float]:
        """
        Given rows from a pc_relate table, creates a dict with:
        keys: Pairs of individuals, lexically ordered
        values: ibd2

        :param list of hl.Struct pc_relate_rows: Rows from a pc_relate table
        :return: Dict of lexically ordered pairs of individuals -> kinship
        :rtype: dict of (str, str) -> float
        """
        ibd2 = dict()
        for row in pc_relate_rows:
            ibd2[tuple(sorted((row[i_col], row[j_col])))] = row[ibd2_col]
        return ibd2

    def get_parents(possible_parents: List[str],
                    indexed_kinship: Dict[Tuple[str, str], Tuple[float,
                                                                 float]],
                    sex: Dict[str, bool]) -> Tuple[str, str]:
        """
        Given a list of possible parents for a sample (first degree relatives with low ibd2),
        looks for a single pair of samples that are unrelated with different sexes.
        If a single pair is found, return the pair (father, mother)

        :param list of str possible_parents: Possible parents
        :param dict of (str, str) -> (float, float)) indexed_kinship: Dict mapping pairs of individuals to their kinship and ibd2 coefficients
        :param dict of str -> bool sex: Dict mapping samples to their sex (True = female, False = male, None or missing = unknown)
        :return: (father, mother)
        :rtype: (str, str)
        """

        parents = []
        while len(possible_parents) > 1:
            p1 = possible_parents.pop()
            for p2 in possible_parents:
                if tuple(sorted((p1, p2))) not in indexed_kinship:
                    if sex.get(p1) is False and sex.get(p2):
                        parents.append((p1, p2))
                    elif sex.get(p1) and sex.get(p2) is False:
                        parents.append((p2, p1))

        if len(parents) == 1:
            return parents[0]

        return None

    # Get first degree relatives - exclude duplicate samples
    dups = hl.literal(duplicated_samples)
    first_degree_pairs = kin_ht.filter(
        (kin_ht[kin_col] > first_degree_threshold[0])
        & (kin_ht[kin_col] < first_degree_threshold[1])
        & ~dups.contains(kin_ht[i_col])
        & ~dups.contains(kin_ht[j_col])).collect()
    first_degree_relatives = defaultdict(set)
    for row in first_degree_pairs:
        first_degree_relatives[row[i_col]].add(row[j_col])
        first_degree_relatives[row[j_col]].add(row[i_col])

    #Add second degree relatives for those samples
    #This is needed to distinguish grandparent - child - parent from child - mother, father down the line
    first_degree_samples = hl.literal(set(first_degree_relatives.keys()))
    second_degree_samples = kin_ht.filter(
        (first_degree_samples.contains(kin_ht[i_col])
         | first_degree_samples.contains(kin_ht[j_col]))
        & (kin_ht[kin_col] > second_degree_threshold[0])
        & (kin_ht[kin_col] < first_degree_threshold[1])).collect()

    ibd2 = get_indexed_ibd2(second_degree_samples)

    fam_id = 1
    trios = []
    while len(first_degree_relatives) > 0:
        s_fam = get_fam_samples(
            list(first_degree_relatives)[0], set(), first_degree_relatives)
        for s in s_fam:
            s_rel = first_degree_relatives.pop(s)
            possible_parents = []
            for rel in s_rel:
                if ibd2[tuple(sorted(
                    (s, rel)))] < ibd2_parent_offspring_threshold:
                    possible_parents.append(rel)

            parents = get_parents(possible_parents, ibd2, sex)

            if parents is not None:
                trios.append(
                    hl.Trio(s=s,
                            fam_id=str(fam_id),
                            pat_id=parents[0],
                            mat_id=parents[1],
                            is_female=sex.get(s)))

        fam_id += 1

    return hl.Pedigree(trios)