def create_fake_pedigree( n: int, sample_list: List[str], exclude_real_probands: bool = False, max_tries: int = 10, real_pedigree: Optional[hl.Pedigree] = None, ) -> hl.Pedigree: """ Generate a pedigree made of trios created by sampling 3 random samples in the sample list. - If `real_pedigree` is given, then children in the resulting fake trios will not include any trio with proband - parents that are in the real ones. - Each sample can be used only once as a proband in the resulting trios. - Sex of probands in fake trios is random. :param n: Number of fake trios desired in the pedigree :param sample_list: List of samples :param exclude_real_probands: If set, then fake trios probands cannot be in the real trios probands. :param max_tries: Maximum number of sampling to try before bailing out (preventing infinite loop if `n` is too large w.r.t. the number of samples) :param real_pedigree: Optional pedigree to exclude children from :return: Fake pedigree """ real_trios = ({trio.s: trio for trio in real_pedigree.trios} if real_pedigree is not None else dict()) if exclude_real_probands and len(real_trios) == len(set(sample_list)): logger.warning( "All samples are in the real probands list; cannot create any fake pedigrees with exclude_real_probands=True. Returning an empty Pedigree." ) return hl.Pedigree([]) fake_trios = {} tries = 0 while len(fake_trios) < n and tries < max_tries: s, mat_id, pat_id = random.sample(sample_list, 3) if (s in real_trios and (exclude_real_probands or {mat_id, pat_id} == { real_trios[s].mat_id, real_trios[s].pat_id })) or s in fake_trios: tries += 1 else: tries = 0 fake_trios[s] = hl.Trio( s=s, pat_id=pat_id, mat_id=mat_id, fam_id=f"fake_{str(len(fake_trios))}", is_female=bool(random.getrandbits(1)), ) if tries == max_tries: logger.warning( "Only returning %d fake trios; random trio sampling stopped after reaching the maximum %d iterations", len(fake_trios), max_tries, ) return hl.Pedigree(list(fake_trios.values()))
def pandas_to_ped(ped_pd: pd.DataFrame): """ Creates a Hail Pedigree object from trios stored as rows in a DataFrame. Input columns should contain 'fam_id', 's', 'is_female', 'pat_id', 'mat_id' :param DataFrame ped_pd: Input DataFrame :return: Pedigree :rtype: Pedigree """ return hl.Pedigree([ hl.Trio(s=row.s, is_female=row.is_female, pat_id=row.pat_id, mat_id=row.mat_id, fam_id=str(row.fam_id)) for row in ped_pd.itertuples() ])
def create_fake_pedigree(n: int, sample_list: List[str], real_pedigree: hl.Pedigree = None) -> hl.Pedigree: """ Generates a "fake" pedigree made of trios created by sampling 3 random samples in the sample list. If `real_pedigree` is given, then children from the real pedigrees won't be used as probands. This functions insures that: - All probands are unique - All individuals in a trio are different :param int n: Number of fake trios desired in the pedigree :param list of str sample_list: List of samples :param Pedigree real_pedigree: Optional pedigree to exclude children from :return: Fake pedigree :rtype: Pedigree """ probands = set() if real_pedigree is not None: probands = {trio.s for trio in real_pedigree.trios }.intersection(set(sample_list)) if len(probands) == len(sample_list): raise ValueError( "Full sample list for fake trios generation needs to include samples that aren't probands in the real trios." ) fake_trios = [] for i in range(n): mat_id, pat_id = random.sample(sample_list, 2) s = random.choice(sample_list) while s in probands.union({mat_id, pat_id}): s = random.choice(sample_list) probands.add(s) fake_trios.append( hl.Trio(s=s, pat_id=pat_id, mat_id=mat_id, fam_id=str(str(i + 1)), is_female=True)) return hl.Pedigree(fake_trios)
def get_trios( fam_id: str, parent_child_pairs: List[Tuple[str, str]], related_pairs: Dict[Tuple[str, str], str], ) -> List[hl.Trio]: """ Generates trios based from the list of parent-child pairs in the family and all related pairs in the data. Only complete parent/offspring trios are included in the results. The trios are assembled as follows: 1. All pairs of unrelated samples with different sexes within the family are extracted as possible parent pairs 2. For each possible parent pair, a list of all children is constructed (each child in the list has a parent-offspring pair with each parent) 3. If there are multiple children for a given parent pair, all children should be siblings with each other 4. Check that each child was only assigned a single pair of parents. If a child is found to have multiple parent pairs, they are ALL discarded. :param fam_id: The family ID :param parent_child_pairs: The parent-child pairs for this family :param related_pairs: All related sample pairs in the data :return: List of trios in the family """ def get_possible_parents(samples: List[str]) -> List[Tuple[str, str]]: """ 1. All pairs of unrelated samples with different sexes within the family are extracted as possible parent pairs :param samples: All samples in the family :return: Possible parent pairs """ possible_parents = [] for i in range(len(samples)): for j in range(i + 1, len(samples)): if (related_pairs.get( tuple(sorted([samples[i], samples[j]]))) is None): if sex.get(samples[i]) is False and sex.get( samples[j]) is True: possible_parents.append((samples[i], samples[j])) elif (sex.get(samples[i]) is True and sex.get(samples[j]) is False): possible_parents.append((samples[j], samples[i])) return possible_parents def get_children(possible_parents: Tuple[str, str]) -> List[str]: """ 2. For a given possible parent pair, a list of all children is constructed (each child in the list has a parent-offspring pair with each parent) :param possible_parents: A pair of possible parents :return: The list of all children (if any) corresponding to the possible parents """ possible_offsprings = defaultdict( set ) # stores sample -> set of parents in the possible_parents where (sample, parent) is found in possible_child_pairs for pair in parent_child_pairs: if possible_parents[0] == pair[0]: possible_offsprings[pair[1]].add(possible_parents[0]) elif possible_parents[0] == pair[1]: possible_offsprings[pair[0]].add(possible_parents[0]) elif possible_parents[1] == pair[0]: possible_offsprings[pair[1]].add(possible_parents[1]) elif possible_parents[1] == pair[1]: possible_offsprings[pair[0]].add(possible_parents[1]) return [ s for s, parents in possible_offsprings.items() if len(parents) == 2 ] def check_sibs(children: List[str]) -> bool: """ 3. If there are multiple children for a given parent pair, all children should be siblings with each other :param children: List of all children for a given parent pair :return: Whether all children in the list are siblings """ for i in range(len(children)): for j in range(i + 1, len(children)): if (related_pairs[tuple(sorted([children[i], children[j] ]))] != SIBLINGS): return False return True def discard_multi_parents_children(trios: List[hl.Trio]): """ 4. Check that each child was only assigned a single pair of parents. If a child is found to have multiple parent pairs, they are ALL discarded. :param trios: All trios formed for this family :return: The list of trios for which each child has a single parents pair. """ children_trios = defaultdict(list) for trio in trios: children_trios[trio.s].append(trio) for s, s_trios in children_trios.items(): if len(s_trios) > 1: logger.warning( "Discarded duplicated child {0} found multiple in trios: {1}" .format(s, ", ".join([str(trio) for trio in s_trios]))) return [ trios[0] for trios in children_trios.values() if len(trios) == 1 ] # Get all possible pairs of parents in (father, mother) order all_possible_parents = get_possible_parents( list({s for pair in parent_child_pairs for s in pair})) trios = [] for possible_parents in all_possible_parents: children = get_children(possible_parents) if check_sibs(children): trios.extend([ hl.Trio( s=s, fam_id=fam_id, pat_id=possible_parents[0], mat_id=possible_parents[1], is_female=sex.get(s), ) for s in children ]) else: logger.warning( "Discarded family with same parents, and multiple offspring that weren't siblings:" "\nMother: {}\nFather:{}\nChildren:{}".format( possible_parents[0], possible_parents[1], ", ".join(children))) return discard_multi_parents_children(trios)
def infer_families( kin_ht: hl.Table, # the kinship hail table sex: Dict[str, bool], # the dictionary of sexes i_col: str = 'i', # the rest of these are default that can be set to something else if needed j_col: str = 'j', pi_hat_col: str = 'pi_hat', ibd2_col: str = 'ibd2', ibd1_col: str = 'ibd1', ibd0_col: str = 'ibd0', first_degree_threshold: Tuple[float, float] = (0.4, 0.75), second_degree_threshold: Tuple[float, float] = (0.195, 0.3), ibd1_second_degree_threshold: float = 0.40, ibd2_parent_offspring_threshold: float = 0.30, ibd1_parent_offspring_threshold: float = 0.70, ibd0_parent_offspring_threshold: float = 0.15) -> hl.Pedigree: """ Infers familial relationships from the results of pc_relate and sex information. Note that both kinship and ibd2 are needed in the pc_relate output. This function returns a pedigree containing trios inferred from the data. Family ID can be the same for multiple trios if one or more members of the trios are related (e.g. sibs, multi-generational family). Trios are ordered by family ID. Note that this function only returns complete trios defined as: one child, one father and one mother (sex is required for both parents) :param Table kin_ht: pc_relate output table :param dict of str -> bool sex: A dict containing the sex for each sample. True = female, False = male, None = unknown :param str i_col: Column containing the 1st sample id in the ibd table :param str j_col: Column containing the 2nd sample id in the ibd table #:param str kin_col: Column containing the kinship in the ibd table :param str pi_hat_col: Column containing the pi_hat in the ibd table :param str ibd2_col: Column containing ibd2 in the pc_relate table :param (float, float) first_degree_threshold: Lower/upper bounds for kin for 1st degree relatives :param (float, float) second_degree_threshold: Lower/upper bounds for kin for 2nd degree relatives :param float ibd2_parent_offspring_threshold: Upper bound on ibd2 for a parent/offspring :return: Pedigree containing all trios in the data :rtype: Pedigree """ def get_fam_samples( sample: str, fam: Set[str], samples_rel: Dict[str, Set[str]], ) -> Set[str]: """ Given a sample, its known family and a dict that links samples with their relatives, outputs the set of samples that constitute this sample family. :param str sample: sample :param dict of str -> set of str samples_rel: dict( :param set of str fam: sample known family :return: Family including the sample :rtype: set of str """ fam.add( sample ) # usually this starts out as a blank set except for the case two lines below for s2 in samples_rel[ sample]: # iterate through the sample's relatives if s2 not in fam: fam = get_fam_samples( s2, fam, samples_rel ) # this part is to get who s2 is related to but that sample may not have been related to? return fam def get_indexed_ibd( pc_relate_rows: List[hl.Struct]) -> Dict[Tuple[str, str], float]: """ Given rows from a pc_relate table, creates dicts with: keys: Pairs of individuals, lexically ordered values: ibd2, ibd1, ibd0 :param list of hl.Struct pc_relate_rows: Rows from a pc_relate table :return: Dict of lexically ordered pairs of individuals -> kinship :rtype: dict of (str, str) -> float """ ibd2 = dict() ibd1 = dict() ibd0 = dict() for row in pc_relate_rows: ibd2[tuple(sorted((row[i_col], row[j_col])))] = row[ ibd2_col] # this is just getting the ibd2 value for every sample pair ibd1[tuple(sorted((row[i_col], row[j_col])))] = row[ ibd1_col] # this is just getting the ibd1 value for every sample pair ibd0[tuple(sorted((row[i_col], row[j_col])))] = row[ ibd0_col] # this is just getting the ibd0 value for every sample pair return ibd2, ibd1, ibd0 def get_parents(possible_parents: List[str], relative_pairs: List[Tuple[str, str]], sex: Dict[str, bool]) -> Union[Tuple[str, str], None]: """ Given a list of possible parents for a sample (first degree relatives with low ibd2), looks for a single pair of samples that are unrelated with different sexes. If a single pair is found, return the pair (father, mother) :param list of str possible_parents: Possible parents :param list of (str, str) relative_pairs: Pairs of relatives, used to check that parents aren't related with each other :param dict of str -> bool sex: Dict mapping samples to their sex (True = female, False = male, None or missing = unknown) :return: (father, mother) if found, `None` otherwise :rtype: (str, str) or None """ parents = [] logging.info(f"You have {len(possible_parents)} possible parent(s)") while len(possible_parents ) > 1: # go through the entire list of possible parents p1 = possible_parents.pop() # start with the first possible parent for p2 in possible_parents: logging.info(str(tuple(sorted((p1, p2)))) + '\n') if tuple( sorted((p1, p2)) ) not in relative_pairs: # to what degree is a "relative"? will this work for grandparent, mom, child? logging.info( "your potential parent's don't appear to be relatives\n" ) logging.info("SEX p1: " + str(sex.get(p1)) + '\n') logging.info("SEX p2: " + str(sex.get(p2)) + '\n') if sex.get(p1) is False and sex.get(p2): parents.append((p1, p2)) logging.info("found in order 1\n") elif sex.get(p1) and sex.get(p2) is False: parents.append((p2, p1)) logging.info("found in order 2\n") else: logging.info("Your Parents are Related!!!\n\n") if len(parents) == 1: logging.info("Found your parents!\n") return parents[0] return None # Duplicated samples to remove (If not provided, this function won't work as it assumes that each child has exactly two parents) duplicated_samples = set() try: dups = hl.literal(duplicated_samples) except: dups = hl.empty_array(hl.tstr) first_degree_pairs = kin_ht.filter( (kin_ht[pi_hat_col] >= first_degree_threshold[0]) & (kin_ht[pi_hat_col] <= first_degree_threshold[1]) & ~dups.contains(kin_ht[i_col]) & ~dups.contains(kin_ht[j_col]) # so not including any duplicate samples ).collect() first_degree_relatives = defaultdict(set) for row in first_degree_pairs: first_degree_relatives[row[i_col]].add( row[j_col] ) # so you're making a list for every sample that includes any other sample they are related to by first degree first_degree_relatives[row[j_col]].add(row[i_col]) # Add second degree relatives for those samples # This is needed to distinguish grandparent - child - parent from child - mother, father down the line first_degree_samples = hl.literal(set(first_degree_relatives.keys())) second_degree_samples = kin_ht.filter(( (kin_ht[pi_hat_col] >= first_degree_threshold[0]) & (kin_ht[pi_hat_col] <= first_degree_threshold[1])) | ( (kin_ht[pi_hat_col] >= second_degree_threshold[0]) & (kin_ht[ibd1_col] >= ibd1_second_degree_threshold) & (kin_ht[pi_hat_col] < second_degree_threshold[1]))).collect() ibd2, ibd1, ibd0 = get_indexed_ibd( second_degree_samples ) # this is just getting the ibd values for every sample pair fam_id = 1 trios = [] duos = [] decisions = {} while len(first_degree_relatives) > 0: s_fam = get_fam_samples( list(first_degree_relatives)[0], set(), first_degree_relatives ) # just feed in the entire dictionary because it gets keyed out to only that sample in the function anyway for s in s_fam: logging.info(f"Processing sample: {s}") s_rel = first_degree_relatives.pop( s ) # because your popping, the above index of [0] will appropriately be updated possible_parents = [] for rel in s_rel: # so s rel is a list of all the people s (which was popped off) was related to by first degree if (ibd2[tuple(sorted((s, rel)))] <= ibd2_parent_offspring_threshold) & \ (ibd1[tuple(sorted((s, rel)))] >= ibd1_parent_offspring_threshold) & \ (ibd0[tuple(sorted((s, rel)))] <= ibd0_parent_offspring_threshold): # if the ib2 value for that pair is below that parent threshold possible_parents.append(rel) #these will be the proband-offspring only pairs if len(possible_parents) == 1: duos.append(sorted((s, possible_parents[0]))) decisions[s] = possible_parents[0] else: parents = get_parents(possible_parents, list(ibd2.keys()), sex) decisions[s] = parents if parents is not None: # just formatting the trio output here trios.append( hl.Trio(s=s, fam_id=str(fam_id), pat_id=parents[0], mat_id=parents[1], is_female=sex.get(s))) fam_id += 1 return hl.Pedigree(trios), duos, decisions
def infer_families( kin_ht: hl.Table, sex: Dict[str, bool], duplicated_samples: Set[str], i_col: str = 'i', j_col: str = 'j', kin_col: str = 'kin', ibd2_col: str = 'ibd2', first_degree_threshold: Tuple[float, float] = (0.2, 0.4), second_degree_threshold: Tuple[float, float] = (0.05, 0.16), ibd2_parent_offspring_threshold: float = 0.2) -> hl.Pedigree: """ Infers familial relationships from the results of pc_relate and sex information. Note that both kinship and ibd2 are needed in the pc_relate output. This function returns a pedigree containing trios inferred from the data. Family ID can be the same for multiple trios if one or more members of the trios are related (e.g. sibs, multi-generational family). Trios are ordered by family ID. Note that this function only returns complete trios defined as: one child, one father and one mother (sex is required for both parents) :param Table kin_ht: pc_relate output table :param dict of str -> bool sex: A dict containing the sex for each sample. True = female, False = male, None = unknown :param set of str duplicated_samples: Duplicated samples to remove (If not provided, this function won't work as it assumes that each child has exactly two parents) :param str i_col: Column containing the 1st sample id in the pc_relate table :param str j_col: Column containing the 2nd sample id in the pc_relate table :param str kin_col: Column containing the kinship in the pc_relate table :param str ibd2_col: Column containing ibd2 in the pc_relate table :param (float, float) first_degree_threshold: Lower/upper bounds for kin for 1st degree relatives :param (float, float) second_degree_threshold: Lower/upper bounds for kin for 2nd degree relatives :param float ibd2_parent_offspring_threshold: Upper bound on ibd2 for a parent/offspring :return: Pedigree containing all trios in the data :rtype: Pedigree """ def get_fam_samples( sample: str, fam: Set[str], samples_rel: Dict[str, Set[str]], ) -> Set[str]: """ Given a sample, its known family and a dict that links samples with their relatives, outputs the set of samples that constitute this sample family. :param str sample: sample :param dict of str -> set of str samples_rel: dict(sample -> set(sample_relatives)) :param set of str fam: sample known family :return: Family including the sample :rtype: set of str """ fam.add(sample) for s2 in samples_rel[sample]: if s2 not in fam: fam = get_fam_samples(s2, fam, samples_rel) return fam def get_indexed_ibd2( pc_relate_rows: List[hl.Struct]) -> Dict[Tuple[str, str], float]: """ Given rows from a pc_relate table, creates a dict with: keys: Pairs of individuals, lexically ordered values: ibd2 :param list of hl.Struct pc_relate_rows: Rows from a pc_relate table :return: Dict of lexically ordered pairs of individuals -> kinship :rtype: dict of (str, str) -> float """ ibd2 = dict() for row in pc_relate_rows: ibd2[tuple(sorted((row[i_col], row[j_col])))] = row[ibd2_col] return ibd2 def get_parents(possible_parents: List[str], indexed_kinship: Dict[Tuple[str, str], Tuple[float, float]], sex: Dict[str, bool]) -> Tuple[str, str]: """ Given a list of possible parents for a sample (first degree relatives with low ibd2), looks for a single pair of samples that are unrelated with different sexes. If a single pair is found, return the pair (father, mother) :param list of str possible_parents: Possible parents :param dict of (str, str) -> (float, float)) indexed_kinship: Dict mapping pairs of individuals to their kinship and ibd2 coefficients :param dict of str -> bool sex: Dict mapping samples to their sex (True = female, False = male, None or missing = unknown) :return: (father, mother) :rtype: (str, str) """ parents = [] while len(possible_parents) > 1: p1 = possible_parents.pop() for p2 in possible_parents: if tuple(sorted((p1, p2))) not in indexed_kinship: if sex.get(p1) is False and sex.get(p2): parents.append((p1, p2)) elif sex.get(p1) and sex.get(p2) is False: parents.append((p2, p1)) if len(parents) == 1: return parents[0] return None # Get first degree relatives - exclude duplicate samples dups = hl.literal(duplicated_samples) first_degree_pairs = kin_ht.filter( (kin_ht[kin_col] > first_degree_threshold[0]) & (kin_ht[kin_col] < first_degree_threshold[1]) & ~dups.contains(kin_ht[i_col]) & ~dups.contains(kin_ht[j_col])).collect() first_degree_relatives = defaultdict(set) for row in first_degree_pairs: first_degree_relatives[row[i_col]].add(row[j_col]) first_degree_relatives[row[j_col]].add(row[i_col]) #Add second degree relatives for those samples #This is needed to distinguish grandparent - child - parent from child - mother, father down the line first_degree_samples = hl.literal(set(first_degree_relatives.keys())) second_degree_samples = kin_ht.filter( (first_degree_samples.contains(kin_ht[i_col]) | first_degree_samples.contains(kin_ht[j_col])) & (kin_ht[kin_col] > second_degree_threshold[0]) & (kin_ht[kin_col] < first_degree_threshold[1])).collect() ibd2 = get_indexed_ibd2(second_degree_samples) fam_id = 1 trios = [] while len(first_degree_relatives) > 0: s_fam = get_fam_samples( list(first_degree_relatives)[0], set(), first_degree_relatives) for s in s_fam: s_rel = first_degree_relatives.pop(s) possible_parents = [] for rel in s_rel: if ibd2[tuple(sorted( (s, rel)))] < ibd2_parent_offspring_threshold: possible_parents.append(rel) parents = get_parents(possible_parents, ibd2, sex) if parents is not None: trios.append( hl.Trio(s=s, fam_id=str(fam_id), pat_id=parents[0], mat_id=parents[1], is_female=sex.get(s))) fam_id += 1 return hl.Pedigree(trios)