def parse_siblings(dir_data, logger=None, f_mids='mid.csv'): """ Parse brother pairs by referencing member ID LUT and relationship matrix. Siblings RID is 2 and brother, of course, are Males. Thus, these are the factors we use to identify pairs. :param dir_data: Directory containing folders of FIDs (e.g., F0001/, ..., F????/). :param f_rel_matrix: :param f_mids: :return: """ kind = 'siblings' # family directories dirs_fid, fid_list = load_fids(dir_data) print("{} families are being processed".format(len(fid_list))) # Load MID LUT for all FIDs. df_mids = db.load_mids(dirs_fid, f_csv=f_mids) # Load relationship matrices for all FIDs. # df_relationships = load_relationship_matrices(dirs_fid, f_csv=f_rel_matrix) siblings = [] for i, fid in enumerate(fid_list): # ids = [i for i, s in enumerate(genders) if 'Male' in s] rel_mat = db.parse_relationship_matrices(df_mids[i]) # rel_mat = np.array(df_relationships[i]) genders = list(df_mids[i].Gender) success, genders = helpers.check_gender_label(genders) if not success: logger.error("Gender notation incorrect for {}".format(fid)) # ids_not = [j for j, s in enumerate(genders) if 'Male' not in s] # rel_mat[ids_not, :] = 0 # rel_mat[:, ids_not] = 0 sibling_ids = np.where(rel_mat == 2) if not helpers.check_npairs(len(sibling_ids[1]), kind, fid): continue sib_ids = db.get_unique_pairs(sibling_ids) # sib_ids = [(b1, b2) if b1 < b2 else (b2, b1) for b1, b2 in zip(list(sibling_ids[0]), list(sibling_ids[1]))] # sib_id = list(set(sib_ids)) sibling_ids = list(set(sib_ids)) for ids in sib_ids: # remove if brother or sister pair if ('m' in genders[ids[0]] and 'm' in genders[ids[1]]) or \ ('f' in genders[ids[0]] and 'f' in genders[ids[1]]): print("Removing", ids) sibling_ids.remove(ids) for ids in enumerate(sibling_ids): print(ids) indices = list(np.array(ids[1]) + 1) siblings.append(db.Pair(mids=indices, fid=fid, kind=kind)) del indices return siblings
def parse_sisters(dir_data, logger=None, f_mids='mid.csv'): """ Parse sister pairs by referencing member ID LUT and relationship matrix. Siblings RID is 2 and sister, of course, are Females. Thus, these are the factors we use to identify pairs. :param dir_data: Directory containing folders of FIDs (e.g., F0001/, ..., F????/). :param f_rel_matrix: :param f_mids: :return: """ # family directories kind = 'sisters' dirs_fid, fid_list = load_fids(dir_data) logger.info("{} families are being processed".format(len(fid_list))) # Load MID LUT for all FIDs. df_mids = db.load_mids(dirs_fid, f_csv=f_mids) # # Load relationship matrices for all FIDs. # df_relationships = load_relationship_matrices(dirs_fid, f_csv=f_rel_matrix) sisters = [] for i, fid in enumerate(fid_list): # ids = [i for i, s in enumerate(genders) if 'Male' in s] rel_mat = db.parse_relationship_matrices(df_mids[i]) genders = list(df_mids[i].Gender) success, genders = helpers.check_gender_label(genders) if not success: logger.error("Gender notation incorrect for {}".format(fid)) # zero out female subjects rel_mat = db.specify_gender(rel_mat, genders, 'f') sister_ids = np.where(rel_mat == 2) if not helpers.check_npairs(len(sister_ids[1]), kind, fid): continue # add to list of brothers sisters = db.set_pairs(sisters, sister_ids, kind, fid) return sisters
def parse_grandparents(dir_data, logger=None, f_mids='mid.csv'): """ Parse sister pairs by referencing member ID LUT and relationship matrix. Siblings RID is 2 and sister, of course, are Females. Thus, these are the factors we use to identify pairs. :param dir_data: Directory containing folders of FIDs (e.g., F0001/, ..., F????/). :param f_rel_matrix: :param f_mids: :return: """ # family directories dirs_fid, fid_list = load_fids(dir_data) logger.info("{} families are being processed".format(len(fid_list))) # Load MID LUT for all FIDs. # Load MID LUT for all FIDs. df_mids = db.load_mids(dirs_fid, f_csv=f_mids) # Load relationship matrices for all FIDs. # df_relationships = load_relationship_matrices(dirs_fid, f_csv=f_rel_matrix) gfgd = [] gfgs = [] gmgd = [] gmgs = [] kind = 'parent-child' for i, fid in enumerate(fid_list): # ids = [i for i, s in enumerate(genders) if 'Male' in s] rel_mat = db.parse_relationship_matrices(df_mids[i]) genders = list(df_mids[i].Gender) success, genders = helpers.check_gender_label(genders) if not success: logger.error("Gender notation incorrect for {}".format(fid)) # ids_not = [j for j, s in enumerate(genders) if 'Female' not in s] # rel_mat[ids_not, :] = 0 # rel_mat[:, ids_not] = 0 # indices of matrix containing 4 or 1; that the matrix is inversed across the diagonal # mat_ids = np.where(rel_mat == 1) and np.where(rel_mat.T == 1), np.where(rel_mat == 1) and np.where( # rel_mat.T == 4) c_ids = np.where(rel_mat == 3) p_ids = np.where(rel_mat == 6) if len(c_ids[0]) != len(p_ids[0]): logger.warn("Number of children and parents are different.") if not helpers.check_npairs(len(c_ids[0]), kind, fid): continue # ch_ids = [(p1, p2) for p1, p2 in zip(list(c_ids[0]), list(c_ids[1]))] par_ids = [(p1, p2) for p1, p2 in zip(list(p_ids[0]), list(p_ids[1]))] # ch_ids = list(set(ch_ids)) for p in par_ids: print(p) p_mid = list(np.array(p) + 1)[0] c_mid = list(np.array(p) + 1)[1] p_gender = genders[p_mid - 1] c_gender = genders[c_mid - 1] if 'f' in p_gender: # fathers if 'f' in c_gender: # son gfgs.append(db.Pair(mids=(p_mid, c_mid), fid=fid, kind='gmgd')) else: # daughter gfgd.append(db.Pair(mids=(p_mid, c_mid), fid=fid, kind='gmgs')) else: # mothers if 'f' in c_gender: gmgs.append(db.Pair(mids=(p_mid, c_mid), fid=fid, kind='gfgd')) else: gmgd.append(db.Pair(mids=(p_mid, c_mid), fid=fid, kind='gfgd')) return gfgd, gfgs, gmgd, gmgs