def __init__(self, pandas_obj): if not pandas_obj.dtypes.apply( lambda dt: GenotypeDtype.is_dtype(dt)).any(): raise AttributeError( "Incompatible datatypes: at least one column must be a GenotypeDtype." ) id_counts = Counter([ s.genomics.variant.id for _, s in pandas_obj.iteritems() if GenotypeDtype.is_dtype(s) ]) if len(id_counts) < len( pandas_obj.select_dtypes([GenotypeDtype]).columns): duplicates = [(k, v) for k, v in id_counts.items() if v >= 2] raise AttributeError( f"Duplicate Variant IDs. Column names may differ from variant IDs, but variant IDs must be unique.\n\tDuplicates: " + ", ".join( [f"{dupe} ({count:,})" for dupe, count in duplicates])) self._obj = pandas_obj
def generate_random_gt( variant: Variant, alt_allele_freq: Union[List[float], float], n: int = 1000, random_seed: int = 1855, ) -> GenotypeArray: """ Simulate random genotypes according to the provided allele frequencies Parameters ---------- variant: Variant alt_allele_freq: float or List[float] Allele frequencies for each alternate allele in the variant (Bialleleic variants may specify a single float value) n: int, default 1000 How many genotypes to simulate random_seed: int, default 1855 Returns ------- GenotypeArray """ # Validate frequencies if isinstance(alt_allele_freq, float): # Convert it into a list alt_allele_freq = [ alt_allele_freq, ] if len(alt_allele_freq) != len(variant.alleles) - 1: raise ValueError( f"The number of provided frequencies ({len(alt_allele_freq)}) doesn't match" f" the number of alternate alleles in the variant ({len(variant.alleles)-1})." ) if sum(alt_allele_freq) > 1.0: raise ValueError( f"The provided frequencies must not sum to > 1.0 (sum was {sum(alt_allele_freq):.3e})" ) # Set remaining odds to the reference allele allele_freq = [ 1 - sum(alt_allele_freq), ] + alt_allele_freq # Choose gts np.random.seed(random_seed) genotypes = np.random.choice(range(len(variant.alleles)), p=allele_freq, size=(n, variant.ploidy)) # Create GenotypeArray representation of the data dtype = GenotypeDtype(variant) scores = np.ones(n) * MISSING_IDX data = np.array(list(zip(genotypes, scores)), dtype=dtype._record_type) gt_array = GenotypeArray(values=data, dtype=dtype) return gt_array
def dtype(): variant = Variant( chromosome="chr1", position=123456, id="rs12345", ref="A", alt=["T", "G"], score=30, ) return GenotypeDtype(variant=variant)
def encode_dominant(self) -> pd.DataFrame: """Dominant encoding of genotypes. See :meth:`GenotypeArray.encode_dominant` Returns ------- pd.DataFrame """ return pd.concat( [ s.genomics.encode_dominant() if GenotypeDtype.is_dtype(s) else s for _, s in self._obj.iteritems() ], axis=1, )
def _get_snp2_gt_array(self, gt_table_idxs): """Assemble a GenotypeArray for SNP2 directly from genotype table indices""" dtype = GenotypeDtype(self.snp2) gt_table_data = ( ((0, 0), MISSING_IDX), ((0, 0), MISSING_IDX), ((0, 0), MISSING_IDX), ((0, 1), MISSING_IDX), ((0, 1), MISSING_IDX), ((0, 1), MISSING_IDX), ((1, 1), MISSING_IDX), ((1, 1), MISSING_IDX), ((1, 1), MISSING_IDX), ) data = np.array([gt_table_data[i] for i in gt_table_idxs], dtype=dtype._record_type) return GenotypeArray(values=data, dtype=dtype)
def encode_edge(self, encoding_info: pd.DataFrame) -> pd.DataFrame: """EDGE (weighted) encoding of genotypes. See :meth:`GenotypeArray.encode_edge` Parameters ---------- encoding_info: pd.DataFrame columns: - Variant ID - used to match variants - Alpha Value - used for heterozygous genotypes - Ref Allele - which allele is considered reference - Alt Allele - which allele is considered alternate - Minor Allele Frequency - MAF of data used during calculation of alpha values Returns ------- pd.DataFrame """ # Validate the input DataFrame for required_col in [ "Variant ID", "Alpha Value", "Ref Allele", "Alt Allele", "Minor Allele Frequency", ]: if required_col not in list(encoding_info): raise ValueError( f"Missing one or more required columns in the encoding info: `{required_col}`" ) id_counts = encoding_info["Variant ID"].value_counts() if sum(id_counts > 1): raise ValueError( f"Duplicate IDs: {', '.join([v for v in id_counts[id_counts>1].index])}" ) # Rename the columns to match parameter names for simplicity encoding_info = encoding_info.rename( columns={ "Alpha Value": "alpha_value", "Ref Allele": "ref_allele", "Alt Allele": "alt_allele", "Minor Allele Frequency": "minor_allele_freq", }) # Convert the encoding info into a Dict("Variant ID" = {param names : param values}) encoding_info = { d["Variant ID"]: {k: v for k, v in d.items() if k != "Variant ID"} for d in encoding_info.to_dict(orient="records") } # Log messages for any warnings warnings = dict() # Process each variant results = [] for _, s in self._obj.iteritems(): if not GenotypeDtype.is_dtype(s): results.append(s) continue info = encoding_info.get(s.array.variant.id, None) if info is None: warnings[ s.array.variant. id] = "No matching information found in the encoding data" continue elif (s.genomics.maf / info["minor_allele_freq"]) > 10e30: # TODO: replace this with a reasonable comparison to the data MAF. For now it is an always-pass criteria warnings[ s.array.variant. id] = f"Large MAF Difference: {s.genomics.maf} in sample, {info['minor_allele_freq']} in encoding data" continue else: try: results.append(s.genomics.encode_edge(**info)) except Exception as e: warnings[s.array.variant.id] = str(e) # Print Warnings if len(warnings) > 0: print(f"{len(warnings):,} Variables failed encoding") for var, warning in warnings.items(): print(f"\t{var}: {warning}") # Concatenate results return pd.concat(results, axis=1)
def calculate_edge_alphas( genotypes: Union[pd.Series, pd.DataFrame], data: pd.DataFrame, outcome_variable: str, covariates: Optional[List[str]] = None, ): """ Calculate alpha values to be used in EDGE encoding Parameters ---------- genotypes: A GenotypeArray Series or DataFrame data: Data to be used in the regression, including the outcome and covariates outcome_variable: The variable to be used as the output (y) of the regression covariates: Other variables to be included in the regression formula Returns ------- Dict Variant ID: str Alpha Value - used for heterozygous genotypes Ref Allele - which allele is considered reference Alt Allele - which allele is considered alternate Minor Allele Frequency - MAF of data used during calculation of alpha values Notes ----- See [1]_ for more information about EDGE encoding. References ---------- .. [1] Hall, Molly A., et al. "Novel EDGE encoding method enhances ability to identify genetic interactions." PLoS genetics 17.6 (2021): e1009534. """ # Validate parameters if covariates is None: covariates = [] # Covariates must be a list if type(covariates) != list: raise ValueError( "'covariates' must be specified as a list or set to None") # Convert Series to a DataFrame for simpler processing later on if isinstance(genotypes, pd.Series): genotypes = pd.DataFrame(genotypes) # Extract specific data if isinstance(data, pd.Series): if data.name != outcome_variable: raise ValueError( f"The data is a Series but it's name doesn't match the outcome variable" ) data = pd.DataFrame( data) # Ensure data is a DataFrame from here on for simplicity else: try: data = data[[ outcome_variable, ] + covariates] except KeyError as e: raise ValueError(f"Missing variable in provided data: {e}") # Check Types to determine which kind of regression to run dtypes = _get_types(data) outcome_type = dtypes.get(outcome_variable) if outcome_type == "continuous": family = sm.families.Gaussian(link=sm.families.links.identity()) use_t = True elif outcome_type == "binary": # Use the order according to the categorical counts = data[outcome_variable].value_counts().to_dict() categories = data[outcome_variable].cat.categories codes, categories = zip(*enumerate(categories)) data[outcome_variable].replace(categories, codes, inplace=True) print( f"Binary Outcome (family = Binomial): '{outcome_variable}'\n" f"\t{counts[categories[0]]:,} occurrences of '{categories[0]}' coded as 0\n" f"\t{counts[categories[1]]:,} occurrences of '{categories[1]}' coded as 1" ) family = sm.families.Binomial(link=sm.families.links.logit()) use_t = False # Check for missing outcomes na_outcome_count = data[outcome_variable].isna().sum() if na_outcome_count > 0: raise ValueError( f"{na_outcome_count} samples are missing an outcome value") # Ensure genotypes data is actually all genotypes if not genotypes.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt)).all(): incorrect = genotypes.dtypes[ ~genotypes.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt))] raise AttributeError( f"Incompatible datatypes: all columns must be a GenotypeDtype: {incorrect}" ) # Merge genotypes and data for col in list(data): if col in list(genotypes): raise ValueError( "Outcome and covariate names should not exist in `genotypes`: Check '{col}'" ) gt_col_names = list(genotypes) merged = genotypes.merge(data, how="inner", left_index=True, right_index=True) if len(merged) == 0: raise ValueError( "Unable to merge the genotypes with the data. Check the index values." ) elif len(merged) < len(genotypes): raise ValueError( f"Only {len(merged):,} of {len(genotypes):,} genotypes were merged to the data. Check the index values." ) # Run regressions results = [] for gt in gt_col_names: result = { "Variant ID": merged[gt].genomics.variant.id, "Alpha Value": np.nan, "Ref Allele": merged[gt].genomics.variant.ref, "Alt Allele": merged[gt].genomics.variant.alt, "Minor Allele Frequency": merged[gt].genomics.maf, } encoded = merged[gt].genomics.encode_codominant() df = pd.concat([data, encoded], axis=1) formula = f"Q('{outcome_variable}') ~ Q('{gt}')" if len(covariates) > 0: formula += " + " formula += " + ".join([f"Q('{c}')" for c in covariates]) y, X = patsy.dmatrices(formula, df, return_type="dataframe", NA_action="drop") y = fix_names(y) X = fix_names(X) # Drop the intercept column # This can be done in the formula, but causes issues with the dummy variable encoding X = X.drop(columns=["Intercept"]) # Run Regression est = sm.GLM(y, X, family=family).fit(use_t=use_t) # Save results if the regression converged if est.converged: if est.params[f"{gt}[T.Hom]"] == 0: print( f"No results for {gt}: The homozygous alternate beta value was 0" ) continue else: result["Alpha Value"] = (est.params[f"{gt}[T.Het]"] / est.params[f"{gt}[T.Hom]"]) else: print(f"No results for {gt}: Regression did not converge") continue results.append(result) if len(results) == 0: raise ValueError("No results (see printed errors)") else: result = pd.DataFrame(results) return result
def test_size(input_str, size): gtdtype = GenotypeDtype.construct_from_string(input_str) assert gtdtype.itemsize == size
def test_from_str(input_str, variant): """Test creating GenotypeDtype from str""" gtdtype = GenotypeDtype.construct_from_string(input_str) assert gtdtype.variant == variant