def __init__(self, pandas_obj):
     if not pandas_obj.dtypes.apply(
             lambda dt: GenotypeDtype.is_dtype(dt)).any():
         raise AttributeError(
             "Incompatible datatypes: at least one column must be a GenotypeDtype."
         )
     id_counts = Counter([
         s.genomics.variant.id for _, s in pandas_obj.iteritems()
         if GenotypeDtype.is_dtype(s)
     ])
     if len(id_counts) < len(
             pandas_obj.select_dtypes([GenotypeDtype]).columns):
         duplicates = [(k, v) for k, v in id_counts.items() if v >= 2]
         raise AttributeError(
             f"Duplicate Variant IDs.  Column names may differ from variant IDs, but variant IDs must be unique.\n\tDuplicates: "
             + ", ".join(
                 [f"{dupe} ({count:,})" for dupe, count in duplicates]))
     self._obj = pandas_obj
示例#2
0
def generate_random_gt(
    variant: Variant,
    alt_allele_freq: Union[List[float], float],
    n: int = 1000,
    random_seed: int = 1855,
) -> GenotypeArray:
    """
    Simulate random genotypes according to the provided allele frequencies

    Parameters
    ----------
    variant: Variant
    alt_allele_freq: float or List[float]
      Allele frequencies for each alternate allele in the variant (Bialleleic variants may specify a single float value)
    n: int, default 1000
        How many genotypes to simulate
    random_seed: int, default 1855

    Returns
    -------
    GenotypeArray

    """
    # Validate frequencies
    if isinstance(alt_allele_freq, float):
        # Convert it into a list
        alt_allele_freq = [
            alt_allele_freq,
        ]
    if len(alt_allele_freq) != len(variant.alleles) - 1:
        raise ValueError(
            f"The number of provided frequencies ({len(alt_allele_freq)}) doesn't match"
            f" the number of alternate alleles in the variant ({len(variant.alleles)-1})."
        )
    if sum(alt_allele_freq) > 1.0:
        raise ValueError(
            f"The provided frequencies must not sum to > 1.0 (sum was {sum(alt_allele_freq):.3e})"
        )

    # Set remaining odds to the reference allele
    allele_freq = [
        1 - sum(alt_allele_freq),
    ] + alt_allele_freq

    # Choose gts
    np.random.seed(random_seed)
    genotypes = np.random.choice(range(len(variant.alleles)),
                                 p=allele_freq,
                                 size=(n, variant.ploidy))

    # Create GenotypeArray representation of the data
    dtype = GenotypeDtype(variant)
    scores = np.ones(n) * MISSING_IDX
    data = np.array(list(zip(genotypes, scores)), dtype=dtype._record_type)
    gt_array = GenotypeArray(values=data, dtype=dtype)

    return gt_array
示例#3
0
def dtype():
    variant = Variant(
        chromosome="chr1",
        position=123456,
        id="rs12345",
        ref="A",
        alt=["T", "G"],
        score=30,
    )
    return GenotypeDtype(variant=variant)
    def encode_dominant(self) -> pd.DataFrame:
        """Dominant encoding of genotypes.

        See :meth:`GenotypeArray.encode_dominant`

        Returns
        -------
        pd.DataFrame
        """
        return pd.concat(
            [
                s.genomics.encode_dominant()
                if GenotypeDtype.is_dtype(s) else s
                for _, s in self._obj.iteritems()
            ],
            axis=1,
        )
 def _get_snp2_gt_array(self, gt_table_idxs):
     """Assemble a GenotypeArray for SNP2 directly from genotype table indices"""
     dtype = GenotypeDtype(self.snp2)
     gt_table_data = (
         ((0, 0), MISSING_IDX),
         ((0, 0), MISSING_IDX),
         ((0, 0), MISSING_IDX),
         ((0, 1), MISSING_IDX),
         ((0, 1), MISSING_IDX),
         ((0, 1), MISSING_IDX),
         ((1, 1), MISSING_IDX),
         ((1, 1), MISSING_IDX),
         ((1, 1), MISSING_IDX),
     )
     data = np.array([gt_table_data[i] for i in gt_table_idxs],
                     dtype=dtype._record_type)
     return GenotypeArray(values=data, dtype=dtype)
    def encode_edge(self, encoding_info: pd.DataFrame) -> pd.DataFrame:
        """EDGE (weighted) encoding of genotypes.

        See :meth:`GenotypeArray.encode_edge`

        Parameters
        ----------
        encoding_info: pd.DataFrame
            columns:
                - Variant ID - used to match variants
                - Alpha Value - used for heterozygous genotypes
                - Ref Allele - which allele is considered reference
                - Alt Allele - which allele is considered alternate
                - Minor Allele Frequency - MAF of data used during calculation of alpha values

        Returns
        -------
        pd.DataFrame
        """
        # Validate the input DataFrame
        for required_col in [
                "Variant ID",
                "Alpha Value",
                "Ref Allele",
                "Alt Allele",
                "Minor Allele Frequency",
        ]:
            if required_col not in list(encoding_info):
                raise ValueError(
                    f"Missing one or more required columns in the encoding info: `{required_col}`"
                )
        id_counts = encoding_info["Variant ID"].value_counts()
        if sum(id_counts > 1):
            raise ValueError(
                f"Duplicate IDs: {', '.join([v for v in id_counts[id_counts>1].index])}"
            )

        # Rename the columns to match parameter names for simplicity
        encoding_info = encoding_info.rename(
            columns={
                "Alpha Value": "alpha_value",
                "Ref Allele": "ref_allele",
                "Alt Allele": "alt_allele",
                "Minor Allele Frequency": "minor_allele_freq",
            })

        # Convert the encoding info into a Dict("Variant ID" = {param names : param values})
        encoding_info = {
            d["Variant ID"]: {k: v
                              for k, v in d.items() if k != "Variant ID"}
            for d in encoding_info.to_dict(orient="records")
        }

        # Log messages for any warnings
        warnings = dict()

        # Process each variant
        results = []
        for _, s in self._obj.iteritems():
            if not GenotypeDtype.is_dtype(s):
                results.append(s)
                continue
            info = encoding_info.get(s.array.variant.id, None)
            if info is None:
                warnings[
                    s.array.variant.
                    id] = "No matching information found in the encoding data"
                continue
            elif (s.genomics.maf / info["minor_allele_freq"]) > 10e30:
                # TODO: replace this with a reasonable comparison to the data MAF.  For now it is an always-pass criteria
                warnings[
                    s.array.variant.
                    id] = f"Large MAF Difference: {s.genomics.maf} in sample, {info['minor_allele_freq']} in encoding data"
                continue
            else:
                try:
                    results.append(s.genomics.encode_edge(**info))
                except Exception as e:
                    warnings[s.array.variant.id] = str(e)
        # Print Warnings
        if len(warnings) > 0:
            print(f"{len(warnings):,} Variables failed encoding")
            for var, warning in warnings.items():
                print(f"\t{var}: {warning}")
        # Concatenate results
        return pd.concat(results, axis=1)
示例#7
0
def calculate_edge_alphas(
    genotypes: Union[pd.Series, pd.DataFrame],
    data: pd.DataFrame,
    outcome_variable: str,
    covariates: Optional[List[str]] = None,
):
    """
    Calculate alpha values to be used in EDGE encoding

    Parameters
    ----------
    genotypes:
        A GenotypeArray Series or DataFrame
    data:
        Data to be used in the regression, including the outcome and covariates
    outcome_variable:
        The variable to be used as the output (y) of the regression
    covariates:
        Other variables to be included in the regression formula

    Returns
    -------
    Dict
      Variant ID: str
      Alpha Value - used for heterozygous genotypes
      Ref Allele - which allele is considered reference
      Alt Allele - which allele is considered alternate
      Minor Allele Frequency - MAF of data used during calculation of alpha values

    Notes
    -----
    See [1]_ for more information about EDGE encoding.

    References
    ----------
    .. [1] Hall, Molly A., et al.
           "Novel EDGE encoding method enhances ability to identify genetic interactions."
           PLoS genetics 17.6 (2021): e1009534.
    """
    # Validate parameters
    if covariates is None:
        covariates = []
    # Covariates must be a list
    if type(covariates) != list:
        raise ValueError(
            "'covariates' must be specified as a list or set to None")

    # Convert Series to a DataFrame for simpler processing later on
    if isinstance(genotypes, pd.Series):
        genotypes = pd.DataFrame(genotypes)

    # Extract specific data
    if isinstance(data, pd.Series):
        if data.name != outcome_variable:
            raise ValueError(
                f"The data is a Series but it's name doesn't match the outcome variable"
            )
        data = pd.DataFrame(
            data)  # Ensure data is a DataFrame from here on for simplicity
    else:
        try:
            data = data[[
                outcome_variable,
            ] + covariates]
        except KeyError as e:
            raise ValueError(f"Missing variable in provided data: {e}")

    # Check Types to determine which kind of regression to run
    dtypes = _get_types(data)

    outcome_type = dtypes.get(outcome_variable)
    if outcome_type == "continuous":
        family = sm.families.Gaussian(link=sm.families.links.identity())
        use_t = True
    elif outcome_type == "binary":
        # Use the order according to the categorical
        counts = data[outcome_variable].value_counts().to_dict()
        categories = data[outcome_variable].cat.categories
        codes, categories = zip(*enumerate(categories))
        data[outcome_variable].replace(categories, codes, inplace=True)
        print(
            f"Binary Outcome (family = Binomial): '{outcome_variable}'\n"
            f"\t{counts[categories[0]]:,} occurrences of '{categories[0]}' coded as 0\n"
            f"\t{counts[categories[1]]:,} occurrences of '{categories[1]}' coded as 1"
        )
        family = sm.families.Binomial(link=sm.families.links.logit())
        use_t = False

    # Check for missing outcomes
    na_outcome_count = data[outcome_variable].isna().sum()
    if na_outcome_count > 0:
        raise ValueError(
            f"{na_outcome_count} samples are missing an outcome value")

    # Ensure genotypes data is actually all genotypes
    if not genotypes.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt)).all():
        incorrect = genotypes.dtypes[
            ~genotypes.dtypes.apply(lambda dt: GenotypeDtype.is_dtype(dt))]
        raise AttributeError(
            f"Incompatible datatypes: all columns must be a GenotypeDtype: {incorrect}"
        )

    # Merge genotypes and data
    for col in list(data):
        if col in list(genotypes):
            raise ValueError(
                "Outcome and covariate names should not exist in `genotypes`: Check '{col}'"
            )
    gt_col_names = list(genotypes)
    merged = genotypes.merge(data,
                             how="inner",
                             left_index=True,
                             right_index=True)
    if len(merged) == 0:
        raise ValueError(
            "Unable to merge the genotypes with the data.  Check the index values."
        )
    elif len(merged) < len(genotypes):
        raise ValueError(
            f"Only {len(merged):,} of {len(genotypes):,} genotypes were merged to the data.  Check the index values."
        )

    # Run regressions
    results = []
    for gt in gt_col_names:
        result = {
            "Variant ID": merged[gt].genomics.variant.id,
            "Alpha Value": np.nan,
            "Ref Allele": merged[gt].genomics.variant.ref,
            "Alt Allele": merged[gt].genomics.variant.alt,
            "Minor Allele Frequency": merged[gt].genomics.maf,
        }
        encoded = merged[gt].genomics.encode_codominant()
        df = pd.concat([data, encoded], axis=1)
        formula = f"Q('{outcome_variable}') ~ Q('{gt}')"
        if len(covariates) > 0:
            formula += " + "
            formula += " + ".join([f"Q('{c}')" for c in covariates])

        y, X = patsy.dmatrices(formula,
                               df,
                               return_type="dataframe",
                               NA_action="drop")
        y = fix_names(y)
        X = fix_names(X)
        # Drop the intercept column
        # This can be done in the formula, but causes issues with the dummy variable encoding
        X = X.drop(columns=["Intercept"])

        # Run Regression
        est = sm.GLM(y, X, family=family).fit(use_t=use_t)
        # Save results if the regression converged
        if est.converged:
            if est.params[f"{gt}[T.Hom]"] == 0:
                print(
                    f"No results for {gt}: The homozygous alternate beta value was 0"
                )
                continue
            else:
                result["Alpha Value"] = (est.params[f"{gt}[T.Het]"] /
                                         est.params[f"{gt}[T.Hom]"])
        else:
            print(f"No results for {gt}: Regression did not converge")
            continue
        results.append(result)

    if len(results) == 0:
        raise ValueError("No results (see printed errors)")
    else:
        result = pd.DataFrame(results)
        return result
示例#8
0
def test_size(input_str, size):
    gtdtype = GenotypeDtype.construct_from_string(input_str)
    assert gtdtype.itemsize == size
示例#9
0
def test_from_str(input_str, variant):
    """Test creating GenotypeDtype from str"""
    gtdtype = GenotypeDtype.construct_from_string(input_str)
    assert gtdtype.variant == variant