Пример #1
0
def read_file_to_gpmap(
    input_file_name,
    wildtype=None,
):
    """Read the input file for GPSeer.

    This should be a CSV file with the following columns:
    genotypes, phenotypes, n_replicates, stdeviations
    """
    df = pd.read_csv(input_file_name)
    required_columns = ["genotypes", "phenotypes"]
    optional_columns = ["stdeviations", "n_replicates"]
    for c in required_columns:
        try:
            df[c]
        except AttributeError:
            err = "input file ({}) must contain a column labeled '{}'".format(
                input_file_name, c)
            return AttributeError(err)

    # If wildtype is not given, use the first genotype in the input file.
    if not wildtype:
        wildtype = df.loc[0, 'genotypes']

    # Fill in missing columns for the GenotypePhenotypeMap
    for col in optional_columns:
        if col not in df.columns:
            df[col] = None

    gpm = GenotypePhenotypeMap.read_dataframe(df, wildtype)
    return gpm
Пример #2
0
    def fit_transform(self, X=None, y=None, **kwargs):
        self.fit(X=X, y=y, **kwargs)
        ypred = self.predict(X=X)

        # Transform map.
        gpm = GenotypePhenotypeMap.read_dataframe(
            dataframe=self.gpm.data[ypred==1],
            wildtype=self.gpm.wildtype,
            mutations=self.gpm.mutations
        )
        return gpm
Пример #3
0
    def fit_transform(self, X=None, y=None, **kwargs):
        self.fit(X=X, y=y, **kwargs)

        linear_phenotypes = self.transform(X=X, y=y)

        # Transform map.
        gpm = GenotypePhenotypeMap.read_dataframe(
            dataframe=self.gpm.data,
            wildtype=self.gpm.wildtype,
            mutations=self.gpm.mutations
        )

        gpm.data['phenotypes'] = linear_phenotypes
        return gpm
Пример #4
0
def split_gpm(gpm, idx=None, nobs=None, fraction=None):
    """Split GenotypePhenotypeMap into two sets, a training and a test set.

    Parameters
    ----------
    data : pandas.DataFrame
        full dataset to split.

    idx : list
        List of indices to include in training set

    nobs : int
        number of observations in training.

    fraction : float
        fraction in training set.

    Returns
    -------
    train_gpm : GenotypePhenotypeMap
        training set.

    test_gpm : GenotypePhenotypeMap
        test set.
    """
    train, test = split_data(gpm.data, idx=idx, nobs=nobs, fraction=fraction)

    train_gpm = GenotypePhenotypeMap.read_dataframe(train,
                                                    wildtype=gpm.wildtype,
                                                    mutations=gpm.mutations)

    test_gpm = GenotypePhenotypeMap.read_dataframe(test,
                                                   wildtype=gpm.wildtype,
                                                   mutations=gpm.mutations)

    return train_gpm, test_gpm