def read_file_to_gpmap( input_file_name, wildtype=None, ): """Read the input file for GPSeer. This should be a CSV file with the following columns: genotypes, phenotypes, n_replicates, stdeviations """ df = pd.read_csv(input_file_name) required_columns = ["genotypes", "phenotypes"] optional_columns = ["stdeviations", "n_replicates"] for c in required_columns: try: df[c] except AttributeError: err = "input file ({}) must contain a column labeled '{}'".format( input_file_name, c) return AttributeError(err) # If wildtype is not given, use the first genotype in the input file. if not wildtype: wildtype = df.loc[0, 'genotypes'] # Fill in missing columns for the GenotypePhenotypeMap for col in optional_columns: if col not in df.columns: df[col] = None gpm = GenotypePhenotypeMap.read_dataframe(df, wildtype) return gpm
def fit_transform(self, X=None, y=None, **kwargs): self.fit(X=X, y=y, **kwargs) ypred = self.predict(X=X) # Transform map. gpm = GenotypePhenotypeMap.read_dataframe( dataframe=self.gpm.data[ypred==1], wildtype=self.gpm.wildtype, mutations=self.gpm.mutations ) return gpm
def fit_transform(self, X=None, y=None, **kwargs): self.fit(X=X, y=y, **kwargs) linear_phenotypes = self.transform(X=X, y=y) # Transform map. gpm = GenotypePhenotypeMap.read_dataframe( dataframe=self.gpm.data, wildtype=self.gpm.wildtype, mutations=self.gpm.mutations ) gpm.data['phenotypes'] = linear_phenotypes return gpm
def split_gpm(gpm, idx=None, nobs=None, fraction=None): """Split GenotypePhenotypeMap into two sets, a training and a test set. Parameters ---------- data : pandas.DataFrame full dataset to split. idx : list List of indices to include in training set nobs : int number of observations in training. fraction : float fraction in training set. Returns ------- train_gpm : GenotypePhenotypeMap training set. test_gpm : GenotypePhenotypeMap test set. """ train, test = split_data(gpm.data, idx=idx, nobs=nobs, fraction=fraction) train_gpm = GenotypePhenotypeMap.read_dataframe(train, wildtype=gpm.wildtype, mutations=gpm.mutations) test_gpm = GenotypePhenotypeMap.read_dataframe(test, wildtype=gpm.wildtype, mutations=gpm.mutations) return train_gpm, test_gpm