def randomize_dataset(self, new_seed): """ Sample a new self.df of the same length, but with a new seed. """ self.df = self._load_nb_examples(seed=new_seed) self.df.rename(symbol_map(self.df.columns), axis="columns", inplace=True) if self.normalize: self.df = self.df - self.df.mean(axis=0)
def __init__(self, relabel_genes=True, datastore=None, randomize=False): if datastore is None: self.datastore = os.path.dirname(os.path.abspath(__file__)) else: self.datastore = datastore self.load_data() self.nx_graph = nx.relabel.relabel_nodes(self.nx_graph, symbol_map(self.nx_graph.nodes)) # Randomize self.randomize = randomize if self.randomize: print("Randomizing the graph") self.nx_graph = nx.relabel.relabel_nodes(self.nx_graph, randmap(self.nx_graph.nodes))
def load_data(self): self.hdf5 = h5py.File(name=self.file_path, mode='r') self.expression_data = self.hdf5['expression_data'] self.nrows, self.ncols = self.expression_data.shape # Load all gene names to memory self.genes = [x.decode() for x in self.hdf5['gene_names'][()].tolist()] if self.load_full: self.df = pd.DataFrame(data=self.expression_data[()], columns=self.genes) else: self.df = self._load_nb_examples() self.df.rename(symbol_map(self.df.columns), axis="columns", inplace=True) if self.normalize: self.df = self.df - self.df.mean(axis=0)
def load_data(self): csv_file = at.get(self.at_hash, datastore=self.datastore) hdf_file = csv_file.split(".gz")[0] + ".hdf5" if not os.path.isfile(hdf_file): print("We are converting a CSV dataset of TCGA to HDF5. Please wait a minute, this only happens the first " "time you use the TCGA dataset.") df = pd.read_csv(csv_file, compression="gzip", sep="\t") df = df.set_index('Sample') df = df.transpose() df.to_hdf(hdf_file, key="data", complevel=5) self.df = pd.read_hdf(hdf_file) self.df.rename(symbol_map(self.df.columns), axis="columns", inplace=True) self.df = self.df - self.df.mean(axis=0) #self.df = self.df / self.df.variance() self.sample_names = self.df.index.values.tolist() self.node_names = np.array(self.df.columns.values.tolist()).astype("str") self.nb_nodes = self.df.shape[1] self.labels = [0 for _ in range(self.df.shape[0])]
def __init__(self, relabel_genes=True, datastore="./data"): self.datastore = datastore self.load_data() self.nx_graph = nx.relabel.relabel_nodes( self.nx_graph, symbol_map(self.nx_graph.nodes))
def __init__(self, relabel_genes=True): self.load_data() self.nx_graph = nx.relabel.relabel_nodes( self.nx_graph, symbol_map(self.nx_graph.nodes))