示例#1
0
    def randomize_dataset(self, new_seed):
        """
        Sample a new self.df of the same length, but with a new seed.
        """
        self.df = self._load_nb_examples(seed=new_seed)
        self.df.rename(symbol_map(self.df.columns),
                       axis="columns",
                       inplace=True)

        if self.normalize:
            self.df = self.df - self.df.mean(axis=0)
示例#2
0
 def __init__(self, relabel_genes=True, datastore=None, randomize=False):
     
     if datastore is None:
         self.datastore = os.path.dirname(os.path.abspath(__file__))
     else:
         self.datastore = datastore
     self.load_data()
     self.nx_graph = nx.relabel.relabel_nodes(self.nx_graph, symbol_map(self.nx_graph.nodes))
     
     # Randomize
     self.randomize = randomize
     if self.randomize:
         print("Randomizing the graph")
         self.nx_graph = nx.relabel.relabel_nodes(self.nx_graph, randmap(self.nx_graph.nodes))
示例#3
0
    def load_data(self):
        self.hdf5 = h5py.File(name=self.file_path, mode='r')
        self.expression_data = self.hdf5['expression_data']
        self.nrows, self.ncols = self.expression_data.shape

        # Load all gene names to memory
        self.genes = [x.decode() for x in self.hdf5['gene_names'][()].tolist()]

        if self.load_full:
            self.df = pd.DataFrame(data=self.expression_data[()],
                                   columns=self.genes)
        else:
            self.df = self._load_nb_examples()
        self.df.rename(symbol_map(self.df.columns),
                       axis="columns",
                       inplace=True)
        if self.normalize:
            self.df = self.df - self.df.mean(axis=0)
示例#4
0
 def load_data(self):
     csv_file = at.get(self.at_hash, datastore=self.datastore)
     hdf_file = csv_file.split(".gz")[0] + ".hdf5"
     if not os.path.isfile(hdf_file):
         print("We are converting a CSV dataset of TCGA to HDF5. Please wait a minute, this only happens the first "
               "time you use the TCGA dataset.")
         df = pd.read_csv(csv_file, compression="gzip", sep="\t")
         df = df.set_index('Sample')
         df = df.transpose()
         df.to_hdf(hdf_file, key="data", complevel=5)
     self.df = pd.read_hdf(hdf_file)
     self.df.rename(symbol_map(self.df.columns), axis="columns", inplace=True)
     self.df = self.df - self.df.mean(axis=0)
     #self.df = self.df / self.df.variance()
     self.sample_names = self.df.index.values.tolist()
     self.node_names = np.array(self.df.columns.values.tolist()).astype("str")
     self.nb_nodes = self.df.shape[1]
     self.labels = [0 for _ in range(self.df.shape[0])]
示例#5
0
 def __init__(self, relabel_genes=True, datastore="./data"):
     self.datastore = datastore
     self.load_data()
     self.nx_graph = nx.relabel.relabel_nodes(
         self.nx_graph, symbol_map(self.nx_graph.nodes))
示例#6
0
 def __init__(self, relabel_genes=True):
     self.load_data()
     self.nx_graph = nx.relabel.relabel_nodes(
         self.nx_graph, symbol_map(self.nx_graph.nodes))