def load(dataset): datadir = project_dirs.data_dir() filename = '{}_allGenes.mat'.format(dataset) path = join(datadir, filename) if cfg.verbosity > 0: print 'Loading dataset {} from {}'.format(dataset, path) mat = loadmat(path) ages = np.array(mat['ages'].flat) gene_names = np.array( matlab_cell_array_to_list_of_strings(mat['gene_names'])) region_names = np.array( matlab_cell_array_to_list_of_strings(mat['region_names'])) if 'genders' in mat: genders = np.array( matlab_cell_array_to_list_of_strings(mat['genders'])) else: genders = None expression = mat['expression'] if expression.ndim == 2: # extend shape to represent a single region name expression.shape = list(expression.shape) + [1] # average expression for duplicate genes dct = defaultdict( list) # gene_name -> list of indices where it appears for i, g in enumerate(gene_names): dct[g].append(i) new_gene_names = sorted(set(gene_names)) new_expression = np.empty( [len(ages), len(new_gene_names), len(region_names)]) for i, g in enumerate(new_gene_names): idx = dct[g] new_expression[:, i, :] = expression[:, idx, :].mean(axis=1) gene_names = np.array(new_gene_names) expression = new_expression # make sure ages are sorted (for colantuoni there are 2 datapoints that aren't) inds = np.argsort(ages) ages = ages[inds] genders = genders[inds] if genders is not None else None expression = expression[inds, :, :] exons = _get_exons(gene_names) if cfg.exon_level else None res = OneDataset(expression=expression, gene_names=gene_names, region_names=region_names, genders=genders, ages=ages, name=dataset, exons=exons).restrict_pathway('all') sorted_regions = cfg.sorted_regions.get(dataset) if sorted_regions is not None: res = res.restrict_regions(cfg.sorted_regions[dataset]) return res
def load(dataset): datadir = project_dirs.data_dir() filename = '{}_allGenes.mat'.format(dataset) path = join(datadir,filename) if cfg.verbosity > 0: print 'Loading dataset {} from {}'.format(dataset,path) mat = loadmat(path) ages = np.array(mat['ages'].flat) gene_names = np.array(matlab_cell_array_to_list_of_strings(mat['gene_names'])) region_names = np.array(matlab_cell_array_to_list_of_strings(mat['region_names'])) if 'genders' in mat: genders = np.array(matlab_cell_array_to_list_of_strings(mat['genders'])) else: genders = None expression = mat['expression'] if expression.ndim == 2: # extend shape to represent a single region name expression.shape = list(expression.shape)+[1] # average expression for duplicate genes dct = defaultdict(list) # gene_name -> list of indices where it appears for i,g in enumerate(gene_names): dct[g].append(i) new_gene_names = sorted(set(gene_names)) new_expression = np.empty([len(ages),len(new_gene_names),len(region_names)]) for i,g in enumerate(new_gene_names): idx = dct[g] new_expression[:,i,:] = expression[:,idx,:].mean(axis=1) gene_names = np.array(new_gene_names) expression = new_expression # make sure ages are sorted (for colantuoni there are 2 datapoints that aren't) inds = np.argsort(ages) ages = ages[inds] genders = genders[inds] if genders is not None else None expression = expression[inds,:,:] res = OneDataset( expression = expression, gene_names = gene_names, region_names = region_names, genders = genders, ages = ages, name = dataset ).restrict_pathway('all') sorted_regions = cfg.sorted_regions.get(dataset) if sorted_regions is not None: res = res.restrict_regions(cfg.sorted_regions[dataset]) return res
def load_kang_tree_distances(): filename = join(project_dirs.data_dir(), 'kangTreeDistances.mat') mat = loadmat(filename) distances = mat['developingTreeDistances'] regions = matlab_cell_array_to_list_of_strings(mat['regions']) dct_regions = {r:i for i,r in enumerate(regions)} return Bunch( regions = regions, dct_regions = dct_regions, distances = distances, )
def load_kang_tree_distances(): filename = join(project_dirs.data_dir(), 'kangTreeDistances.mat') mat = loadmat(filename) distances = mat['developingTreeDistances'] regions = matlab_cell_array_to_list_of_strings(mat['regions']) dct_regions = {r: i for i, r in enumerate(regions)} return Bunch( regions=regions, dct_regions=dct_regions, distances=distances, )