def get_x(data: Data, genes=None, compounds=None) -> np.ndarray: """ Get gene activations per compound for given dataset and gene list :param data: e. g. human data :param genes: genes to select, default is all :return: X """ # use all genes if no specfial gene set indicated if not genes: genes = data.header.genes hierarchical = get_hierarchical_data(data, genes) header = data.header # pickle.dump([hierarchical, header], open('small_human.p', 'wb')) # exit(0) X = [] if not compounds: compounds = header.compounds # Data formatted as data[gene][compound][dosage][replicate][time] # name index index index index for compound_name in compounds: c = header.compounds.index(compound_name) for d in range(len(header.dosages)): for r in range(len(header.replicates)): # collect activations for each replicate, so cross product can be used later on act = [] for gene in genes: # print(gene) for t in range(len(header.times)): # print(compound_name, c, d, r, t, hierarchical[gene][c][d][r]) try: act.append(hierarchical[gene][c][d][r][t]) except: print("Fix failed miserably again!") import sys sys.exit(1337) X.append(np.array(act)) return np.array(X)
def get_x(data: Data, genes=None, compounds=None) -> np.ndarray: """ Extract gene activations for specified subsets of compounds and genes from the full data set for a given data domain. :param data: e. g. human in vitro/rat in vitro/rat in vivo data :param genes: genes to select (preselected genesets - STEATOSIS, NAFLD ect, or other specified list of genes), default is all :return: X """ # use all genes if no specfial gene set indicated if not genes: genes = data.header.genes hierarchical = get_hierarchical_data(data, genes) header = data.header X = [] if not compounds: compounds = header.compounds # Data formatted as data[gene][compound][dosage][replicate][time] # name index index index index for compound_name in compounds: c = header.compounds.index(compound_name) for d in range(len(header.dosages)): for r in range(len(header.replicates)): # collect activations for each replicate, so cross product can be used later on act = [] for gene in genes: for t in range(len(header.times)): try: act.append(hierarchical[gene][c][d][r][t]) except: print("error reading data - read_x") import sys sys.exit(1337) X.append(np.array(act)) return np.array(X)
def get_doubling_xy(first: Data, second: Data, genes_first=None, genes_second=None, compounds=None, max_replicates=2) -> Tuple[List[List[float]], List[List[float]]]: """ Create doubled X and Y like Kurts parserDoubling, for which genes + headers have to be equal. :param first: e. g. human data :param second: e. g. rat data :param genes_first: genes to select from first dataset, default is all :param genes_second: genes to select from second dataset, default is same as genes_first :param compounds: default is all compounds in data. Realistically, should use CompoundLists.GENERAL_47 :param max_replicates: how many replicates to use at most. For example, vivo has 5 replicates, but some are missing data. the first three are safe to use, but using more than two creates complexity in later interpreting the dataset (2x3x4 = 24 instances for each compound instead of 2x2x4 = 16) :return: X and Y """ # Dan: this is the function that ACTUALLY reads the data; read_data only specifies which genes to read if first.header.dosages != second.header.dosages: raise ValueError("human and rat dosages are not the same") # use all genes if no special gene set indicated if genes_first is None: genes_first = first.header.genes if genes_second is None: genes_second = genes_first first_hierarchical = get_hierarchical_data(first, genes_first) # Dan: contains selected genes second_hierarchical = get_hierarchical_data(second, genes_second) header = first.header times_first = len(first.header.times) times_second = len(second.header.times) if not compounds: compounds = header.compounds X = [] Y = [] # Data formatted as data[gene][compound][dosage][replicate][time] # name index index index index for compound_name in compounds: c1 = first.header.compounds.index(compound_name) c2 = second.header.compounds.index(compound_name) for d in range(len(header.dosages)): # add all time series for all replicates, then combine each pair first_activations = [] second_activations = [] # Use at most the first three replicates (for vivo); the others have missing data for r1 in range(min(len(first.header.replicates), max_replicates)): first_act = [] for gene in genes_first: first_act.extend(first_hierarchical[gene][c1][d][r1]) if len(first_hierarchical[gene][c1][d][r1]) < times_first: print("Missing data! ", gene, compound_name, d, r1) exit(0) first_activations.append(first_act) for r2 in range(min(len(second.header.replicates), max_replicates)): second_act = [] for gene in genes_second: second_act.extend(second_hierarchical[gene][c2][d][r2]) if len(second_hierarchical[gene][c2][d][r2]) < times_second: print("Missing data! ", gene, compound_name, d, r2) exit(0) second_activations.append(second_act) # combine replicates (i.e. doubling) for h, r in itertools.product(first_activations, second_activations): X.append(h) Y.append(r) return X, Y
def get_doubling_xyz(first: Data, second: Data, third: Data, genes_first=None, genes_second=None, genes_third=None, compounds=None, max_replicates=2) -> Tuple[List[List[float]], List[List[float]]]: # use all genes if no specfial gene set indicated if genes_first is None: genes_first = first.header.genes if genes_second is None: genes_second = genes_first if genes_third is None: genes_third = genes_second print(genes_first) print(genes_second) print(genes_third) first_hierarchical = get_hierarchical_data(first, genes_first) second_hierarchical = get_hierarchical_data(second, genes_second) third_hierarchical = get_hierarchical_data(third, genes_third) header = first.header times_first = len(first.header.times) times_second = len(second.header.times) times_third = len(third.header.times) if not compounds: compounds = header.compounds X = [] Y = [] Z = [] # Data formatted as data[gene][compound][dosage][replicate][time] # name index index index index for compound_name in compounds: c1 = first.header.compounds.index(compound_name) c2 = second.header.compounds.index(compound_name) c3 = third.header.compounds.index(compound_name) for d in range(len(header.dosages)): # add all time series for all replicates, then combine each pair first_activations = [] second_activations = [] third_activations = [] # Use at most the first three replicates (for vivo); the others have missing data for r1 in range(min(len(first.header.replicates), max_replicates)): first_act = [] for gene in genes_first: first_act.extend(first_hierarchical[gene][c1][d][r1]) if len(first_hierarchical[gene][c1][d][r1]) < times_first: print("Missing data! ", gene, compound_name, d, r1) exit(0) first_activations.append(first_act) for r2 in range(min(len(second.header.replicates), max_replicates)): second_act = [] for gene in genes_second: second_act.extend(second_hierarchical[gene][c2][d][r2]) if len(second_hierarchical[gene][c2][d][r2]) < times_second: print("Missing data! ", gene, compound_name, d, r2) exit(0) second_activations.append(second_act) for r3 in range(min(len(third.header.replicates), max_replicates)): third_act = [] for gene in genes_third: third_act.extend(third_hierarchical[gene][c3][d][r3]) if len(third_hierarchical[gene][c3][d][r3]) < times_third: print("Missing data! ", gene, compound_name, d, r3) exit(0) third_activations.append(third_act) # combine replicates (i.e. doubling) for h, r, r2 in itertools.product(first_activations, second_activations, third_activations): X.append(h) Y.append(r) Z.append(r2) return X, Y, Z