def get_x(data: Data, genes=None, compounds=None) -> np.ndarray:
    """
    Get gene activations per compound for given dataset and gene list
    :param data: e. g. human data
    :param genes: genes to select, default is all
    :return: X
    """

    # use all genes if no specfial gene set indicated
    if not genes:
        genes = data.header.genes


    hierarchical = get_hierarchical_data(data, genes)
    header = data.header

    # pickle.dump([hierarchical, header], open('small_human.p', 'wb'))
    # exit(0)

    X = []

    if not compounds:
        compounds = header.compounds

    # Data formatted as data[gene][compound][dosage][replicate][time]
    # name  index     index   index      index

    for compound_name in compounds:
        c = header.compounds.index(compound_name)
        for d in range(len(header.dosages)):
            for r in range(len(header.replicates)):
                # collect activations for each replicate, so cross product can be used later on
                act = []
                for gene in genes:
                    # print(gene)
                    for t in range(len(header.times)):
                        # print(compound_name, c, d, r, t, hierarchical[gene][c][d][r])
                        try:
                            act.append(hierarchical[gene][c][d][r][t])
                        except:
                            print("Fix failed miserably again!")
                            import sys
                            sys.exit(1337)
                X.append(np.array(act))
    return np.array(X)
def get_x(data: Data, genes=None, compounds=None) -> np.ndarray:
    """
    Extract gene activations for specified subsets of compounds and genes from the full data set for a given data domain. 
    :param data: e. g. human in vitro/rat in vitro/rat in vivo data
    :param genes: genes to select (preselected genesets - STEATOSIS, NAFLD ect, or other specified list of genes), default is all
    :return: X
    """

    # use all genes if no specfial gene set indicated
    if not genes:
        genes = data.header.genes

    hierarchical = get_hierarchical_data(data, genes)
    header = data.header

    X = []

    if not compounds:
        compounds = header.compounds

    # Data formatted as data[gene][compound][dosage][replicate][time]
    # name  index     index   index      index

    for compound_name in compounds:
        c = header.compounds.index(compound_name)
        for d in range(len(header.dosages)):
            for r in range(len(header.replicates)):
                # collect activations for each replicate, so cross product can be used later on
                act = []
                for gene in genes:
                    for t in range(len(header.times)):
                        try:
                            act.append(hierarchical[gene][c][d][r][t])
                        except:
                            print("error reading data - read_x")
                            import sys
                            sys.exit(1337)
                X.append(np.array(act))
    return np.array(X)
def get_doubling_xy(first: Data, second: Data, genes_first=None, genes_second=None, compounds=None, max_replicates=2) -> Tuple[List[List[float]], List[List[float]]]:
    """
    Create doubled X and Y like Kurts parserDoubling, for which genes + headers have to be equal.
    :param first: e. g. human data
    :param second: e. g. rat data
    :param genes_first: genes to select from first dataset, default is all
    :param genes_second: genes to select from second dataset, default is same as genes_first
    :param compounds: default is all compounds in data. Realistically, should use CompoundLists.GENERAL_47
    :param max_replicates: how many replicates to use at most. For example, vivo has 5 replicates, but some are missing data.
                           the first three are safe to use, but using more than two creates complexity in later interpreting
                           the dataset (2x3x4 = 24 instances for each compound instead of 2x2x4 = 16)
    :return: X and Y
    """
    # Dan: this is the function that ACTUALLY reads the data; read_data only specifies which genes to read

    if first.header.dosages != second.header.dosages:
        raise ValueError("human and rat dosages are not the same")
    
    # use all genes if no special gene set indicated
    if genes_first is None:
        genes_first = first.header.genes

    if genes_second is None:
        genes_second = genes_first

    first_hierarchical = get_hierarchical_data(first, genes_first)  # Dan: contains selected genes
    second_hierarchical = get_hierarchical_data(second, genes_second)

    header = first.header
    times_first = len(first.header.times)
    times_second = len(second.header.times)

    if not compounds:
        compounds = header.compounds

    X = []
    Y = []

    # Data formatted as data[gene][compound][dosage][replicate][time]
    # name  index     index   index      index

    for compound_name in compounds:
        c1 = first.header.compounds.index(compound_name)
        c2 = second.header.compounds.index(compound_name)
        for d in range(len(header.dosages)):
            # add all time series for all replicates, then combine each pair
            first_activations = []
            second_activations = []

            # Use at most the first three replicates (for vivo); the others have missing data
            for r1 in range(min(len(first.header.replicates), max_replicates)):
                first_act = []
                for gene in genes_first:
                    first_act.extend(first_hierarchical[gene][c1][d][r1])
                    if len(first_hierarchical[gene][c1][d][r1]) < times_first:
                        print("Missing data! ", gene, compound_name, d, r1)
                        exit(0)

                first_activations.append(first_act)

            for r2 in range(min(len(second.header.replicates), max_replicates)):
                second_act = []
                for gene in genes_second:
                    second_act.extend(second_hierarchical[gene][c2][d][r2])
                    if len(second_hierarchical[gene][c2][d][r2]) < times_second:
                        print("Missing data! ", gene, compound_name, d, r2)
                        exit(0)

                second_activations.append(second_act)

            # combine replicates (i.e. doubling)
            for h, r in itertools.product(first_activations, second_activations):
                X.append(h)
                Y.append(r)

    return X, Y
def get_doubling_xyz(first: Data, second: Data, third: Data, genes_first=None, genes_second=None, genes_third=None, 
    compounds=None, max_replicates=2) -> Tuple[List[List[float]], List[List[float]]]:
    
    # use all genes if no specfial gene set indicated
    if genes_first is None:
        genes_first = first.header.genes

    if genes_second is None:
        genes_second = genes_first

    if genes_third is None:
        genes_third = genes_second

    print(genes_first)
    print(genes_second)
    print(genes_third)

    first_hierarchical = get_hierarchical_data(first, genes_first)
    second_hierarchical = get_hierarchical_data(second, genes_second)
    third_hierarchical = get_hierarchical_data(third, genes_third)

    header = first.header
    times_first = len(first.header.times)
    times_second = len(second.header.times)
    times_third = len(third.header.times)

    if not compounds:
        compounds = header.compounds

    X = []
    Y = []
    Z = []

    # Data formatted as data[gene][compound][dosage][replicate][time]
    # name  index     index   index      index

    for compound_name in compounds:
        c1 = first.header.compounds.index(compound_name)
        c2 = second.header.compounds.index(compound_name)
        c3 = third.header.compounds.index(compound_name)
        for d in range(len(header.dosages)):
            # add all time series for all replicates, then combine each pair
            first_activations = []
            second_activations = []
            third_activations = []

            # Use at most the first three replicates (for vivo); the others have missing data
            for r1 in range(min(len(first.header.replicates), max_replicates)):
                first_act = []
                for gene in genes_first:
                    first_act.extend(first_hierarchical[gene][c1][d][r1])
                    if len(first_hierarchical[gene][c1][d][r1]) < times_first:
                        print("Missing data! ", gene, compound_name, d, r1)
                        exit(0)

                first_activations.append(first_act)

            for r2 in range(min(len(second.header.replicates), max_replicates)):
                second_act = []
                for gene in genes_second:
                    second_act.extend(second_hierarchical[gene][c2][d][r2])
                    if len(second_hierarchical[gene][c2][d][r2]) < times_second:
                        print("Missing data! ", gene, compound_name, d, r2)
                        exit(0)

                second_activations.append(second_act)

            for r3 in range(min(len(third.header.replicates), max_replicates)):
                third_act = []
                for gene in genes_third:
                    third_act.extend(third_hierarchical[gene][c3][d][r3])
                    if len(third_hierarchical[gene][c3][d][r3]) < times_third:
                        print("Missing data! ", gene, compound_name, d, r3)
                        exit(0)
                third_activations.append(third_act)

            # combine replicates (i.e. doubling)
            for h, r, r2 in itertools.product(first_activations, second_activations, third_activations):
                X.append(h)
                Y.append(r)
                Z.append(r2)

    return X, Y, Z