def test_get_bicluster(self): data = np.arange(60).reshape(10, 6) array = np.array([[25, 27, 28], [37, 39, 40], [55, 57, 58]]) rows = (4, 6, 9) cols = (1, 3, 4) bicluster = Bicluster(rows, cols, data) self.assertTrue(np.alltrue(array == bicluster.array()))
def _read_result_file_(filename, data): biclusters = [] with open(filename, 'r') as f: rows = [] cols = [] header = f.readline().split() properties = dict(nstable=int(header[5]), likelihood=float(header[7]), nparams=int(float(header[11])), bic=float(header[13])) target = rows for line in f: if line[0:9] == "bicluster": if not line[9] == '1': #make sure we've read one biclustert biclusters.append(Bicluster(rows, cols, data=data)) rows = [] cols = [] f.next() continue elif line[0:3] == "row": target = rows continue elif line[0:3] == "col": target = cols continue else: v = int(line.split()[0]) - 1 target.append(v) #ensure we get last bicluster. biclusters.append(Bicluster(rows, cols, data=data)) return biclusters, properties
class TestValidation(unittest.TestCase): """ Contains test cases for testing the validation functions in the 'validation' module. """ data = numpy.random.randn(10, 10) list1 = [Bicluster([0, 1, 2, 3], [0, 1, 2, 3], data)] list2 = [Bicluster([2, 3, 4, 5], [2, 3, 4, 5], data)] def test_prelic(self): rel, rec = bb.prelic_list(self.list1, self.list1) self.assertEqual(rel, 1) self.assertEqual(rec, 1) rel, rec = bb.prelic_list(self.list1, self.list2) self.assertAlmostEqual(rel, 1 / 3) self.assertAlmostEqual(rec, 1 / 3) def test_fmeasure(self): rel, rec = bb.f_measure_list(self.list1, self.list1, modified=False) self.assertEquals(rel, 1) self.assertEquals(rec, 1) sens = 4 / 16 spec = (100 - 28) / (100 - 16) expected = 2 * (sens * spec) / (sens + spec) rel, rec = bb.f_measure_list(self.list1, self.list2, modified=False) self.assertAlmostEqual(rel, expected) self.assertAlmostEqual(rec, expected) def test_modified_fmeasure(self): rel, rec = bb.f_measure_list(self.list1, self.list1, modified=True) self.assertEquals(rel, 1) self.assertEquals(rec, 1) def test_bicluster_jaccard(self): rel, rec = bb.jaccard_list(self.list1, self.list1) self.assertEquals(rel, 1) self.assertEquals(rec, 1) expected = 4 / 28 rel, rec = bb.jaccard_list(self.list1, self.list2) self.assertAlmostEqual(rel, expected) self.assertAlmostEqual(rec, expected) def test_recovery_and_relevance(self): rel, rec = bb.recovery_relevance_list(self.list1, self.list1) self.assertEquals(rel, 1) self.assertEquals(rec, 1) rel, rec = bb.recovery_relevance_list(self.list1, self.list2) self.assertAlmostEqual(rel, 0.25) self.assertAlmostEqual(rec, 0.25)
def _read_result_file_(filename, data): """ Reads the bicluster in a single CPB output file. The file format is: ROWS [row index] [row score] [row index] [row score] ... [row index] [row score] COLS [col index] [col score] [col index] [col score] ... [col index] [col score] """ rows, cols = [], [] with open(filename, 'r') as f: target = rows for line in f: if line[0] == 'R': continue elif line[0] == 'C': target = cols continue else: target.append(int(line.split()[0])) rows.sort() cols.sort() return Bicluster(rows, cols, data=data)
def _shuffle_(data, expected, new_rows=None, new_cols=None): """ Shuffles the dataset while preserving biclusters. Args: * data: numpy.ndarray * expected: list of biclusters. * new_rows: Shuffled row indices; if None, randomly generated. * new_cols: Shuffled column indices; if None, randomly generated. Returns: The tuple (shuffled_data, shuffled_biclusters) where shuffled_data is a shuffled version of the input dataset, and shuffled_biclusters is a list of biclusters corresponding to the new biclusters in the shuffled dataset. """ nrows, ncols = data.shape if new_rows is None: new_rows = range(nrows) random.shuffle(new_rows) if new_cols is None: new_cols = range(ncols) random.shuffle(new_cols) shuffled_data = data[new_rows].T[new_cols].T shuffled_biclusters = [] for b in expected: new_b_rows = [new_rows.index(r) for r in b.rows] new_b_cols = [new_cols.index(c) for c in b.cols] shuffled_biclusters.append( Bicluster(new_b_rows, new_b_cols, shuffled_data)) return shuffled_data, shuffled_biclusters
def _extract_biclusters_(fact, thresZ=0.5, thresL=None): params = dict() params['thresZ'] = thresZ if thresL is not None: params['thresL'] = thresL extract = robjects.r['extractBic'] result = extract(fact, **params) data = result.rx('X')[0] numpy_data = numpy.array(data) row_dict = util.make_index_map(list(data.names[0])) col_dict = util.make_index_map(list(data.names[1])) # an R matrix; each row is a bicluster biclusters = [] r_biclusters = result.rx('bic')[0] for b in range(1, r_biclusters.nrow + 1): #r matrices are 1-indexed entry = r_biclusters.rx(b, True) rownames = list(entry.rx('bixn')[0]) colnames = list(entry.rx('biypn')[0]) rows = [row_dict[r] for r in rownames] cols = [col_dict[c] for c in colnames] biclusters.append(Bicluster(rows, cols, numpy_data)) return biclusters
def _run_biclust_(function_name, data, **kwargs): """Convenience function for the various methods implemented in 'biclust'. Performs biclustering on the dataset and returns a set of biclusters. """ #replace underscores with dots: keys = kwargs.keys() for key in keys: kwargs[key.replace("_", ".")] = kwargs.pop(key) robjects.r.library('biclust') #run biclustering biclust = robjects.r["biclust"] function = robjects.r[function_name] try: result = biclust(data, method=function_name, **kwargs) except RRuntimeError as e: logging.error( '{0} caught an R exception. Assuming no biclusters were found. Message: {1}' .format(function_name, e.message)) return [] #get rowXnumber array row_matrix = numpy.array(result.do_slot("RowxNumber")) #get numberXcolumn array col_matrix = numpy.array(result.do_slot("NumberxCol")) num_biclusters = row_matrix.shape[1] # a hack for Cheng and Church, which appears to sometimes get the transpose of # the column matrix if not num_biclusters == col_matrix.shape[0]: if num_biclusters == col_matrix.shape[1] and \ row_matrix.shape[0] == data.shape[0] and \ col_matrix.shape[0] == data.shape[1]: col_matrix = col_matrix.T if not num_biclusters == col_matrix.shape[0]: raise Exception( 'There is a problem with the results returned by {0}'.format( function_name)) #make list of biclusters biclusters = [] for i in range(num_biclusters): rows_bools = row_matrix[:, i] != 0 cols_bools = col_matrix[i, :] != 0 rows = [index for index, elt in enumerate(rows_bools) if elt] cols = [index for index, elt in enumerate(cols_bools) if elt] biclusters.append(Bicluster(rows, cols, data=data)) return biclusters
def _createBicluster_(geneLine, conditionLine, data): """ Extracts the rows and columns of the bicluster from the given gene and condition lines. """ genes = map(int, geneLine.split(" ")) conditions = map(int, conditionLine.split(" ")) return Bicluster(genes, conditions, data)
def test__get_r_biclust_(self): exp_rows = np.array([[1, 0, 1], [1, 1, 0]], dtype=np.bool8) exp_cols = np.array([[1, 1, 0], [1, 0, 1]], dtype=np.bool8) data = np.random.randn(2, 2) biclusters = [ Bicluster([0, 1], [0, 1], data), Bicluster([1], [0], data), Bicluster([0], [1], data) ] result = _get_r_biclust_(biclusters) rows = np.array(result.do_slot("RowxNumber")) cols = np.array(result.do_slot("NumberxCol")) cols = cols.T self.assertTrue((rows == exp_rows).all()) self.assertTrue((cols == exp_cols).all())
def test_get_row_col_matrices(self): exp_rows = np.vstack(np.array([1, 1, 0])) exp_cols = np.vstack(np.array([0, 1, 0])) data = np.random.randn(3, 3) biclusters = [Bicluster([0, 1], [1], data)] rowxnumber, colxnumber = get_row_col_matrices(biclusters) self.assertTrue((rowxnumber == exp_rows).all()) self.assertTrue((colxnumber == exp_cols).all())
def _make_expected_biclusters_(row_matrix, col_matrix, data): """ Given the output of _make_row_matrix_() and _make_col_matrix_(), make a list of Biclusters. """ nclust = row_matrix.shape[1] assert nclust == col_matrix.shape[1] biclusters = [] for row_line, col_line in zip(row_matrix.T, col_matrix.T): rows = list(numpy.where(row_line > 0)[0]) cols = list(numpy.where(col_line > 0)[0]) biclusters.append(Bicluster(rows, cols, data)) return biclusters
def test_bicluster_eq(self): bic_a = Bicluster([1, 2, 3], [1, 2, 3]) bic_b = Bicluster([1, 2, 3], [1, 2, 3]) self.assertEquals(bic_a, bic_b) data = np.arange(10) bic_b.data = data self.assertNotEquals(bic_a, bic_b) bic_a.data = data self.assertEquals(bic_a, bic_b) bic_b.data = np.arange(10) self.assertNotEquals(bic_a, bic_b)
def _parse_bicluster_(string, gene_dict, cond_dict, data): expected_ngenes = _get_expected_(string, _gene_regex_) expected_nconds = _get_expected_(string, _cond_regex_) #split after the gene part after_genes = re.split(_gene_regex_, string)[1] #split into genes and conditions gene_lines, cond_lines = re.split(_cond_regex_, after_genes) cond_lines = cond_lines.split('\n')[0] rows = _handle_gene_lines_(gene_lines, gene_dict) cols = _handle_cond_lines_(cond_lines, cond_dict) assert len(rows) == expected_ngenes assert len(cols) == expected_nconds return Bicluster(rows, cols, data)
def make_isa_data(nrows=300, ncols=50, nclusts=3, nclustrows=None, nclustcols=None, noise=0, bicluster_signals=None, bicluster_noise=None, noverlap_rows=0, noverlap_cols=None, shuffle=None): """ Make ISA-style data. Generates a dataset using the Bioconductor 'isa2' package's make.isa.data function. If an argument is None, it is not included, and isa2's defaults are used. Requires that 'isa2' be installed. Args: * nrows: Number of rows in the data matrix. * cols: Number of columns in the data matrix. * nclusts: Number of biclusters. * nclustrows: Rows in each bicluster. Defaults to round(0.5 * num_rows/num_fact) * nclustcols: Cols in each bicluster. round(0.5 * num_cols/num_fact) * noise: Standard deviation of normal noise in background. * bicluster_signals: List of base signals for each bicluster. Defaults to 1's. * bicluster_noise: List of noise standard deviations for each bicluster. Defaults to 0's. * noverlap_rows: Number of bicluster rows that overlap. * noverlap_cols: Number of coluster columns that overlap. Defaults to 'overlap_row'. * shuffle: If True, shuffle rows and columns. """ args = locals() isa_map = dict( nrows='num_rows', ncols='num_cols', nclusts='num_fact', nclustrows='mod_row_size', nclustcols='mod_col_size', noise='noise', bicluster_signals='mod_signal', bicluster_noise='mod_noise', noverlap_rows='overlap_row', noverlap_cols='overlap_col', ) isa_args = dict() for key, argkey in isa_map.iteritems(): isa_args[argkey] = args[key] #remove empty keys empty_keys = [] for key in isa_args: if isa_args[key] is None: empty_keys.append(key) for key in empty_keys: isa_args.pop(key) for key in ['mod_signal', 'mod_noise']: if key in isa_args: isa_args[key] = robjects.FloatVector(list(isa_args[key])) robjects.r.library('isa2') #get data func = robjects.r['isa.in.silico'] result = func(**isa_args) #convert to python data = numpy.array(robjects.Matrix(result[0])).copy() rows = numpy.array(robjects.Matrix(result[1])).copy() cols = numpy.array(robjects.Matrix(result[2])).copy() nbiclusters = rows.shape[1] row_list = [] for i in range(nbiclusters): row = list(rows[:, i].nonzero()[0]) row_list.append(row) col_list = [] for i in range(nbiclusters): col = list(cols[:, i].nonzero()[0]) col_list.append(col) expected = [] for r, c, in zip(row_list, col_list): expected.append(Bicluster(r, c, data)) if shuffle: data, expected = _shuffle_(data, expected) return data, expected
def isa(data, thr_row=None, thr_col=None, no_seeds=100, direction=['updown', 'updown']): """ ISA biclustering algorithm. Args: * data: numpy.ndarray. * thr_row: threshold value for rows. * thr_col: threshold value for cols. * no_seeds: number of seeds to generate biclusters. * direction: either 'up' for upregulated, 'down' for downregulated, 'updown' for both(default). Returns: A list of biclusters. """ #load the isa library robjects.r.library('isa2') #get an R object for the data r_data = robjects.Matrix(data) def handle_threshold(x): if x is None: x = robjects.r['seq'](1, 3, by=0.5) else: if not isiterable(x): x = [x] x = robjects.FloatVector(list(x)) return x thr_row = handle_threshold(thr_row) thr_col = handle_threshold(thr_col) direction = robjects.StrVector(direction) #run biclustering func = robjects.r('isa') result = func(r_data, thr_row, thr_col, no_seeds, direction) #get rowXnumber array row_matrix = numpy.array(robjects.Matrix(result[0])) #get numberXcolumn array col_matrix = numpy.array(robjects.Matrix(result[1])) num_biclusters = row_matrix.shape[1] assert num_biclusters == col_matrix.shape[1] #make list of biclusters biclusters = [] for i in range(num_biclusters): row_vals = row_matrix[:, i] col_vals = col_matrix[:, i] rows = [index for index, elt in enumerate(row_vals) if elt] cols = [index for index, elt in enumerate(col_vals) if elt] biclusters.append(Bicluster(rows, cols, data=data)) return biclusters
def make_fabia_data(nrows, ncols, nclusts, f1, f2, of1, of2, sd_noise, sd_z_noise, mean_z, sd_z, sd_l_noise, mean_l, sd_l, shuffle=True, pos=False): """ Make FABIA-style data. An interface to the Bioconductor 'fabia' library's makeFabiaDataset functions. Requires that 'fabia' be installed. Args: * nrows: number of observations. * ncols: number of samples. * nclusts: number of biclusters. * f1: ncols/f1 max. additional samples are active in a bicluster. * f2: nrows/f2 max. additional observations that form a pattern in a bicluster. * of1: minimal active samples in a bicluster. * of2: minimal observations that form a pattern in a bicluster. * sd_noise: Gaussian zero mean noise std on data matrix. * sd_z_noise: Gaussian zero mean noise std for deactivated hidden factors. * mean_z: Gaussian mean for activated factors. * sd_z: Gaussian std for activated factors. * sd_l_noise: Gaussian zero mean noise std if no observation patterns are present. * mean_l: Gaussian mean for observation patterns. * sd_l: Gaussian std for observation patterns. * shuffle: If True, shuffle dataset. * pos: Use the MakeFabiaDataPos functions """ robjects.r.library('fabia') function = 'makeFabiaData' if not shuffle: function += "Blocks" if pos: function += "Pos" func = robjects.r[function] result = func(nrows, ncols, nclusts, f1, f2, of1, of2, sd_noise, sd_z_noise, mean_z, sd_z, sd_l_noise, mean_l, sd_l) noisy_data = numpy.array(result[0]).copy() noiseless_data = numpy.array(result[1]).copy() cols_vector = result[2] rows_vector = result[3] f = lambda x: int(x) - 1 rows = [] for r in rows_vector: rows.append(map(f, r)) cols = [] for c in cols_vector: cols.append(map(f, c)) biclusters = [] for r, c in zip(rows, cols): biclusters.append(Bicluster(r, c, noisy_data)) return noisy_data, biclusters