def compareRandom(num_trials, tensor_dimensions, matrix_data, cluster_dimensions, maxit_ebc, jitter_max_ebc, objective_tolerance): deltas = [] iterations_M = [] iterations_Mr = [] noconverge_M = 0 noconverge_Mr = 0 for j in range(num_trials): print "Trial ", j M = SparseMatrix(tensor_dimensions) M.read_data(matrix_data) Mr = M.shuffle() # could also be M.shuffle_old() M.normalize() ebc_M = EBC(M, cluster_dimensions, maxit_ebc, jitter_max_ebc, objective_tolerance) cXY_M, objective_M, it_M = ebc_M.run() if it_M == maxit_ebc: noconverge_M += 1 else: iterations_M.append(it_M) Mr.normalize() ebc_Mr = EBC(Mr, cluster_dimensions, maxit_ebc, jitter_max_ebc, objective_tolerance) cXY_Mr, objective_Mr, it_Mr = ebc_Mr.run() if it_Mr == maxit_ebc: noconverge_Mr += 1 else: iterations_Mr.append(it_Mr) deltas.append(objective_M - objective_Mr) return deltas, iterations_M, iterations_Mr, noconverge_M, noconverge_Mr
class TestMatrix(unittest.TestCase): def setUp(self): self.data = [l.split('\t') for l in open('tests/sample-matrix-file.txt', 'r').readlines()] self.matrix = SparseMatrix([2, 4, 9]) self.matrix.read_data(self.data) def testMatrixInit(self): self.assertEquals(self.matrix.nonzero_elements[(1, 3, 7)], 2.0) self.assertEquals(self.matrix.nonzero_elements[(0, 0, 0)], 2.0) self.assertEquals(self.matrix.nonzero_elements[(0, 0, 2)], 2.0) self.assertEquals(self.matrix.nonzero_elements[(1, 1, 5)], 7.0) self.assertEquals(self.matrix.nonzero_elements[(1, 1, 3)], 3.0) self.assertEquals(self.matrix.nonzero_elements[(1, 3, 6)], 2.0) self.assertEquals(self.matrix.nonzero_elements[(1, 3, 8)], 2.0) self.assertEquals(self.matrix.nonzero_elements[(0, 0, 1)], 2.0) self.assertEquals(self.matrix.nonzero_elements[(1, 1, 4)], 2.0) self.assertEquals(self.matrix.nonzero_elements[(1, 2, 5)], 2.0) self.assertEquals(len(self.matrix.nonzero_elements), 10) self.assertEquals(self.matrix.feature_ids[0], {'mice': 1, 'patient': 0}) self.assertEquals(self.matrix.feature_ids[1], {'R92Q': 1, 'R91W': 2, 'Val30Met': 0, 'R90W': 3}) self.assertEquals(self.matrix.feature_ids[2], {'START_ENTITY|nmod|END_ENTITY': 1, 'START_ENTITY|nummod|END_ENTITY': 5, 'FAP|compound|END_ENTITY': 2, 'expression|nmod|END_ENTITY': 8, '+|compound|END_ENTITY': 7, 'mice|nummod|END_ENTITY': 3, 'homozygous|nsubj|START_ENTITY': 6, 'mutation|appos|END_ENTITY': 4, 'START_ENTITY|nmod|FAP': 0}) def testShuffle(self): shuffled_matrix = self.matrix.shuffle() self.assertEquals(len(shuffled_matrix.nonzero_elements), len(self.matrix.nonzero_elements)) self.assertEquals(set(shuffled_matrix.nonzero_elements.values()), set(self.matrix.nonzero_elements.values())) print("shuffled matrix elements: ", shuffled_matrix.nonzero_elements)
def main(): data_file = sys.argv[1] ebc_cols = [int(e) for e in sys.argv[2].split(",")] K = [int(e) for e in sys.argv[3].split(",")] N_runs = int(sys.argv[4]) output_file = sys.argv[5] jitter_max = float(sys.argv[6]) max_iterations_ebc = int(sys.argv[7]) entity_cols = [int(e) for e in sys.argv[8].split(",")] object_toler = float(sys.argv[9]) # get original data raw_data = [line.split("\t") for line in open(data_file, "r")] data = [[d[i] for i in ebc_cols] for d in raw_data] data_dimensions = len(data[0]) - 1 # get axis length for each dimension N = [] for dim in range(data_dimensions): N.append(len(set([d[dim] for d in data]))) print(N) # set up matrix M = SparseMatrix(N) M.read_data(data) M.normalize() # set up entity map to ids entity_map = defaultdict(tuple) for d in raw_data: entity = tuple([d[i] for i in entity_cols]) entity_ids = tuple([M.feature_ids[ebc_cols.index(i)][d[i]] for i in entity_cols]) entity_map[entity_ids] = entity # figure out which ebc columns the entity columns correspond to entity_column_indices = [] for c in ebc_cols: if c in entity_cols: entity_column_indices.append(ebc_cols.index(c)) # run EBC and get entity cluster assignments ebc_M = EBC(M, K, max_iterations_ebc, jitter_max, object_toler) clusters = defaultdict(list) for t in range(N_runs): print "run ", t cXY_M, objective_M, it_M = ebc_M.run() for e1 in entity_map.keys(): c1_i = tuple([cXY_M[i][e1[i]] for i in entity_column_indices]) clusters[e1].append(c1_i) # print assignments writer = open(output_file, "w") for k in clusters: e1_name = entity_map[k] writer.write(",".join([str(e) for e in k]) + "\t" + ",".join([e for e in e1_name]) + "\t" + "\t".join([",".join([str(f) for f in e]) for e in clusters[k]]) + "\n") writer.flush() writer.close()
def main(): """ An example run of EBC. """ with open("resources/matrix-ebc-paper-sparse.tsv", "r") as f: data = [] for line in f: sl = line.split("\t") if len(sl) < 5: # headers continue data.append([sl[0], sl[2], float(sl[4])]) matrix = SparseMatrix([14052, 7272]) matrix.read_data(data) matrix.normalize() ebc = EBC(matrix, [30, 125], 10, 1e-10, 0.01) cXY, objective, it = ebc.run()
def testOldMatrix3d(self): with open("resources/matrix-ebc-paper-dense-3d.tsv", "r") as f: data = [] for line in f: sl = line.split("\t") data.append([sl[0], sl[1], sl[2], float(sl[3])]) matrix = SparseMatrix([756, 996, 1232]) matrix.read_data(data) matrix.normalize() ebc = EBC(matrix, [30, 30, 10], 100, 1e-10, 0.01) cXY, objective, it = ebc.run() print "objective: ", objective print "iterations: ", it self.assertEquals(len(ebc.pXY.nonzero_elements), 10007) self.assertEquals(len(set(ebc.cXY[0])), 30) self.assertEquals(len(set(ebc.cXY[1])), 30) self.assertEquals(len(set(ebc.cXY[2])), 10)
def test3DMatrix(self): data = [[0, 0, 0, 1.0], [0, 0, 1, 1.0], [0, 1, 0, 1.0], [0, 1, 1, 1.0], [1, 0, 0, 1.0], [1, 0, 1, 1.0], [1, 1, 0, 1.0], [1, 1, 1, 1.0], [2, 2, 2, 1.0], [2, 2, 3, 1.0], [2, 3, 2, 1.0], [3, 2, 2, 1.0], [2, 3, 3, 1.0], [3, 3, 2, 1.0], [3, 2, 3, 1.0], [3, 3, 3, 1.0], [4, 4, 4, 1.0], [4, 4, 5, 1.0], [4, 5, 4, 1.0], [4, 5, 5, 1.0], [5, 4, 4, 1.0], [5, 4, 5, 1.0], [5, 5, 4, 1.0], [5, 5, 5, 1.0]] matrix = SparseMatrix([6, 6, 6]) matrix.read_data(data) matrix.normalize() ebc = EBC(matrix, [3, 3, 3], 10, 1e-10, 0.01) assigned_C = [[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]] cXY, objective, it = ebc.run(assigned_C) self.assertEquals(cXY, assigned_C) self.assertAlmostEqual(objective, 0.0) self.assertEquals(it, 1) for i in range(100): cXY, objective, it = ebc.run() # random initialization print cXY, objective, it
class TestBenchmarkEBC(unittest.TestCase): """ Benchmark the EBC code as a unittest, using the sparse matrix data. """ def setUp(self): with open("resources/matrix-ebc-paper-sparse.tsv", "r") as f: data = [] for line in f: sl = line.split("\t") if len(sl) < 5: # headers continue data.append([sl[0], sl[2], float(sl[4])]) self.matrix = SparseMatrix([14052, 7272]) self.matrix.read_data(data) self.matrix.normalize() def testEbcOnSparseMatrix(self): ebc = EBC(self.matrix, [30, 125], 10, 1e-10, 0.01) cXY, objective, it = ebc.run() print "objective: ", objective print "iterations: ", it self.assertEquals(len(ebc.pXY.nonzero_elements), 29456) self.assertEquals(len(set(ebc.cXY[0])), 30) self.assertEquals(len(set(ebc.cXY[1])), 125)
def setUp(self): data = [[0, 0, 0, 1.0], [0, 0, 1, 1.0], [0, 1, 0, 1.0], [0, 1, 1, 1.0], [1, 0, 0, 1.0], [1, 0, 1, 1.0], [1, 1, 0, 1.0], [1, 1, 1, 1.0], [2, 2, 2, 1.0], [2, 2, 3, 1.0], [2, 3, 2, 1.0], [3, 2, 2, 1.0], [2, 3, 3, 1.0], [3, 3, 2, 1.0], [3, 2, 3, 1.0], [3, 3, 3, 1.0], [4, 4, 4, 1.0], [4, 4, 5, 1.0], [4, 5, 4, 1.0], [4, 5, 5, 1.0], [5, 4, 4, 1.0], [5, 4, 5, 1.0], [5, 5, 4, 1.0], [5, 5, 5, 1.0]] matrix = SparseMatrix([6, 6, 6]) matrix.read_data(data) matrix.normalize() ebc = EBC(matrix, [3, 3, 3], 10, 1e-10) assigned_C = [[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]] cXY, objective = ebc.run(assigned_C) self.assertEquals(cXY, assigned_C) self.assertAlmostEqual(objective, 0.0) cXY, objective = ebc.run() # random initialization self.assertAlmostEqual(objective, 0.0)
class TestEbc(unittest.TestCase): def setUp(self): self.data = [["0", "0", 0.05], ["0", "1", 0.05], ["0", "2", 0.05], ["0", "3", 0.00], ["0", "4", 0.00], ["0", "5", 0.00], ["1", "0", 0.05], ["1", "1", 0.05], ["1", "2", 0.05], ["1", "3", 0.00], ["1", "4", 0.00], ["1", "5", 0.00], ["2", "0", 0.00], ["2", "1", 0.00], ["2", "2", 0.00], ["2", "3", 0.05], ["2", "4", 0.05], ["2", "5", 0.05], ["3", "0", 0.00], ["3", "1", 0.00], ["3", "2", 0.00], ["3", "3", 0.05], ["3", "4", 0.05], ["3", "5", 0.05], ["4", "0", 0.04], ["4", "1", 0.04], ["4", "2", 0.00], ["4", "3", 0.04], ["4", "4", 0.04], ["4", "5", 0.04], ["5", "0", 0.04], ["5", "1", 0.04], ["5", "2", 0.04], ["5", "3", 0.00], ["5", "4", 0.04], ["5", "5", 0.04]] self.matrix = SparseMatrix([6, 6]) self.matrix.read_data(self.data) def testDataLoad(self): self.assertEquals(sorted(self.matrix.nonzero_elements.items(), key=itemgetter(0)), [((0, 0), 0.05), ((0, 1), 0.05), ((0, 2), 0.05), ((1, 0), 0.05), ((1, 1), 0.05), ((1, 2), 0.05), ((2, 3), 0.05), ((2, 4), 0.05), ((2, 5), 0.05), ((3, 3), 0.05), ((3, 4), 0.05), ((3, 5), 0.05), ((4, 0), 0.04), ((4, 1), 0.04), ((4, 3), 0.04), ((4, 4), 0.04), ((4, 5), 0.04), ((5, 0), 0.04), ((5, 1), 0.04), ((5, 2), 0.04), ((5, 4), 0.04), ((5, 5), 0.04)]) def testOldMatrix(self): with open("resources/matrix-ebc-paper-dense.tsv", "r") as f: data = [] for line in f: sl = line.split("\t") if len(sl) < 5: # headers continue data.append([sl[0], sl[2], float(sl[4])]) matrix = SparseMatrix([3514, 1232]) matrix.read_data(data) matrix.normalize() ebc = EBC(matrix, [30, 125], 10, 1e-10, 0.01) cXY, objective, it = ebc.run() print "objective: ", objective print "iterations: ", it self.assertEquals(len(ebc.pXY.nonzero_elements), 10007) self.assertEquals(len(set(ebc.cXY[0])), 30) self.assertEquals(len(set(ebc.cXY[1])), 125) def testOldMatrix3d(self): with open("resources/matrix-ebc-paper-dense-3d.tsv", "r") as f: data = [] for line in f: sl = line.split("\t") data.append([sl[0], sl[1], sl[2], float(sl[3])]) matrix = SparseMatrix([756, 996, 1232]) matrix.read_data(data) matrix.normalize() ebc = EBC(matrix, [30, 30, 10], 100, 1e-10, 0.01) cXY, objective, it = ebc.run() print "objective: ", objective print "iterations: ", it self.assertEquals(len(ebc.pXY.nonzero_elements), 10007) self.assertEquals(len(set(ebc.cXY[0])), 30) self.assertEquals(len(set(ebc.cXY[1])), 30) self.assertEquals(len(set(ebc.cXY[2])), 10) def test3DMatrix(self): data = [[0, 0, 0, 1.0], [0, 0, 1, 1.0], [0, 1, 0, 1.0], [0, 1, 1, 1.0], [1, 0, 0, 1.0], [1, 0, 1, 1.0], [1, 1, 0, 1.0], [1, 1, 1, 1.0], [2, 2, 2, 1.0], [2, 2, 3, 1.0], [2, 3, 2, 1.0], [3, 2, 2, 1.0], [2, 3, 3, 1.0], [3, 3, 2, 1.0], [3, 2, 3, 1.0], [3, 3, 3, 1.0], [4, 4, 4, 1.0], [4, 4, 5, 1.0], [4, 5, 4, 1.0], [4, 5, 5, 1.0], [5, 4, 4, 1.0], [5, 4, 5, 1.0], [5, 5, 4, 1.0], [5, 5, 5, 1.0]] matrix = SparseMatrix([6, 6, 6]) matrix.read_data(data) matrix.normalize() ebc = EBC(matrix, [3, 3, 3], 10, 1e-10, 0.01) assigned_C = [[0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2], [0, 0, 1, 1, 2, 2]] cXY, objective, it = ebc.run(assigned_C) self.assertEquals(cXY, assigned_C) self.assertAlmostEqual(objective, 0.0) self.assertEquals(it, 1) for i in range(100): cXY, objective, it = ebc.run() # random initialization print cXY, objective, it
class TestSanityCheck(unittest.TestCase): """ Do a sanity check for the EBC code, using the data from the original ITCC paper. """ def setUp(self): with open("resources/matrix-itcc-paper-orig.tsv", "r") as f: data = [l.split('\t') for l in f] self.matrix = SparseMatrix([6, 6]) self.matrix.read_data(data) self.matrix.normalize() def cartesian(self, arrays, out=None): arrays = [np.asarray(x) for x in arrays] dtype = arrays[0].dtype n = np.prod([x.size for x in arrays]) if out is None: out = np.zeros([n, len(arrays)], dtype=dtype) m = n / arrays[0].size out[:, 0] = np.repeat(arrays[0], m) if arrays[1:]: self.cartesian(arrays[1:], out=out[0:m, 1:]) for j in xrange(1, arrays[0].size): out[j * m:(j + 1) * m, 1:] = out[0:m, 1:] return out def testEbcOnSparseMatrix(self): ebc = EBC(self.matrix, [3, 2], 10, 1e-10, 0.01) cXY, objective, it = ebc.run(verbose=False) print "--> ebc" print "objective: ", objective print "iterations: ", it ebc = EBC(self.matrix, [3, 2], 10, 1e-10, 0.01) ebc.run(assigned_clusters=[[2, 0, 1, 1, 2, 2], [0, 0, 1, 0, 1, 1]], verbose=False) indices = [range(N_d) for N_d in ebc.pXY.N] index_list = self.cartesian(indices) approx_distribution = {} for location in index_list: q = 1.0 c_location = [] for i in range(len(location)): c_i = ebc.cXY[i][location[i]] c_location.append(c_i) q *= ebc.qXxHat[i][location[i]] q *= ebc.qXhatYhat.get(tuple(c_location)) approx_distribution[tuple(location)] = q self.assertAlmostEquals(approx_distribution[(0, 0)], 0.054) self.assertAlmostEquals(approx_distribution[(0, 1)], 0.054) self.assertAlmostEquals(approx_distribution[(0, 2)], 0.042) self.assertAlmostEquals(approx_distribution[(0, 3)], 0.0) self.assertAlmostEquals(approx_distribution[(0, 4)], 0.0) self.assertAlmostEquals(approx_distribution[(0, 5)], 0.0) self.assertAlmostEquals(approx_distribution[(1, 0)], 0.054) self.assertAlmostEquals(approx_distribution[(1, 1)], 0.054) self.assertAlmostEquals(approx_distribution[(1, 2)], 0.042) self.assertAlmostEquals(approx_distribution[(1, 3)], 0.0) self.assertAlmostEquals(approx_distribution[(1, 4)], 0.0) self.assertAlmostEquals(approx_distribution[(1, 5)], 0.0) self.assertAlmostEquals(approx_distribution[(2, 0)], 0.0) self.assertAlmostEquals(approx_distribution[(2, 1)], 0.0) self.assertAlmostEquals(approx_distribution[(2, 2)], 0.0) self.assertAlmostEquals(approx_distribution[(2, 3)], 0.042) self.assertAlmostEquals(approx_distribution[(2, 4)], 0.054) self.assertAlmostEquals(approx_distribution[(2, 5)], 0.054) self.assertAlmostEquals(approx_distribution[(3, 0)], 0.0) self.assertAlmostEquals(approx_distribution[(3, 1)], 0.0) self.assertAlmostEquals(approx_distribution[(3, 2)], 0.0) self.assertAlmostEquals(approx_distribution[(3, 3)], 0.042) self.assertAlmostEquals(approx_distribution[(3, 4)], 0.054) self.assertAlmostEquals(approx_distribution[(3, 5)], 0.054) self.assertAlmostEquals(approx_distribution[(4, 0)], 0.036) self.assertAlmostEquals(approx_distribution[(4, 1)], 0.036) self.assertAlmostEquals(approx_distribution[(4, 2)], 0.028) self.assertAlmostEquals(approx_distribution[(4, 3)], 0.028) self.assertAlmostEquals(approx_distribution[(4, 4)], 0.036) self.assertAlmostEquals(approx_distribution[(4, 5)], 0.036) self.assertAlmostEquals(approx_distribution[(5, 0)], 0.036) self.assertAlmostEquals(approx_distribution[(5, 1)], 0.036) self.assertAlmostEquals(approx_distribution[(5, 2)], 0.028) self.assertAlmostEquals(approx_distribution[(5, 3)], 0.028) self.assertAlmostEquals(approx_distribution[(5, 4)], 0.036) self.assertAlmostEquals(approx_distribution[(5, 5)], 0.036) def testEbc2dOnSparseMatrix(self): with open("resources/matrix-itcc-paper-orig.tsv", "r") as f: data = [l.split('\t') for l in f] m = ebc2d.get_matrix_from_data(data) # run without assigned clusters ebc = EBC2D(m, [3, 2], 10, 1e-10, 0.01) cXY, objective, it = ebc.run(verbose=False) print "--> ebc2d" print "objective: ", objective print "iterations: ", it # run with assigned clusters ebc = EBC2D(m, [3, 2], 10, 1e-10, 0.01) cXY, objective, it = ebc.run(assigned_clusters=[[2, 0, 1, 1, 2, 2], [0, 0, 1, 0, 1, 1]], verbose=False) indices = [range(N_d) for N_d in ebc.pXY.shape] index_list = self.cartesian(indices) approx_distribution = {} qX_xhat = [ebc.qX_xhat, ebc.qY_yhat] for location in index_list: q = 1.0 c_location = [] for i in range(len(location)): c_i = cXY[i][location[i]] c_location.append(c_i) q *= qX_xhat[i][location[i]] q *= ebc.qXhatYhat[c_location[0], c_location[1]] approx_distribution[tuple(location)] = q self.assertAlmostEquals(approx_distribution[(0, 0)], 0.054) self.assertAlmostEquals(approx_distribution[(0, 1)], 0.054) self.assertAlmostEquals(approx_distribution[(0, 2)], 0.042) self.assertAlmostEquals(approx_distribution[(0, 3)], 0.0) self.assertAlmostEquals(approx_distribution[(0, 4)], 0.0) self.assertAlmostEquals(approx_distribution[(0, 5)], 0.0) self.assertAlmostEquals(approx_distribution[(1, 0)], 0.054) self.assertAlmostEquals(approx_distribution[(1, 1)], 0.054) self.assertAlmostEquals(approx_distribution[(1, 2)], 0.042) self.assertAlmostEquals(approx_distribution[(1, 3)], 0.0) self.assertAlmostEquals(approx_distribution[(1, 4)], 0.0) self.assertAlmostEquals(approx_distribution[(1, 5)], 0.0) self.assertAlmostEquals(approx_distribution[(2, 0)], 0.0) self.assertAlmostEquals(approx_distribution[(2, 1)], 0.0) self.assertAlmostEquals(approx_distribution[(2, 2)], 0.0) self.assertAlmostEquals(approx_distribution[(2, 3)], 0.042) self.assertAlmostEquals(approx_distribution[(2, 4)], 0.054) self.assertAlmostEquals(approx_distribution[(2, 5)], 0.054) self.assertAlmostEquals(approx_distribution[(3, 0)], 0.0) self.assertAlmostEquals(approx_distribution[(3, 1)], 0.0) self.assertAlmostEquals(approx_distribution[(3, 2)], 0.0) self.assertAlmostEquals(approx_distribution[(3, 3)], 0.042) self.assertAlmostEquals(approx_distribution[(3, 4)], 0.054) self.assertAlmostEquals(approx_distribution[(3, 5)], 0.054) self.assertAlmostEquals(approx_distribution[(4, 0)], 0.036) self.assertAlmostEquals(approx_distribution[(4, 1)], 0.036) self.assertAlmostEquals(approx_distribution[(4, 2)], 0.028) self.assertAlmostEquals(approx_distribution[(4, 3)], 0.028) self.assertAlmostEquals(approx_distribution[(4, 4)], 0.036) self.assertAlmostEquals(approx_distribution[(4, 5)], 0.036) self.assertAlmostEquals(approx_distribution[(5, 0)], 0.036) self.assertAlmostEquals(approx_distribution[(5, 1)], 0.036) self.assertAlmostEquals(approx_distribution[(5, 2)], 0.028) self.assertAlmostEquals(approx_distribution[(5, 3)], 0.028) self.assertAlmostEquals(approx_distribution[(5, 4)], 0.036) self.assertAlmostEquals(approx_distribution[(5, 5)], 0.036)