def hier_cluster2(doQuantum=True, mode=2, backendcode=0): """hierarchichal clustering on rows (samples) instead of columns""" # to do: hier_cluster2 should be merged into hier_cluster, since the two are almost identical, but for the data # pre processing, and calculation of the distance matrix. Also loading of the data should be more customizable if "{}" in filepath: # which dataset to use ... this part as well as filepath above are customizable dataset_train = feather.read_dataframe( filepath.format('lumAB', 'train')) #.values X_train = dataset_train.iloc[:, 2:].values y_train = dataset_train.iloc[:, 1].values else: dataset_train = feather.read_dataframe(filepath) #.values X_train = dataset_train.iloc[:, 2:].values y_train = dataset_train.iloc[:, 1].values nindices = len(X_train) indices = list(range(nindices)) print("Distance matrix of ", nindices, "samples") dist_matrix = [[ np.round(np.linalg.norm(X_train[i] - X_train[j]), 1) for i in indices ] for j in indices] if doQuantum: indices_q = indices.copy() dist_matrix_q = deepcopy(dist_matrix) # need to map ntrial*(ntrial-1)/2 distances to the numbers 0 to 2**nqubits-1 # nqubits = int(np.ceil(num_qubits_from_samples(nindices))) # use the same number of qubits to represent each index. Note for some ntrial, you can save several qubit with # more complex mappings, e.g. ntrial = 11, each index 4 qubits (2**4>11) or you can use 7 qubits total (2**7 > 11^2) # then get rid of the duplicate half distances, since d(i,j)=d(j,i) start_time = time() clusters_c, merge_c, height_c = agglomerate(indices, dist_matrix, 0) order_c = list( map(lambda x: int(x) + 1, re.findall( r'\d+', clusters_c.__str__()))) # add 1 so order matches R indices #print(order_c) tree_c = PhyloTree(str(clusters_c) + ";") print(clusters_c) print("merge_c ", merge_c) print("height_c ", height_c) print("Classical HCl takes ", round(time() - start_time, 4), "s\n") feather.write_dataframe(pd.DataFrame(merge_c), "temp/merge_c.feather") feather.write_dataframe(pd.DataFrame(height_c), "temp/height_c.feather") feather.write_dataframe(pd.DataFrame(order_c), "temp/order_c.feather") if doQuantum: start_time = time() clusters_q, merge_q, height_q = agglomerate(indices_q, dist_matrix_q, mode=mode, backendcode=backendcode) order_q = list( map(lambda x: int(x) + 1, re.findall( r'\d+', clusters_q.__str__()))) # add 1 so order matches R indices tree_q = PhyloTree(str(clusters_q) + ";") print(clusters_q) print("merge_q ", merge_q) print("height_q ", height_q) print("Quantum HCl takes ", acs.timestring(time() - start_time), "\n") qctree_sim = tree_q.compare(tree_c)['source_edges_in_ref'] print("Similarity score, quantum to classic:", round(qctree_sim, 4)) feather.write_dataframe(pd.DataFrame(merge_q), "temp/merge_q.feather") feather.write_dataframe(pd.DataFrame(height_q), "temp/height_q.feather") feather.write_dataframe(pd.DataFrame(order_q), "temp/order_q.feather") return [clusters_c, clusters_q]
# old file, stores first generation HCl from classical and quantum, and uses them # to create ete3 PhyloTree objects, to find the edges in common via the # Robinson Foulds metric. This metric has been incorporated into the main script from ete3 import PhyloTree import re # top 44 genes of PC1 for lumAB classical44 = "(((((((((0,143),(32,84)),(53,217)),(93,153)),(154,232)),(19,117)),(((((15,137),69),49),247),71)),((((((1,118),(189,193)),((215,233),241)),((24,((99,185),114)),((80,147),167))),(((12,110),((20,61),136)),(56,145))),(((8,160),(((25,163),((46,82),105)),180)),(((((17,(50,125)),(((23,55),(94,112)),((103,234),226))),64),((((18,128),(60,109)),(37,223)),(((62,230),221),(((73,227),130),146)))),(((36,74),((115,244),225)),(((111,220),192),(121,(149,169)))))))),(((((((((2,248),((33,63),59)),((3,(70,159)),240)),((((28,(212,(222,239))),(102,144)),(158,188)),(98,(172,173)))),(((67,85),165),((75,141),(92,101)))),((((((26,184),100),245),97),(48,176)),(((27,127),((40,122),87)),(135,162)))),(((((7,68),120),((((21,208),72),205),(((58,197),238),((140,210),155)))),(((10,(150,219)),131),224)),(((47,(((54,191),132),(190,(228,242)))),((79,83),(((129,204),166),243))),((126,174),(142,151))))),(((6,(45,206)),(((11,35),((14,124),231)),(171,214))),(((((29,235),31),(42,202)),(179,201)),(((44,199),(57,229)),(((78,106),90),((89,178),(168,216))))))),(((((4,196),157),((((((9,139),161),((30,39),164)),41),(108,(134,156))),((96,186),123))),236),((((((5,198),66),200),(((65,77),(133,246)),95)),(((((13,(81,107)),((86,(148,194)),170)),((22,203),104)),(34,(38,52))),(((((116,177),207),213),138),(152,(181,249))))),(((16,(43,91)),((((51,119),113),((76,209),175)),(182,218))),(((88,195),237),((183,187),211)))))));" quantum44 = "(((((((0,143),(53,217)),((93,153),160)),(19,117)),((((8,((17,32),24)),(154,232)),((15,137),69)),(71,247))),(((((((1,(50,125)),((99,114),185)),(((47,(94,112)),110),(84,((103,234),226)))),64),(6,((80,147),167))),((((36,206),(118,193)),((215,233),241)),((62,((73,227),130)),221))),(((12,((20,61),136)),(56,145)),(((((18,128),223),146),((((55,169),60),109),(121,149))),(((((25,46),163),(37,(74,(82,105)))),180),49))))),((((((((2,10),(168,216)),((44,199),(57,229))),(((27,127),((40,122),87)),(135,162))),((((((26,184),(100,204)),(48,176)),(141,245)),((111,(192,220)),((115,244),225))),(((((54,191),132),(190,((228,248),242))),(150,224)),(((79,129),166),243)))),((((((3,(70,159)),(97,240)),(((33,63),59),(98,(172,173)))),((((((28,222),(212,239)),158),(65,((102,144),249))),(34,(38,52))),((83,238),((126,188),174)))),((142,151),157)),((((7,68),120),((((21,208),72),((58,(197,219)),75)),(131,(((140,210),155),205)))),(((67,85),165),95)))),(((((11,35),((14,124),231)),45),(((23,(92,101)),(214,230)),(171,189))),(((((29,235),31),(42,202)),(179,201)),(((78,106),90),(89,178))))),((((4,196),((((((9,139),30),(39,164)),((43,91),(((51,119),((76,209),175)),108))),(41,((134,161),156))),(96,(123,186)))),236),((((((5,198),66),200),((88,195),237)),((((16,(86,113)),(182,218)),(138,152)),(77,(133,246)))),(((((13,((81,194),107)),(((22,181),(148,203)),170)),104),(((116,207),177),213)),((183,187),211))))));" ct44 = PhyloTree(classical44) qt44 = PhyloTree(quantum44) diff_44 = qt44.compare(ct44)['source_edges_in_ref'] print( "classical vs quantum HCl tree similarity on lumAB training set for top n pc1 genes:\n n=44: {:.4f}" .format(diff_44))