예제 #1
0
def hier_cluster2(doQuantum=True, mode=2, backendcode=0):
    """hierarchichal clustering on rows (samples) instead of columns"""
    # to do: hier_cluster2 should be merged into hier_cluster, since the two are almost identical, but for the data
    # pre processing, and calculation of the distance matrix. Also loading of the data should be more customizable

    if "{}" in filepath:  # which dataset to use ... this part as well as filepath above are customizable
        dataset_train = feather.read_dataframe(
            filepath.format('lumAB', 'train'))  #.values
        X_train = dataset_train.iloc[:, 2:].values
        y_train = dataset_train.iloc[:, 1].values
    else:
        dataset_train = feather.read_dataframe(filepath)  #.values
        X_train = dataset_train.iloc[:, 2:].values
        y_train = dataset_train.iloc[:, 1].values

    nindices = len(X_train)
    indices = list(range(nindices))
    print("Distance matrix of ", nindices, "samples")
    dist_matrix = [[
        np.round(np.linalg.norm(X_train[i] - X_train[j]), 1) for i in indices
    ] for j in indices]

    if doQuantum:
        indices_q = indices.copy()
        dist_matrix_q = deepcopy(dist_matrix)

    # need to map ntrial*(ntrial-1)/2 distances to the numbers 0 to 2**nqubits-1
    # nqubits = int(np.ceil(num_qubits_from_samples(nindices)))
    # use the same number of qubits to represent each index. Note for some ntrial, you can save several qubit with
    # more complex mappings, e.g. ntrial = 11, each index 4 qubits (2**4>11) or you can use 7 qubits total (2**7 > 11^2)
    # then get rid of the duplicate half distances, since d(i,j)=d(j,i)
    start_time = time()
    clusters_c, merge_c, height_c = agglomerate(indices, dist_matrix, 0)
    order_c = list(
        map(lambda x: int(x) + 1, re.findall(
            r'\d+', clusters_c.__str__())))  # add 1 so order matches R indices
    #print(order_c)
    tree_c = PhyloTree(str(clusters_c) + ";")

    print(clusters_c)
    print("merge_c ", merge_c)
    print("height_c ", height_c)
    print("Classical HCl takes ", round(time() - start_time, 4), "s\n")

    feather.write_dataframe(pd.DataFrame(merge_c), "temp/merge_c.feather")
    feather.write_dataframe(pd.DataFrame(height_c), "temp/height_c.feather")
    feather.write_dataframe(pd.DataFrame(order_c), "temp/order_c.feather")

    if doQuantum:
        start_time = time()

        clusters_q, merge_q, height_q = agglomerate(indices_q,
                                                    dist_matrix_q,
                                                    mode=mode,
                                                    backendcode=backendcode)
        order_q = list(
            map(lambda x: int(x) + 1, re.findall(
                r'\d+',
                clusters_q.__str__())))  # add 1 so order matches R indices

        tree_q = PhyloTree(str(clusters_q) + ";")

        print(clusters_q)
        print("merge_q ", merge_q)
        print("height_q ", height_q)
        print("Quantum HCl takes ", acs.timestring(time() - start_time), "\n")

        qctree_sim = tree_q.compare(tree_c)['source_edges_in_ref']
        print("Similarity score, quantum to classic:", round(qctree_sim, 4))

        feather.write_dataframe(pd.DataFrame(merge_q), "temp/merge_q.feather")
        feather.write_dataframe(pd.DataFrame(height_q),
                                "temp/height_q.feather")
        feather.write_dataframe(pd.DataFrame(order_q), "temp/order_q.feather")

        return [clusters_c, clusters_q]
예제 #2
0
# old file, stores first generation HCl from classical and quantum, and uses them
# to create ete3 PhyloTree objects, to find the edges in common via the
# Robinson Foulds metric. This metric has been incorporated into the main script

from ete3 import PhyloTree
import re

# top 44 genes of PC1 for lumAB

classical44 = "(((((((((0,143),(32,84)),(53,217)),(93,153)),(154,232)),(19,117)),(((((15,137),69),49),247),71)),((((((1,118),(189,193)),((215,233),241)),((24,((99,185),114)),((80,147),167))),(((12,110),((20,61),136)),(56,145))),(((8,160),(((25,163),((46,82),105)),180)),(((((17,(50,125)),(((23,55),(94,112)),((103,234),226))),64),((((18,128),(60,109)),(37,223)),(((62,230),221),(((73,227),130),146)))),(((36,74),((115,244),225)),(((111,220),192),(121,(149,169)))))))),(((((((((2,248),((33,63),59)),((3,(70,159)),240)),((((28,(212,(222,239))),(102,144)),(158,188)),(98,(172,173)))),(((67,85),165),((75,141),(92,101)))),((((((26,184),100),245),97),(48,176)),(((27,127),((40,122),87)),(135,162)))),(((((7,68),120),((((21,208),72),205),(((58,197),238),((140,210),155)))),(((10,(150,219)),131),224)),(((47,(((54,191),132),(190,(228,242)))),((79,83),(((129,204),166),243))),((126,174),(142,151))))),(((6,(45,206)),(((11,35),((14,124),231)),(171,214))),(((((29,235),31),(42,202)),(179,201)),(((44,199),(57,229)),(((78,106),90),((89,178),(168,216))))))),(((((4,196),157),((((((9,139),161),((30,39),164)),41),(108,(134,156))),((96,186),123))),236),((((((5,198),66),200),(((65,77),(133,246)),95)),(((((13,(81,107)),((86,(148,194)),170)),((22,203),104)),(34,(38,52))),(((((116,177),207),213),138),(152,(181,249))))),(((16,(43,91)),((((51,119),113),((76,209),175)),(182,218))),(((88,195),237),((183,187),211)))))));"
quantum44 = "(((((((0,143),(53,217)),((93,153),160)),(19,117)),((((8,((17,32),24)),(154,232)),((15,137),69)),(71,247))),(((((((1,(50,125)),((99,114),185)),(((47,(94,112)),110),(84,((103,234),226)))),64),(6,((80,147),167))),((((36,206),(118,193)),((215,233),241)),((62,((73,227),130)),221))),(((12,((20,61),136)),(56,145)),(((((18,128),223),146),((((55,169),60),109),(121,149))),(((((25,46),163),(37,(74,(82,105)))),180),49))))),((((((((2,10),(168,216)),((44,199),(57,229))),(((27,127),((40,122),87)),(135,162))),((((((26,184),(100,204)),(48,176)),(141,245)),((111,(192,220)),((115,244),225))),(((((54,191),132),(190,((228,248),242))),(150,224)),(((79,129),166),243)))),((((((3,(70,159)),(97,240)),(((33,63),59),(98,(172,173)))),((((((28,222),(212,239)),158),(65,((102,144),249))),(34,(38,52))),((83,238),((126,188),174)))),((142,151),157)),((((7,68),120),((((21,208),72),((58,(197,219)),75)),(131,(((140,210),155),205)))),(((67,85),165),95)))),(((((11,35),((14,124),231)),45),(((23,(92,101)),(214,230)),(171,189))),(((((29,235),31),(42,202)),(179,201)),(((78,106),90),(89,178))))),((((4,196),((((((9,139),30),(39,164)),((43,91),(((51,119),((76,209),175)),108))),(41,((134,161),156))),(96,(123,186)))),236),((((((5,198),66),200),((88,195),237)),((((16,(86,113)),(182,218)),(138,152)),(77,(133,246)))),(((((13,((81,194),107)),(((22,181),(148,203)),170)),104),(((116,207),177),213)),((183,187),211))))));"
ct44 = PhyloTree(classical44)
qt44 = PhyloTree(quantum44)

diff_44 = qt44.compare(ct44)['source_edges_in_ref']

print(
    "classical vs quantum HCl tree similarity on lumAB training set for top n pc1 genes:\n n=44: {:.4f}"
    .format(diff_44))