def generateInitialDistanceMatrix(cls, test=False):
        ''' Generate the initial nxn distance matrix by computing Distance between each DNA sequence '''

        # For Testing Purpose
        if test == True:
            cls.simMatrix = np.array(
                [[0, 9, 3, 6, 11], [9, 0, 7, 5, 10], [3, 7, 0, 9, 2],
                 [6, 5, 9, 0, 8], [11, 10, 2, 8, 0]],
                dtype=float)
        # Actual Dataset Implementation
        else:
            pickleFilePath = Path('data/simMat_3.pkl')
            if pickleFilePath.exists():
                # Load Pickle File storing the simMatrix
                with open(pickleFilePath, 'rb') as file:
                    cls.simMatrix = pickle.load(file)
            else:
                # Compute Distance among DNA Sequence
                cls.simMatrix = np.ones((cls.ClusterCount, cls.ClusterCount))
                for cID in range(cls.ClusterCount):
                    clusterA = cls.getClusterById(cID)
                    for _cID in range(cID, cls.ClusterCount):
                        clusterB = cls.getClusterById(_cID)
                        seq1 = clusterA.sequences[0]
                        seq2 = clusterB.sequences[0]
                        similarity_1 = computeDistance(seq1, seq2)
                        cls.simMatrix[cID, _cID] = similarity_1
                        cls.simMatrix[_cID, cID] = similarity_1
                        # print("similarity between {} and {} = {}\r".format(cID, _cID, similarity_1), end='', flush=True)
                        sys.stdout.flush()
                    # print('')
                # Save The Pickle File
                with open(pickleFilePath, 'wb') as file:
                    pickle.dump(cls.simMatrix, file)
Exemplo n.º 2
0
def genSimilarityMatrix(data):
    indexes = list(data.keys())[:]
    pickleFilePath = Path('data/simMatrix_K_Mediods.pkl')
    if pickleFilePath.exists():
        # Load Pickle File storing the simMatrix
        with open(pickleFilePath, 'rb') as file:
            simMatrix = pickle.load(file)
        return simMatrix, indexes

    ClusterCount = len(indexes)
    simMatrix = np.ones((ClusterCount, ClusterCount))
    for cID in range(ClusterCount):
        strA = data[indexes[cID]]
        for _cID in range(cID, ClusterCount):
            strB = data[indexes[_cID]]
            similarity_1 = computeDistance(strA, strB)
            simMatrix[cID, _cID] = similarity_1
            simMatrix[_cID, cID] = similarity_1
            #print("similarity between {} and {} = {}\r".format(cID, _cID, similarity_1), end='', flush=True)
            #sys.stdout.flush()
            #print('')
    print(simMatrix)
    with open(pickleFilePath, 'wb') as file:
        pickle.dump(simMatrix, file)
    return simMatrix, indexes