def contactsTreeWalkGenerator(namesTrain, vectorsTrain, namesBase, vectorsBase, minYesThreshold= 60, topPercent = 5): contactsTrainMolsDict = dict(zip(namesTrain, vectorsTrain)) contactsTrainMolsTree = distanceMatrixToTree(getDistanceMatrix(namesTrain, vectorsTrain)) #drawTree(contactsTrainMolsTree) distances = getTrainigToBaseSimilarityMatrix(contactsTrainMolsDict, namesBase, vectorsBase) for subset in findYesClades(contactsTrainMolsTree, minYesThreshold): #print(subset) topBaseLikeTrainigSetNames = getTopSimilarCompounds(distances, subset, topPer = topPercent) yield subset, topBaseLikeTrainigSetNames
def chemTreeWalkGenerator(namesTrain, vectorsTrain, namesBase, vectorsBase, minYesThreshold= 75, topPercent = 2): chemTrainMolsDict = dict(zip(namesTrain, vectorsTrain)) chemTrainMolsTree = distanceMatrixToTree(getDistanceMatrix(namesTrain, vectorsTrain)) #drawTree(chemTrainMolsTree) #print(chemTrainMolsTree) ############################ Base bitVectors ##################################### distances = getTrainigToBaseSimilarityMatrix(chemTrainMolsDict, namesBase, vectorsBase) #maxSimilarNames = set() for subset in findYesClades(chemTrainMolsTree, minYesThreshold): #print(subset) topBaseLikeTrainigSetNames = getTopSimilarCompounds(distances, subset, topPer = topPercent) yield subset, topBaseLikeTrainigSetNames