예제 #1
0
def calc_distance_matrix(mols):
    """
    Calculate a full distance matrix for the given molecules. Identical molecules get a score of 0.0 with the maximum
    distance possible being 1.0.
    :param mols: A list of molecules. It must be possible to iterate through this list multiple times
    :return: A NxN 2D array of distance scores, with N being the number of molecules in the input
    """

    # TODO - do we need to calculate both sides of the matrix? Tanimoto is supposed to be a symmetric distance measure,
    #  but the matrix that is generated does not seem to be symmetric.

    mol_fm_tuples = []
    for mol in mols:
        features = sucos.getRawFeatures(mol)
        mol_fm_tuples.append((mol, features))

    matrix = []
    for tuple1 in mol_fm_tuples:
        tmp = []
        for tuple2 in mol_fm_tuples:
            if tuple1[0] == tuple2[0]:
                tmp.append(0.0)
            else:
                #utils.log("Calculating SuCOS between", mol1, mol2)
                sucos_score, fm_score, tani_score = sucos.get_SucosScore(
                    tuple1[0],
                    tuple2[0],
                    tani=True,
                    ref_features=tuple1[1],
                    query_features=tuple2[1])
                tmp.append(1.0 - sucos_score)
        matrix.append(tmp)

    return matrix
예제 #2
0
def process(inputfilename, clusterfilenames, outputfilename, mode):

    all_clusters = {}
    for filename in clusterfilenames:
        cluster = []
        cluster_file = utils.open_file_for_reading(filename)
        suppl = Chem.ForwardSDMolSupplier(cluster_file)
        i = 0
        for mol in suppl:
            i += 1
            if not mol:
                utils.log("WARNING: failed to generate molecule", i,
                          "in cluster", filename)
                continue
            try:
                features = sucos.getRawFeatures(mol)
                cluster.append((mol, features))
            except:
                utils.log("WARNING: failed to generate features for molecule",
                          i, "in cluster", filename)

        cluster_file.close()
        all_clusters[filename] = cluster

    input_file = utils.open_file_for_reading(inputfilename)
    suppl = Chem.ForwardSDMolSupplier(input_file)
    output_file = utils.open_file_for_writing(outputfilename)
    writer = Chem.SDWriter(output_file)

    comparisons = 0
    mol_num = 0

    for mol in suppl:
        mol_num += 1
        if not mol:
            utils.log("WARNING: failed to generate molecule", mol_num,
                      "in input")
            continue
        try:
            query_features = sucos.getRawFeatures(mol)
        except:
            utils.log("WARNING: failed to generate features for molecule",
                      mol_num, "in input")
            continue
        scores = [0, 0, 0]
        for clusterfilename in all_clusters:
            cluster = all_clusters[clusterfilename]
            index = 0
            for entry in cluster:
                hit = entry[0]
                ref_features = entry[1]
                index += 1
                comparisons += 1
                sucos_score, fm_score, vol_score = sucos.get_SucosScore(
                    hit,
                    mol,
                    tani=False,
                    ref_features=ref_features,
                    query_features=query_features)
                if mode == 'max':
                    if sucos_score > scores[0]:
                        scores[0] = sucos_score
                        scores[1] = fm_score
                        scores[2] = vol_score
                        cluster_name = clusterfilename
                        cluster_index = index
                elif mode == 'cum':
                    scores[0] += sucos_score
                    scores[1] += fm_score
                    scores[2] += vol_score
                else:
                    raise ValueError("Invalid mode: " + mode)

        if scores[0] > 0:
            if mode == 'max':
                cluster_file_name_only = cluster_name.split(os.sep)[-1]
                #utils.log("Max SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2],"File:", cluster_file_name_only, "Index:", cluster_index)
                mol.SetDoubleProp("Max_SuCOS_Score", scores[0])
                mol.SetDoubleProp("Max_SuCOS_FeatureMap_Score", scores[1])
                mol.SetDoubleProp("Max_SuCOS_Protrude_Score", scores[2])
                mol.SetProp("Max_SuCOS_Cluster", cluster_file_name_only)
                mol.SetIntProp("Max_SuCOS_Index", cluster_index)

            else:
                #utils.log("Cum SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2])
                mol.SetDoubleProp("Cum_SuCOS_Score", scores[0])
                mol.SetDoubleProp("Cum_SuCOS_FeatureMap_Score", scores[1])
                mol.SetDoubleProp("Cum_SuCOS_Protrude_Score", scores[2])

            writer.write(mol)

        else:
            utils.log("Molecule", mol_num,
                      "did not overlay. Omitting from results")

    input_file.close()
    writer.flush()
    writer.close()
    output_file.close()

    utils.log("Completed", comparisons, "comparisons")
예제 #3
0
def process(inputfilename, clusterfilenames, outputfilename, filter_value,
            filter_field):
    all_clusters = {}
    for filename in clusterfilenames:
        cluster = []
        cluster_file = utils.open_file_for_reading(filename)
        suppl = Chem.ForwardSDMolSupplier(cluster_file)
        i = 0
        for mol in suppl:
            i += 1
            if not mol:
                utils.log("WARNING: failed to generate molecule", i,
                          "in cluster", filename)
                continue
            try:
                features = sucos.getRawFeatures(mol)
                cluster.append((mol, features))
            except:
                utils.log("WARNING: failed to generate features for molecule",
                          i, "in cluster", filename)

        cluster_file.close()
        all_clusters[filename] = cluster

    input_file = utils.open_file_for_reading(inputfilename)
    suppl = Chem.ForwardSDMolSupplier(input_file)
    output_file = utils.open_file_for_writing(outputfilename)
    writer = Chem.SDWriter(output_file)

    comparisons = 0
    mol_num = 0

    for mol in suppl:
        mol_num += 1
        if not mol:
            utils.log("WARNING: failed to generate molecule", mol_num,
                      "in input")
            continue
        try:
            query_features = sucos.getRawFeatures(mol)
        except:
            utils.log("WARNING: failed to generate features for molecule",
                      mol_num, "in input")
            continue
        scores_max = [0, 0, 0]
        scores_cum = [0, 0, 0]
        cluster_name = None
        for clusterfilename in all_clusters:
            cluster = all_clusters[clusterfilename]
            index = 0
            for entry in cluster:
                hit = entry[0]
                ref_features = entry[1]
                index += 1
                comparisons += 1
                sucos_score, fm_score, vol_score = sucos.get_SucosScore(
                    hit,
                    mol,
                    tani=False,
                    ref_features=ref_features,
                    query_features=query_features)

                if sucos_score > scores_max[0]:
                    scores_max[0] = sucos_score
                    scores_max[1] = fm_score
                    scores_max[2] = vol_score
                    cluster_name = clusterfilename
                    cluster_index = index

                scores_cum[0] += sucos_score
                scores_cum[1] += fm_score
                scores_cum[2] += vol_score

        # utils.log("Max SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2],"File:", cluster_file_name_only, "Index:", cluster_index)
        mol.SetDoubleProp("Max_SuCOS_Score",
                          scores_max[0] if scores_max[0] > 0 else 0)
        mol.SetDoubleProp("Max_SuCOS_FeatureMap_Score",
                          scores_max[1] if scores_max[1] > 0 else 0)
        mol.SetDoubleProp("Max_SuCOS_Protrude_Score",
                          scores_max[2] if scores_max[2] > 0 else 0)

        if cluster_name:
            cluster_file_name_only = cluster_name.split(os.sep)[-1]
            mol.SetProp("Max_SuCOS_Cluster", cluster_file_name_only)
            mol.SetIntProp("Max_SuCOS_Index", cluster_index)

        # utils.log("Cum SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2])
        mol.SetDoubleProp("Cum_SuCOS_Score",
                          scores_cum[0] if scores_cum[0] > 0 else 0)
        mol.SetDoubleProp("Cum_SuCOS_FeatureMap_Score",
                          scores_cum[1] if scores_cum[1] > 0 else 0)
        mol.SetDoubleProp("Cum_SuCOS_Protrude_Score",
                          scores_cum[2] if scores_cum[2] > 0 else 0)

        if filter_value and filter_field:
            if mol.HasProp(filter_field):
                val = mol.GetDoubleProp(filter_field)
                if val > filter_value:
                    writer.write(mol)
        else:
            writer.write(mol)

    input_file.close()
    writer.flush()
    writer.close()
    output_file.close()

    utils.log("Completed", comparisons, "comparisons")