def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) decision_tree = tree.DecisionTreeClassifier() # get activity list actives = [ 1 for i in range(len(model_configuration["data"]["active"])) ] inactives = [ 0 for i in range(len(model_configuration["data"]["inactive"])) ] activity = actives + inactives decision_tree.fit( model_configuration["data"]["active"] + model_configuration["data"]["inactive"], activity) test_descriptors = _extract_descriptors(descriptors_file) prediction = decision_tree.predict(test_descriptors) molecule_names = _extract_names(fragments_file) # write output first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: for i in range(len(prediction)): score = { "name": molecule_names[i], "score": float(prediction[i]) } if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def extract_fragments(input_files: list, input_type: str, output_files: list, extraction_options: dict): # The write_molecule_json need some static info. # Count some statistics. total_fragments = 0 for file in output_files: inputoutput_utils.create_parent_directory(file) for file_num, path in enumerate(input_files): holder = {"first": True} with open(output_files[file_num], "w", encoding="utf-8") as output_stream: for molecule in _LOAD_FUNCTIONS[input_type](path): item = { "name": molecule.GetProp("_Name"), "smiles": rdkit.Chem.MolToSmiles(molecule), "fragments": _extract_fragments_from_molecule( molecule, extraction_options["fragments"], extraction_options) } total_fragments += len(item["fragments"]) # Append to output. _append_object_to_jsonlines(output_stream, item, holder) logging.info("\tfragments total: %d", total_fragments)
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_active_indexes = [] for fragment in line["fragments"]: index = fragment["index"] for group in model_configuration["configuration"][ "groups"]: if index in group: index = group[0] break if index not in test_active_indexes: test_active_indexes.append(index) max_sim = max([ _compute_sim(item, test_active_indexes) for item in model_configuration["data"]["active"] ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def print_graph(activity_files: list, directory: str, nicknames: list, input_type: str): input_values = [] for file in activity_files: with open(file, "r", encoding="utf-8") as activity_file: for new_line in activity_file: line = json.loads(new_line) input_values.append(line[input_type.upper()]) plt.plot(nicknames, input_values, marker="o") if input_type.upper() == "EF1": plt.ylabel("EF 1%") elif input_type.upper() == "EF5": plt.ylabel("EF 5%") else: plt.ylabel(input_type.upper()) if directory != "": file_name = directory + "/" + input_type.upper() + ".png" else: file_name = input_type.upper() + ".png" inputoutput_utils.create_parent_directory(file_name) plt.xticks(rotation=90, fontsize="x-small") plt.tight_layout() plt.savefig(file_name, dpi=150) plt.figure()
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_active_indexes = [] for fragment in line["fragments"]: if fragment["index"] not in test_active_indexes: test_active_indexes.append(fragment["index"]) group_indexes = _make_groups(test_active_indexes, model_configuration["configuration"]["groups"]) summary = 0 for test_index in group_indexes: if test_index in model_configuration["data"]["active"]: summary += 1 sim = summary / (len(model_configuration["data"]["active"]) + len(test_active_indexes) - summary) score = { "name": line["name"], "score": sim } if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) model_data = model_configuration["data"] active_molecules_ap = [] nbits = model_configuration["configuration"]["nbits"] for active_molecule in model_data["active"]: molecule_smiles = active_molecule.strip("\"") molecule = Chem.MolFromSmiles(molecule_smiles) ap_fingerprint = Pairs.GetHashedAtomPairFingerprint(molecule, nBits=nbits) active_molecules_ap.append(ap_fingerprint) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_molecule_input = line["smiles"] test_molecule_smiles = test_molecule_input.strip("\"") test_molecule = Chem.MolFromSmiles(test_molecule_smiles) test_mol_fingerprint = Pairs.GetHashedAtomPairFingerprint( test_molecule, nBits=nbits) max_sim = max([ DataStructs.TanimotoSimilarity(test_mol_fingerprint, fingerprint) for fingerprint in active_molecules_ap ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def _print_histogram(baseline_val: int, input_file: str, output_directory: str) -> list: inputoutput_utils.create_parent_directory(output_directory + "/0") textstr = "baseline AUC: " + str(float(round(baseline_val, 5))) auc = [] with open(input_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) auc.append(line["AUC"]) min_val = min(auc) max_val = max(auc) diff = max_val - min_val diff_step = diff / 10 steps = [] for i in range(10): steps.append(i * diff_step + min_val) steps.append(max_val) arr = plt.hist(auc, bins=steps, color="blue") plt.xticks(steps, rotation=90) for i in range(10): plt.text(arr[1][i] + diff_step / 2, arr[0][i], str(int(arr[0][i])), horizontalalignment="center") props = dict(boxstyle="round") plt.text(steps[7], int(arr[0][0]), textstr) plt.tight_layout() plt.savefig(output_directory + "/AUC.png", dpi=1000) plt.figure() return auc
def _model_and_score_and_evaluate(active_fragments: str, test_fragments: str, test_activity: str, num: int, output_directory: str, maximal_num: int): inputoutput_utils.create_parent_directory(output_directory + "/scorefiles/0") inputoutput_utils.create_parent_directory(output_directory + "/activities/0") with open(output_directory + "/configurationfiles/configuration" + str(maximal_num) + "_" + str(num) + ".json", "r", encoding="utf-8") as input_file: for new_line in input_file: line = json.loads(new_line) if os.path.isfile(output_directory + "/evaluations/" + line["evaluation"]): continue new_model = model_factory.create_model(line["model_name"]) model = new_model.create_model(active_fragments, "", "", "", line) new_model.score_model(model, test_fragments, "", output_directory + "/scorefiles/score" + line["evaluation"]) # run add_activity activity = add_activity.read_activity(test_activity) add_activity.add_activity_and_write_to_json(output_directory + "/scorefiles/score" + line["evaluation"], activity, output_directory + "/activities/activity" + line["evaluation"]) # run compute_evaluation score_act = compute_evaluation.read_file_with_score_and_activity(output_directory + "/activities/activity" + line["evaluation"]) activity = compute_evaluation.sort_activity(score_act) compute_evaluation.evaluation(activity, output_directory + "/evaluations/" + line["evaluation"])
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) model_data = model_configuration["data"] radius = int( model_configuration["configuration"]["fragments"][0]["size"]) active_molecules_tt = [] for active_molecule in model_data["active"]: molecule_smiles = active_molecule.strip("\"") molecule = Chem.MolFromSmiles(molecule_smiles) tt_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect( molecule, radius) active_molecules_tt.append(tt_fingerprint) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_molecule_input = line["smiles"] test_molecule_smiles = test_molecule_input.strip("\"") test_molecule = Chem.MolFromSmiles(test_molecule_smiles) test_mol_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect( test_molecule, radius) max_sim = max([ DataStructs.TanimotoSimilarity(test_mol_fingerprint, fingerprint) for fingerprint in active_molecules_tt ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) nbit = int(model_configuration["configuration"]["nbits"]) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_indexes = [] for item in line["fragments"]: test_indexes.append(int(item["index"]) % nbit) max_sim = 0 for active_indexes in model_configuration["data"]["active"]: intersection = _intersection_of_two_arrays(active_indexes, test_indexes) sim = len(intersection) / ( len(test_indexes) + len(active_indexes) - len(intersection)) if sim > max_sim: max_sim = sim score = { "name": line["name"], "score": max_sim } if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) active_parameter = int( model_configuration["configuration"]["active_parameter"]) inactive_parameter = int( model_configuration["configuration"]["inactive_parameter"]) with open(output_file, "w") as streamo: first = True with open(fragments_file, "r") as stream: for line in stream: molecule = json.loads(line) name = molecule["name"] suma = 0 for fragment in molecule["fragments"]: founded = False for index in model_configuration["data"]: if int(index) == int(fragment["index"]): founded = True break if founded: suma += active_parameter else: suma -= inactive_parameter sim = suma / len(molecule["fragments"]) score = {"name": name, "score": sim} if first: first = False else: streamo.write("\n") json.dump(score, streamo)
def _main(): configuration = _read_configuration() inputoutput_utils.create_parent_directory(configuration["output"]) values = [[], [], [], [], []] if configuration["type"] == "AUC": str1 = "EF1" str2 = "EF5" elif configuration["type"] == "EF1": str1 = "AUC" str2 = "EF5" elif configuration["type"] == "EF5": str1 = "AUC" str2 = "EF1" else: print("Wrong type!") print("It has to be: AUC. EF1 or EF5") exit(1) with open(configuration["input_file"], "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) values[0].append(line["groups"]) values[1].append(line[configuration["type"]]) values[2].append(line[str1]) values[3].append(line[str2]) if len(values[0]) < int(configuration["best"]): print("The input file does not have that much good results!") print("Number of results in input file: " + str(len(values[0]))) print("Number you wanted to select: " + configuration["best"]) print("Can not be like that!") exit(1) for i in range(len(values[1]) - 1): for j in range(len(values[1]) - i - 1): if values[1][j] < values[1][j + 1]: tmp = values[1][j] values[1][j] = values[1][j + 1] values[1][j + 1] = tmp tmp = values[0][j] values[0][j] = values[0][j + 1] values[0][j + 1] = tmp tmp = values[2][j] values[2][j] = values[2][j + 1] values[2][j + 1] = tmp tmp = values[3][j] values[3][j] = values[3][j + 1] values[3][j + 1] = tmp with open(configuration["output"], "w", encoding="utf-8") as output_stream: for i in range(int(configuration["best"])): model = { "groups": values[0][i], configuration["type"]: values[1][i], str1: values[2][i], str2: values[3][i] } json.dump(model, output_stream) output_stream.write("\n")
def _main(): configuration = _read_configuration() with open(configuration["configuration"], "r", encoding="utf-8") as input_stream: model_configuration = json.load(input_stream) model_name = model_configuration["model_name"] new_model = model_factory.create_model(model_name) model = new_model.create_model(configuration["active_fragments"], configuration["inactive_fragments"], configuration["active_descriptors"], configuration["inactive_descriptors"], model_configuration) inputoutput_utils.create_parent_directory(configuration["output"]) new_model.save_to_json_file(configuration["output"], model)
def add_activity_and_write_to_json(input_score: str, activity: list, output_file: str): inputoutput_utils.create_parent_directory(output_file) with open(output_file, "w", encoding="utf-8") as output_stream: with open(input_score, "r", encoding="utf-8") as stream: for num, line in enumerate(stream): score = json.loads(line) output = { "name": score["name"], "score": score["score"], "activity": activity[num] } if num != 0: output_stream.write("\n") json.dump(output, output_stream)
def _print_histogram(input_files: list, nicknames: list, output_directory: str): all_auc = [] inputoutput_utils.create_parent_directory(output_directory + "/0") for file in input_files: auc = [] with open(file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) auc.append(line["AUC"]) all_auc.append(auc) plt.hist(all_auc, stacked=True, density=False) plt.legend(nicknames, loc="upper right") plt.savefig(output_directory + "/AUC.png", dpi=1000) plt.figure()
def _make_configuration_files(input_file: str, output_directory: str, model_name: str, cpu_counts: int, cutoff_val: int) -> list: active_indexes = [] inputoutput_utils.create_parent_directory(output_directory + "/configurationfiles/0") with open(input_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) for item in line["fragments"]: if item["index"] not in active_indexes: active_indexes.append(item["index"]) pair_list = [] for i in range(len(active_indexes) - 1): for j in range(i + 1, len(active_indexes)): pair_list.append([active_indexes[i], active_indexes[j]]) number = len(pair_list) // cpu_counts ranges = [] for i in range(cpu_counts): ranges.append(i * number) ranges.append(len(pair_list)) for i in range(cpu_counts): output_file = output_directory + "/configurationfiles/configuration" + str( i) + ".json" first = True with open(output_file, "w", encoding="utf-8") as output_stream: for j in range(ranges[i], ranges[i + 1]): if cutoff_val == -1: model = { "model_name": model_name, "groups": [pair_list[j]] } else: model = { "model_name": model_name, "cutoff": cutoff_val, "groups": [pair_list[j]] } if first: first = False else: output_stream.write("\n") json.dump(model, output_stream) return ranges
def evaluation(activity_arr: list, output_file: str): inputoutput_utils.create_parent_directory(output_file) auc = Scoring.CalcAUC(activity_arr, 0) ef1 = Scoring.CalcEnrichment(activity_arr, 0, [0.01]) ef5 = Scoring.CalcEnrichment(activity_arr, 0, [0.05]) rie = Scoring.CalcRIE(activity_arr, 0, 100) bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100) output = { "AUC": auc, "EF1": ef1[0], "EF5": ef5[0], "RIE": rie, "BEDROC": bedroc } with open(output_file, "w", encoding="utf-8") as stream: json.dump(output, stream)
def _print_graph(val: float, values: list, type1: str, output: str): inputoutput_utils.create_parent_directory(output) textstr = "baseline " + type1 + ": " + "%.6f" %(val) if type1 == "AUC": min_val = min(values) max_val = max(values) diff = max_val - min_val diff_step = diff/10 steps = [] for i in range(10): steps.append(i*diff_step + min_val) steps.append(max_val) arr = plt.hist(values, bins=steps, color="blue") plt.xticks(steps, rotation=90) for i in range(10): plt.text(arr[1][i], arr[0][i], str(int(arr[0][i])), horizontalalignment="left") props = dict(boxstyle="round") plt.text(steps[7], int(arr[0][0]), textstr) plt.tight_layout() plt.savefig(output) else: num = {} values1 = sorted(values) for item in values1: if item in num: num[item] += 1 else: num[item] = 1 item = [] values = [] for l in num: item.append(l) values.append(num[l]) xval = [val-2] + [val] + (item) plt.plot(item, values, marker="o") plt.xticks(xval) plt.axvline(val, color="red") plt.savefig(output) print(min(values)) print(max(values))
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): name_num = _read_molecules(fragments_file) inputoutput_utils.create_parent_directory(output_file) active_parameter = int( model_configuration["configuration"]["active_parameter"]) inactive_parameter = int( model_configuration["configuration"]["inactive_parameter"]) with open(output_file, "w") as streamo: first_write = True with open(descriptors_file, "r") as stream: next(stream) counter = 0 molecule_num = 0 sum = 0 for line in stream: line_parts = line.split(",") parts = line_parts[2:] founded = False for descriptors in model_configuration["data"]: if descriptors == parts: founded = True break if founded: sum += active_parameter else: sum -= inactive_parameter counter += 1 if counter == name_num[molecule_num]["fragments"]: score = { "name": name_num[molecule_num]["molecule"], "score": sum / counter } counter = 0 sum = 0 molecule_num += 1 if first_write: first_write = False else: streamo.write("\n") json.dump(score, streamo)
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_active_indexes = _add_indexes( line["fragments"], model_configuration["configuration"]["groups"]) max_sim = _compute_sim( model_configuration["data"]["active"], test_active_indexes) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) model_data = model_configuration["data"] diameter = int( model_configuration["configuration"]["fragments"][0]["size"]) radius = diameter // 2 if int(diameter) % 2 == 1: print("Incorrect input, size must be even!") exit(1) active_molecules_ecfp = [] nbits = model_configuration["configuration"]["nbits"] for active_molecule in model_data["active"]: molecule_smiles = active_molecule.strip("\"") molecule = Chem.MolFromSmiles(molecule_smiles) ecfp_fingerprint = AllChem.GetHashedMorganFingerprint(molecule, radius, nBits=nbits) active_molecules_ecfp.append(ecfp_fingerprint) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_molecule_input = line["smiles"] test_molecule_smiles = test_molecule_input.strip("\"") test_molecule = Chem.MolFromSmiles(test_molecule_smiles) test_mol_fingerprint = AllChem.GetHashedMorganFingerprint( test_molecule, radius, nBits=nbits) max_sim = max([ DataStructs.TanimotoSimilarity(test_mol_fingerprint, fingerprint) for fingerprint in active_molecules_ecfp ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)
def _main(): configuration = _read_configuration() inputoutput_utils.create_parent_directory(configuration["output_file"]) _print_graph(configuration["input_activity"], configuration["output_file"])
def _main(): # run extract_fragments configuration = _read_configuration() with open(configuration["model_configuration"], "r", encoding="utf-8") as input_stream: model_configuration = json.load(input_stream) try: new_model = model_factory.create_model(model_configuration["model_name"]) except: print("Model does not exist!") exit(1) if "kekule" not in model_configuration: model_configuration["kekule"] = False else: model_configuration["kekule"] = bool(model_configuration["kekule"]) if "isomeric" not in model_configuration: model_configuration["isomeric"] = False else: model_configuration["isomeric"] = bool(model_configuration["isomeric"]) if "fragments" not in model_configuration: model_configuration["fragments"] = "ecfp.6" parsed_types = [] for item in model_configuration["fragments"].split(","): item_split = item.split(".") if item_split[0] != "ap": if not len(item_split) == 2: logging.error("Invalid fragment type: %s", item) logging.info("Expected format {TYPE}.{SIZE} or ap") exit(1) parsed_types.append({ "name": item_split[0], "size": int(item_split[1]) }) else: parsed_types.append({ "name": item_split[0], }) model_configuration["fragments"] = parsed_types extraction_options = { "kekule": model_configuration["kekule"], "isomeric": model_configuration["isomeric"], "fragments": model_configuration["fragments"] } input_files = [configuration["input_actives"], configuration["input_inactives"], configuration["test"]] directory = configuration["directory"] fragments_output_files = [directory+"/fragmentsa.json", directory+"/fragmentsi.json", directory+"/fragmentst.json"] for file in fragments_output_files: inputoutput_utils.create_parent_directory(file) extract_fragments.extract_fragments(input_files, configuration["input_type"], fragments_output_files, extraction_options) # run extract_descriptors descriptors_output_files = [directory+"/descriptorsa.csv", directory+"/descriptorsi.csv", directory+"/descriptorst.csv"] for file in descriptors_output_files: inputoutput_utils.create_parent_directory(file) if (model_configuration["model_name"] == "descriptors_model") |\ ((model_configuration["model_name"] == "linear_regression_model") and (int(model_configuration["molecules"]) == 0)): compute_descriptors.compute_descriptors(fragments_output_files, descriptors_output_files, True) else: compute_descriptors.compute_descriptors(fragments_output_files, descriptors_output_files, False) # run create_model and score_molecules model = new_model.create_model(directory+"/fragmentsa.json", directory+"/fragmentsi.json", directory+"/descriptorsa.csv", directory+"/descriptorsi.csv", model_configuration) new_model.score_model(model, directory+"/fragmentst.json", directory+"/descriptorst.csv", directory+"/score.json") # run add_activity activity = add_activity.read_activity(configuration["activity"]) add_activity.add_activity_and_write_to_json(directory + "/score.json", activity, directory + "/activity.json") # run compute_evaluation score_act = compute_evaluation.read_file_with_score_and_activity(directory + "/activity.json") activity = compute_evaluation.sort_activity(score_act) compute_evaluation.evaluation(activity, configuration["output"])
def _main(): configuration = _read_configuration() active_indexes = [] with open(configuration["input_fragments"], "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) for fragment in line["fragments"]: if fragment["index"] not in active_indexes: active_indexes.append(fragment["index"]) pairs = [] for i in range(len(active_indexes) - 1): for j in range(i + 1, len(active_indexes)): pairs.append([active_indexes[i], active_indexes[j]]) auc = 0 ef1 = 0 ef5 = 0 with open(configuration["baseline_output"], "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) auc = line["AUC"] ef1 = line["EF1"] ef5 = line["EF5"] inputoutput_utils.create_parent_directory( configuration["output_directory"] + "/0") _prepare_files(configuration["output_directory"]) onlyfiles = [ f for f in listdir(configuration["input_directory"]) if isfile(join(configuration["input_directory"], f)) ] for file in onlyfiles: with open(configuration["input_directory"] + "/" + file, "r", encoding="utf-8") as input_stream: file_str = file.split(".")[0] num = int(file_str[10:]) for new_line in input_stream: line = json.loads(new_line) output = { "groups": [pairs[num]], "AUC": line["AUC"], "EF1": line["EF1"], "EF5": line["EF5"] } if (line["AUC"] > auc) and (line["EF1"] > ef1) and (line["EF5"] > ef5): with open(configuration["output_directory"] + "/aucef1ef5.json", "a", encoding="utf-8") as output_stream: json.dump(output, output_stream) output_stream.write("\n") if (line["AUC"] > auc) and (line["EF1"] > ef1): with open(configuration["output_directory"] + "/aucef1.json", "a", encoding="utf-8") as output_stream: json.dump(output, output_stream) output_stream.write("\n") if (line["AUC"] > auc) and (line["EF5"] > ef5): with open(configuration["output_directory"] + "/aucef5.json", "a", encoding="utf-8") as output_stream: json.dump(output, output_stream) output_stream.write("\n") if (line["EF1"] > ef1) and (line["EF5"] > ef5): with open(configuration["output_directory"] + "/ef1ef5.json", "a", encoding="utf-8") as output_stream: json.dump(output, output_stream) output_stream.write("\n") if line["AUC"] > auc: with open(configuration["output_directory"] + "/auc.json", "a", encoding="utf-8") as output_stream: json.dump(output, output_stream) output_stream.write("\n") if line["EF5"] > ef5: with open(configuration["output_directory"] + "/ef5.json", "a", encoding="utf-8") as output_stream: json.dump(output, output_stream) output_stream.write("\n") if line["EF1"] > ef1: with open(configuration["output_directory"] + "/ef1.json", "a", encoding="utf-8") as output_stream: json.dump(output, output_stream) output_stream.write("\n") if (line["AUC"] > auc) or (line["EF1"] > ef1) or (line["EF5"] > ef5): with open(configuration["output_directory"] + "/greater.json", "a", encoding="utf-8") as output_stream: json.dump(output, output_stream) output_stream.write("\n") with open(configuration["output_directory"] + "/baseline.json", "w", encoding="utf-8") as output_stream: output = {"AUC": auc, "EF1": ef1, "EF5": ef5} json.dump(output, output_stream)
def compute_descriptors(input_files: list, output_files: list, use_fragments: bool, features_to_use=[]): for output_file in output_files: inputoutput_utils.create_parent_directory(output_file) # Pick features to use. if features_to_use == [] or features_to_use is None: used_features_names = _NAMES else: used_features_names = features_to_use used_features_fnc = [ _FUNCTIONS[_NAMES.index(name)] for name in used_features_names ] # Compute and write descriptors. sanitize_operation = rdkit.Chem.SanitizeFlags.SANITIZE_ALL ^ \ rdkit.Chem.SanitizeFlags.SANITIZE_KEKULIZE number_of_invalid = 0 # Gather data. count_molecules = 0 num = -1 for input_file in input_files: with open(input_file, "r", encoding="utf-8") as streami: num += 1 with open(output_files[num], "w", encoding="utf-8") as stream: _write_header(stream, use_fragments, used_features_names) for line in streami: molecule = json.loads(line) smiles_list = [] index_list = [] position = 0 if use_fragments: for fragment in molecule["fragments"]: smiles_list.append(fragment["smiles"]) index_list.append(fragment["index"]) else: smiles_list.append(molecule["smiles"]) counter = 0 for smiles in smiles_list: # SMILES. stream.write("\"") stream.write(smiles) stream.write("\",") if use_fragments: stream.write(str(index_list[position])) stream.write(",") position += 1 counter += 1 count_molecules += 1 # Construct molecule, compute and write properties. molecule = rdkit.Chem.MolFromSmiles(str(smiles), sanitize=False) # Do not kekulize molecule. rdkit.Chem.SanitizeMol(molecule, sanitizeOps=sanitize_operation) # if molecule is None: logging.error("Invalid molecule detected: %s", smiles) number_of_invalid += 1 continue stream.write(",".join( [str(fnc(molecule)) for fnc in used_features_fnc])) stream.write("\n") # Log nad return summary. logging.info("Invalid molecules: %d/%d", number_of_invalid, count_molecules)
def _make_configuration_files(group_file: str, output_directory: str, model_name: str, cpu_counts: int, cutoff_val: int) -> int: groups = [] inputoutput_utils.create_parent_directory(output_directory + "/configurationfiles/0") inputoutput_utils.create_parent_directory(output_directory + "/evaluations/0") files = [f for f in listdir(output_directory + "/configurationfiles") if isfile(join(output_directory + "/configurationfiles", f))] num_of_config = 0 for file in files: with open(output_directory + "/configurationfiles/" + file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: num_of_config += 1 config_files = [] for file in files: first_part = file.split("_")[0] config_files.append(int(first_part[13:])) if num_of_config == 0: maximal_num = 0 else: maximal_num = max(config_files) evaluation_files = [f for f in listdir(output_directory + "/evaluations") if isfile(join(output_directory + "/evaluations", f))] if len(evaluation_files) != num_of_config: num_of_max_num = 0 for item in config_files: if item == maximal_num: num_of_max_num += 1 if num_of_max_num != cpu_counts: print("Please run the program as before on " + str(num_of_max_num) + " cores") exit(1) else: return maximal_num with open(group_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) groups.append(line["groups"]) num_of_groups = len(groups) group_list = [] for file in files: with open(output_directory + "/configurationfiles/" + file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) group_list.append(line["groups"]) for i in range(len(groups)-1): for j in range(i+1, len(groups)): groups1 = groups[i].copy() groups2 = groups[j].copy() is_intersected = False while _control_intersection(groups1, groups2): fin = False is_intersected = True for group1 in groups1: for item in group1: for group2 in groups2: if item in group2: groups1.remove(group1) groups2.remove(group2) groups1.append(_new_group(group1, group2)) fin = True break if fin: break if fin: break if is_intersected: groups11 = _one_group_intersection(groups1) new_group = groups11.copy() if groups2 != []: new_group.extend(groups2) if _control_groups(group_list, new_group) is False: group_list.append(new_group) else: if _control_groups(group_list, groups1 + groups2) is False: group_list.append(groups1 + groups2) for file in files: with open(output_directory + "/configurationfiles/" + file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) grupeto = line["groups"] for i in range(len(group_list)): if grupeto == group_list[i]: group_list.remove(grupeto) break number = len(group_list) // cpu_counts ranges = [] for i in range(cpu_counts): ranges.append(i * number) ranges.append(len(group_list)) maximal_num += 1 for i in range(cpu_counts): output_file = output_directory + "/configurationfiles/configuration" + str(maximal_num) + "_" + str(i) + ".json" first = True with open(output_file, "w", encoding="utf-8") as output_stream: for j in range(ranges[i], ranges[i+1]): if cutoff_val == -1: model = { "model_name": model_name, "groups": group_list[j], "evaluation": "evaluation" + str(maximal_num) + "_" + str(j) + ".json" } else: model = { "model_name": model_name, "cutoff": cutoff_val, "groups": group_list[j], "evaluation": "evaluation" + str(maximal_num) + "_" + str(j) + ".json" } if first: first = False else: output_stream.write("\n") json.dump(model, output_stream) return maximal_num
def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) reg = linear_model.LinearRegression() # get activity list actives = [ 1 for i in range(len(model_configuration["data"]["active"])) ] inactives = [ 0 for i in range(len(model_configuration["data"]["inactive"])) ] activity = actives + inactives reg.fit( model_configuration["data"]["active"] + model_configuration["data"]["inactive"], activity) test_descriptors = extract_descriptors( descriptors_file, model_configuration["configuration"]) molecule_file = int(model_configuration["configuration"]["molecules"]) prediction = (reg.predict(test_descriptors)) if molecule_file == 1: first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for num_line, new_line in enumerate(input_stream): line = json.loads(new_line) score = { "name": line["name"], "score": prediction[num_line] } if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream) else: num_of_fragment = [0] names_of_molecules = [] with open(fragments_file, "r", encoding="utf-8") as fragments_stream: suma = 0 for new_line in fragments_stream: line = json.loads(new_line) fragment_length = len(line["fragments"]) suma += fragment_length num_of_fragment.append(suma) names_of_molecules.append(line["name"]) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: for i in range(len(num_of_fragment) - 1): prediction_of_molecule = prediction[ num_of_fragment[i]:num_of_fragment[i + 1]] sim = sum(prediction_of_molecule) / len( prediction_of_molecule) score = {"name": names_of_molecules[i], "score": sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream)