def test_standardizer_different_configs(self): """Testing standardization of smiles using threading""" df_smiles = read_csv(curDir / "input/test_standardizer.csv") outcols = ["canonical_smiles", "success", "error_message"] out_types = ["object", "bool", "object"] ## Load ref standardizer st_ref = Standardizer( max_num_atoms=self.config["standardization"]["max_num_atoms"], max_num_tautomers=self.config["standardization"] ["max_num_tautomers"], include_stereoinfo=self.config["standardization"] ["include_stereoinfo"], verbosity=0, ) dt_ref = DfTransformer( st_ref, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=4, verbosity=0, ) response_ref = dt_ref.process_dataframe(df_smiles)[0] config_2 = ConfigDict(config_path=Path( os.path.join(curDir, "input/", "example_parameters_2.json"))).get_parameters() ## load test standardizer st_tmp = Standardizer( max_num_atoms=config_2["standardization"]["max_num_atoms"], max_num_tautomers=config_2["standardization"]["max_num_tautomers"], include_stereoinfo=config_2["standardization"] ["include_stereoinfo"], verbosity=0, ) dt_tmp = DfTransformer( st_tmp, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=2, verbosity=0, ) response_tmp = dt_tmp.process_dataframe(df_smiles)[0] try: assert_frame_equal(response_ref, response_tmp) except AssertionError: # frames are not equal pass else: # frames are equal raise AssertionError
def test_standardizer_parameter_atom_count(self): """Testing standardization with different number of max atom count""" df_smiles = read_csv(curDir / "input/test_standardizer.csv") outcols = ["canonical_smiles", "success", "error_message"] out_types = ["object", "bool", "object"] ## Load ref standardizer st_ref = Standardizer( max_num_atoms=self.config["standardization"]["max_num_atoms"], max_num_tautomers=self.config["standardization"] ["max_num_tautomers"], include_stereoinfo=self.config["standardization"] ["include_stereoinfo"], verbosity=0, ) dt_ref = DfTransformer( st_ref, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=4, verbosity=0, ) response_ref = dt_ref.process_dataframe(df_smiles)[0] ## load test standardizer st_tmp = Standardizer( max_num_atoms=5, max_num_tautomers=self.config["standardization"] ["max_num_tautomers"], include_stereoinfo=self.config["standardization"] ["include_stereoinfo"], verbosity=0, ) dt_tmp = DfTransformer( st_tmp, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=2, verbosity=0, ) response_tmp = dt_tmp.process_dataframe(df_smiles)[0] try: assert_frame_equal(response_ref, response_tmp) except AssertionError: # frames are not equal pass else: # frames are equal raise AssertionError
def prepare(args): overwriting = True load_config(args) load_key(args) output_dir = make_dir(args, "reference_set", None, overwriting) key = SecretDict.get_secrets()["key"] method_params_standardizer = ConfigDict.get_parameters()["standardization"] st = Standardizer.from_param_dict( method_param_dict=method_params_standardizer, verbosity=0) outcols_st = ["canonical_smiles", "success", "error_message"] out_types_st = ["object", "bool", "object"] dt_standarizer = DfTransformer( st, input_columns={"smiles": "smiles"}, output_columns=outcols_st, output_types=out_types_st, success_column="success", nproc=1, verbosity=0, ) method_params_folding = ConfigDict.get_parameters()["scaffold_folding"] sa = ScaffoldFoldAssign.from_param_dict( secret=key, method_param_dict=method_params_folding, verbosity=0) outcols_sa = [ "murcko_smiles", "sn_smiles", "fold_id", "success", "error_message" ] out_types_sa = ["object", "object", "int", "bool", "object"] dt_fold = DfTransformer( sa, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols_sa, output_types=out_types_sa, success_column="success", nproc=1, verbosity=0, ) method_params_descriptor = ConfigDict.get_parameters()["fingerprint"] dc = DescriptorCalculator.from_param_dict( secret=key, method_param_dict=method_params_descriptor, verbosity=0) outcols_dc = ["fp_feat", "fp_val", "success", "error_message"] out_types_dc = ["object", "object", "bool", "object"] dt_descriptor = DfTransformer( dc, input_columns={"canonical_smiles": "smiles"}, output_columns=outcols_dc, output_types=out_types_dc, success_column="success", nproc=1, verbosity=0, ) return output_dir, dt_standarizer, dt_fold, dt_descriptor
def test_standardizer_calculate_single(self): """Testing standardization of a single smiles from Chembl with reference smiles output obtained Feb 14th 2020""" # self.defineConfig() st = Standardizer.from_param_dict( method_param_dict=self.config["standardization"], verbosity=0) response = st.calculate_single( "Cc1ccc(cc1)S(=O)(=O)Nc2ccc(cc2)c3nc4ccc(NS(=O)(=O)c5ccc(C)cc5)cc4[nH]3" )[0] self.assertEqual( response, "Cc1ccc(S(=O)(=O)Nc2ccc(-c3nc4cc(NS(=O)(=O)c5ccc(C)cc5)ccc4[nH]3)cc2)cc1", )
def test_standardizer_pipeline(self): """ Testing standardization of a larger set of smiles from Chembl using serial execution Compared are resulting output files. """ infile = os.path.join(curDir, "input", "test_standardizer.csv") outfile = os.path.join(curDir, "output", "sn_fold_output.OK.csv") errfile = os.path.join(curDir, "output", "sn_fold_output.failed.csv") outfile_tmp = os.path.join(curDir, "output", "tmp", "sn_fold_output.OK.csv") errfile_tmp = os.path.join(curDir, "output", "tmp", "sn_fold_output.failed.csv") st = Standardizer.from_param_dict( method_param_dict=self.config["standardization"], verbosity=0) outcols = ["canonical_smiles", "success", "error_message"] out_types = ["object", "bool", "object"] dt = DfTransformer( st, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=2, verbosity=0, ) # build reference files, only run once # dt.process_file(infile, outfile, errfile) # run test with tmp files dt.process_file(infile, outfile_tmp, errfile_tmp) result = filecmp.cmp(outfile, outfile_tmp, shallow=False) error = filecmp.cmp(errfile, errfile_tmp, shallow=False) os.remove(outfile_tmp) os.remove(errfile_tmp) self.assertEqual(result, error, True)
def test_standardizer_multiprocessing(self): """Testing standardization of smiles using threading""" df_smiles = read_csv(curDir / "input/chembl/chembl_23_example_T2.csv", nrows=10) st = Standardizer( max_num_atoms=self.config["standardization"]["max_num_atoms"], max_num_tautomers=self.config["standardization"] ["max_num_tautomers"], include_stereoinfo=self.config["standardization"] ["include_stereoinfo"], verbosity=0, ) outcols = ["canonical_smiles", "success", "error_message"] out_types = ["object", "bool", "object"] dt_2 = DfTransformer( st, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=2, verbosity=0, ) response2 = dt_2.process_dataframe(df_smiles)[0] dt_4 = DfTransformer( st, input_columns={"smiles": "smiles"}, output_columns=outcols, output_types=out_types, success_column="success", nproc=4, verbosity=0, ) response4 = dt_4.process_dataframe(df_smiles)[0] assert_frame_equal(response2, response4)