def test_split_and_save_data_frame(mocker, tmpdir, default_config): default_config["output_path"] = str(tmpdir) default_config["file_prefix"] = "dummy" filename_train = str( tmpdir / "dummy" + default_config["file_postfix"]["training_library"] ) filename_valid = str( tmpdir / "dummy" + default_config["file_postfix"]["validation_library"] ) filename_test = str( tmpdir / "dummy" + default_config["file_postfix"]["testing_library"] ) data = pd.DataFrame.from_dict({"one": np.zeros(100), "two": np.ones(100)}) split_and_save_data(data, "library", default_config) assert os.path.exists(filename_train) assert os.path.exists(filename_valid) assert os.path.exists(filename_test) data_read = pd.read_csv(filename_train, header=None, names=["one", "two"]) assert len(data_read) == 90 data_read = pd.read_csv(filename_valid, header=None, names=["one", "two"]) assert len(data_read) == 5 data_read = pd.read_csv(filename_test, header=None, names=["one", "two"]) assert len(data_read) == 5
def test_split_and_save_data_sparse(default_config, mocker, tmpdir): default_config["output_path"] = str(tmpdir) default_config["file_prefix"] = "dummy" filename_train = str( tmpdir / "dummy" + default_config["file_postfix"]["training_inputs"] ) filename_valid = str( tmpdir / "dummy" + default_config["file_postfix"]["validation_inputs"] ) filename_test = str( tmpdir / "dummy" + default_config["file_postfix"]["testing_inputs"] ) data = sparse.csr_matrix(np.ones([100, 2])) split_and_save_data(data, "inputs", default_config) assert os.path.exists(filename_train) assert os.path.exists(filename_valid) assert os.path.exists(filename_test) data_read = sparse.load_npz(str(filename_train)) assert data_read.shape[0] == 90 data_read = sparse.load_npz(str(filename_valid)) assert data_read.shape[0] == 5 data_read = sparse.load_npz(str(filename_test)) assert data_read.shape[0] == 5
def test_split_and_save_data_ndarray(mocker, tmpdir, default_config): default_config["output_path"] = str(tmpdir) default_config["file_prefix"] = "dummy" filename_train = str( tmpdir / "dummy" + default_config["file_postfix"]["training_inputs"] ) filename_valid = str( tmpdir / "dummy" + default_config["file_postfix"]["validation_inputs"] ) filename_test = str( tmpdir / "dummy" + default_config["file_postfix"]["testing_inputs"] ) data = np.ones([100, 2]) split_and_save_data(data, "inputs", default_config) assert os.path.exists(filename_train) assert os.path.exists(filename_valid) assert os.path.exists(filename_test) data_read = np.load(filename_train)["arr_0"] assert len(data_read) == 90 data_read = np.load(filename_valid)["arr_0"] assert len(data_read) == 5 data_read = np.load(filename_test)["arr_0"] assert len(data_read) == 5
def main(): """ Entry-point for the preprocess_recommender tool """ config = _get_config() filename = config.filename("library") dataset = pd.read_csv( filename, index_col=False, header=None, names=config["library_headers"], ) print("Dataset loaded, generating Labels...", flush=True) lb = LabelBinarizer(neg_label=0, pos_label=1, sparse_output=True) labels = lb.fit_transform(dataset["template_hash"]) split_and_save_data(labels, "labels", config) print("Labels created and splitted, generating Inputs...", flush=True) reactants = dataset["reactants"].to_numpy() inputs = np.apply_along_axis(reactants_to_fingerprint, 0, [reactants], config) inputs = sparse.lil_matrix(inputs.T).tocsr() split_and_save_data(inputs, "inputs", config) print("Inputs created and splitted, splitting Full Dataset...", flush=True) split_and_save_data(dataset, "library", config) print("Full Dataset splitted, creating unique template set", flush=True) _save_unique_templates(dataset, config)
def main() -> None: """Entry-point for the preprocess_expansion tool""" config = _get_config() if config["library_headers"][-1] != "template_code": config["library_headers"].append("template_code") filename = config.filename("library") if not os.path.exists(filename): dataset = _filter_dataset(config) else: dataset = pd.read_csv( filename, index_col=False, header=None, names=config["library_headers"], ) print("Dataset filtered/loaded, generating labels...", flush=True) labelb = LabelBinarizer(neg_label=0, pos_label=1, sparse_output=True) labels = labelb.fit_transform(dataset["template_hash"]) split_and_save_data(labels, "labels", config) print("Labels created and split, generating inputs...", flush=True) products = dataset["products"].to_numpy() inputs = np.apply_along_axis(smiles_to_fingerprint, 0, [products], config) inputs = sparse.lil_matrix(inputs.T).tocsr() split_and_save_data(inputs, "inputs", config) print("Inputs created and split, splitting full Dataset...", flush=True) split_and_save_data(dataset, "library", config) print("Full Dataset split, creating unique template set", flush=True) _save_unique_templates(dataset, config)
def main() -> None: """Entry-point for the preprocess_filter tool""" config = _get_config() true_dataset = pd.read_csv( config.filename("library"), index_col=False, header=None, names=config["library_headers"][:-1], ) true_dataset["true_product"] = 1 false_dataset = pd.read_csv( config.filename("false_library"), index_col=False, header=None, names=config["library_headers"][:-1], ) false_dataset["true_product"] = 0 dataset = true_dataset.append(false_dataset, sort=False) print("Dataset loaded, generating Labels...", flush=True) labels = dataset["true_product"].to_numpy() split_and_save_data(labels, "labels", config) print("Labels created and split, generating Inputs...", flush=True) products = dataset["products"].to_numpy() reactants = dataset["reactants"].to_numpy() inputs = np.apply_along_axis( reaction_to_fingerprints, 0, [products, reactants], config ).astype(np.int8) inputs = sparse.lil_matrix(inputs.T).tocsr() split_and_save_data(inputs, "inputs2", config) inputs = np.apply_along_axis(smiles_to_fingerprint, 0, [products], config).astype( np.int8 ) inputs = sparse.lil_matrix(inputs.T).tocsr() split_and_save_data(inputs, "inputs", config) print("Inputs created and split, splitting Full Dataset...", flush=True) split_and_save_data(dataset, "library", config)