def main(input_filename, map_filename, output_filename, column_indices=None): """ Get regression targets. Parameters ---------- input_filename : str PCBA data filename. map_filename : str ID->SMILES map filename. output_filename : str Output filename. column_indices : list, optional Column indices to include. If None, compounds are classified by activity. """ parser = PcbaParser(input_filename, map_filename, column_indices=column_indices) if column_indices is not None: print "Extracting data from the following columns:" for col in parser.get_column_names(): print '\t', col smiles, targets = parser.get_targets() # print the fraction of valid assay records that were found in the map total = np.count_nonzero(~np.isnan(parser.read_data().PUBCHEM_CID)) print '{}/{} records matched'.format(len(targets), total) # save SMILES and targets write_pickle({'smiles': smiles, 'targets': targets}, output_filename)
class TestPcbaParser(unittest.TestCase): """ Tests for PcbaParser. """ def setUp(self): """ Set up tests. """ self.temp_dir = tempfile.mkdtemp() self.map = { 'CID645443': 'Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1', 'CID2997889': 'CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1', 'CID2244': 'CC(=O)Oc1ccccc1C(=O)O', 'CID2662': 'Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1', 'CID3672': 'CC(C)Cc1ccc(C(C)C(=O)O)cc1' } _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.pkl') write_pickle(self.map, self.map_filename) # use a subset of AID588342 # note that CID 654924 is duplicated this_dir = os.path.split(os.path.realpath(__file__))[0] self.data_filename = os.path.join(this_dir, 'data/test_pcba_data.csv') # set up parser self.engine = PcbaParser(self.data_filename, self.map_filename) def tearDown(self): """ Clean up tests. """ shutil.rmtree(self.temp_dir) def test_get_targets_classification(self): """ Test PcbaParser.get_targets with classification. """ smiles, targets = self.engine.get_targets() assert len(smiles) == len(targets) == 2 idx = np.where(smiles == self.map['CID2997889'])[0][0] assert targets[idx] # marked Active idx = np.where(smiles == self.map['CID645443'])[0][0] assert not targets[idx] # marked Inactive def test_get_targets_regression(self): """ Test PcbaParser.get_targets with regression. """ columns = [7, 8, 12, 14, 15, 20, 22, 23, 24, 25, 26] self.engine = PcbaParser(self.data_filename, self.map_filename, column_indices=columns) smiles, targets = self.engine.get_targets() idx = np.where(smiles == self.map['CID2997889'])[0][0] assert not np.any(np.isnan(targets[idx])) idx = np.where(smiles == self.map['CID645443'])[0][0] assert np.any(np.isnan(targets[idx])) # will have NaNs
class TestPcbaParser(unittest.TestCase): """ Tests for PcbaParser. """ def setUp(self): """ Set up tests. """ self.temp_dir = tempfile.mkdtemp() self.map = { "CID645443": "Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1", "CID2997889": "CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1", "CID2244": "CC(=O)Oc1ccccc1C(=O)O", "CID2662": "Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1", "CID3672": "CC(C)Cc1ccc(C(C)C(=O)O)cc1", } _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix=".pkl") write_pickle(self.map, self.map_filename) # use a subset of AID588342 # note that CID 654924 is duplicated this_dir = os.path.split(os.path.realpath(__file__))[0] self.data_filename = os.path.join(this_dir, "data/test_pcba_data.csv") # set up parser self.engine = PcbaParser(self.data_filename, self.map_filename) def tearDown(self): """ Clean up tests. """ shutil.rmtree(self.temp_dir) def test_get_targets_classification(self): """ Test PcbaParser.get_targets with classification. """ smiles, targets = self.engine.get_targets() assert len(smiles) == len(targets) == 2 idx = np.where(smiles == self.map["CID2997889"])[0][0] assert targets[idx] # marked Active idx = np.where(smiles == self.map["CID645443"])[0][0] assert not targets[idx] # marked Inactive def test_get_targets_regression(self): """ Test PcbaParser.get_targets with regression. """ columns = [7, 8, 12, 14, 15, 20, 22, 23, 24, 25, 26] self.engine = PcbaParser(self.data_filename, self.map_filename, column_indices=columns) smiles, targets = self.engine.get_targets() idx = np.where(smiles == self.map["CID2997889"])[0][0] assert not np.any(np.isnan(targets[idx])) idx = np.where(smiles == self.map["CID645443"])[0][0] assert np.any(np.isnan(targets[idx])) # will have NaNs
def test_read_data(self): """ Test Nci60Parser.read_data. """ df = self.engine.read_data() fixed_count = df.count().values.sum() # count excluding NaNs # use PcbaParser to read data (w/o proper NaN handling) engine = PcbaParser(self.data_filename, self.map_filename, delimiter="\t", primary_key="NSC", id_prefix="NSC") df = engine.read_data() broken_count = df.count().values.sum() assert fixed_count < broken_count
def test_get_targets_regression(self): """ Test PcbaParser.get_targets with regression. """ columns = [7, 8, 12, 14, 15, 20, 22, 23, 24, 25, 26] self.engine = PcbaParser(self.data_filename, self.map_filename, column_indices=columns) smiles, targets = self.engine.get_targets() idx = np.where(smiles == self.map['CID2997889'])[0][0] assert not np.any(np.isnan(targets[idx])) idx = np.where(smiles == self.map['CID645443'])[0][0] assert np.any(np.isnan(targets[idx])) # will have NaNs
def test_read_data(self): """ Test Nci60Parser.read_data. """ df = self.engine.read_data() fixed_count = df.count().values.sum() # count excluding NaNs # use PcbaParser to read data (w/o proper NaN handling) engine = PcbaParser(self.data_filename, self.map_filename, delimiter='\t', primary_key='NSC', id_prefix='NSC') df = engine.read_data() broken_count = df.count().values.sum() assert fixed_count < broken_count
def test_get_targets_regression(self): """ Test PcbaParser.get_targets with regression. """ columns = [7, 8, 12, 14, 15, 20, 22, 23, 24, 25, 26] self.engine = PcbaParser(self.data_filename, self.map_filename, column_indices=columns) smiles, targets = self.engine.get_targets() idx = np.where(smiles == self.map["CID2997889"])[0][0] assert not np.any(np.isnan(targets[idx])) idx = np.where(smiles == self.map["CID645443"])[0][0] assert np.any(np.isnan(targets[idx])) # will have NaNs
def setUp(self): """ Set up tests. """ self.temp_dir = tempfile.mkdtemp() self.map = { 'CID645443': 'Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1', 'CID2997889': 'CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1', 'CID2244': 'CC(=O)Oc1ccccc1C(=O)O', 'CID2662': 'Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1', 'CID3672': 'CC(C)Cc1ccc(C(C)C(=O)O)cc1' } _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix='.pkl') write_pickle(self.map, self.map_filename) # use a subset of AID588342 # note that CID 654924 is duplicated this_dir = os.path.split(os.path.realpath(__file__))[0] self.data_filename = os.path.join(this_dir, 'data/test_pcba_data.csv') # set up parser self.engine = PcbaParser(self.data_filename, self.map_filename)
def setUp(self): """ Set up tests. """ self.temp_dir = tempfile.mkdtemp() self.map = { "CID645443": "Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1", "CID2997889": "CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1", "CID2244": "CC(=O)Oc1ccccc1C(=O)O", "CID2662": "Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1", "CID3672": "CC(C)Cc1ccc(C(C)C(=O)O)cc1", } _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix=".pkl") write_pickle(self.map, self.map_filename) # use a subset of AID588342 # note that CID 654924 is duplicated this_dir = os.path.split(os.path.realpath(__file__))[0] self.data_filename = os.path.join(this_dir, "data/test_pcba_data.csv") # set up parser self.engine = PcbaParser(self.data_filename, self.map_filename)