def main(input_filename, map_filename, output_filename, column_indices=None):
    """
    Get regression targets.

    Parameters
    ----------
    input_filename : str
        PCBA data filename.
    map_filename : str
        ID->SMILES map filename.
    output_filename : str
        Output filename.
    column_indices : list, optional
        Column indices to include. If None, compounds are classified by
        activity.
    """
    parser = PcbaParser(input_filename, map_filename,
                        column_indices=column_indices)
    if column_indices is not None:
        print "Extracting data from the following columns:"
        for col in parser.get_column_names():
            print '\t', col
    smiles, targets = parser.get_targets()

    # print the fraction of valid assay records that were found in the map
    total = np.count_nonzero(~np.isnan(parser.read_data().PUBCHEM_CID))
    print '{}/{} records matched'.format(len(targets), total)

    # save SMILES and targets
    write_pickle({'smiles': smiles, 'targets': targets}, output_filename)
class TestPcbaParser(unittest.TestCase):
    """
    Tests for PcbaParser.
    """
    def setUp(self):
        """
        Set up tests.
        """
        self.temp_dir = tempfile.mkdtemp()
        self.map = {
            'CID645443':
            'Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1',
            'CID2997889': 'CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1',
            'CID2244': 'CC(=O)Oc1ccccc1C(=O)O',
            'CID2662': 'Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1',
            'CID3672': 'CC(C)Cc1ccc(C(C)C(=O)O)cc1'
        }
        _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                suffix='.pkl')
        write_pickle(self.map, self.map_filename)

        # use a subset of AID588342
        # note that CID 654924 is duplicated
        this_dir = os.path.split(os.path.realpath(__file__))[0]
        self.data_filename = os.path.join(this_dir, 'data/test_pcba_data.csv')

        # set up parser
        self.engine = PcbaParser(self.data_filename, self.map_filename)

    def tearDown(self):
        """
        Clean up tests.
        """
        shutil.rmtree(self.temp_dir)

    def test_get_targets_classification(self):
        """
        Test PcbaParser.get_targets with classification.
        """
        smiles, targets = self.engine.get_targets()
        assert len(smiles) == len(targets) == 2
        idx = np.where(smiles == self.map['CID2997889'])[0][0]
        assert targets[idx]  # marked Active
        idx = np.where(smiles == self.map['CID645443'])[0][0]
        assert not targets[idx]  # marked Inactive

    def test_get_targets_regression(self):
        """
        Test PcbaParser.get_targets with regression.
        """
        columns = [7, 8, 12, 14, 15, 20, 22, 23, 24, 25, 26]
        self.engine = PcbaParser(self.data_filename,
                                 self.map_filename,
                                 column_indices=columns)
        smiles, targets = self.engine.get_targets()
        idx = np.where(smiles == self.map['CID2997889'])[0][0]
        assert not np.any(np.isnan(targets[idx]))
        idx = np.where(smiles == self.map['CID645443'])[0][0]
        assert np.any(np.isnan(targets[idx]))  # will have NaNs
class TestPcbaParser(unittest.TestCase):
    """
    Tests for PcbaParser.
    """

    def setUp(self):
        """
        Set up tests.
        """
        self.temp_dir = tempfile.mkdtemp()
        self.map = {
            "CID645443": "Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1",
            "CID2997889": "CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1",
            "CID2244": "CC(=O)Oc1ccccc1C(=O)O",
            "CID2662": "Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1",
            "CID3672": "CC(C)Cc1ccc(C(C)C(=O)O)cc1",
        }
        _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix=".pkl")
        write_pickle(self.map, self.map_filename)

        # use a subset of AID588342
        # note that CID 654924 is duplicated
        this_dir = os.path.split(os.path.realpath(__file__))[0]
        self.data_filename = os.path.join(this_dir, "data/test_pcba_data.csv")

        # set up parser
        self.engine = PcbaParser(self.data_filename, self.map_filename)

    def tearDown(self):
        """
        Clean up tests.
        """
        shutil.rmtree(self.temp_dir)

    def test_get_targets_classification(self):
        """
        Test PcbaParser.get_targets with classification.
        """
        smiles, targets = self.engine.get_targets()
        assert len(smiles) == len(targets) == 2
        idx = np.where(smiles == self.map["CID2997889"])[0][0]
        assert targets[idx]  # marked Active
        idx = np.where(smiles == self.map["CID645443"])[0][0]
        assert not targets[idx]  # marked Inactive

    def test_get_targets_regression(self):
        """
        Test PcbaParser.get_targets with regression.
        """
        columns = [7, 8, 12, 14, 15, 20, 22, 23, 24, 25, 26]
        self.engine = PcbaParser(self.data_filename, self.map_filename, column_indices=columns)
        smiles, targets = self.engine.get_targets()
        idx = np.where(smiles == self.map["CID2997889"])[0][0]
        assert not np.any(np.isnan(targets[idx]))
        idx = np.where(smiles == self.map["CID645443"])[0][0]
        assert np.any(np.isnan(targets[idx]))  # will have NaNs
 def test_read_data(self):
     """
     Test Nci60Parser.read_data.
     """
     df = self.engine.read_data()
     fixed_count = df.count().values.sum()  # count excluding NaNs
     # use PcbaParser to read data (w/o proper NaN handling)
     engine = PcbaParser(self.data_filename, self.map_filename, delimiter="\t", primary_key="NSC", id_prefix="NSC")
     df = engine.read_data()
     broken_count = df.count().values.sum()
     assert fixed_count < broken_count
 def test_get_targets_regression(self):
     """
     Test PcbaParser.get_targets with regression.
     """
     columns = [7, 8, 12, 14, 15, 20, 22, 23, 24, 25, 26]
     self.engine = PcbaParser(self.data_filename,
                              self.map_filename,
                              column_indices=columns)
     smiles, targets = self.engine.get_targets()
     idx = np.where(smiles == self.map['CID2997889'])[0][0]
     assert not np.any(np.isnan(targets[idx]))
     idx = np.where(smiles == self.map['CID645443'])[0][0]
     assert np.any(np.isnan(targets[idx]))  # will have NaNs
 def test_read_data(self):
     """
     Test Nci60Parser.read_data.
     """
     df = self.engine.read_data()
     fixed_count = df.count().values.sum()  # count excluding NaNs
     # use PcbaParser to read data (w/o proper NaN handling)
     engine = PcbaParser(self.data_filename,
                         self.map_filename,
                         delimiter='\t',
                         primary_key='NSC',
                         id_prefix='NSC')
     df = engine.read_data()
     broken_count = df.count().values.sum()
     assert fixed_count < broken_count
 def test_get_targets_regression(self):
     """
     Test PcbaParser.get_targets with regression.
     """
     columns = [7, 8, 12, 14, 15, 20, 22, 23, 24, 25, 26]
     self.engine = PcbaParser(self.data_filename, self.map_filename, column_indices=columns)
     smiles, targets = self.engine.get_targets()
     idx = np.where(smiles == self.map["CID2997889"])[0][0]
     assert not np.any(np.isnan(targets[idx]))
     idx = np.where(smiles == self.map["CID645443"])[0][0]
     assert np.any(np.isnan(targets[idx]))  # will have NaNs
    def setUp(self):
        """
        Set up tests.
        """
        self.temp_dir = tempfile.mkdtemp()
        self.map = {
            'CID645443':
            'Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1',
            'CID2997889': 'CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1',
            'CID2244': 'CC(=O)Oc1ccccc1C(=O)O',
            'CID2662': 'Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1',
            'CID3672': 'CC(C)Cc1ccc(C(C)C(=O)O)cc1'
        }
        _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir,
                                                suffix='.pkl')
        write_pickle(self.map, self.map_filename)

        # use a subset of AID588342
        # note that CID 654924 is duplicated
        this_dir = os.path.split(os.path.realpath(__file__))[0]
        self.data_filename = os.path.join(this_dir, 'data/test_pcba_data.csv')

        # set up parser
        self.engine = PcbaParser(self.data_filename, self.map_filename)
    def setUp(self):
        """
        Set up tests.
        """
        self.temp_dir = tempfile.mkdtemp()
        self.map = {
            "CID645443": "Cc1ccc(-n2c3c(cc(C(=O)Nc4cccc(C)n4)c2=O)C(=O)CCC3)cc1",
            "CID2997889": "CC(C)(C)C(=O)Nc1ccc(-c2cn3ccsc3n2)cc1",
            "CID2244": "CC(=O)Oc1ccccc1C(=O)O",
            "CID2662": "Cc1ccc(-c2cc(C(F)(F)F)nn2-c2ccc(S(N)(=O)=O)cc2)cc1",
            "CID3672": "CC(C)Cc1ccc(C(C)C(=O)O)cc1",
        }
        _, self.map_filename = tempfile.mkstemp(dir=self.temp_dir, suffix=".pkl")
        write_pickle(self.map, self.map_filename)

        # use a subset of AID588342
        # note that CID 654924 is duplicated
        this_dir = os.path.split(os.path.realpath(__file__))[0]
        self.data_filename = os.path.join(this_dir, "data/test_pcba_data.csv")

        # set up parser
        self.engine = PcbaParser(self.data_filename, self.map_filename)