示例#1
0
  def test_get_data(self):
    """
    Test get_data.
    """
    config = {'target': '1296534', 'r2': 'Fit_R2', 'phenotype': 'in'}
    engine = PcbaDataExtractor(self.aid998, config, with_aid=True)

    # check SID->CID mapping
    data = engine.get_data()
    assert 'cid' not in data.columns
    data = engine.get_data(sid_cid={11110959: 1730})
    assert data[data['sid'] == 11110959].iloc[0]['cid'] == 1730
    assert pd.isnull(data[data['sid'] == 11111313].iloc[0]['cid'])

    # check lowercase
    data = engine.get_data(lower=True)
    assert data[data['sid'] == 11110959].iloc[0]['phenotype'] == 'inhibitor'
    assert data[data['sid'] == 11110959].iloc[0]['potency'] == 4.4668
    data = engine.get_data(lower=False)
    assert data[data['sid'] == 11110959].iloc[0]['phenotype'] == 'Inhibitor'
    assert data[data['sid'] == 11110959].iloc[0]['potency'] == 4.4668
示例#2
0
    def test_check_config(self):
        """
    Test _check_config.
    """
        config = {
            'target': '1296534',
            'potency': 'Activity at 10 uM',
            'phenotype': 'in'
        }

        # without common fields
        engine = PcbaDataExtractor(self.aid540325, config, with_aid=True)
        assert engine.config['target'] == 'gi_1296534'
        assert 'phenotype' not in engine.config
        assert engine.phenotype == 'inhibitor'

        # with common fields
        engine = PcbaDataExtractor(self.aid998, config, with_aid=True)
        assert engine.config['target'] == 'gi_1296534'
        assert 'phenotype' in engine.config
        assert engine.config['phenotype'] == 'Phenotype'
        assert engine.config['potency'] == 'Potency'
        assert engine.config['efficacy'] == 'Efficacy'
示例#3
0
    def test_get_data(self):
        """
    Test get_data.
    """
        config = {'target': '1296534', 'r2': 'Fit_R2', 'phenotype': 'in'}
        engine = PcbaDataExtractor(self.aid998, config, with_aid=True)

        # check SID->CID mapping
        data = engine.get_data()
        assert 'cid' not in data.columns
        data = engine.get_data(sid_cid={11110959: 1730})
        assert data[data['sid'] == 11110959].iloc[0]['cid'] == 1730
        assert pd.isnull(data[data['sid'] == 11111313].iloc[0]['cid'])

        # check lowercase
        data = engine.get_data(lower=True)
        assert data[data['sid'] ==
                    11110959].iloc[0]['phenotype'] == 'inhibitor'
        assert data[data['sid'] == 11110959].iloc[0]['potency'] == 4.4668
        data = engine.get_data(lower=False)
        assert data[data['sid'] ==
                    11110959].iloc[0]['phenotype'] == 'Inhibitor'
        assert data[data['sid'] == 11110959].iloc[0]['potency'] == 4.4668
示例#4
0
def main(dirs, config_filename, map_filename=None, summary_filename=None,
         with_aid=True, with_target=True, phenotype=False, id_prefix='CID',
         output_format='.pkl.gz'):
  aids = set()
  targets = set()
  total = 0
  config = pd.read_csv(config_filename)
  summary = []
  sid_cid = None
  if map_filename is not None:
    sid_cid = read_sid_cid_map(map_filename)
  if 'aid' not in config.columns:
    raise ValueError('Configuration file must contain "aid" column.')
  assert len(config) == len(pd.unique(config['aid']))
  for this_dir in dirs:
    for filename in glob.glob(os.path.join(this_dir, '*.json.gz')):

      # get AID from filename so we only have to load relevant assays
      aid = int(os.path.basename(filename).split('.')[0])
      if aid not in config['aid'].values:
        continue

      # get configuration for this AID
      this_config = config[config['aid'] == aid].iloc[0]
      if not with_aid and 'aid' in this_config:
        del this_config['aid']
      if not with_target and 'target' in this_config:
        del this_config['target']

      # get data
      try:
        extractor = PcbaDataExtractor(filename, this_config, with_aid=with_aid)
      except NotImplementedError as e:
        warnings.warn(e.message)
        continue
      if phenotype and 'phenotype' not in extractor.config:
        warnings.warn('{} has no phenotype'.format(aid))
        continue
      assert aid == extractor.parser.get_aid()  # sanity check for AID match
      aids.add(aid)
      target = extractor.config.get('target')
      targets.add(target)
      data = extractor.get_data(sid_cid=sid_cid)
      total += len(data)

      # add generic molecule ID column
      if id_prefix == 'CID':
        col = 'cid'
      elif id_prefix == 'SID':
        col = 'sid'
      else:
        raise NotImplementedError('Unrecognized ID prefix "{}"'.format(
            id_prefix))
      ids = []
      for i, mol_id in enumerate(data[col]):
        try:
          ids.append(id_prefix + str(int(mol_id)))
        except (TypeError, ValueError):
          warnings.warn('No ID for the following row:\n{}'.format(data.loc[i]))
          ids.append(None)  # can be found with pd.isnull

      # skip this assay if there are no valid IDs
      if np.all(pd.isnull(ids)):
        warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid))
        continue
      data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index)

      # add generic assay ID column
      assay_id = 'PCBA-' + str(aid)
      if with_aid:
        data.loc[:, 'assay_id'] = assay_id

      # save dataframe
      output_filename = '{}.{}'.format(assay_id, output_format)
      print '{}\t{}\t{}\t{}'.format(aid, target, output_filename, len(data))
      write_dataframe(data, output_filename)
      summary.append({'aid': aid, 'target': target,
                      'filename': output_filename, 'size': len(data)})

  # make sure we found everything
  missing = set(config['aid']).difference(aids)
  if len(missing):
    warnings.warn('Missed AIDs {}'.format(missing))

  # save a summary
  summary = pd.DataFrame(summary)
  if summary_filename is not None:
    write_dataframe(summary, summary_filename)
  warnings.warn('Found {} assays for {} targets ({} total data points)'.format(
      len(aids), len(targets), total))
示例#5
0
def main(dirs,
         config_filename,
         map_filename=None,
         summary_filename=None,
         with_aid=True,
         with_target=True,
         phenotype=False,
         id_prefix='CID',
         output_format='.pkl.gz'):
    aids = set()
    targets = set()
    total = 0
    config = pd.read_csv(config_filename)
    summary = []
    sid_cid = None
    if map_filename is not None:
        sid_cid = read_sid_cid_map(map_filename)
    if 'aid' not in config.columns:
        raise ValueError('Configuration file must contain "aid" column.')
    assert len(config) == len(pd.unique(config['aid']))
    for this_dir in dirs:
        for filename in glob.glob(os.path.join(this_dir, '*.json.gz')):

            # get AID from filename so we only have to load relevant assays
            aid = int(os.path.basename(filename).split('.')[0])
            if aid not in config['aid'].values:
                continue

            # get configuration for this AID
            this_config = config[config['aid'] == aid].iloc[0]
            if not with_aid and 'aid' in this_config:
                del this_config['aid']
            if not with_target and 'target' in this_config:
                del this_config['target']

            # get data
            try:
                extractor = PcbaDataExtractor(filename,
                                              this_config,
                                              with_aid=with_aid)
            except NotImplementedError as e:
                warnings.warn(e.message)
                continue
            if phenotype and 'phenotype' not in extractor.config:
                warnings.warn('{} has no phenotype'.format(aid))
                continue
            assert aid == extractor.parser.get_aid(
            )  # sanity check for AID match
            aids.add(aid)
            target = extractor.config.get('target')
            targets.add(target)
            data = extractor.get_data(sid_cid=sid_cid)
            total += len(data)

            # add generic molecule ID column
            if id_prefix == 'CID':
                col = 'cid'
            elif id_prefix == 'SID':
                col = 'sid'
            else:
                raise NotImplementedError(
                    'Unrecognized ID prefix "{}"'.format(id_prefix))
            ids = []
            for i, mol_id in enumerate(data[col]):
                try:
                    ids.append(id_prefix + str(int(mol_id)))
                except (TypeError, ValueError):
                    warnings.warn('No ID for the following row:\n{}'.format(
                        data.loc[i]))
                    ids.append(None)  # can be found with pd.isnull

            # skip this assay if there are no valid IDs
            if np.all(pd.isnull(ids)):
                warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid))
                continue
            data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index)

            # add generic assay ID column
            assay_id = 'PCBA-' + str(aid)
            if with_aid:
                data.loc[:, 'assay_id'] = assay_id

            # save dataframe
            output_filename = '{}.{}'.format(assay_id, output_format)
            print '{}\t{}\t{}\t{}'.format(aid, target, output_filename,
                                          len(data))
            write_dataframe(data, output_filename)
            summary.append({
                'aid': aid,
                'target': target,
                'filename': output_filename,
                'size': len(data)
            })

    # make sure we found everything
    missing = set(config['aid']).difference(aids)
    if len(missing):
        warnings.warn('Missed AIDs {}'.format(missing))

    # save a summary
    summary = pd.DataFrame(summary)
    if summary_filename is not None:
        write_dataframe(summary, summary_filename)
    warnings.warn(
        'Found {} assays for {} targets ({} total data points)'.format(
            len(aids), len(targets), total))