def test_read_sid_cid_map(): """ Test read_sid_cid_map. """ f, filename = tempfile.mkstemp(suffix='.txt') os.close(f) g, gilename = tempfile.mkstemp(suffix='.txt.gz') os.close(g) try: with open(filename, 'wb') as f: f.write('123456\t7890\n') with gzip.open(gilename, 'wb') as g: g.write('123456\t7890\n') assert read_sid_cid_map(filename) == {123456: 7890} assert read_sid_cid_map(gilename) == {123456: 7890} finally: os.remove(filename) os.remove(gilename)
def main(dirs, config_filename, map_filename=None, summary_filename=None, with_aid=True, with_target=True, phenotype=False, id_prefix='CID', output_format='.pkl.gz'): aids = set() targets = set() total = 0 config = pd.read_csv(config_filename) summary = [] sid_cid = None if map_filename is not None: sid_cid = read_sid_cid_map(map_filename) if 'aid' not in config.columns: raise ValueError('Configuration file must contain "aid" column.') assert len(config) == len(pd.unique(config['aid'])) for this_dir in dirs: for filename in glob.glob(os.path.join(this_dir, '*.json.gz')): # get AID from filename so we only have to load relevant assays aid = int(os.path.basename(filename).split('.')[0]) if aid not in config['aid'].values: continue # get configuration for this AID this_config = config[config['aid'] == aid].iloc[0] if not with_aid and 'aid' in this_config: del this_config['aid'] if not with_target and 'target' in this_config: del this_config['target'] # get data try: extractor = PcbaDataExtractor(filename, this_config, with_aid=with_aid) except NotImplementedError as e: warnings.warn(e.message) continue if phenotype and 'phenotype' not in extractor.config: warnings.warn('{} has no phenotype'.format(aid)) continue assert aid == extractor.parser.get_aid() # sanity check for AID match aids.add(aid) target = extractor.config.get('target') targets.add(target) data = extractor.get_data(sid_cid=sid_cid) total += len(data) # add generic molecule ID column if id_prefix == 'CID': col = 'cid' elif id_prefix == 'SID': col = 'sid' else: raise NotImplementedError('Unrecognized ID prefix "{}"'.format( id_prefix)) ids = [] for i, mol_id in enumerate(data[col]): try: ids.append(id_prefix + str(int(mol_id))) except (TypeError, ValueError): warnings.warn('No ID for the following row:\n{}'.format(data.loc[i])) ids.append(None) # can be found with pd.isnull # skip this assay if there are no valid IDs if np.all(pd.isnull(ids)): warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid)) continue data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index) # add generic assay ID column assay_id = 'PCBA-' + str(aid) if with_aid: data.loc[:, 'assay_id'] = assay_id # save dataframe output_filename = '{}.{}'.format(assay_id, output_format) print '{}\t{}\t{}\t{}'.format(aid, target, output_filename, len(data)) write_dataframe(data, output_filename) summary.append({'aid': aid, 'target': target, 'filename': output_filename, 'size': len(data)}) # make sure we found everything missing = set(config['aid']).difference(aids) if len(missing): warnings.warn('Missed AIDs {}'.format(missing)) # save a summary summary = pd.DataFrame(summary) if summary_filename is not None: write_dataframe(summary, summary_filename) warnings.warn('Found {} assays for {} targets ({} total data points)'.format( len(aids), len(targets), total))
def main(dirs, config_filename, map_filename=None, summary_filename=None, with_aid=True, with_target=True, phenotype=False, id_prefix='CID', output_format='.pkl.gz'): aids = set() targets = set() total = 0 config = pd.read_csv(config_filename) summary = [] sid_cid = None if map_filename is not None: sid_cid = read_sid_cid_map(map_filename) if 'aid' not in config.columns: raise ValueError('Configuration file must contain "aid" column.') assert len(config) == len(pd.unique(config['aid'])) for this_dir in dirs: for filename in glob.glob(os.path.join(this_dir, '*.json.gz')): # get AID from filename so we only have to load relevant assays aid = int(os.path.basename(filename).split('.')[0]) if aid not in config['aid'].values: continue # get configuration for this AID this_config = config[config['aid'] == aid].iloc[0] if not with_aid and 'aid' in this_config: del this_config['aid'] if not with_target and 'target' in this_config: del this_config['target'] # get data try: extractor = PcbaDataExtractor(filename, this_config, with_aid=with_aid) except NotImplementedError as e: warnings.warn(e.message) continue if phenotype and 'phenotype' not in extractor.config: warnings.warn('{} has no phenotype'.format(aid)) continue assert aid == extractor.parser.get_aid( ) # sanity check for AID match aids.add(aid) target = extractor.config.get('target') targets.add(target) data = extractor.get_data(sid_cid=sid_cid) total += len(data) # add generic molecule ID column if id_prefix == 'CID': col = 'cid' elif id_prefix == 'SID': col = 'sid' else: raise NotImplementedError( 'Unrecognized ID prefix "{}"'.format(id_prefix)) ids = [] for i, mol_id in enumerate(data[col]): try: ids.append(id_prefix + str(int(mol_id))) except (TypeError, ValueError): warnings.warn('No ID for the following row:\n{}'.format( data.loc[i])) ids.append(None) # can be found with pd.isnull # skip this assay if there are no valid IDs if np.all(pd.isnull(ids)): warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid)) continue data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index) # add generic assay ID column assay_id = 'PCBA-' + str(aid) if with_aid: data.loc[:, 'assay_id'] = assay_id # save dataframe output_filename = '{}.{}'.format(assay_id, output_format) print '{}\t{}\t{}\t{}'.format(aid, target, output_filename, len(data)) write_dataframe(data, output_filename) summary.append({ 'aid': aid, 'target': target, 'filename': output_filename, 'size': len(data) }) # make sure we found everything missing = set(config['aid']).difference(aids) if len(missing): warnings.warn('Missed AIDs {}'.format(missing)) # save a summary summary = pd.DataFrame(summary) if summary_filename is not None: write_dataframe(summary, summary_filename) warnings.warn( 'Found {} assays for {} targets ({} total data points)'.format( len(aids), len(targets), total))