def main(active_filename, decoy_filename, assay_id, target, with_assay_id=True, with_target=True, phenotype=None, output_filename=None, mol_id_prefix=None, output_format='.pkl.gz'): rows = [] for outcome, filename in zip(['active', 'inactive'], [active_filename, decoy_filename]): this_phenotype = phenotype if outcome == 'inactive' and phenotype is not None: this_phenotype = 'inactive' with serial.MolReader().open(filename) as reader: this_rows = get_rows(reader, outcome, this_phenotype, mol_id_prefix) rows.extend(this_rows) # create dataframe df = pd.DataFrame(rows) # sanity check for duplicate mol_ids assert len(np.unique(df['mol_id'])) == len(df) # add assay_id and target columns if with_assay_id: df.loc[:, 'assay_id'] = assay_id if with_target: df.loc[:, 'target'] = target if output_filename is None: output_filename = '{}.{}'.format(assay_id, output_format) print '{}\t{}\t{}\t{}'.format(assay_id, target, output_filename, len(df)) write_dataframe(df, output_filename)
def write_output_file(data, filename, compression_level=3): """ Pickle output data, possibly to a compressed file. Parameters ---------- data : object Object to pickle in output file. filename : str Output filename. Should end with .joblib, .pkl, or .pkl.gz. compression_level : int, optional (default 3) Compression level (0-9) to use with joblib.dump. """ if filename.endswith('.joblib'): joblib.dump(data, filename, compress=compression_level) else: write_dataframe(data, filename)
def main(dirs, config_filename, map_filename=None, summary_filename=None, with_aid=True, with_target=True, phenotype=False, id_prefix='CID', output_format='.pkl.gz'): aids = set() targets = set() total = 0 config = pd.read_csv(config_filename) summary = [] sid_cid = None if map_filename is not None: sid_cid = read_sid_cid_map(map_filename) if 'aid' not in config.columns: raise ValueError('Configuration file must contain "aid" column.') assert len(config) == len(pd.unique(config['aid'])) for this_dir in dirs: for filename in glob.glob(os.path.join(this_dir, '*.json.gz')): # get AID from filename so we only have to load relevant assays aid = int(os.path.basename(filename).split('.')[0]) if aid not in config['aid'].values: continue # get configuration for this AID this_config = config[config['aid'] == aid].iloc[0] if not with_aid and 'aid' in this_config: del this_config['aid'] if not with_target and 'target' in this_config: del this_config['target'] # get data try: extractor = PcbaDataExtractor(filename, this_config, with_aid=with_aid) except NotImplementedError as e: warnings.warn(e.message) continue if phenotype and 'phenotype' not in extractor.config: warnings.warn('{} has no phenotype'.format(aid)) continue assert aid == extractor.parser.get_aid() # sanity check for AID match aids.add(aid) target = extractor.config.get('target') targets.add(target) data = extractor.get_data(sid_cid=sid_cid) total += len(data) # add generic molecule ID column if id_prefix == 'CID': col = 'cid' elif id_prefix == 'SID': col = 'sid' else: raise NotImplementedError('Unrecognized ID prefix "{}"'.format( id_prefix)) ids = [] for i, mol_id in enumerate(data[col]): try: ids.append(id_prefix + str(int(mol_id))) except (TypeError, ValueError): warnings.warn('No ID for the following row:\n{}'.format(data.loc[i])) ids.append(None) # can be found with pd.isnull # skip this assay if there are no valid IDs if np.all(pd.isnull(ids)): warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid)) continue data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index) # add generic assay ID column assay_id = 'PCBA-' + str(aid) if with_aid: data.loc[:, 'assay_id'] = assay_id # save dataframe output_filename = '{}.{}'.format(assay_id, output_format) print '{}\t{}\t{}\t{}'.format(aid, target, output_filename, len(data)) write_dataframe(data, output_filename) summary.append({'aid': aid, 'target': target, 'filename': output_filename, 'size': len(data)}) # make sure we found everything missing = set(config['aid']).difference(aids) if len(missing): warnings.warn('Missed AIDs {}'.format(missing)) # save a summary summary = pd.DataFrame(summary) if summary_filename is not None: write_dataframe(summary, summary_filename) warnings.warn('Found {} assays for {} targets ({} total data points)'.format( len(aids), len(targets), total))
def main(dirs, config_filename, map_filename=None, summary_filename=None, with_aid=True, with_target=True, phenotype=False, id_prefix='CID', output_format='.pkl.gz'): aids = set() targets = set() total = 0 config = pd.read_csv(config_filename) summary = [] sid_cid = None if map_filename is not None: sid_cid = read_sid_cid_map(map_filename) if 'aid' not in config.columns: raise ValueError('Configuration file must contain "aid" column.') assert len(config) == len(pd.unique(config['aid'])) for this_dir in dirs: for filename in glob.glob(os.path.join(this_dir, '*.json.gz')): # get AID from filename so we only have to load relevant assays aid = int(os.path.basename(filename).split('.')[0]) if aid not in config['aid'].values: continue # get configuration for this AID this_config = config[config['aid'] == aid].iloc[0] if not with_aid and 'aid' in this_config: del this_config['aid'] if not with_target and 'target' in this_config: del this_config['target'] # get data try: extractor = PcbaDataExtractor(filename, this_config, with_aid=with_aid) except NotImplementedError as e: warnings.warn(e.message) continue if phenotype and 'phenotype' not in extractor.config: warnings.warn('{} has no phenotype'.format(aid)) continue assert aid == extractor.parser.get_aid( ) # sanity check for AID match aids.add(aid) target = extractor.config.get('target') targets.add(target) data = extractor.get_data(sid_cid=sid_cid) total += len(data) # add generic molecule ID column if id_prefix == 'CID': col = 'cid' elif id_prefix == 'SID': col = 'sid' else: raise NotImplementedError( 'Unrecognized ID prefix "{}"'.format(id_prefix)) ids = [] for i, mol_id in enumerate(data[col]): try: ids.append(id_prefix + str(int(mol_id))) except (TypeError, ValueError): warnings.warn('No ID for the following row:\n{}'.format( data.loc[i])) ids.append(None) # can be found with pd.isnull # skip this assay if there are no valid IDs if np.all(pd.isnull(ids)): warnings.warn('No valid IDs for AID {}. Skipping.'.format(aid)) continue data.loc[:, 'mol_id'] = pd.Series(ids, index=data.index) # add generic assay ID column assay_id = 'PCBA-' + str(aid) if with_aid: data.loc[:, 'assay_id'] = assay_id # save dataframe output_filename = '{}.{}'.format(assay_id, output_format) print '{}\t{}\t{}\t{}'.format(aid, target, output_filename, len(data)) write_dataframe(data, output_filename) summary.append({ 'aid': aid, 'target': target, 'filename': output_filename, 'size': len(data) }) # make sure we found everything missing = set(config['aid']).difference(aids) if len(missing): warnings.warn('Missed AIDs {}'.format(missing)) # save a summary summary = pd.DataFrame(summary) if summary_filename is not None: write_dataframe(summary, summary_filename) warnings.warn( 'Found {} assays for {} targets ({} total data points)'.format( len(aids), len(targets), total))