Exemplos de MalariaCatalog.scr em Python

Linguagem de programação: Python

Espaço para nome / nome do pacote: ccl_malaria.molscatalog

Classe / Tipo: MalariaCatalog

Método / Função: scr

Exemplos em hotexamples.com: 1

MalariaCatalog.scr em Python - 1 exemplos encontrados. Esses são os exemplos do mundo real mais bem avaliados de ccl_malaria.molscatalog.MalariaCatalog.scr em Python extraídos de projetos de código aberto. Você pode avaliar os exemplos para nos ajudar a melhorar a qualidade deles.

Métodos Frequentes

Exibir Ocultar

molids2labels(4)

label(3)

molids2mols(2)

molids2smiless(2)

lab(1)

molids2pec50s(1)

scr(1)

unl(1)

Métodos Frequentes

molids2labels (4)

label (3)

molids2mols (2)

molids2smiless (2)

lab (1)

molids2pec50s (1)

scr (1)

unl (1)

Exemplo n.º 1

0

Exibir arquivo

def munge_ecfps(): #####---Step 1: put all these together in 3 files, lab, unl and scr. ##### - ECFPs and FCFPs for the same mol are together ##### - The order is the same as in the original file ##### - Optionally delete the workers files def parse_weirdfpformat_line(line): """Returns a tuple (molid, [cansmi, count, [(center, radius)]+]+).""" def _parse_weird_feature(feature): vals = feature.split() cansmi = vals[0] count = int(vals[1]) if len(vals) > 2: a = iter(vals[2:]) centers = [(center, radius) for center, radius in izip(a, a)] return cansmi, count, centers return cansmi, count, () values = line.strip().split('\t') molid = values[0] if '*FAILED*' in values[1]: return molid, None return molid, map(_parse_weird_feature, values[1:]) def malaria_ecfp_parallel_results_iterator(prefix='', log=True): """Iterates over the files resulting from the computation of ecfps using the function ecfp.""" weirdfps = glob(op.join(_MALARIA_ECFPS_PARALLEL_RESULTS_DIR, '%s*.weirdfps' % prefix)) weirdfps = _sort_by_start(weirdfps) for fn in weirdfps: if log: info(fn) with gzip.open(fn) as reader: for line in reader: yield line class Chihuahua(object): """A data processor that takes weirdfp lines, hunk them in disk and then merge them sorted in a big file. It can be setup to be easy on memory usage (at the cost of doubling disk space usage). """ def __init__(self, molid2i, root, prefix, data2molid, chunksize=10000): super(Chihuahua, self).__init__() self.chunksize = chunksize self.molid2i = molid2i self.num_mols = len(self.molid2i) self.temp_fns = [op.join(root, '%s-%d' % (prefix, base)) for base in xrange(0, self.num_mols, chunksize)] self.temp_files = [open(fn, 'w') for fn in self.temp_fns] self.data2molid = data2molid self.root = root self.prefix = prefix ensure_dir(self.root) def process(self, moldata): index = self.molid2i.get(self.data2molid(moldata), None) if index is None: return goes_to = index / self.chunksize self.temp_files[goes_to].write(moldata) if not moldata.endswith('\n'): self.temp_files[goes_to].write('\n') def done(self): # Sort in memory each chunk for tmp in self.temp_files: tmp.close() with open(op.join(self.root, self.prefix), 'w') as writer: for fn in self.temp_fns: with open(fn) as reader: lines = sorted(reader.readlines(), key=lambda line: self.molid2i[self.data2molid(line)]) for line in lines: writer.write(line) for fn in self.temp_fns: os.remove(fn) mc = MalariaCatalog() labproc = Chihuahua(molid2i={molid: i for i, molid in enumerate(mc.lab())}, root=_MALARIA_ECFPS_DIR, prefix='lab', data2molid=lambda line: line[0:line.find('\t')], chunksize=100000) unlproc = Chihuahua(molid2i={molid: i for i, molid in enumerate(mc.unl())}, root=_MALARIA_ECFPS_DIR, prefix='unl', data2molid=lambda line: line[0:line.find('\t')], chunksize=100000) scrproc = Chihuahua(molid2i={molid: i for i, molid in enumerate(mc.scr())}, root=_MALARIA_ECFPS_DIR, prefix='scr', data2molid=lambda line: line[0:line.find('\t')], chunksize=100000) _process_molecule_data(malaria_ecfp_parallel_results_iterator(), (labproc, unlproc, scrproc)) #####---Step 2: recode ECFPs and FCFPs from the file at step 1. After this: ##### - ECFPs and FCFPs duplicates get merged. ##### - A unique assignment for each substructure in the dataset to a int [0, ...] (column number). ##### - A unique assignment for each molid in the dataset for wich Morgan DID NOT FAIL (row number). def ecfps_recode(dset='lab'): """Merges ECFPs and FCFPs into a single line and gets rid of the centers information if present.""" with open(op.join(_MALARIA_ECFPS_DIR, dset)) as reader, \ open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged'), 'w') as writer: for ecfp in reader: fcfp = reader.next() molide, subse = parse_weirdfpformat_line(ecfp) molidf, subsf = parse_weirdfpformat_line(fcfp) assert molide == molidf if subse is not None: uniques = set((sub, count) for sub, count, _ in subse + subsf) writer.write('%s\t%s' % (molide, '\t'.join(['%s %d' % (sub, count) for sub, count in uniques]))) writer.write('\n') ecfps_recode('lab') ecfps_recode('unl') ecfps_recode('scr') def sub2i(): """Generates a map {labelled_substructure -> column} This produces a unique assignment for all the features in the dataset, in three files: - lab: the indices for all features that appear in labelled - unl: the indices for features that do not appear in labelled but appear in unlabelld - scr: the indices for the features that appear in screening but not on labelled or unlabelled Of course, keep track of changes to the map as needed while creating models. Note that this keeps all the substructures in memory (which shoould be ok for any recent machine). """ def all_subs(dset): info(dset) subs = set() with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged')) as reader: for line in reader: subs.update(sub.split()[0] for sub in line.split('\t')[1:]) # TODO sort by frequency return subs lab_subs = all_subs('lab') unl_subs = all_subs('unl') scr_subs = all_subs('scr') with open(op.join(_MALARIA_ECFPS_DIR, 'lab.merged.s2i'), 'w') as writer: for i, sub in enumerate(sorted(lab_subs)): writer.write('%s %d\n' % (sub, i)) num_written = len(lab_subs) with open(op.join(_MALARIA_ECFPS_DIR, 'unl.merged.s2i'), 'w') as writer: new_subs = unl_subs - lab_subs for i, sub in enumerate(sorted(new_subs)): writer.write('%s %d\n' % (sub, i + num_written)) num_written += len(new_subs) with open(op.join(_MALARIA_ECFPS_DIR, 'scr.merged.s2i'), 'w') as writer: new_subs = scr_subs - (unl_subs | lab_subs) for i, sub in enumerate(sorted(new_subs)): writer.write('%s %d\n' % (sub, i + num_written)) with open(op.join(_MALARIA_ECFPS_DIR, 'trans.merged.s2i'), 'w') as writer: for sub in sorted(lab_subs & unl_subs | lab_subs & scr_subs): writer.write('%s\n' % sub) sub2i() def mol2i(dset='lab'): """Generates a map {molid -> row}. Molecules for which RDKIT could not generate the fingerprints are not in this map, nor in hte final sparse matrices. In any case we will need to keep track of changes on the map as we do, for example, cross-val. """ with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged.m2i'), 'w') as writer: with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged')) as reader: for line in reader: writer.write('%s\n' % line[0:line.find('\t')]) mol2i('lab') mol2i('unl') mol2i('scr') #####---Step 3: write sparse matrices with the recoded information of step 2. After this: ##### - We get a h5 file for each dataset, with a sparse matrix in CSR format. ##### - Note that this is a memory intense procedure, can be done lightweight by using 2 passes. def to_sparse_chihuahua(dset='lab', two_pass=False): """Generates sparse CSR matrices using as features only these in the labelled dataset, with the column index and the row index as computed previously. They get stored in a h5 file with the following datasets: - data - indices - indptr - shape """ if two_pass: # First pass: shape and number of nonzeros # Second pass: h5 file with the proper sizes of indices, indptr and data, write on the fly raise NotImplementedError # mol2row, smiles2col m2i = {mol.strip(): i for i, mol in enumerate(open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged.m2i')))} s2i = {} with open(op.join(_MALARIA_ECFPS_DIR, 'lab.merged.s2i')) as reader: for line in reader: sub, i = line.strip().split() i = int(i) s2i[sub] = i rows = array('I') cols = array('I') data = array('I') # gather data with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged')) as reader: for line in reader: values = line.split('\t') molid = values[0] row = m2i[molid] for fc in values[1:]: sub, count = fc.split() count = int(count) col = s2i.get(sub, None) if col is not None: rows.append(row) cols.append(col) data.append(count) # save as CSR sparse matrix M = coo_matrix((data, (rows, cols)), dtype=np.int32).tocsr() with h5py.File(op.join(_MALARIA_ECFPS_DIR, dset + '.sparse.h5'), 'w') as h5: h5['indices'] = M.indices h5['indptr'] = M.indptr h5['data'] = data h5['shape'] = np.array([M.shape[0], len(s2i)]) to_sparse_chihuahua('lab') to_sparse_chihuahua('unl') to_sparse_chihuahua('scr') #####---Step 4: lame feature duplicate detection to tackle partially multicolliniarity MalariaFingerprintsManager(zero_dupes='lab').X() MalariaFingerprintsManager(zero_dupes='all').X()