Exemplo n.º 1
0
def munge_ecfps():

    #####---Step 1: put all these together in 3 files, lab, unl and scr.
    #####     - ECFPs and FCFPs for the same mol are together
    #####     - The order is the same as in the original file
    #####     - Optionally delete the workers files

    def parse_weirdfpformat_line(line):
        """Returns a tuple (molid, [cansmi, count, [(center, radius)]+]+)."""
        def _parse_weird_feature(feature):
            vals = feature.split()
            cansmi = vals[0]
            count = int(vals[1])
            if len(vals) > 2:
                a = iter(vals[2:])
                centers = [(center, radius) for center, radius in izip(a, a)]
                return cansmi, count, centers
            return cansmi, count, ()
        values = line.strip().split('\t')
        molid = values[0]
        if '*FAILED*' in values[1]:
            return molid, None
        return molid, map(_parse_weird_feature, values[1:])

    def malaria_ecfp_parallel_results_iterator(prefix='', log=True):
        """Iterates over the files resulting from the computation of ecfps using the function ecfp."""
        weirdfps = glob(op.join(_MALARIA_ECFPS_PARALLEL_RESULTS_DIR, '%s*.weirdfps' % prefix))
        weirdfps = _sort_by_start(weirdfps)
        for fn in weirdfps:
            if log:
                info(fn)
            with gzip.open(fn) as reader:
                for line in reader:
                    yield line

    class Chihuahua(object):
        """A data processor that takes weirdfp lines, hunk them in disk and then merge them sorted in a big file.
        It can be setup to be easy on memory usage (at the cost of doubling disk space usage).
        """
        def __init__(self, molid2i, root, prefix, data2molid, chunksize=10000):
            super(Chihuahua, self).__init__()
            self.chunksize = chunksize
            self.molid2i = molid2i
            self.num_mols = len(self.molid2i)
            self.temp_fns = [op.join(root, '%s-%d' % (prefix, base)) for base in xrange(0, self.num_mols, chunksize)]
            self.temp_files = [open(fn, 'w') for fn in self.temp_fns]
            self.data2molid = data2molid
            self.root = root
            self.prefix = prefix
            ensure_dir(self.root)

        def process(self, moldata):
            index = self.molid2i.get(self.data2molid(moldata), None)
            if index is None:
                return
            goes_to = index / self.chunksize
            self.temp_files[goes_to].write(moldata)
            if not moldata.endswith('\n'):
                self.temp_files[goes_to].write('\n')

        def done(self):
            # Sort in memory each chunk
            for tmp in self.temp_files:
                tmp.close()
            with open(op.join(self.root, self.prefix), 'w') as writer:
                for fn in self.temp_fns:
                    with open(fn) as reader:
                        lines = sorted(reader.readlines(), key=lambda line: self.molid2i[self.data2molid(line)])
                        for line in lines:
                            writer.write(line)
            for fn in self.temp_fns:
                os.remove(fn)

    mc = MalariaCatalog()

    labproc = Chihuahua(molid2i={molid: i for i, molid in enumerate(mc.lab())},
                        root=_MALARIA_ECFPS_DIR,
                        prefix='lab',
                        data2molid=lambda line: line[0:line.find('\t')],
                        chunksize=100000)

    unlproc = Chihuahua(molid2i={molid: i for i, molid in enumerate(mc.unl())},
                        root=_MALARIA_ECFPS_DIR,
                        prefix='unl',
                        data2molid=lambda line: line[0:line.find('\t')],
                        chunksize=100000)

    scrproc = Chihuahua(molid2i={molid: i for i, molid in enumerate(mc.scr())},
                        root=_MALARIA_ECFPS_DIR,
                        prefix='scr',
                        data2molid=lambda line: line[0:line.find('\t')],
                        chunksize=100000)

    _process_molecule_data(malaria_ecfp_parallel_results_iterator(), (labproc, unlproc, scrproc))

    #####---Step 2: recode ECFPs and FCFPs from the file at step 1. After this:
    #####  - ECFPs and FCFPs duplicates get merged.
    #####  - A unique assignment for each substructure in the dataset to a int [0, ...] (column number).
    #####  - A unique assignment for each molid in the dataset for wich Morgan DID NOT FAIL (row number).

    def ecfps_recode(dset='lab'):
        """Merges ECFPs and FCFPs into a single line and gets rid of the centers information if present."""
        with open(op.join(_MALARIA_ECFPS_DIR, dset)) as reader, \
                open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged'), 'w') as writer:
            for ecfp in reader:
                fcfp = reader.next()
                molide, subse = parse_weirdfpformat_line(ecfp)
                molidf, subsf = parse_weirdfpformat_line(fcfp)
                assert molide == molidf
                if subse is not None:
                    uniques = set((sub, count) for sub, count, _ in subse + subsf)
                    writer.write('%s\t%s' % (molide, '\t'.join(['%s %d' % (sub, count) for sub, count in uniques])))
                    writer.write('\n')
    ecfps_recode('lab')
    ecfps_recode('unl')
    ecfps_recode('scr')

    def sub2i():
        """Generates a map {labelled_substructure -> column}
        This produces a unique assignment for all the features in the dataset, in three files:
          - lab: the indices for all features that appear in labelled
          - unl: the indices for features that do not appear in labelled but appear in unlabelld
          - scr: the indices for the features that appear in screening but not on labelled or unlabelled
        Of course, keep track of changes to the map as needed while creating models.

        Note that this keeps all the substructures in memory (which shoould be ok for any recent machine).
        """
        def all_subs(dset):
            info(dset)
            subs = set()
            with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged')) as reader:
                for line in reader:
                    subs.update(sub.split()[0] for sub in line.split('\t')[1:])  # TODO sort by frequency
            return subs
        lab_subs = all_subs('lab')
        unl_subs = all_subs('unl')
        scr_subs = all_subs('scr')
        with open(op.join(_MALARIA_ECFPS_DIR, 'lab.merged.s2i'), 'w') as writer:
            for i, sub in enumerate(sorted(lab_subs)):
                writer.write('%s %d\n' % (sub, i))
        num_written = len(lab_subs)
        with open(op.join(_MALARIA_ECFPS_DIR, 'unl.merged.s2i'), 'w') as writer:
            new_subs = unl_subs - lab_subs
            for i, sub in enumerate(sorted(new_subs)):
                writer.write('%s %d\n' % (sub, i + num_written))
            num_written += len(new_subs)
        with open(op.join(_MALARIA_ECFPS_DIR, 'scr.merged.s2i'), 'w') as writer:
            new_subs = scr_subs - (unl_subs | lab_subs)
            for i, sub in enumerate(sorted(new_subs)):
                writer.write('%s %d\n' % (sub, i + num_written))
        with open(op.join(_MALARIA_ECFPS_DIR, 'trans.merged.s2i'), 'w') as writer:
            for sub in sorted(lab_subs & unl_subs | lab_subs & scr_subs):
                writer.write('%s\n' % sub)
    sub2i()

    def mol2i(dset='lab'):
        """Generates a map {molid -> row}.
        Molecules for which RDKIT could not generate the fingerprints are not in this map,
        nor in hte final sparse matrices.
        In any case we will need to keep track of changes on the map as we do, for example, cross-val.
        """
        with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged.m2i'), 'w') as writer:
            with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged')) as reader:
                for line in reader:
                    writer.write('%s\n' % line[0:line.find('\t')])
    mol2i('lab')
    mol2i('unl')
    mol2i('scr')

    #####---Step 3: write sparse matrices with the recoded information of step 2. After this:
    #####  - We get a h5 file for each dataset, with a sparse matrix in CSR format.
    #####  - Note that this is a memory intense procedure, can be done lightweight by using 2 passes.

    def to_sparse_chihuahua(dset='lab', two_pass=False):
        """Generates sparse CSR matrices using as features only these in the labelled dataset,
        with the column index and the row index as computed previously.
        They get stored in a h5 file with the following datasets:
          - data
          - indices
          - indptr
          - shape
        """
        if two_pass:
            # First pass: shape and number of nonzeros
            # Second pass: h5 file with the proper sizes of indices, indptr and data, write on the fly
            raise NotImplementedError
        # mol2row, smiles2col
        m2i = {mol.strip(): i for i, mol in enumerate(open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged.m2i')))}
        s2i = {}
        with open(op.join(_MALARIA_ECFPS_DIR, 'lab.merged.s2i')) as reader:
            for line in reader:
                sub, i = line.strip().split()
                i = int(i)
                s2i[sub] = i
        rows = array('I')
        cols = array('I')
        data = array('I')
        # gather data
        with open(op.join(_MALARIA_ECFPS_DIR, dset + '.merged')) as reader:
            for line in reader:
                values = line.split('\t')
                molid = values[0]
                row = m2i[molid]
                for fc in values[1:]:
                    sub, count = fc.split()
                    count = int(count)
                    col = s2i.get(sub, None)
                    if col is not None:
                        rows.append(row)
                        cols.append(col)
                        data.append(count)
        # save as CSR sparse matrix
        M = coo_matrix((data, (rows, cols)), dtype=np.int32).tocsr()
        with h5py.File(op.join(_MALARIA_ECFPS_DIR, dset + '.sparse.h5'), 'w') as h5:
            h5['indices'] = M.indices
            h5['indptr'] = M.indptr
            h5['data'] = data
            h5['shape'] = np.array([M.shape[0], len(s2i)])
    to_sparse_chihuahua('lab')
    to_sparse_chihuahua('unl')
    to_sparse_chihuahua('scr')

    #####---Step 4: lame feature duplicate detection to tackle partially multicolliniarity
    MalariaFingerprintsManager(zero_dupes='lab').X()
    MalariaFingerprintsManager(zero_dupes='all').X()