def RunMPI(self, distData): starttime = time.time() nData = self.comm.bcast([len(distData)], root=0) #synchronize w/root self.IdleTime += time.time() - starttime if self.rank == 0: return self.ScatterDriver(distData) stats = MPI.Status() if self.verbose(): print 'Starting calculation (' + str(nData) + ' total inputs)' while True: output.StartTimer('WAITING') if self.verbose: print 'Waiting for data ...' sys.stdout.flush() SLICE = self.comm.recv(source=0, tag=MPI.ANY_TAG, status=stats) output.EndTimer('WAITING') if stats.Get_tag() == QUIT: break if self.verbose: print 'Working on ' + str(len(SLICE)) + ' data slices' sys.stdout.flush() results = [self.DistFunc(item) for item in SLICE] output.StartTimer('WAITING') self.comm.Send(signaldone, dest=0) output.EndTimer('WAITING') self.comm.send(results, dest=0) self.stdout.flush()
def ScatterCoords(mols): if not pl.mpi: output.StartTimer('COORDS') for mol in mols: if not mol.HasProp('coords'): SetCoords(mol) output.EndTimer('COORDS') return None needCalc = [mol for mol in mols if not mol.HasProp('coords')] if len(needCalc) == 0: return None # Check if values are already computed if mprms.UseMongo: import mongoserver as mongo needSMI = [Chem.MolToSmiles(mol) for mol in needCalc] myMongo = mongo.LookupDB(metric, needSMI) for v, m in zip(myMongo, needCalc): if v is not None: SetListProp(m, 'coords', v) oldn = len(needCalc) needCalc = [mol for mol in mols if not mol.HasProp('coords')] print 'Used %d memorized coordinate values' % (oldn - len(needCalc)) if len(needCalc) == 0: return None output.StartTimer('COORDS') print 'Scattering chemical space coordinate calculation ...', len(needCalc) pl.MyTask.SetFunction(MPICoordCalc) coords = pl.MyTask.RunMPI(needCalc) for coord, mol in zip(coords, needCalc): SetListProp(mol, 'coords', coord) # Update values in mongo database if mprms.UseMongo: needSMI = [Chem.MolToSmiles(mol) for mol in needCalc] mongo.UpdateDB( metric, {s: GetListProp(m, 'coords') for s, m in zip(needSMI, needCalc)}) print '%d new memorized coordinate values' % len(needCalc) output.EndTimer('COORDS')
def ScatterAssign(mols): toassign = [m for m in mols if not m.HasProp('gridcoord')] if len(toassign) == 0: return oput.StartTimer('GRID ASSIGN') if len(toassign) < ChunkSize or not pl.mpi: assignments = grid(toassign) for assign, mol in zip(assignments, toassign): print "type(assign):", type(assign) print "assign:", assign print[type(item) for item in assign] newassign = tuple(map(float, assign)) SetListProp(mol, 'gridcoord', newassign) else: print 'Scattering grid assignments ...' scatter = [(grid, [m.GetProp('coords') for m in toassign[i:i + ChunkSize]]) for i in xrange(0, len(toassign), ChunkSize)] if pl.verbose: print len(scatter) pl.MyTask.SetFunction(MPIGridAssign) assigns = pl.MyTask.RunMPI(scatter) for mol, gridcoord in zip(toassign, (a for x in assigns for a in x)): mol.SetProp('gridcoord', gridcoord) oput.EndTimer('GRID ASSIGN')
def WriteWatchFolder(mylib, wrotepool, itnum): oput.StartTimer('WRITE') filename = WatchPrefix + str(itnum) + '.oeb.gz' writemols = list(LMMScreener(mylib, wrotepool)) DumpMols(writemols, filename) for mol in writemols: StripData(mol) os.system('mv ' + filename + ' ' + WatchFolder + filename) NoReadFiles.add(WatchFolder + filename) oput.EndTimer('WRITE')
def ComputeObjectives(mols_tocalc, gen=0): if CINDES_interface: print 'calculating via CINDES program' output.StartTimer('CINDES') qc.calculate(mols_tocalc, gen=gen) output.EndTimer('CINDES') elif callable(fitnessfunction): for mol in mols_tocalc: value = fitnessfunction(mol) mol.SetDoubleProp('Objective', float(value)) else: #Serial raise NotImplementedError( 'only CINDES objectives are supported currently')
def ReadWatchFolder(mylib, wrotepool): global mymaxit oput.StartTimer('READ') if pl.mpi: answer = ScatterReadWatchFolder(mylib, wrotepool) oput.EndTimer('READ') return answer mysmi = set(m.GetProp('isosmi') for m in mylib) print "Reading additional molecules: ", for file in glob(WatchFolder + '*.oeb.gz'): if basename(file)[:len(WatchPrefix)] == WatchPrefix: mymaxit = extractnum(basename(file)) if basename(file, False) in NoReadFiles: continue nNew = 0 for nmol, newmol in enumerate(GetLowMemMols(file)): smi = newmol.GetProp('isosmi') if not smi in mysmi: nNew += 1 mysmi.add(smi) wrotepool.add(smi) StripData(newmol) mylib.append(newmol) NoReadFiles.add(basename(file, False)) print basename(file, True) + ' (' + str(nNew) + '/' + str(nmol + 1) + ')', print 'done.' oput.EndTimer('READ') return mymaxit
def select(self, pool): nSwap = 0 # 2. we assign molecular coordinates to the molecules in pool coords = self.GetCoords(pool) # 3. Do some initializations: scores = np.array([mol.GetDoubleProp('Objective') for mol in pool]) # 4 Calculate the diversity measure for the pure diversity subset # 4.1 First Select a pure diversity based sample subset picks = self.GetPureDiversityPicks(coords) templib = [pool[i] for i in picks] # 4.2 coords = self.NormCoords(coords, templib) # 4.3 calculate the average distance AveDistSqr = self.GetAveDistSqr(templib) # 4.4 calculate the average objective value aveobj = sum(m.GetDoubleProp('Objective') for m in templib) / (float(len(templib))) print 'Average objective value of pure diversity subset:', aveobj output.StartTimer("OBJECTIVE MXMN") print 'Optimizing library ...', distances = self.GetDistances(coords) # scale scores and diversity-values: from sklearn import preprocessing distances = preprocessing.scale(distances) scores = preprocessing.scale(scores) fitness = self.cdiv * distances + self.pdiv * scores * minsign fittests = np.argsort(fitness) newlib = [] for i in fittests[-self.subsetSize:]: newlib.append(pool[i]) newlib.sort(key=lambda x: x.GetDoubleProp('Objective'), reverse=not minimize) print 'Average objective value after optimization: ', sum( np.array([m.GetDoubleProp('Objective') for m in newlib]) / float(len(newlib))) output.EndTimer('OBJECTIVE MXMN') return newlib
def GridDiversity(oldmols, newmols, pcabasis=None, molgrid=None): global grid, nCellDims, nBins if DE: print "nCellDims:", nCellDims print "nBins:", nBins if molgrid is None: molgrid = dict() mols = oldmols + newmols else: mols = newmols if grid is None: print 'Creating new grid' grid = PCAGrid(mols, nCellDims, nBins, pcabasis=pcabasis, scaleBins=BinsByVariance) ScatterCoords([m for m in mols if not m.HasProp('gridcoord')]) ScatterAssign(mols) ScatterDecider(mols) nNew = 0 oput.StartTimer('GRID PICKS') for mol in mols: index = mol.GetProp('gridcoord') if molgrid.has_key(index): if molgrid[index].GetDoubleProp('decider') < mol.GetDoubleProp( 'decider'): molgrid[index] = mol nNew += 1 else: molgrid[index] = mol nNew += 1 print len(molgrid),'/',np.product(grid.nbins),'occupied cells (',\ nNew,'new)' oput.EndTimer('GRID PICKS') return len(molgrid), molgrid.values(), molgrid
def ScatterDecider(mols): toCompute = [m for m in mols if not m.HasProp('decider')] if len(toCompute) == 0: return else: pass #print "len(toCompute):", len(toCompute) #print 'C1=CC(=O)C(=O)C=C1CC=O' in [ oe.OECreateCanSmiString(m) for m in mols ] oput.StartTimer('BIAS FUNCTION') if not pl.mpi or fastDecider: for m in toCompute: print m.SetDoubleProp('decider', decider(m)) else: print 'Scattering bias function ...' sendmols = [pl.SendMol(m) for m in toCompute] pl.MyTask.SetFunction(MPIDecider) vals = pl.MyTask.RunMPI(sendmols) for v, m in zip(vals, toCompute): m.SetDoubleProp('decider', v) oput.EndTimer('BIAS FUNCTION')
def GetAveDistSqr(templib): output.StartTimer('NN DIST CALC') AveDistSqr = (1.0 - similarity.NNSimilarity(templib, average=True))**2 output.EndTimer('NN DIST CALC') print 'average diversity value of pure diversity subset:', AveDistSqr return AveDistSqr
def select(self, pool): nSwap = 0 # 2. we assign molecular coordinates to the molecules in pool coords = self.GetCoords(pool) # 3. Do some initializations: scores = np.ma.array([mol.GetDoubleProp('Objective') for mol in pool]) # 4 Calculate the diversity measure for the pure diversity subset # 4.1 First Select a pure diversity based sample subset picks = self.GetPureDiversityPicks(coords) templib = [pool[i] for i in picks] # 4.2 coords = self.NormCoords(coords, templib) # 4.3 calculate the average distance AveDistSqr = self.GetAveDistSqr(templib) # 4.4 calculate the average objective value aveobj = sum( m.GetDoubleProp('Objective') for m in templib) / (float(len(templib))) print 'Average objective value of pure diversity subset:', aveobj ############################ NEIGHBORHOOD MAXIMIN ################## #Discard the original subset; instead, pick the BEST SCORING COMPOUND #within the neighborhood of each compound # make a masked array so we don't pick already picked ones pickmask = np.zeros(len(pool), dtype=np.bool) for i in picks: pickmask[i] = True # mask every value larger than TargetScore. targetmask = np.ma.getmask( np.ma.masked_greater(scores * minsign, TargetScore * minsign)) print "targetmask:", targetmask print "scores*minsign:", scores * minsign print "TargetScore * minsign:", TargetScore * minsign output.StartTimer("OBJECTIVE MXMN") print 'Optimizing library ...', newlib = [] ######### MAIN LOOP ######### for ipick in picks: myscore = pool[ipick].GetDoubleProp('Objective') if not self.selectfittest: #Skip compounds already at target if myscore * minsign <= TargetScore * minsign: newlib.append(pool[ipick]) continue #Mask compounds outside of current neighborhood #or that have already been picked distsqr = self.GetDistSqr(coords, ipick) neighbor_pick_mask = self.GetNeighborPickMask(distsqr, AveDistSqr, pickmask) mask = _array_or(targetmask, neighbor_pick_mask) if self.selectfittest: distsqr = np.ma.masked_array(scores * -1.0 * minsign, mask=mask) else: #If any compounds in the neighborhood hit the target, pick #the closest one # get dist's of mols not already picked, in neighborhood # and objective value above cutoff value distsqr = np.ma.masked_array(distsqr, mask=mask) if distsqr.count() > 0: print "I'm here!" # change pick for best pick: inewpick = np.argmin(distsqr) newlib.append(pool[inewpick]) # adjust pickmask pickmask[inewpick] = True pickmask[ipick] = False nSwap += 1 continue #If there is no compound hitting the target, pick the #best one in the neigbhorhood # get only scores in neighborhood not already picked scores.mask = neighbor_pick_mask # and get the best value even if not fullfilling cutoff inewpick = np.argmin(minsign * scores) # if inewpick is different from the current ipick: if scores[inewpick] * minsign < myscore * minsign: newlib.append(pool[inewpick]) pickmask[inewpick] = True pickmask[ipick] = False nSwap += 1 else: newlib.append(pool[ipick]) ##### #Done with optimizing maximin newlib.sort( key=lambda x: x.GetDoubleProp('Objective'), reverse=not minimize) print 'swapped', nSwap, '/', len(newlib), 'compounds' print 'Average objective value after optimization: ', sum( np.array([m.GetDoubleProp('Objective') for m in newlib])/float(len(newlib))) output.EndTimer('OBJECTIVE MXMN') output.obstats['nSwap']=nSwap return newlib
def GetAveDistSqr(templib): output.StartTimer('NN DIST CALC') AveDistSqr = distance.AveNNDistance(templib) output.EndTimer('NN DIST CALC') print 'average diversity value of pure diversity subset:', AveDistSqr return AveDistSqr
def GridDiversity_JITFilter(oldmols, newmols, Filter=True, Geom=False, molgrid=None): if DE: print "oldmols:", oldmols if not (Filter or Geom): return GridDiversity(oldmols, newmols) if Filter: newmols = dr.DriveFilters(newmols, Filter, False) ScatterCoords([m for m in newmols if not m.HasProp('gridcoord')]) ScatterAssign(newmols) ScatterDecider(newmols) #Get old assignments if not passed oput.StartTimer('GRID PICKS') if not molgrid: molgrid = {} for mol in oldmols: index = mol.GetProp('gridcoord') if molgrid.has_key(index): if molgrid[index].GetDoubleProp('decider') < mol.GetDoubleProp( 'decider'): molgrid[index] = mol else: molgrid[index] = mol nOld = len(molgrid) #Screen new molecules for novelty toscreen = [] for mol in newmols: index = mol.GetProp('gridcoord') try: if (not molgrid.has_key(index)) or \ molgrid[index].GetDoubleProp('decider')<mol.GetDoubleProp('decider'): toscreen.append(mol) except ValueError: print "No Decider keyword for:", Chem.MolToSmiles(molgrid[index]) #print molgrid[index].GetDoubleProp('decider') pass oput.EndTimer('GRID PICKS') print 'Novel mutants:', len(toscreen), '/', len(newmols) #Run filters on novel molecules only if Geom: goodmols = dr.DriveFilters(toscreen, Filter, Geom) print 'Molecules passing filters:', len(goodmols) else: goodmols = toscreen #Check for some problems badgoodmols = [m for m in goodmols if not m.HasProp('gridcoord')] if len(badgoodmols) > 0: DumpLowMemMols(badgoodmols, 'failchange.pjar.gz', True) print 'Molecule was changed after filtering ...' ScatterCoords(badgoodmols) ScatterAssign(badgoodmols) ScatterDecider(badgoodmols) #Assign filtered novel molecules oput.StartTimer('GRID PICKS') nReplace = 0 for mol in goodmols: index = mol.GetProp('gridcoord') if molgrid.has_key(index): if molgrid[index].GetDoubleProp('decider') < mol.GetDoubleProp( 'decider'): molgrid[index] = mol nReplace += 1 else: molgrid[index] = mol oput.EndTimer('GRID PICKS') print len(molgrid),'/',np.product(grid.nbins),'occupied cells (',\ len(molgrid)-nOld,'new, '+str(nReplace)+' replaced.)' return len(molgrid), molgrid.values(), molgrid