def s(data, filename='tmp.dat'): ''' saves arrays into an ASCII file easily read in gnuplot >>> s(data, filename='tmp.dat') # overwrites/creates tmp.dat ''' data = _transpose(data) _savetxt(filename, _array(data), delimiter=', ')
def ReadMolecules(self,titleLine=False,smilesColumn=0,nameColumn=1): #titleLine for smiles if self.file_extension in ['.smi','.smiles']: if self.verbose: print "Format of the structures file = SMILES" suppl = _SmilesMolSupplier(self.input_file,smilesColumn=smilesColumn, nameColumn=nameColumn, delimiter=self.delimiter,titleLine=titleLine) for i,m in enumerate(suppl): if m is not None: self.mols.append(m) mol_id = i if self.name_field == None else m.GetProp(self.name_field) self.mols_ids.append(mol_id) else: self.molserr.append(i) nb_mols=len(self.mols) elif self.file_extension == '.mol2': print "Format of the structures file = Mol2" molss=[] with open(self.input_file) as fi: for mol2 in RetrieveMol2Block(fi): rdkMolecule = _MolFromMol2Block(mol2) molss.append(rdkMolecule) for i,m in enumerate(molss): if m is not None: self.mols.append(m) mol_id = i if self.name_field == None else m.GetProp(self.name_field) self.mols_ids.append(mol_id) else: self.molserr.append(i) self.mols.append(m) self.nb_mols=len(self.mols) else: if self.verbose: print "Format of the structures file = SDF" suppl = _SDMolSupplier(self.input_file) for i,m in enumerate(suppl): if m is not None: self.mols.append(m) mol_id = i if self.name_field == None else m.GetProp(self.name_field) self.mols_ids.append(mol_id) else: self.molserr.append(i) self.nbMols=len(self.mols) if self.verbose: if len(self.molserr) !=0: print "%d molecules (starting at zero) could not be processed.\n"%(len(self.molserr)) err_file="incorrect_molecules.csv" print "This information has been saved in the following file: %s\n"%(err_file) #for x in self.molserr: print x print "NOTE: the indexes of the molecules start at zero. Thus the first molecule is molecule 0." # Save the information about which molecules could not be processed correctly. _savetxt(err_file,self.molserr,fmt="%d") else: print "All molecules in the input file were processed correctly"
molecules.read_molecules() # Get the information about the data set dataset = get_dataset_info() dataset.extract_substructure_information(radii=radii,mols=molecules.mols) # Calculate fps # if we do not consider external file if fileMolsEXT is None: calc_fps_no_reference_keys = CalculateFPs(radii=radii,mols=molecules.mols) calc_fps_no_reference_keys.calculate_hashed_fps_binary_quick(nBits=nbBits) calc_fps_no_reference_keys.calculate_hashed_fps_counts(nBits=nbBits) # Write fps to file fp_hash_b=outname+"_hashed_binary.csv" fp_hash_c=outname+"_hashed_counts.csv" _savetxt(fp_hash_b,calc_fps_no_reference_keys.fps_hashed_binary_quick,delimiter=',',fmt="%d") _savetxt(fp_hash_c,calc_fps_no_reference_keys.fps_hashed_counts,delimiter=',',fmt="%d") if unhashed: if image: calc_fps_no_reference_keys.calculate_unhashed_fps(draw_substructures=True) fpbinary=outname+'_unhashed_binary.csv' fpcounts=outname+'_unhashed_counts.csv' _savetxt(fpbinary,calc_fps_no_reference_keys.fps_unhashed_binary,delimiter=',',fmt="%d") _savetxt(fpcounts,calc_fps_no_reference_keys.fps_unhashed_counts,delimiter=',',fmt="%d") # save the smiles and the substructures fname=outname+'_substructure_smiles.csv' outfile = open(fname, 'w' ) for key, value in sorted(calc_fps_no_reference_keys.substructures_smiles.items() ): outfile.write( str(key) + '\t' + str(value) + '\n' ) outfile.close() # save the dictionary of substructures
def read_molecules(self, titleLine=False, smilesColumn=0, nameColumn=1): #titleLine for smiles if self.file_extension in ['.smi', '.smiles']: if self.verbose: print "Format of the structures file = SMILES" suppl = _SmilesMolSupplier(self.input_file, smilesColumn=smilesColumn, nameColumn=nameColumn, delimiter=self.delimiter, titleLine=titleLine) for i, m in enumerate(suppl): if m is not None: self.mols.append(m) mol_id = i if self.name_field == None else m.GetProp( self.name_field) self.mols_ids.append(mol_id) else: self.molserr.append(i) nb_mols = len(self.mols) elif self.file_extension == '.mol2': print "Format of the structures file = Mol2" molss = [] with open(self.input_file) as fi: for mol2 in RetrieveMol2Block(fi): rdkMolecule = _MolFromMol2Block(mol2) molss.append(rdkMolecule) for i, m in enumerate(molss): if m is not None: self.mols.append(m) mol_id = i if self.name_field == None else m.GetProp( self.name_field) self.mols_ids.append(mol_id) else: self.molserr.append(i) self.mols.append(m) self.nb_mols = len(self.mols) else: if self.verbose: print "Format of the structures file = SDF" suppl = _SDMolSupplier(self.input_file) for i, m in enumerate(suppl): if m is not None: self.mols.append(m) mol_id = i if self.name_field == None else m.GetProp( self.name_field) self.mols_ids.append(mol_id) else: self.molserr.append(i) self.nbMols = len(self.mols) if self.verbose: if len(self.molserr) != 0: print "The following %d molecules (starting at zero) could not be processed:\n" % ( len(self.molserr)) for x in self.molserr: print x print "NOTE: the indexes of the molecules start at zero. Thus the first molecule is molecule 0." err_file = "incorrect_molecules_" + self.file_name + ".csv" print "This information has been saved in the following file: %s\n" % ( err_file) # Save the information about which molecules could not be processed correctly. _savetxt(err_file, self.molserr, fmt="%d") else: print "All molecules in the input file were processed correctly"
# Get the information about the data set dataset = get_dataset_info(name_field='_Name') dataset.extract_substructure_information(radii=radii, mols=molecules.mols) # Calculate fps # if we do not consider external file if fileMolsEXT is None: calc_fps_no_reference_keys = CalculateFPs(radii=radii, mols=molecules.mols) calc_fps_no_reference_keys.calculate_hashed_fps_binary_quick(nBits=nbBits) calc_fps_no_reference_keys.calculate_hashed_fps_counts(nBits=nbBits) # Write fps to file fp_hash_b = outname + "_hashed_binary.csv" fp_hash_c = outname + "_hashed_counts.csv" _savetxt(fp_hash_b, calc_fps_no_reference_keys.fps_hashed_binary_quick, delimiter=',', fmt="%d") _savetxt(fp_hash_c, calc_fps_no_reference_keys.fps_hashed_counts, delimiter=',', fmt="%d") if unhashed: if image: calc_fps_no_reference_keys.calculate_unhashed_fps( draw_substructures=True) fpbinary = outname + '_unhashed_binary.csv' fpcounts = outname + '_unhashed_counts.csv' _savetxt(fpbinary, calc_fps_no_reference_keys.fps_unhashed_binary, delimiter=',', fmt="%d")