Пример #1
0
def s(data, filename='tmp.dat'):
    '''
    saves arrays into an ASCII file easily read in gnuplot
    >>> s(data, filename='tmp.dat')  # overwrites/creates tmp.dat
    '''
    data = _transpose(data)
    _savetxt(filename, _array(data), delimiter=', ')
Пример #2
0
    def ReadMolecules(self,titleLine=False,smilesColumn=0,nameColumn=1): #titleLine for smiles

        if self.file_extension in ['.smi','.smiles']:
            if self.verbose:
                print "Format of the structures file = SMILES"
            suppl = _SmilesMolSupplier(self.input_file,smilesColumn=smilesColumn,
                                           nameColumn=nameColumn,
                                           delimiter=self.delimiter,titleLine=titleLine)

            for i,m in enumerate(suppl):
                if m is not None:
                    self.mols.append(m)
                    mol_id = i if self.name_field == None else m.GetProp(self.name_field)
                    self.mols_ids.append(mol_id)
                else:
                    self.molserr.append(i)
            nb_mols=len(self.mols)
        elif self.file_extension == '.mol2':
            print "Format of the structures file = Mol2"
            molss=[]
            with open(self.input_file) as fi:
                for mol2 in RetrieveMol2Block(fi):
                    rdkMolecule = _MolFromMol2Block(mol2)
                    molss.append(rdkMolecule)
            for i,m in enumerate(molss):
                if m is not None:
                    self.mols.append(m)
                    mol_id = i if self.name_field == None else m.GetProp(self.name_field)
                    self.mols_ids.append(mol_id)
                else:
                    self.molserr.append(i)
                    self.mols.append(m)
            self.nb_mols=len(self.mols)
        else:
            if self.verbose:
                print "Format of the structures file = SDF"
            suppl = _SDMolSupplier(self.input_file)
            for i,m in enumerate(suppl):
                if m is not None:
                    self.mols.append(m)
                    mol_id = i if self.name_field == None else m.GetProp(self.name_field)
                    self.mols_ids.append(mol_id)
                else:
                    self.molserr.append(i)
            self.nbMols=len(self.mols)
        
        if self.verbose:
            if len(self.molserr) !=0:
                print "%d molecules (starting at zero) could not be processed.\n"%(len(self.molserr))
                err_file="incorrect_molecules.csv"
                print "This information has been saved in the following file: %s\n"%(err_file)
                #for x in self.molserr: print x
                print "NOTE: the indexes of the molecules start at zero. Thus the first molecule is molecule 0."
                # Save the information about which molecules could not be processed correctly.
                _savetxt(err_file,self.molserr,fmt="%d")
            else:
                print "All molecules in the input file were processed correctly"
Пример #3
0
molecules.read_molecules()

# Get the information about the data set
dataset = get_dataset_info()
dataset.extract_substructure_information(radii=radii,mols=molecules.mols)

# Calculate fps
# if we do not consider external file
if fileMolsEXT is None:
	calc_fps_no_reference_keys = CalculateFPs(radii=radii,mols=molecules.mols)
	calc_fps_no_reference_keys.calculate_hashed_fps_binary_quick(nBits=nbBits)
	calc_fps_no_reference_keys.calculate_hashed_fps_counts(nBits=nbBits)
	# Write fps to file
	fp_hash_b=outname+"_hashed_binary.csv"
	fp_hash_c=outname+"_hashed_counts.csv"
	_savetxt(fp_hash_b,calc_fps_no_reference_keys.fps_hashed_binary_quick,delimiter=',',fmt="%d")
	_savetxt(fp_hash_c,calc_fps_no_reference_keys.fps_hashed_counts,delimiter=',',fmt="%d")
	if unhashed:
		if image:
			calc_fps_no_reference_keys.calculate_unhashed_fps(draw_substructures=True)
			fpbinary=outname+'_unhashed_binary.csv'
			fpcounts=outname+'_unhashed_counts.csv'
			_savetxt(fpbinary,calc_fps_no_reference_keys.fps_unhashed_binary,delimiter=',',fmt="%d")
			_savetxt(fpcounts,calc_fps_no_reference_keys.fps_unhashed_counts,delimiter=',',fmt="%d")
			# save the smiles and the substructures
			fname=outname+'_substructure_smiles.csv'
			outfile = open(fname, 'w' )
			for key, value in sorted(calc_fps_no_reference_keys.substructures_smiles.items() ):
				outfile.write( str(key) + '\t' + str(value) + '\n' )
			outfile.close()
			# save the dictionary of substructures
    def read_molecules(self,
                       titleLine=False,
                       smilesColumn=0,
                       nameColumn=1):  #titleLine for smiles

        if self.file_extension in ['.smi', '.smiles']:
            if self.verbose:
                print "Format of the structures file = SMILES"
            suppl = _SmilesMolSupplier(self.input_file,
                                       smilesColumn=smilesColumn,
                                       nameColumn=nameColumn,
                                       delimiter=self.delimiter,
                                       titleLine=titleLine)

            for i, m in enumerate(suppl):
                if m is not None:
                    self.mols.append(m)
                    mol_id = i if self.name_field == None else m.GetProp(
                        self.name_field)
                    self.mols_ids.append(mol_id)
                else:
                    self.molserr.append(i)
            nb_mols = len(self.mols)
        elif self.file_extension == '.mol2':
            print "Format of the structures file = Mol2"
            molss = []
            with open(self.input_file) as fi:
                for mol2 in RetrieveMol2Block(fi):
                    rdkMolecule = _MolFromMol2Block(mol2)
                    molss.append(rdkMolecule)
            for i, m in enumerate(molss):
                if m is not None:
                    self.mols.append(m)
                    mol_id = i if self.name_field == None else m.GetProp(
                        self.name_field)
                    self.mols_ids.append(mol_id)
                else:
                    self.molserr.append(i)
                    self.mols.append(m)
            self.nb_mols = len(self.mols)
        else:
            if self.verbose:
                print "Format of the structures file = SDF"
            suppl = _SDMolSupplier(self.input_file)
            for i, m in enumerate(suppl):
                if m is not None:
                    self.mols.append(m)
                    mol_id = i if self.name_field == None else m.GetProp(
                        self.name_field)
                    self.mols_ids.append(mol_id)
                else:
                    self.molserr.append(i)
            self.nbMols = len(self.mols)

        if self.verbose:
            if len(self.molserr) != 0:
                print "The following %d molecules (starting at zero) could not be processed:\n" % (
                    len(self.molserr))
                for x in self.molserr:
                    print x
                print "NOTE: the indexes of the molecules start at zero. Thus the first molecule is molecule 0."
                err_file = "incorrect_molecules_" + self.file_name + ".csv"
                print "This information has been saved in the following file: %s\n" % (
                    err_file)
                # Save the information about which molecules could not be processed correctly.
                _savetxt(err_file, self.molserr, fmt="%d")
            else:
                print "All molecules in the input file were processed correctly"
# Get the information about the data set
dataset = get_dataset_info(name_field='_Name')
dataset.extract_substructure_information(radii=radii, mols=molecules.mols)

# Calculate fps
# if we do not consider external file
if fileMolsEXT is None:
    calc_fps_no_reference_keys = CalculateFPs(radii=radii, mols=molecules.mols)
    calc_fps_no_reference_keys.calculate_hashed_fps_binary_quick(nBits=nbBits)
    calc_fps_no_reference_keys.calculate_hashed_fps_counts(nBits=nbBits)
    # Write fps to file
    fp_hash_b = outname + "_hashed_binary.csv"
    fp_hash_c = outname + "_hashed_counts.csv"
    _savetxt(fp_hash_b,
             calc_fps_no_reference_keys.fps_hashed_binary_quick,
             delimiter=',',
             fmt="%d")
    _savetxt(fp_hash_c,
             calc_fps_no_reference_keys.fps_hashed_counts,
             delimiter=',',
             fmt="%d")
    if unhashed:
        if image:
            calc_fps_no_reference_keys.calculate_unhashed_fps(
                draw_substructures=True)
            fpbinary = outname + '_unhashed_binary.csv'
            fpcounts = outname + '_unhashed_counts.csv'
            _savetxt(fpbinary,
                     calc_fps_no_reference_keys.fps_unhashed_binary,
                     delimiter=',',
                     fmt="%d")