def readData(propCod): """Reads files with data extracted from DBS. :param propCod: (str) Property code. :return: tables: (dict) Tables of property and source (LARS code). {property code {LARS code: [values]}} """ tables = {} for prop in propCod: tables[prop] = {} for larsCod in propCod[prop]: fileName = getFileName(prop, larsCod) if not os.path.exists(fileName): continue try: tab = pd.read_csv(fileName) except FileNotFoundError: raise myExceptions.NoFile(fileName) tables[prop][larsCod] = tab return tables
def readFieFile(fileName): """Reads family isomer enumeration (fie) file. :param fileName: (str) File name. :return: df: (pandas DataFrame) Table with ENU code, molecular formula and SMILEs string. """ if not os.path.exists(fileName): raise myExceptions.NoFile(fileName) # df = pd.read_csv(fileName, sep='\s+', comment='#', names=['nam', 'frm', 'smiles']) df = pd.read_csv(fileName, sep='\s+', names=['nam', 'frm', 'smiles']) return df
def run(): """Writes identifiers to txt file.""" nArgs = len(sys.argv) if nArgs != 3 and nArgs != 4: raise myExceptions.ArgError('3 or 4', nArgs) name = sys.argv[2] name = name.split('/')[-1] enuMolFile = 'out/{}.json'.format(name) if not os.path.exists(enuMolFile): raise myExceptions.NoFile(enuMolFile) with open(enuMolFile) as jsonFile: enuData = json.load(jsonFile, object_hook=moleculeDecoder) if len(sys.argv) == 4: fieFile = sys.argv[3] isomers = IO.readFieFile(fieFile) isomers = utils.canonicalizeSmiles(isomers) enuData = selectMolecules(isomers, enuData) outFileName = 'out/00_{}.lst'.format(name) out = open(outFileName, 'w') for smiles in enuData: mol = enuData[smiles] # frm = mol.form frm = mol.form_pcp cas = mol.cas smiles = mol.smiles if not cas: cas = '%' name = mol.name_pcp if not name: name = '%' inchi = mol.inchi_pcp name = name.replace(' ', '_') out.write('{:10} {:12} {:40} {:30} {}\n'.format( frm, cas, name, inchi, smiles)) out.close()
def getDbsEntries(fileName): """Reads dbs entries from file. :param fileName: (str) File name. :return: dbsEntries: (dict) Dictionary of DBS entries. """ if not os.path.exists(fileName): raise myExceptions.NoFile(fileName) with open(fileName) as jsonFile: dbsEntries = json.load(jsonFile, object_hook=dbsEntryDecoder) return dbsEntries
def __init__(self, fileName): """Constructs all the necessary attributes for the dbsConfiguration object. A configuration file consists of sections, lead by a "[section]" header, and followed by "name: value" entries, with continuations and such in the style of RFC 822. See 'configparser' documentation for more information. :param fileName: (str) File name from which configuration is read. """ if not os.path.exists(fileName): raise myExceptions.NoFile(fileName) config = configparser.ConfigParser() config.read(fileName) self.config = config
def run(dbsConfig): """Get identifiers and all available data for a given molecule in DBS and save data to data/ directory. :param dbsConfig: (dbsConfiguration object) DBS configuration object. """ nArgs = len(sys.argv) if nArgs != 3: raise myExceptions.ArgError(nArgs, 3) # I decided to keep both files (molListFile and enuMolFile) # because I can easily comment out entries in molListFile, # because of "#" in smiles strings I cannot use this symbol to start a comment. # 00_file.lst molListFile = sys.argv[2] # molList = np.genfromtxt(molListFile, dtype=None, encoding='utf-8') if not os.path.exists(molListFile): raise myExceptions.NoFile(molListFile) molList = pd.read_csv(molListFile, sep='\s+', header=None, names=['frm', 'cas', 'nam', 'inchi', 'smiles']) dbsFileName = dbsConfig.getDbsFileName() dbsEntries = dbs.getDbsEntries(dbsFileName) print('Getting Identifiers') path = dbsConfig.getPath() molList = getCids(dbsEntries, molList, path) # getCidsofSynonyms(dbsEntries, molList) print('Getting Data') propCod = dbsConfig.getPropCod() tables = getData(dbsEntries, molList, path, propCod) print('Writing Data') writeData(tables)