def write_stereo_dct(spc_str, allstereo=False): """ read the species file in a .csv format and write a new one that has stero information """ # Read the headers headers = [ header for header in read_csv_headers(spc_str) if header != 'name' ] if 'inchi' not in headers: headers.append('inchi') headers_noich = [ header for header in headers if header not in ('inchi', 'inchikey') ] new_headers = ['inchi', 'inchikey'] + headers_noich # Read in the initial CSV information (deal with mult stereo) init_dct = csv_dct(spc_str, values=headers, key='name') # Build a new dict new_dct = {} names_in_order = list(init_dct.keys()) randomized_names = list(init_dct.keys()) nproc_avail = max(len(os.sched_getaffinity(0)) - 1, 1) num_spc = len(randomized_names) spc_per_proc = math.floor(num_spc / nproc_avail) queue = multiprocessing.Queue() procs = [] random.shuffle(randomized_names) for proc_n in range(nproc_avail): spc_start = proc_n * spc_per_proc if proc_n == nproc_avail - 1: spc_end = num_spc else: spc_end = (proc_n + 1) * spc_per_proc names = randomized_names[spc_start:spc_end] proc = multiprocessing.Process(target=_add_stereo_to_dct, args=( queue, names, init_dct, headers_noich, allstereo, )) procs.append(proc) proc.start() for _ in procs: new_dct.update(queue.get()) for proc in procs: proc.join() return new_dct, new_headers, names_in_order
def build_spc_dct(spc_str, spc_type): """ Get a dictionary of all the input species indexed by InChi string """ if spc_type == 'csv': spc_dct = csv_dct(spc_str) else: raise NotImplementedError return spc_dct
def write_basis_csv(spc_str, outname='species_hof_basis.csv', path='.'): """read the species file in a .csv format and write a new one that has hof basis species added to it """ headers = [header for header in read_csv_headers(spc_str) if header != 'name'] if 'inchi' not in headers: headers.append('inchi') headers_noich = [header for header in headers if header not in ('inchi', 'inchikey')] new_headers = ['inchi' ] + headers_noich # Read in the initial CSV information (deal with mult stereo) init_dct = csv_dct(spc_str, values=headers, key='name') # Build a new dict names = list(init_dct.keys()) spc_queue = [] for name in names: spc_queue.append([name, '']) new_dct = {} new_names = [] # Get the hof basis molecules ref_schemes = ['cbh0', 'cbh1', 'cbh2'] for ref_scheme in ref_schemes: _, uniref_dct = basis.prepare_refs( ref_scheme, init_dct, spc_queue) for newn in list(uniref_dct.keys()): tempn_smiles = automol.inchi.smiles(uniref_dct[newn]['inchi']) tempn = ref_scheme + '_' + tempn_smiles if tempn not in new_names: new_names.append(tempn) new_dct[tempn] = uniref_dct[newn] new_dct[tempn]['smiles'] = tempn_smiles new_dct.update(init_dct) init_dct = new_dct # Writer string spc_str = ','.join(['name'] + new_headers) + '\n' for name in init_dct: spc_str += '{},'.format(name) for idx, header in enumerate(new_headers): val = init_dct[name][header] if isinstance(val, str): val = "'{}'".format(val) spc_str += str(val) if idx+1 < len(new_headers): spc_str += ',' spc_str += '\n' # Write the file basis_file = os.path.join(path, outname) with open(basis_file, 'w') as file_obj: file_obj.write(spc_str)
def write_basis_csv(spc_str, outname='species_hof_basis.csv', path='.', parallel=False): """read the species file in a .csv format and write a new one that has hof basis species added to it """ headers = [header for header in read_csv_headers(spc_str) if header != 'name'] if 'inchi' not in headers: headers.append('inchi') headers_noich = [header for header in headers if header not in ('inchi', 'inchikey')] new_headers = ['inchi', 'inchikey' ] + headers_noich # Read in the initial CSV information (deal with mult stereo) init_dct = csv_dct(spc_str, values=headers, key='name') # Build a new dict names = list(init_dct.keys()) spc_queue = [] for name in names: spc_queue.append([name, '']) new_dct = {} # Get the hof basis molecules ref_schemes = ['cbh0', 'cbh1', 'cbh2'] ref_dct = {} spc_str = ','.join(['name'] + new_headers) + '\n' for ref_scheme in ref_schemes: formula_dct = {} _, uniref_dct = basis.prepare_refs( ref_scheme, init_dct, spc_queue, repeats=True, parallel=parallel) for name in uniref_dct: spc_str += ref_scheme + '_' smiles = automol.inchi.smiles(uniref_dct[name]['inchi']) uniref_dct[name]['smiles'] = smiles formula = automol.inchi.formula_string(uniref_dct[name]['inchi']) if formula in formula_dct: formula_dct[formula] += 1 formula = formula + '({})'.format(formula_dct[formula]) else: formula_dct[formula] = 0 spc_str += formula + ',' for idx, header in enumerate(new_headers): val = uniref_dct[name][header] if isinstance(val, str): val = "'{}'".format(val) spc_str += str(val) if idx+1 < len(new_headers): spc_str += ',' spc_str += '\n' basis_file = os.path.join(path, outname + '_basis') with open(basis_file, 'w') as file_obj: file_obj.write(spc_str) new_names = [] for ref_scheme in ref_schemes: _, uniref_dct = basis.prepare_refs( ref_scheme, init_dct, spc_queue, repeats=False, parallel=parallel) for newn in list(uniref_dct.keys()): tempn_smiles = automol.inchi.smiles(uniref_dct[newn]['inchi']) tempn = ref_scheme + '_' + tempn_smiles if tempn not in new_names: new_names.append(tempn) new_dct[tempn] = uniref_dct[newn] new_dct[tempn]['smiles'] = tempn_smiles ref_dct.update(new_dct) new_dct.update(init_dct) init_dct = new_dct # Writer string spc_str = ','.join(['name'] + new_headers) + '\n' for name in init_dct: formula_dct = {} if 'cbh' in name: namelabel = name.split('_')[0] if not namelabel in formula_dct: formula_dct[namelabel] = {} frmdct = formula_dct[namelabel] spc_str += namelabel + '_' formula = automol.inchi.formula_string(init_dct[name]['inchi']) if formula in frmdct: frmdct[formula] += 1 formula = formula + '({})'.format(frmdct[formula]) else: frmdct[formula] = 0 spc_str += formula + ',' else: spc_str += '{},'.format(name) for idx, header in enumerate(new_headers): val = init_dct[name][header] if isinstance(val, str): val = "'{}'".format(val) spc_str += str(val) if idx+1 < len(new_headers): spc_str += ',' spc_str += '\n' # Write the file basis_file = os.path.join(path, outname) with open(basis_file, 'w') as file_obj: file_obj.write(spc_str)
def write_stereo_csv(spc_str, outname='species_stereo.csv', path='.', allstereo=False): """ read the species file in a .csv format and write a new one that has stero information """ # Read the headers headers = [header for header in read_csv_headers(spc_str) if header != 'name'] if 'inchi' not in headers: headers.append('inchi') headers_noich = [header for header in headers if header not in ('inchi', 'inchikey')] new_headers = ['inchi', 'inchikey'] + headers_noich # Read in the initial CSV information (deal with mult stereo) init_dct = csv_dct(spc_str, values=headers, key='name') # Build a new dict new_dct = {} names_in_order = list(init_dct.keys()) randomized_names = list(init_dct.keys()) nproc_avail = max(len(os.sched_getaffinity(0)) - 1, 1) num_spc = len(randomized_names) spc_per_proc = math.floor(num_spc / nproc_avail) queue = multiprocessing.Queue() procs = [] random.shuffle(randomized_names) for proc_n in range(nproc_avail): spc_start = proc_n*spc_per_proc if proc_n == nproc_avail - 1: spc_end = num_spc else: spc_end = (proc_n+1)*spc_per_proc names = randomized_names[spc_start:spc_end] proc = multiprocessing.Process( target=_add_stereo_to_dct, args=(queue, names, init_dct, headers_noich, allstereo,)) procs.append(proc) proc.start() for _ in procs: new_dct.update(queue.get()) for proc in procs: proc.join() # Writer string spc_str = ','.join(['name'] + new_headers) + '\n' for name in names_in_order: if name in new_dct: spc_str += '{},'.format(name) for idx, header in enumerate(new_headers): val = new_dct[name][header] if isinstance(val, str): val = "'{}'".format(val) spc_str += str(val) if idx+1 < len(new_headers): spc_str += ',' spc_str += '\n' # Write the file stereo_file = os.path.join(path, outname) with open(stereo_file, 'w') as file_obj: file_obj.write(spc_str)
def write_stereo_csv(spc_str, outname='species_stereo.csv', path='.', allstereo=False): """ read the species file in a .csv format and write a new one that has stero information """ # Read the headers headers = [header for header in read_csv_headers(spc_str) if header != 'name'] if 'inchi' not in headers: headers.append('inchi') headers_noich = [header for header in headers if header not in ('inchi', 'inchikey')] new_headers = ['inchi', 'inchikey'] + headers_noich # Read in the initial CSV information (deal with mult stereo) init_dct = csv_dct(spc_str, values=headers, key='name') # Build a new dict new_dct = {} for name in init_dct: # Get the inchi key ich = init_dct[name]['inchi'] # Generate ichs with stereo and hashes ichs_wstereo = _generate_stereo(ich, allstereo=allstereo) # Add name and inchi info to string for idx, ich_wstereo in enumerate(ichs_wstereo): # Augment name if needed if idx == 0: sname = name else: sname = name + '-{}'.format(str(idx+1)) # Initialize new_dct[sname] = {} # Generate hash key from InChI hashkey = automol.inchi.inchi_key(ich_wstereo) # Add vals to dct new_dct[sname].update({'inchi': ich_wstereo, 'inchikey': hashkey}) for header in headers_noich: new_dct[sname][header] = init_dct[name][header] # Writer string spc_str = ','.join(['name'] + new_headers) + '\n' for name in new_dct: spc_str += '{},'.format(name) for idx, header in enumerate(new_headers): val = new_dct[name][header] if isinstance(val, str): val = "'{}'".format(val) spc_str += str(val) if idx+1 < len(new_headers): spc_str += ',' spc_str += '\n' # Write the file stereo_file = os.path.join(path, outname) with open(stereo_file, 'w') as file_obj: file_obj.write(spc_str)
def write_basis_csv(spc_str, outname='species_hof_basis.csv', path='.', parallel=False): """ Read the species file in a .csv format and write a new one that has hof basis species added to it. """ headers = [ header for header in read_csv_headers(spc_str) if header != 'name' ] if 'inchi' not in headers: headers.append('inchi') headers_noich = [ header for header in headers if header not in ('inchi', 'inchikey') ] new_headers = ['inchi', 'inchikey'] + headers_noich csv_str = ','.join(['name'] + new_headers) + '\n' # Read in the initial CSV information (deal with mult stereo) init_dct = csv_dct(spc_str, values=headers, key='name') # Build a new dict names = list(init_dct.keys()) spc_queue = [] for name in names: spc_queue.append([name, '']) # Find all references ref_schemes = ['cbh0', 'cbh1', 'cbh2'] for ref_scheme in ref_schemes: formula_dct = {} _, uniref_dct = thermfit.prepare_refs(ref_scheme, init_dct, spc_queue, repeats=True, parallel=parallel) for name in uniref_dct: spc_str, formula_dct = _species_row_string(uniref_dct, formula_dct, name, new_headers) csv_str += ref_scheme + '_' + spc_str basis_file = os.path.join(path, outname + '_basis') with open(basis_file, 'w') as file_obj: file_obj.write(spc_str) # Find only the unique references new_names = [] for ref_scheme in ref_schemes: _, uniref_dct = thermfit.prepare_refs(ref_scheme, init_dct, spc_queue, repeats=False, parallel=parallel) new_names, init_dct, uniref_dct = _add_unique_references_to_dct( new_names, init_dct, uniref_dct, ref_scheme) spc_str = ','.join(['name'] + new_headers) + '\n' formula_dct = {} for name in init_dct: if 'cbh' in name: namelabel = name.split('_')[0] if namelabel not in formula_dct: formula_dct[namelabel] = {} frmdct = formula_dct[namelabel] spc_str += namelabel + '_' formula = _get_formula_from_dct(init_dct, name) formula, frmdct = _assign_unique_name(frmdct, formula) spc_str += formula + ',' else: spc_str += '{},'.format(name) for idx, header in enumerate(new_headers): val = init_dct[name][header] if isinstance(val, str): val = "'{}'".format(val) spc_str += str(val) if idx + 1 < len(new_headers): spc_str += ',' spc_str += '\n' # Write the file basis_file = os.path.join(path, outname) with open(basis_file, 'w') as file_obj: file_obj.write(spc_str)