def main(argv=None): """ Runs the main program. @param argv: The command line arguments. :return: The return code for the program's termination. """ args, ret = parse_cmdline(argv) if ret != 0: return ret if args.src_file is not None: proc_data = calc_for_wham(args.src_file) write_csv(proc_data, create_out_fname(args.src_file, prefix=OUT_PFX), COLVAR_WHAM_KEY_SEQ) else: found_files = find_files_by_dir(args.base_dir, args.pattern) logger.debug("Found '%d' dirs with files to process", len(found_files)) # noinspection PyCompatibility for f_dir, files in found_files.iteritems(): if not files: logger.warn("No files found for dir '%s'", f_dir) continue for colvar_path in ([os.path.join(f_dir, tgt) for tgt in files]): proc_data = calc_for_wham(colvar_path) f_name = create_out_fname(colvar_path, prefix=OUT_PFX) if allow_write(f_name, overwrite=args.overwrite): list_to_file([str(d['r']) for d in proc_data if 'r' in d], f_name) # write_csv(proc_data, f_name, COLVAR_WHAM_KEY_SEQ, extrasaction="ignore") return 0 # success
def main(argv=None): """ Runs the main program. :param argv: The command line arguments. :return: The return code for the program's termination. """ args, ret = parse_cmdline(argv) if ret != GOOD_RET or args is None: return ret kbt = calc_kbt(args.temp) if args.src_file is not None: proc_data = to_zero_point(calc_rad(args.src_file, kbt)) write_csv(proc_data, create_out_fname(args.src_file, prefix=OUT_PFX), RAD_KEY_SEQ) else: found_files = find_files_by_dir(args.base_dir, args.pattern) logger.debug("Found '{}' dirs with files to process".format( len(found_files))) # noinspection PyCompatibility for f_dir, files in found_files.items(): if not files: logger.warn("No files found for dir '{}'".format(f_dir)) continue for pmf_path in ([os.path.join(f_dir, tgt) for tgt in files]): proc_data = to_zero_point(calc_rad(pmf_path, kbt)) f_name = create_out_fname(pmf_path, prefix=OUT_PFX) if allow_write(f_name, overwrite=args.overwrite): write_csv(proc_data, f_name, RAD_KEY_SEQ) return GOOD_RET # success
def main(argv=None): """ Runs the main program. :param argv: The command line arguments. :return: The return code for the program's termination. """ args, ret = parse_cmdline(argv) if ret != GOOD_RET or args is None: return ret kbt = calc_kbt(args.temp) if args.src_file is not None: proc_data = to_zero_point(calc_rad(args.src_file, kbt)) write_csv(proc_data, create_out_fname(args.src_file, prefix=OUT_PFX), RAD_KEY_SEQ) else: found_files = find_files_by_dir(args.base_dir, args.pattern) logger.debug("Found '{}' dirs with files to process".format(len(found_files))) # noinspection PyCompatibility for f_dir, files in found_files.iteritems(): if not files: logger.warn("No files found for dir '{}'".format(f_dir)) continue for pmf_path in ([os.path.join(f_dir, tgt) for tgt in files]): proc_data = to_zero_point(calc_rad(pmf_path, kbt)) f_name = create_out_fname(pmf_path, prefix=OUT_PFX) if allow_write(f_name, overwrite=args.overwrite): write_csv(proc_data, f_name, RAD_KEY_SEQ) return GOOD_RET # success
def print_per_frame(dump_file, cfg, data_to_print, out_fieldnames, write_mode): f_out = create_out_fname(dump_file, suffix='_sum', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, out_fieldnames, extrasaction="ignore", mode=write_mode, round_digits=ROUND_DIGITS, print_message=cfg[PRINT_PROGRESS])
def write_result(result, src_file, overwrite=False, basedir=None): """Writes the result to a file named for the given source file. :param result: The result to write. :param src_file: The original source file name. :param overwrite: Whether to overwrite an existing file name. :param basedir: The base directory to target (uses the source file's base directory if not specified) """ f_name = create_out_fname(src_file, prefix=OUT_PFX, base_dir=basedir) if allow_write(f_name, overwrite=overwrite): write_csv(result, f_name, OUT_KEY_SEQ)
def main(argv=None): """ Runs the main program. :param argv: The command line arguments. :return: The return code for the program's termination. """ args, ret = parse_cmdline(argv) if ret != GOOD_RET: return ret deduped = compress_dups(read_csv(args.file, all_conv=float), args.column) write_csv(deduped, create_out_fname(args.file, prefix=PREFIX), read_csv_header(args.file)) return GOOD_RET # success
def testWriteCsv(self): tmp_dir = None data = csv_data() try: tmp_dir = tempfile.mkdtemp() tgt_fname = create_out_fname(SHORT_WHAM_PATH, prefix=OUT_PFX, base_dir=tmp_dir) write_csv(data, tgt_fname, RAD_KEY_SEQ) csv_result = read_csv(tgt_fname, data_conv={FREE_KEY: str_to_bool, CORR_KEY: float, COORD_KEY: str, }) self.assertEqual(len(data), len(csv_result)) for i, csv_row in enumerate(csv_result): self.assertDictEqual(data[i], csv_row) finally: shutil.rmtree(tmp_dir)
def process_log_files(source_name, log_file_list): """ Loops through all files and prints output @param source_name: the source name to use as the base for creating an outfile name @param log_file_list: list of file names to read and process """ result_list = [] out_fname = create_out_fname(source_name, suffix='_sum', ext=".csv") for log_file in log_file_list: result_list += process_log(log_file) if len(result_list) == 0: warning( "Found no lammps log data to process from: {}".format(source_name)) else: write_csv(result_list, out_fname, LOG_FIELDNAMES, extrasaction="ignore")
def find_rel_e(extracted_data, cfg, ref_e_dict): """ calculate relative energy, if data found @param extracted_data: dictionary of data found from chk file @param cfg: configuration for run @param ref_e_dict: reference energies, if available @return: """ tot_resid = 0 num_resid = 0 for data_dict in extracted_data: this_group = data_dict[REL_E_GROUP] if this_group: rel_ene_ref = cfg[REL_E_SEC][this_group][REL_E_REF] if this_group is None or np.isnan(rel_ene_ref): data_dict[REL_E] = np.nan else: rel_e = data_dict[ENV_ENE] - rel_ene_ref data_dict[REL_E] = rel_e file_name = data_dict[FILE_NAME] if file_name in ref_e_dict: ref_e = ref_e_dict[file_name] resid = np.round(np.sqrt((ref_e - rel_e)**2), 6) data_dict[REF_E] = ref_e data_dict[E_RESID] = resid tot_resid += resid num_resid += 1 f_out = create_out_fname(cfg[CHK_FILE_LIST], suffix='_sum', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(extracted_data, f_out, ENE_FIELD_NAMES, extrasaction="ignore") if len(ref_e_dict) > 1: print("Calculated total energy residual from {} files: {}".format( num_resid, tot_resid))
def process_file(base_file, data_file): # TODO: add in reading vectors base_dict = read_csv(base_file, quote_style=csv.QUOTE_NONNUMERIC)[0] data_dict_list = read_csv(data_file, quote_style=csv.QUOTE_NONNUMERIC) data_headers = [INDEX, RMSD] + read_csv_header(data_file) num_vals = len(base_dict.values()) for data_id, data_dict in enumerate(data_dict_list): rmsd = 0.0 for key, val in base_dict.items(): try: rmsd += (data_dict[key] - val)**2 except KeyError: raise InvalidDataError( "Could not find key '{}' from base file in compared data file." .format(key)) data_dict[INDEX] = data_id data_dict[RMSD] = round((rmsd / num_vals)**0.5, 2) out_name = create_out_fname(data_file, prefix=RMSD + '_') write_csv(data_dict_list, out_name, data_headers)
def make_summary(cfg): """ If the option is specified, add the last best fit output file to the list of outputs and evaluate changes @param cfg: configuration for the run @return: """ best_file = cfg[MAIN_SEC][BEST_FILE] summary_file = cfg[MAIN_SEC][SUMMARY_FILE] low, high, headers = get_param_info(cfg) latest_output = np.loadtxt(best_file, dtype=np.float64) if os.path.isfile(summary_file): last_row = None percent_diffs = [] previous_output = np.loadtxt(summary_file, dtype=np.float64) all_output = np.vstack((previous_output, latest_output)) for row in all_output: if last_row is not None: diff = row - last_row percent_diff = {} # Check data for small values, hitting upper or lower bound, and calc % diff for index, val in enumerate(np.nditer(row)): if abs(val) < TOL: warning( "Small value ({}) encountered for parameter {} (col {})" "".format(val, headers[index], index)) if abs(diff[index]) > TOL: if abs(last_row[index]) > TOL: percent_diff[headers[index]] = round( diff[index] / last_row[index] * 100, 2) else: if abs(diff[index]) > TOL: percent_diff[headers[index]] = np.inf if abs(val - low[index]) < TOL: warning( "Value ({}) near lower bound ({}) encountered for parameter {} (col {})." "".format(val, low[index], headers[index], index)) if abs(val - high[index]) < TOL: warning( "Value ({}) near upper bound ({}) encountered for parameter {} (col {})." "".format(val, high[index], headers[index], index)) else: percent_diff[headers[index]] = np.nan percent_diffs.append(percent_diff) last_row = row if len(percent_diffs) > 0: max_percent_diff = 0 max_diff_param = None for param, val in percent_diffs[-1].items(): if abs(val) > abs(max_percent_diff): max_percent_diff = val max_diff_param = param print( "Maximum (absolute value) percent difference from last read line is {} % for parameter '{}'." "".format(max_percent_diff, max_diff_param)) if cfg[MAIN_SEC][RESID_IN_BEST]: print("Percent change in residual: {} %" "".format( percent_diffs[-1][RESIDUAL + cfg[MAIN_SEC][SUM_HEAD_SUFFIX]])) # format for gnuplot and np.loadtxt f_out = create_out_fname(summary_file, suffix='_perc_diff', ext='.csv', base_dir=cfg[MAIN_SEC][OUT_BASE_DIR]) write_csv(percent_diffs, f_out, headers, extrasaction="ignore") f_out = create_out_fname(summary_file, ext='.csv', base_dir=cfg[MAIN_SEC][OUT_BASE_DIR]) with open(f_out, 'w') as s_file: s_file.write(','.join(headers) + '\n') np.savetxt(s_file, all_output, fmt='%8.6f', delimiter=',') print('Wrote file: {}'.format(f_out)) # in addition to csv (above), print format for gnuplot and np.loadtxt with open(summary_file, 'w') as s_file: np.savetxt(s_file, all_output, fmt='%12.6f') print("Wrote file: {}".format(summary_file)) else: # have this as sep statement, because now printing a 1D array, handled differently than 2D array (newline=' ') with open(summary_file, 'w') as s_file: np.savetxt(s_file, latest_output, fmt='%12.6f', newline=' ') print("Wrote results from {} to new summary file {}".format( best_file, summary_file))
def make_summary(output_file, summary_file, cfg): low, high, headers = get_param_info(cfg) latest_output = np.loadtxt(output_file, dtype=np.float64) # append last best resid low = np.append(low, np.nan) high = np.append(high, np.nan) headers.append('resid') base_dir = os.path.dirname(output_file) latest_output = np.append(latest_output, get_resid(base_dir)) if os.path.isfile(summary_file): last_row = None percent_diffs = [] previous_output = np.loadtxt(summary_file, dtype=np.float64) all_output = np.vstack((previous_output, latest_output)) for row in all_output: if last_row is not None: diff = row - last_row percent_diff = {} # Check data for small values, hitting upper or lower bound, and calc % diff for index, val in enumerate(np.nditer(row)): if abs(val) < TOL: warning("Small value ({}) encountered for parameter {} (col {})" "".format(val, headers[index], index)) if abs(diff[index]) > TOL: if abs(last_row[index]) > TOL: percent_diff[headers[index]] = "%8.2f" % (diff[index] / last_row[index] * 100) else: percent_diff[headers[index]] = ' ' if abs(val-low[index]) < TOL: warning("Value ({}) near lower bound ({}) encountered for parameter {} (col {})." "".format(val, low[index], headers[index], index)) if abs(val-high[index]) < TOL: warning("Value ({}) near upper bound ({}) encountered for parameter {} (col {})." "".format(val, high[index], headers[index], index)) else: percent_diff[headers[index]] = ' ' percent_diffs.append(percent_diff) last_row = row # format for gnuplot and np.loadtxt f_out = create_out_fname(summary_file, suffix='_perc_diff', ext='.csv', base_dir=cfg[MAIN_SEC][OUT_BASE_DIR]) write_csv(percent_diffs, f_out, headers, extrasaction="ignore") print('Wrote file: {}'.format(f_out)) f_out = create_out_fname(summary_file, ext='.csv', base_dir=cfg[MAIN_SEC][OUT_BASE_DIR]) with open(f_out, 'w') as s_file: s_file.write(','.join(headers)+'\n') np.savetxt(s_file, all_output, fmt='%8.6f', delimiter=',') print('Wrote file: {}'.format(f_out)) # in addition to csv (above), print format for gnuplot and np.loadtxt with open(summary_file, 'w') as s_file: np.savetxt(s_file, all_output, fmt='%12.6f') print(summary_file) print("Wrote summary file {}".format(summary_file)) else: # have this as sep statement, because now printing a 1D array, handled differently than 2D array (newline=' ') with open(summary_file, 'w') as s_file: np.savetxt(s_file, latest_output, fmt='%12.6f', newline=' ') print("Wrote results from {} to new summary file {}".format(output_file, summary_file))
def process_evb_files(cfg, selected_fieldnames): """ Want to grab the timestep and highest prot ci^2, highest wat ci^2, and print them @param selected_fieldnames: list of field names for output based on user-selected options @param cfg: configuration data read from ini file @return: @raise InvalidDataError: """ first_file_flag = True all_data = [] if cfg[EVB_FILE] is not None: evb_file_list = [cfg[EVB_FILE]] else: evb_file_list = [] # Separate try-catch block here because want it to continue rather than exit; # exit below if there are no files to process try: evb_file_list += file_rows_to_list(cfg[EVB_LIST_FILE]) except IOError as e: if cfg[EVB_LIST_FILE] != DEF_EVB_LIST_FILE: raise IOError(e) if len(evb_file_list) == 0: raise InvalidDataError( "Found no evb file names to read. Specify one file with the keyword '{}' or \n" "a file containing a list of evb files with the keyword '{}'.". format(EVB_FILE, EVB_LIST_FILE)) for evb_file in evb_file_list: data_to_print, subset_to_print, wat_mol_data_to_print = process_evb_file( evb_file, cfg) all_data += data_to_print if cfg[PRINT_PER_FILE] is True: if len(data_to_print) > 0: f_out = create_out_fname(evb_file, suffix='_evb_info', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, selected_fieldnames, extrasaction="ignore", print_message=cfg[PRINT_PROGRESS], round_digits=ROUND_DIGITS) if cfg[PRINT_CI_SUBSET]: if len(subset_to_print) > 0: f_out = create_out_fname(evb_file, suffix='_ci_sq_ts', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(subset_to_print, f_out, CI_FIELDNAMES, extrasaction="ignore", print_message=cfg[PRINT_PROGRESS], round_digits=ROUND_DIGITS) else: warning("'{}' set to true, but found no data from: {} \n" "No output will be printed for this file." "".format(PRINT_CI_SUBSET, evb_file)) if cfg[PRINT_PER_LIST]: if first_file_flag: print_mode = 'w' first_file_flag = False else: print_mode = 'a' if cfg[PRINT_CI_SUBSET]: if len(subset_to_print) > 0: f_out = create_out_fname(cfg[EVB_LIST_FILE], suffix='_ci_sq_ts', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(subset_to_print, f_out, [FILE_NAME] + CI_FIELDNAMES, extrasaction="ignore", mode=print_mode, print_message=cfg[PRINT_PROGRESS], round_digits=ROUND_DIGITS) else: warning( "'{}' set to true, but found no data meeting criteria." "".format(PRINT_CI_SUBSET)) f_out = create_out_fname(cfg[EVB_LIST_FILE], suffix='_evb_info', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, [FILE_NAME] + selected_fieldnames, extrasaction="ignore", mode=print_mode, print_message=cfg[PRINT_PROGRESS], round_digits=ROUND_DIGITS) return all_data
def obj_fun(x0_trial, cfg, tpl_dict, tpl_str, fitting_sum, result_dict, result_headers, x0_full=None): """ Objective function to be minimized. Also used to save trial input and output. @param x0_trial: initial parameter values to minimize @param x0_full: all parameter values to minimize (may be larger than x0 trail @param cfg: configuration for the run @param tpl_dict: dictionary of values for filling in template strings @param tpl_str: template string (read from file) @param fitting_sum: list of dicts for saving all trial values (to be appended, if needed) @param result_dict: a dictionary of results already found, to keep the program from unnecessarily running the expensive function when we already have solved for that parameter set @param result_headers: list of headers for printing results @return: the result for the set of values being tested, obtained from the bash script specified in cfg """ if x0_full is None: x0_full = x0_trial else: x0_full[:len(x0_trial)] = x0_trial resid_dict = {} penalty = 0 for param_num, param_name in enumerate(cfg[OPT_PARAMS]): # Needed to add break for triangle/stepwise minimization if param_num >= len(x0_trial): break tpl_dict[param_name] = round(x0_trial[param_num], cfg[NUM_PARAM_DECIMALS]) resid_dict[param_name] = tpl_dict[param_name] if param_name in cfg[LEFT_SIDE_POTENTIAL]: min_val = cfg[LEFT_SIDE_POTENTIAL][param_name][0] stiffness = cfg[LEFT_SIDE_POTENTIAL][param_name][1] if x0_trial[param_num] < min_val: penalty += stiffness * np.square(x0_trial[param_num] - min_val) if param_name in cfg[RIGHT_SIDE_PENALTY]: max_val = cfg[RIGHT_SIDE_PENALTY][param_name][0] stiffness = cfg[RIGHT_SIDE_PENALTY][param_name][1] if x0_trial[param_num] > max_val: penalty += stiffness * np.square(x0_trial[param_num] - max_val) eval_eqs(cfg, tpl_dict) fill_save_tpl(cfg, tpl_str, tpl_dict, cfg[PAR_TPL], cfg[PAR_FILE_NAME], print_info=cfg[PRINT_INFO]) # Note: found that the minimizer calls the function with the same inputs multiple times! # only call this expensive function if we don't already have that answer, determined by checking for it in # the result dictionary # to make the input hashable for a dictionary x0_str = str(x0_full) if x0_str in result_dict: trial_result = result_dict[x0_str] else: trial_result = float( check_output([cfg[BASH_DRIVER], tpl_dict[NEW_FNAME]]).strip()) trial_result += penalty result_dict[x0_str] = trial_result tpl_dict[RESID] = round(trial_result, cfg[NUM_PARAM_DECIMALS]) if cfg[PAR_COPY_NAME] is not None or cfg[RESULT_COPY] is not None: copy_par_result_file(cfg, tpl_dict, print_info=cfg[PRINT_INFO]) if cfg[FITTING_SUM_FNAME] is not None: write_csv(fitting_sum, cfg[FITTING_SUM_FNAME], result_headers, print_message=cfg[PRINT_INFO], round_digits=cfg[NUM_PARAM_DECIMALS]) if cfg[BEST_PARAMS_FNAME] is not None: if trial_result < cfg[LOWEST_RESID]: cfg[LOWEST_RESID] = trial_result with open(cfg[BEST_PARAMS_FNAME], 'w') as w_file: for param_num, param_name in enumerate(cfg[OPT_PARAMS]): w_file.write("{:} = {:f},{:f}\n".format( param_name, x0_full[param_num], cfg[INITIAL_DIR][param_name])) if cfg[PRINT_INFO]: print("Resid: {:11f} for parameters: {}".format( trial_result, ",".join(["{:11f}".format(x) for x in x0_trial]))) if cfg[FITTING_SUM_FNAME] is not None: resid_dict[RESID] = trial_result fitting_sum.append(resid_dict) return trial_result
def process_files(comp_f_list, col_name, base_out_name, delimiter, sep_out_flag, out_location): """ Want to grab the timestep, first and 2nd mole found, first and 2nd ci^2 print the timestep, residue ci^2 @param comp_f_list: a list of lists of file names to process (file read during input processing) @param col_name: name of column to use for alignment @param base_out_name: name of file to be created, or suffix if multiple files to be created @param delimiter: string, delimiter separating file names on lines of the comp_f_list @param sep_out_flag: a boolean to note if separate output files should be made based on each row of input @param out_location: user-specified location for the output files, if specified @return: @raise InvalidDataError: """ all_dicts = defaultdict(dict) # if need multiple output files, designate them by adding a prefix prefix = '' # if there will be multiple output files, make sure do not reuse a prefix, so keep copy of used names prefix_used = [] # if one output file from multiple sets of file to combine, will change write_mode to append later write_mode = 'w' # we don't have to specify run names in the output if there one row set of files to combine, # or if there will be separate output files if len(comp_f_list) < 2 or sep_out_flag: add_run_name = False headers = [] else: add_run_name = True headers = [RUN_NAME] for line_num, line in enumerate(comp_f_list): dict_keys = None if sep_out_flag: headers = [] all_dicts = defaultdict(dict) # separate on delimiter, strip any white space, and also get rid of empty entries comp_files = filter(None, [c_file.strip() for c_file in line.split(delimiter)]) # get the common part of the name, if it exists; otherwise, give the name the line index for file_index, file_name in enumerate(comp_files): base_name = os.path.splitext(os.path.basename(file_name))[0] if file_index == 0: run_name = base_name else: run_name = longest_common_substring(run_name, base_name) if run_name == '': # because will use run_name as a string, need to convert it run_name = str(line_num) + "_" for c_file in comp_files: new_dict = read_csv_to_dict(c_file, col_name) if dict_keys is None: dict_keys = new_dict.keys() else: dict_keys = set(dict_keys).intersection(new_dict.keys()) new_dict_keys = six.next(six.itervalues(new_dict)).keys() # Get the keys for the inner dictionary; diff methods for python 2 and 3 so use six # expect to only get new headers when making a new file (write_mode == 'w') # for the next file, will not gather more headers. When printed, extra cols will be skipped, and # missing columns will have no data shown if write_mode == 'w': for key in new_dict_keys: if key in headers: # okay if already have header if the header is the column. # If we are going to append, we also expect to already have the header name if key != col_name: warning("Non-unique column name {} found in {}. " "Values will be overwritten.".format(key, c_file)) else: headers.append(key) for new_key in new_dict.items(): all_dicts[new_key[0]].update(new_key[1]) final_dict = [] for key in sorted(dict_keys): final_dict.append(all_dicts[key]) # final_dict.append(all_dicts[key].update({RUN_NAME: run_name})) if add_run_name: for each_dict in final_dict: each_dict.update({RUN_NAME: run_name}) # Possible to have no overlap in align column if len(final_dict) > 0: # make sure col_name appears first by taking it out before sorting if sep_out_flag: prefix = run_name if prefix == '' or prefix in prefix_used: prefix = str(line_num) + "_" # have a consistent output by sorting the headers, but keep the aligning column first # only needs to be done for printing the first time if write_mode == 'w': headers.remove(col_name) headers = [col_name] + sorted(headers) if add_run_name: headers.remove(RUN_NAME) headers = [RUN_NAME] + headers f_name = create_out_fname(base_out_name, prefix=prefix, base_dir=out_location) prefix_used.append(prefix) write_csv(final_dict, f_name, headers, mode=write_mode) if not sep_out_flag and write_mode == 'w': write_mode = 'a' else: raise InvalidDataError("No common values found for column {} among files: {}" "".format(col_name, ", ".join(comp_files)))
def find_rel_e(extracted_data, cfg, out_field_names, ref_energy_dict): """ calculate relative energies from the gathered data @param extracted_data: gathered data (based on flags) @param cfg: configuration for file @param out_field_names: field names chosen based on user-defined options @param ref_energy_dict: a dictionary of time names and the reference energy for calculating an energy RMSD @return: prints out a new outfile unless an error is raised """ out_field_names = [ FILE_NAME, TIMESTEP, REL_E_GROUP, RESID_E, REF_E, REL_ENE, REL_PROT_E, REL_HYD_E, REL_NEXT_HYD_E, ] + out_field_names[1:] tot_resid = 0 num_resid = 0 for data_dict in extracted_data: this_group = data_dict[REL_E_GROUP] if this_group: rel_ene_ref = cfg[REL_E_SEC][this_group][REL_E_REF] ref_diab_e = cfg[REL_E_SEC][this_group][MIN_DIAB_ENE] if this_group is None or np.isnan(rel_ene_ref): for key in [ RESID_E, REF_E, REL_ENE, REL_PROT_E, REL_HYD_E, REL_NEXT_HYD_E ]: data_dict[key] = np.nan else: rel_e = data_dict[ENE_TOTAL] - rel_ene_ref data_dict[REL_ENE] = rel_e data_dict[REL_PROT_E] = data_dict[MAX_PROT_E] - ref_diab_e data_dict[REL_HYD_E] = data_dict[MAX_HYD_E] - ref_diab_e data_dict[REL_NEXT_HYD_E] = data_dict[NEXT_MAX_HYD_E] - ref_diab_e file_name = data_dict[FILE_NAME] if file_name in ref_energy_dict: ref_e = ref_energy_dict[file_name] resid = np.sqrt((ref_e - rel_e)**2) data_dict[REF_E] = ref_e data_dict[RESID_E] = resid tot_resid += resid num_resid += 1 else: data_dict[REF_E] = np.nan data_dict[RESID_E] = np.nan f_out = create_out_fname(cfg[EVB_LIST_FILE], suffix='_evb_info', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(extracted_data, f_out, out_field_names, extrasaction="ignore", print_message=cfg[PRINT_PROGRESS], round_digits=ROUND_DIGITS) if len(ref_energy_dict) > 1: print("Calculated total energy residual from {} files: {}".format( num_resid, round(tot_resid, 6)))
def process_evb_files(cfg): """ Want to grab the timestep and highest prot ci^2, highest wat ci^2, and print them @param cfg: configuration data read from ini file @return: @raise InvalidDataError: """ first_file_flag = True evb_file_list = [] if cfg[EVB_FILE] is not None: evb_file_list.append(cfg[EVB_FILE]) # Separate try-catch block here because want it to continue rather than exit; exit below if there are no files to # process try: with open(cfg[EVB_FILES]) as f: for evb_file in f: evb_file_list.append(evb_file.strip()) except IOError as e: warning("Problems reading file:", e) if len(evb_file_list) == 0: raise InvalidDataError("Found no evb file names to read. Specify one file with the keyword '{}' or \n" "a file containing a list of evb files with the keyword '{}'.".format(EVB_FILE, EVB_FILES)) for evb_file in evb_file_list: data_to_print, subset_to_print, wat_mol_data_to_print = process_evb_file(evb_file, cfg) no_print = [] if cfg[PRINT_PER_FILE] is True: if cfg[PRINT_KEY_PROPS]: if len(data_to_print) > 0: f_out = create_out_fname(evb_file, suffix='_evb_info', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, KEY_PROPS_FIELDNAMES, extrasaction="ignore") else: no_print.append(PRINT_KEY_PROPS) if cfg[PRINT_CI_SUBSET]: if len(subset_to_print) > 0: f_out = create_out_fname(evb_file, suffix='_ci_sq_ts', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(subset_to_print, f_out, CI_FIELDNAMES, extrasaction="ignore") else: no_print.append(PRINT_CI_SUBSET) if cfg[PRINT_CI_SQ]: if len(data_to_print) > 0: f_out = create_out_fname(evb_file, suffix='_ci_sq', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, CI_FIELDNAMES, extrasaction="ignore") else: no_print.append(PRINT_CI_SQ) if cfg[PRINT_CEC]: if len(data_to_print) > 0: f_out = create_out_fname(evb_file, suffix='_cec', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, CEC_COORD_FIELDNAMES, extrasaction="ignore") else: no_print.append(PRINT_CEC) if cfg[PRINT_WAT_MOL]: if len(wat_mol_data_to_print) > 0: f_out = create_out_fname(evb_file, suffix='_wat_mols', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(wat_mol_data_to_print, f_out, PROT_WAT_FIELDNAMES, extrasaction="ignore") else: no_print.append(PRINT_WAT_MOL) if len(no_print) > 0: warning("{} set to true, but found no data from: {} \n" "No output will be printed for this file.".format(",".join(map(single_quote, no_print)), evb_file)) if cfg[PRINT_PER_LIST]: if first_file_flag: print_mode = 'w' first_file_flag = False else: print_mode = 'a' if cfg[PRINT_CI_SQ]: f_out = create_out_fname(cfg[EVB_FILES], suffix='_ci_sq', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, CI_FIELDNAMES, extrasaction="ignore", mode=print_mode) if cfg[PRINT_CI_SUBSET]: f_out = create_out_fname(cfg[EVB_FILES], suffix='_ci_sq_ts', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(subset_to_print, f_out, CI_FIELDNAMES, extrasaction="ignore", mode=print_mode) if cfg[PRINT_WAT_MOL]: f_out = create_out_fname(cfg[EVB_FILES], suffix='_wat_mols', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(wat_mol_data_to_print, f_out, PROT_WAT_FIELDNAMES, extrasaction="ignore", mode=print_mode) if cfg[PRINT_CEC]: f_out = create_out_fname(cfg[EVB_FILES], suffix='_cec', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, CEC_COORD_FIELDNAMES, extrasaction="ignore", mode=print_mode) if cfg[PRINT_KEY_PROPS]: f_out = create_out_fname(cfg[EVB_FILES], suffix='_evb_info', ext='.csv', base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, KEY_PROPS_FIELDNAMES, extrasaction="ignore", mode=print_mode)
def print_per_frame(dump_file, cfg, data_to_print, out_fieldnames, write_mode): f_out = create_out_fname(dump_file, suffix="_sum", ext=".csv", base_dir=cfg[OUT_BASE_DIR]) write_csv(data_to_print, f_out, out_fieldnames, extrasaction="ignore", mode=write_mode)
def min_params(cfg, tpl_dict, tpl_str): num_opt_params = len(cfg[OPT_PARAMS]) x0 = np.empty(num_opt_params) ini_direc = np.zeros((num_opt_params, num_opt_params)) result_dict = {} fitting_sum = [] result_sum_headers = [RESID] # setup minimization for param_num, param_name in enumerate(cfg[OPT_PARAMS]): x0[param_num] = cfg[TPL_VALS][param_name] ini_direc[param_num, param_num] = cfg[INITIAL_DIR][param_name] result_sum_headers.append(param_name) # arguments for objective function obj_fun_args = (cfg, tpl_dict, tpl_str, fitting_sum, result_dict, result_sum_headers) # options for minimizer opt_options = { 'maxiter': cfg[MAX_ITER], 'disp': cfg[PRINT_INFO], 'return_all': cfg[PRINT_CONV_ALL], } if cfg[SCIPY_OPT_METHOD] == POWELL: opt_options['direc'] = ini_direc if cfg[SCIPY_OPT_METHOD] in [POWELL, NELDER_MEAD]: opt_options['xtol'] = cfg[CONV_CUTOFF] opt_options['ftol'] = cfg[CONV_CUTOFF] opt_options['maxfev'] = cfg[MAX_ITER] if cfg[BASIN_HOP]: # for tests if cfg[BASIN_SEED]: np.random.seed(1) step_spec = False x_min = np.empty(num_opt_params) x_max = np.empty(num_opt_params) step_size = np.empty(num_opt_params) if BASIN_HOPS in cfg: hop_dict = cfg[BASIN_HOPS] min_dict = cfg[BASIN_MINS] max_dict = cfg[BASIN_MAXS] if len(hop_dict) > 0: for param_num, param_name in enumerate(cfg[OPT_PARAMS]): if param_name in hop_dict: step_size[param_num] = hop_dict[param_name] step_spec = True else: step_size[param_num] = cfg[BASIN_DEF_STEP] if param_name in min_dict: x_min[param_num] = min_dict[param_name] x_max[param_num] = max_dict[param_name] else: x_min[param_num] = -np.inf x_max[param_num] = np.inf if step_spec: take_step = RandomDisplacementBounds(x_min, x_max, step_size, cfg[PRINT_INFO]) else: take_step = None minimizer_kwargs = dict(method=POWELL, args=obj_fun_args, options=opt_options) ret = basinhopping(obj_fun, x0, minimizer_kwargs=minimizer_kwargs, disp=cfg[PRINT_INFO], niter=cfg[BASIN_NITER], niter_success=cfg[NITER_SUCCESS], take_step=take_step) return_message = ret.message[-1] + "." else: # Number of minimization cycles set by default or user input num_minis = 0 return_message = "No minimization cycles completed" ret = None trial_param_num = len(x0) while num_minis < cfg[MINI_CYCLES]: # Set up "triangle" or step-wise minimization if trial_param_num < 3 or not cfg[TRIANGLE_MINI]: x0_trial = x0 # needed for after the first round of minimization trial_param_num = len(x0) else: trial_param_num = 2 x0_trial = x0[:trial_param_num] obj_fun_args = (cfg, tpl_dict, tpl_str, fitting_sum, result_dict, result_sum_headers, x0) if 'direc' in opt_options: opt_options['direc'] = ini_direc[:trial_param_num, : trial_param_num] while trial_param_num <= len(x0): ret = minimize(obj_fun, x0_trial, args=obj_fun_args, method=cfg[SCIPY_OPT_METHOD], options=opt_options) x0_trial = ret.x return_message = ret.message x0[:trial_param_num] = x0_trial trial_param_num += 1 if trial_param_num <= len(x0): x0_trial = x0[:trial_param_num] if 'direc' in opt_options: opt_options['direc'] = ini_direc[:trial_param_num, : trial_param_num] num_minis += 1 if cfg[MINI_CYCLES] - num_minis >= 0: print(return_message + " Completed {} of {} minimization cycles".format( num_minis, cfg[MINI_CYCLES])) if cfg[PRINT_CONV_ALL]: print(return_message + " Number of function calls: {}".format(ret.nfev)) # Same final printing either way x_final = ret.x if x_final.size > 1: if cfg[FITTING_SUM_FNAME] is not None: write_csv(fitting_sum, cfg[FITTING_SUM_FNAME], result_sum_headers, print_message=cfg[PRINT_INFO], round_digits=cfg[NUM_PARAM_DECIMALS]) print("Optimized parameters:") for param_num, param_name in enumerate(cfg[OPT_PARAMS]): print("{:>11} = {:11f}".format(param_name, x_final[param_num])) else: print("Optimized parameter:\n" "{:>11}: {:11f}".format(cfg[OPT_PARAMS][0], float(x_final)))
def process_files(comp_f_list, col_name, base_out_name, delimiter, sep_out_flag, out_location): """ Want to grab the timestep, first and 2nd mole found, first and 2nd ci^2 print the timestep, residue ci^2 @param comp_f_list: a list of lists of file names to process (file read during input processing) @param col_name: name of column to use for alignment @param base_out_name: name of file to be created, or suffix if multiple files to be created @param delimiter: string, delimiter separating file names on lines of the comp_f_list @param sep_out_flag: a boolean to note if separate output files should be made based on each row of input @param out_location: user-specified location for the output files, if specified @return: @raise InvalidDataError: """ all_dicts = defaultdict(dict) # if need multiple output files, designate them by adding a prefix prefix = '' # if there will be multiple output files, make sure do not reuse a prefix, so keep copy of used names prefix_used = [] # if one output file from multiple sets of file to combine, will change write_mode to append later write_mode = 'w' # we don't have to specify run names in the output if there one row set of files to combine, # or if there will be separate output files if len(comp_f_list) < 2 or sep_out_flag: add_run_name = False headers = [] else: add_run_name = True headers = [RUN_NAME] for line_num, line in enumerate(comp_f_list): dict_keys = None if sep_out_flag: headers = [] all_dicts = defaultdict(dict) # separate on delimiter, strip any white space, and also get rid of empty entries comp_files = filter( None, [c_file.strip() for c_file in line.split(delimiter)]) # get the common part of the name, if it exists; otherwise, give the name the line index for file_index, file_name in enumerate(comp_files): base_name = os.path.splitext(os.path.basename(file_name))[0] if file_index == 0: run_name = base_name else: run_name = longest_common_substring(run_name, base_name) if run_name == '': # because will use run_name as a string, need to convert it run_name = str(line_num) + "_" for c_file in comp_files: new_dict = read_csv_to_dict(c_file, col_name) if dict_keys is None: dict_keys = new_dict.keys() else: dict_keys = set(dict_keys).intersection(new_dict.keys()) new_dict_keys = six.next(six.itervalues(new_dict)).keys() # Get the keys for the inner dictionary; diff methods for python 2 and 3 so use six # expect to only get new headers when making a new file (write_mode == 'w') # for the next file, will not gather more headers. When printed, extra cols will be skipped, and # missing columns will have no data shown if write_mode == 'w': for key in new_dict_keys: if key in headers: # okay if already have header if the header is the column. # If we are going to append, we also expect to already have the header name if key != col_name: warning("Non-unique column name {} found in {}. " "Values will be overwritten.".format( key, c_file)) else: headers.append(key) for new_key in new_dict.items(): all_dicts[new_key[0]].update(new_key[1]) final_dict = [] for key in sorted(dict_keys): final_dict.append(all_dicts[key]) # final_dict.append(all_dicts[key].update({RUN_NAME: run_name})) if add_run_name: for each_dict in final_dict: each_dict.update({RUN_NAME: run_name}) # Possible to have no overlap in align column if len(final_dict) > 0: # make sure col_name appears first by taking it out before sorting if sep_out_flag: prefix = run_name if prefix == '' or prefix in prefix_used: prefix = str(line_num) + "_" # have a consistent output by sorting the headers, but keep the aligning column first # only needs to be done for printing the first time if write_mode == 'w': headers.remove(col_name) headers = [col_name] + sorted(headers) if add_run_name: headers.remove(RUN_NAME) headers = [RUN_NAME] + headers f_name = create_out_fname(base_out_name, prefix=prefix, base_dir=out_location) prefix_used.append(prefix) write_csv(final_dict, f_name, headers, mode=write_mode) if not sep_out_flag and write_mode == 'w': write_mode = 'a' else: raise InvalidDataError( "No common values found for column {} among files: {}" "".format(col_name, ", ".join(comp_files)))