def fill_save_tpl(cfg, tpl_str, tpl_vals_dict, tpl_name, filled_tpl_name, print_info=True): """ use the dictionary to make the file name and filled template. Then save the file. @param cfg: configuration for run @param tpl_str: the string to be filled to make the filled tpl file @param tpl_vals_dict: dictionary of tpl keys and vals @param tpl_name: the cfg key for the template file name @param filled_tpl_name: the cfg key for the filled template file name @param print_info: print to standard out when a file is printed """ try: filled_tpl_str = tpl_str.format(**tpl_vals_dict) except KeyError as e: raise KeyError( "Key '{}' not found in the configuration but required for template file: {}" "".format(e.args[0], tpl_name)) try: filled_fname_str = filled_tpl_name.format(**tpl_vals_dict) except KeyError as e: raise KeyError( "Key '{}' not found in the configuration but required for filled template file name: {}" "".format(e.args[0], filled_tpl_name)) tpl_vals_dict[NEW_FNAME] = create_out_fname(filled_fname_str, base_dir=cfg[OUT_DIR]) str_to_file(filled_tpl_str, tpl_vals_dict[NEW_FNAME], print_info=print_info)
def process_file(f_name, b_str, e_str, new_f_name): if new_f_name is None: new_f_name = create_out_fname(f_name, suffix='_amend') # open old file first; then if, there is a problem with it, no new file will be created with open(f_name) as f: with open(new_f_name, 'w') as w_file: for line in f: line = line.strip() w_file.write(b_str + line + e_str + "\n") print("Wrote file: {}".format(new_f_name))
def testWriteCsv(self): tmp_dir = None data = csv_data() try: tmp_dir = tempfile.mkdtemp() tgt_fname = create_out_fname(SHORT_WHAM_PATH, prefix=OUT_PFX, base_dir=tmp_dir) write_csv(data, tgt_fname, RAD_KEY_SEQ) csv_result = read_csv(tgt_fname, data_conv={FREE_KEY: str_to_bool, CORR_KEY: float, COORD_KEY: str, }) self.assertEqual(len(data), len(csv_result)) for i, csv_row in enumerate(csv_result): self.assertDictEqual(data[i], csv_row) finally: shutil.rmtree(tmp_dir)
def create_hist_plot(hist_dict, header, out_dir, data_file): """ See https://stanford.edu/~mwaskom/software/seaborn/examples/horizontal_barplot.html @param hist_dict: dict of label, count @param header: name of dictionary @param out_dir: str, name of directory where files are to be saved @param data_file: name of data file @return: a list of lists (label, count) """ # remove spaces in name header = "".join(header.split()) # convert dict to list for creating bar chat bar_data = [[key, val] for key, val in hist_dict.items()] bar_data.sort(key=itemgetter(0)) bar_data.sort(key=itemgetter(1), reverse=True) # bar chart background style sns.set(style="whitegrid", font='Arial') # color options include pastel sns.set_color_codes("deep") # Initialize the matplotlib figure f, ax = plt.subplots(figsize=(6, 6)) # Create pandas dataframe new_df = pd.DataFrame(bar_data, columns=["key", "count"]) # Plot sns.barplot(x="count", y="key", data=new_df, label="Total", color="b") # other options: xlim=(0, 24) ax.set(xlabel="Count", ylabel="") ax.set_title(header) with warnings.catch_warnings(): warnings.simplefilter("ignore") plt.tight_layout() f_name = create_out_fname(data_file, suffix=header, base_dir=out_dir, ext=".png") plt.savefig(f_name, dpi=300) print("Wrote file: {}".format(f_name)) # quote strings for printing so csv properly read, and add header count_to_print = [[header + "_key", header + "_count"]] for row in bar_data: count_to_print.append([row[0], row[1]]) return count_to_print
def create_hists(data_file, header_row, hist_data, out_dir): counts_to_print = [] if len(hist_data) > 0: for col in hist_data: count_to_print = create_hist_plot(hist_data[col], header_row[col], out_dir, data_file) if len(counts_to_print) == 0: counts_to_print = count_to_print else: len1 = len(counts_to_print) len2 = len(count_to_print) width1 = len(counts_to_print[0]) width2 = len(count_to_print[0]) combined_list = [] for row in range(min(len1, len2)): combined_list.append(counts_to_print[row] + count_to_print[row]) for row in range(len2, len1): combined_list.append(counts_to_print[row] + [""] * width2) for row in range(len1, len2): # noinspection PyTypeChecker combined_list.append([""] * width1 + count_to_print[row]) counts_to_print = copy.deepcopy(combined_list) f_name = create_out_fname(data_file, prefix='counts_', ext='.csv', base_dir=out_dir) list_to_csv(counts_to_print, f_name, delimiter=',')
def process_pdb(cfg, atom_num_dict, mol_num_dict, element_dict): pdb_loc = cfg[PDB_FILE] pdb_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []} # to allow warning to be printed once and only once missing_types = [] qmmm_elem_id_dict = {} ca_res_atom_id_dict = {} cb_res_atom_id_dict = {} atoms_for_vmd = [] with open(pdb_loc) as f: wat_count = 0 atom_count = 0 mol_count = 1 current_mol = None last_mol_num = None atoms_content = [] for line in f: line = line.strip() if len(line) == 0: continue line_head = line[:cfg[PDB_LINE_TYPE_LAST_CHAR]] # head_content to contain Everything before 'Atoms' section # also capture the number of atoms if line_head == 'REMARK' or line_head == 'CRYST1': pdb_data[HEAD_CONTENT].append(line) # atoms_content to contain everything but the xyz elif line_head == 'ATOM ': # My template PDB has ***** after atom_count 99999. Thus, I'm renumbering. Otherwise, this this: # atom_num = line[cfg[PDB_LINE_TYPE_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]] # For renumbering, making sure prints in the correct format, including num of characters: atom_count += 1 # For reordering atoms if atom_count in atom_num_dict: atom_id = atom_num_dict[atom_count] else: atom_id = atom_count if atom_id > 99999: atom_num = format(atom_id, 'x') if len(atom_num) > 5: warning( "Hex representation of {} is {}, which is greater than 5 characters. This" "will affect the PDB output formatting.".format( atom_id, atom_num)) else: atom_num = '{:5d}'.format(atom_id) atom_type = line[ cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_TYPE_LAST_CHAR]] atom_type_stripped = atom_type.strip() res_type = line[ cfg[PDB_ATOM_TYPE_LAST_CHAR]:cfg[PDB_RES_TYPE_LAST_CHAR]] mol_num = int(line[ cfg[PDB_RES_TYPE_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]]) pdb_x = float( line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]]) pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]]) pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]]) occ_t = line[cfg[PDB_Z_LAST_CHAR]:cfg[PDB_LAST_T_CHAR]] element = line[cfg[PDB_LAST_T_CHAR]:cfg[PDB_LAST_ELEM_CHAR]] last_cols = line[cfg[PDB_LAST_ELEM_CHAR]:] # For user-specified changing of molecule number if mol_num in mol_num_dict: mol_num = mol_num_dict[mol_num] # If doing water molecule checking... if cfg[FIRST_WAT_ID] <= atom_count <= cfg[LAST_WAT_ID]: if (wat_count % 3) == 0: current_mol = mol_num if atom_type != ' OH2 ': warning( 'Expected an OH2 atom to be the first atom of a water molecule. ' 'Check line: {}'.format(line)) # last_cols = ' 0.00 0.00 S2 O' else: if current_mol != mol_num: warning('Water not in order on line:', line) if (wat_count % 3) == 1: if atom_type != ' H1 ': warning( 'Expected an H1 atom to be the second atom of a water molecule. ' 'Check line: {}'.format(line)) else: if atom_type != ' H2 ': warning( 'Expected an H2 atom to be the second atom of a water molecule. ' 'Check line: {}'.format(line)) wat_count += 1 if mol_num in cfg[ RESID_QMMM] and atom_type not in SKIP_ATOM_TYPES: if atom_type == C_ALPHA: ca_res_atom_id_dict[mol_num] = atom_id else: if atom_type == C_BETA: cb_res_atom_id_dict[mol_num] = atom_id if atom_type_stripped in element_dict: element = element_dict[atom_type_stripped] else: raise InvalidDataError( "Did not find atom type '{}' in the element dictionary. Please " "provide a new atom type, element dictionary (using keyword {} " "in the configuration file) that includes all atom types in the " "residues identified with the '{}' key." "".format(atom_type_stripped, ELEMENT_DICT_FILE, RESID_QMMM)) if element in qmmm_elem_id_dict: qmmm_elem_id_dict[element].append(atom_id) else: qmmm_elem_id_dict[element] = [atom_id] atoms_for_vmd.append(atom_id - 1) if cfg[ADD_ELEMENTS] and atom_count <= cfg[LAST_ADD_ELEM]: if atom_type_stripped in element_dict: element = element_dict[atom_type_stripped] else: if atom_type_stripped not in missing_types: warning( "Please add atom type '{}' to dictionary of elements. Will not write/overwrite " "element type in the pdb output.".format( atom_type_stripped)) missing_types.append(atom_type_stripped) # For numbering molecules from 1 to end if cfg[RENUM_MOL]: if last_mol_num is None: last_mol_num = mol_num if mol_num != last_mol_num: last_mol_num = mol_num mol_count += 1 if mol_count == 10000: warning( "Molecule numbers greater than 9999 will be printed in hex" ) # Due to PDB format constraints, need to print in hex starting at 9999 molecules. if mol_count > 9999: mol_num = format(mol_count, 'x') if len(mol_num) > 4: warning( "Hex representation of {} is {}, which is greater than 4 characters. This" "will affect the PDB output formatting.". format(atom_id, atom_num)) else: mol_num = '{:4d}'.format(mol_count) line_struct = [ line_head, atom_num, atom_type, res_type, mol_num, pdb_x, pdb_y, pdb_z, occ_t, element, last_cols ] atoms_content.append(line_struct) # tail_content to contain everything after the 'Atoms' section else: pdb_data[TAIL_CONTENT].append(line) # Only sort if there is renumbering if len(atom_num_dict) > 0: pdb_data[ATOMS_CONTENT] = sorted(atoms_content, key=lambda entry: entry[1]) else: pdb_data[ATOMS_CONTENT] = atoms_content if cfg[PDB_NEW_FILE] is None: f_name = create_out_fname(cfg[PDB_FILE], suffix="_new", base_dir=cfg[OUT_BASE_DIR]) else: f_name = create_out_fname(cfg[PDB_NEW_FILE], base_dir=cfg[OUT_BASE_DIR]) print_pdb(pdb_data[HEAD_CONTENT], pdb_data[ATOMS_CONTENT], pdb_data[TAIL_CONTENT], f_name, cfg[PDB_FORMAT]) if len(cfg[RESID_QMMM]) > 0: f_name = create_out_fname('amino_id.dat', base_dir=cfg[OUT_BASE_DIR]) print_mode = "w" for elem in sorted(qmmm_elem_id_dict): print_qm_kind(qmmm_elem_id_dict[elem], elem, f_name, mode=print_mode) print_mode = 'a' print_qm_links(ca_res_atom_id_dict, cb_res_atom_id_dict, f_name, mode=print_mode) f_name = create_out_fname('vmd_protein_atoms.dat', base_dir=cfg[OUT_BASE_DIR]) list_to_csv([atoms_for_vmd], f_name, delimiter=' ')
def testOutFname(self): """ Check for prefix addition. """ self.assertTrue(create_out_fname(ORIG_WHAM_PATH, prefix=OUT_PFX).endswith( os.sep + OUT_PFX + ORIG_WHAM_FNAME))
def process_file(data_file, out_dir, len_buffer, delimiter, min_max_dict, header=False, make_hist=False): try: dim_vectors, header_row, hist_data = np_float_array_from_file(data_file, delimiter=delimiter, header=header, gather_hist=make_hist) except InvalidDataError as e: raise InvalidDataError("{}\n" "Run program with '-h' to see options, such as specifying header row (-n) " "and/or delimiter (-d)".format(e)) if header: to_print = [[''] + header_row] else: to_print = [] max_vector = dim_vectors.max(axis=0) min_vector = dim_vectors.min(axis=0) avg_vector = dim_vectors.mean(axis=0) med_vector = np.percentile(dim_vectors, 50, axis=0) # noinspection PyTypeChecker to_print += [['Min values:'] + min_vector.tolist(), ['Max values:'] + max_vector.tolist(), ['Avg values:'] + avg_vector.tolist(), ['Std dev:'] + dim_vectors.std(axis=0, ddof=1).tolist(), ['5% percentile:'] + np.percentile(dim_vectors, 4.55, axis=0).tolist(), ['32% percentile:'] + np.percentile(dim_vectors, 31.73, axis=0).tolist(), ['50% percentile:'] + med_vector.tolist(), ['68% percentile:'] + np.percentile(dim_vectors, 68.27, axis=0).tolist(), ['95% percentile:'] + np.percentile(dim_vectors, 95.45, axis=0).tolist(), ] if len_buffer is not None: to_print.append(['Max plus {} buffer:'.format(len_buffer)] + (max_vector + len_buffer).tolist()) if min_max_dict is not None: nan_list = [np.nan] * len(header_row) avg_ini_diff = ['Avg % Diff:'] + nan_list med_ini_diff = ['Med % Diff:'] + nan_list med_is_min = ['Median is Min:'] + nan_list med_is_max = ['Median is Max:'] + nan_list for col_num, header in enumerate(to_print[0]): if header in min_max_dict[0]: ini_val = min_max_dict[0][header] low_val = min_max_dict[1][header] upp_val = min_max_dict[2][header] avg_val = avg_vector[col_num - 1] med_val = med_vector[col_num - 1] min_val = min_vector[col_num - 1] max_val = max_vector[col_num - 1] min_tol = max(TOL * max(abs(min_val), abs(low_val)), TOL) med_tol = max(TOL * abs(med_val), TOL) max_tol = max(TOL * max(abs(max_val), abs(upp_val)), TOL) if (low_val - min_val) > min_tol: warning("Minimum value found for header '{}' ({}) is less than lower bound ({})" "".format(header, min_val, low_val)) if (max_val - upp_val) > max_tol: warning("Maximum value found for header '{}' ({}) is greater than upper bound ({})" "".format(header, max_val, upp_val)) avg_ini_diff[col_num] = (avg_val - ini_val) / ini_val * 100 med_ini_diff[col_num] = (med_val - ini_val) / ini_val * 100 if abs(med_val - low_val) > med_tol: med_is_min[col_num] = 0 else: med_is_min[col_num] = 1 if abs(med_val - upp_val) > med_tol: med_is_max[col_num] = 0 else: med_is_max[col_num] = 1 # else: # for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]: # min_max_list.append(np.nan) for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]: to_print.append(min_max_list) # Printing to standard out: do not print quotes around strings because using csv writer # print("Number of dimensions ({}) based on first line of file: {}".format(len(dim_vectors[0]), data_file)) if len(dim_vectors[0]) < 12: for index, row in enumerate(to_print): # formatting for header if index == 0 and header: print("{:>20s} {}".format(row[0], ' '.join(['{:>16s}'.format(x.strip()) for x in row[1:]]))) # formatting for vals else: print("{:>20s} {}".format(row[0], ' '.join(['{:16.6f}'.format(x) for x in row[1:]]))) f_name = create_out_fname(data_file, prefix='stats_', ext='.csv', base_dir=out_dir) list_to_csv(to_print, f_name) # list_to_file(to_print, f_name, delimiter=',') if make_hist: create_hists(data_file, header_row, hist_data, out_dir)