示例#1
0
def fill_save_tpl(cfg,
                  tpl_str,
                  tpl_vals_dict,
                  tpl_name,
                  filled_tpl_name,
                  print_info=True):
    """
    use the dictionary to make the file name and filled template. Then save the file.
    @param cfg: configuration for run
    @param tpl_str: the string to be filled to make the filled tpl file
    @param tpl_vals_dict: dictionary of tpl keys and vals
    @param tpl_name: the cfg key for the template file name
    @param filled_tpl_name: the cfg key for the filled template file name
    @param print_info: print to standard out when a file is printed
    """
    try:
        filled_tpl_str = tpl_str.format(**tpl_vals_dict)
    except KeyError as e:
        raise KeyError(
            "Key '{}' not found in the configuration but required for template file: {}"
            "".format(e.args[0], tpl_name))

    try:
        filled_fname_str = filled_tpl_name.format(**tpl_vals_dict)
    except KeyError as e:
        raise KeyError(
            "Key '{}' not found in the configuration but required for filled template file name: {}"
            "".format(e.args[0], filled_tpl_name))

    tpl_vals_dict[NEW_FNAME] = create_out_fname(filled_fname_str,
                                                base_dir=cfg[OUT_DIR])
    str_to_file(filled_tpl_str,
                tpl_vals_dict[NEW_FNAME],
                print_info=print_info)
示例#2
0
def process_file(f_name, b_str, e_str, new_f_name):

    if new_f_name is None:
        new_f_name = create_out_fname(f_name, suffix='_amend')

    # open old file first; then if, there is a problem with it, no new file will be created
    with open(f_name) as f:
        with open(new_f_name, 'w') as w_file:
            for line in f:
                line = line.strip()
                w_file.write(b_str + line + e_str + "\n")
    print("Wrote file: {}".format(new_f_name))
示例#3
0
    def testWriteCsv(self):
        tmp_dir = None
        data = csv_data()
        try:
            tmp_dir = tempfile.mkdtemp()
            tgt_fname = create_out_fname(SHORT_WHAM_PATH, prefix=OUT_PFX, base_dir=tmp_dir)

            write_csv(data, tgt_fname, RAD_KEY_SEQ)
            csv_result = read_csv(tgt_fname,
                                  data_conv={FREE_KEY: str_to_bool,
                                             CORR_KEY: float,
                                             COORD_KEY: str, })
            self.assertEqual(len(data), len(csv_result))
            for i, csv_row in enumerate(csv_result):
                self.assertDictEqual(data[i], csv_row)
        finally:
            shutil.rmtree(tmp_dir)
示例#4
0
def create_hist_plot(hist_dict, header, out_dir, data_file):
    """
    See https://stanford.edu/~mwaskom/software/seaborn/examples/horizontal_barplot.html
    @param hist_dict: dict of label, count
    @param header: name of dictionary
    @param out_dir: str, name of directory where files are to be saved
    @param data_file: name of data file
    @return: a list of lists (label, count)
    """
    # remove spaces in name
    header = "".join(header.split())

    # convert dict to list for creating bar chat
    bar_data = [[key, val] for key, val in hist_dict.items()]
    bar_data.sort(key=itemgetter(0))
    bar_data.sort(key=itemgetter(1), reverse=True)

    # bar chart background style
    sns.set(style="whitegrid", font='Arial')
    # color options include pastel
    sns.set_color_codes("deep")
    # Initialize the matplotlib figure
    f, ax = plt.subplots(figsize=(6, 6))
    # Create pandas dataframe
    new_df = pd.DataFrame(bar_data, columns=["key", "count"])
    # Plot
    sns.barplot(x="count", y="key", data=new_df,
                label="Total", color="b")
    # other options: xlim=(0, 24)
    ax.set(xlabel="Count", ylabel="")
    ax.set_title(header)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        plt.tight_layout()

    f_name = create_out_fname(data_file, suffix=header, base_dir=out_dir, ext=".png")
    plt.savefig(f_name, dpi=300)
    print("Wrote file: {}".format(f_name))

    # quote strings for printing so csv properly read, and add header
    count_to_print = [[header + "_key", header + "_count"]]
    for row in bar_data:
        count_to_print.append([row[0], row[1]])

    return count_to_print
示例#5
0
def create_hists(data_file, header_row, hist_data, out_dir):
    counts_to_print = []
    if len(hist_data) > 0:
        for col in hist_data:
            count_to_print = create_hist_plot(hist_data[col], header_row[col], out_dir, data_file)

            if len(counts_to_print) == 0:
                counts_to_print = count_to_print
            else:
                len1 = len(counts_to_print)
                len2 = len(count_to_print)
                width1 = len(counts_to_print[0])
                width2 = len(count_to_print[0])
                combined_list = []
                for row in range(min(len1, len2)):
                    combined_list.append(counts_to_print[row] + count_to_print[row])
                for row in range(len2, len1):
                    combined_list.append(counts_to_print[row] + [""] * width2)
                for row in range(len1, len2):
                    # noinspection PyTypeChecker
                    combined_list.append([""] * width1 + count_to_print[row])
                counts_to_print = copy.deepcopy(combined_list)
    f_name = create_out_fname(data_file, prefix='counts_', ext='.csv', base_dir=out_dir)
    list_to_csv(counts_to_print, f_name, delimiter=',')
示例#6
0
def process_pdb(cfg, atom_num_dict, mol_num_dict, element_dict):
    pdb_loc = cfg[PDB_FILE]
    pdb_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []}
    # to allow warning to be printed once and only once
    missing_types = []
    qmmm_elem_id_dict = {}
    ca_res_atom_id_dict = {}
    cb_res_atom_id_dict = {}
    atoms_for_vmd = []

    with open(pdb_loc) as f:
        wat_count = 0
        atom_count = 0
        mol_count = 1

        current_mol = None
        last_mol_num = None
        atoms_content = []

        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            line_head = line[:cfg[PDB_LINE_TYPE_LAST_CHAR]]
            # head_content to contain Everything before 'Atoms' section
            # also capture the number of atoms
            if line_head == 'REMARK' or line_head == 'CRYST1':
                pdb_data[HEAD_CONTENT].append(line)

            # atoms_content to contain everything but the xyz
            elif line_head == 'ATOM  ':

                # My template PDB has ***** after atom_count 99999. Thus, I'm renumbering. Otherwise, this this:
                # atom_num = line[cfg[PDB_LINE_TYPE_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]]
                # For renumbering, making sure prints in the correct format, including num of characters:
                atom_count += 1

                # For reordering atoms
                if atom_count in atom_num_dict:
                    atom_id = atom_num_dict[atom_count]
                else:
                    atom_id = atom_count

                if atom_id > 99999:
                    atom_num = format(atom_id, 'x')
                    if len(atom_num) > 5:
                        warning(
                            "Hex representation of {} is {}, which is greater than 5 characters. This"
                            "will affect the PDB output formatting.".format(
                                atom_id, atom_num))
                else:
                    atom_num = '{:5d}'.format(atom_id)

                atom_type = line[
                    cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_TYPE_LAST_CHAR]]
                atom_type_stripped = atom_type.strip()
                res_type = line[
                    cfg[PDB_ATOM_TYPE_LAST_CHAR]:cfg[PDB_RES_TYPE_LAST_CHAR]]
                mol_num = int(line[
                    cfg[PDB_RES_TYPE_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]])
                pdb_x = float(
                    line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]])
                pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]])
                pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]])
                occ_t = line[cfg[PDB_Z_LAST_CHAR]:cfg[PDB_LAST_T_CHAR]]
                element = line[cfg[PDB_LAST_T_CHAR]:cfg[PDB_LAST_ELEM_CHAR]]
                last_cols = line[cfg[PDB_LAST_ELEM_CHAR]:]

                # For user-specified changing of molecule number
                if mol_num in mol_num_dict:
                    mol_num = mol_num_dict[mol_num]

                # If doing water molecule checking...
                if cfg[FIRST_WAT_ID] <= atom_count <= cfg[LAST_WAT_ID]:
                    if (wat_count % 3) == 0:
                        current_mol = mol_num
                        if atom_type != '  OH2 ':
                            warning(
                                'Expected an OH2 atom to be the first atom of a water molecule. '
                                'Check line: {}'.format(line))
                        # last_cols = '  0.00  0.00      S2   O'
                    else:
                        if current_mol != mol_num:
                            warning('Water not in order on line:', line)
                        if (wat_count % 3) == 1:
                            if atom_type != '  H1  ':
                                warning(
                                    'Expected an H1 atom to be the second atom of a water molecule. '
                                    'Check line: {}'.format(line))
                        else:
                            if atom_type != '  H2  ':
                                warning(
                                    'Expected an H2 atom to be the second atom of a water molecule. '
                                    'Check line: {}'.format(line))
                    wat_count += 1

                if mol_num in cfg[
                        RESID_QMMM] and atom_type not in SKIP_ATOM_TYPES:
                    if atom_type == C_ALPHA:
                        ca_res_atom_id_dict[mol_num] = atom_id
                    else:
                        if atom_type == C_BETA:
                            cb_res_atom_id_dict[mol_num] = atom_id
                        if atom_type_stripped in element_dict:
                            element = element_dict[atom_type_stripped]
                        else:
                            raise InvalidDataError(
                                "Did not find atom type '{}' in the element dictionary. Please "
                                "provide a new atom type, element dictionary (using keyword {} "
                                "in the configuration file) that includes all atom types in the "
                                "residues identified with the '{}' key."
                                "".format(atom_type_stripped,
                                          ELEMENT_DICT_FILE, RESID_QMMM))
                        if element in qmmm_elem_id_dict:
                            qmmm_elem_id_dict[element].append(atom_id)
                        else:
                            qmmm_elem_id_dict[element] = [atom_id]
                        atoms_for_vmd.append(atom_id - 1)

                if cfg[ADD_ELEMENTS] and atom_count <= cfg[LAST_ADD_ELEM]:
                    if atom_type_stripped in element_dict:
                        element = element_dict[atom_type_stripped]
                    else:
                        if atom_type_stripped not in missing_types:
                            warning(
                                "Please add atom type '{}' to dictionary of elements. Will not write/overwrite "
                                "element type in the pdb output.".format(
                                    atom_type_stripped))
                            missing_types.append(atom_type_stripped)

                # For numbering molecules from 1 to end
                if cfg[RENUM_MOL]:
                    if last_mol_num is None:
                        last_mol_num = mol_num

                    if mol_num != last_mol_num:
                        last_mol_num = mol_num
                        mol_count += 1
                        if mol_count == 10000:
                            warning(
                                "Molecule numbers greater than 9999 will be printed in hex"
                            )

                    # Due to PDB format constraints, need to print in hex starting at 9999 molecules.
                    if mol_count > 9999:
                        mol_num = format(mol_count, 'x')
                        if len(mol_num) > 4:
                            warning(
                                "Hex representation of {} is {}, which is greater than 4 characters. This"
                                "will affect the PDB output formatting.".
                                format(atom_id, atom_num))
                    else:
                        mol_num = '{:4d}'.format(mol_count)

                line_struct = [
                    line_head, atom_num, atom_type, res_type, mol_num, pdb_x,
                    pdb_y, pdb_z, occ_t, element, last_cols
                ]
                atoms_content.append(line_struct)

            # tail_content to contain everything after the 'Atoms' section
            else:
                pdb_data[TAIL_CONTENT].append(line)

    # Only sort if there is renumbering
    if len(atom_num_dict) > 0:
        pdb_data[ATOMS_CONTENT] = sorted(atoms_content,
                                         key=lambda entry: entry[1])
    else:
        pdb_data[ATOMS_CONTENT] = atoms_content

    if cfg[PDB_NEW_FILE] is None:
        f_name = create_out_fname(cfg[PDB_FILE],
                                  suffix="_new",
                                  base_dir=cfg[OUT_BASE_DIR])
    else:
        f_name = create_out_fname(cfg[PDB_NEW_FILE],
                                  base_dir=cfg[OUT_BASE_DIR])
    print_pdb(pdb_data[HEAD_CONTENT], pdb_data[ATOMS_CONTENT],
              pdb_data[TAIL_CONTENT], f_name, cfg[PDB_FORMAT])

    if len(cfg[RESID_QMMM]) > 0:
        f_name = create_out_fname('amino_id.dat', base_dir=cfg[OUT_BASE_DIR])
        print_mode = "w"
        for elem in sorted(qmmm_elem_id_dict):
            print_qm_kind(qmmm_elem_id_dict[elem],
                          elem,
                          f_name,
                          mode=print_mode)
            print_mode = 'a'
        print_qm_links(ca_res_atom_id_dict,
                       cb_res_atom_id_dict,
                       f_name,
                       mode=print_mode)
        f_name = create_out_fname('vmd_protein_atoms.dat',
                                  base_dir=cfg[OUT_BASE_DIR])
        list_to_csv([atoms_for_vmd], f_name, delimiter=' ')
示例#7
0
 def testOutFname(self):
     """
     Check for prefix addition.
     """
     self.assertTrue(create_out_fname(ORIG_WHAM_PATH, prefix=OUT_PFX).endswith(
         os.sep + OUT_PFX + ORIG_WHAM_FNAME))
示例#8
0
def process_file(data_file, out_dir, len_buffer, delimiter, min_max_dict, header=False, make_hist=False):
    try:
        dim_vectors, header_row, hist_data = np_float_array_from_file(data_file, delimiter=delimiter,
                                                                      header=header, gather_hist=make_hist)

    except InvalidDataError as e:
        raise InvalidDataError("{}\n"
                               "Run program with '-h' to see options, such as specifying header row (-n) "
                               "and/or delimiter (-d)".format(e))

    if header:
        to_print = [[''] + header_row]
    else:
        to_print = []

    max_vector = dim_vectors.max(axis=0)
    min_vector = dim_vectors.min(axis=0)
    avg_vector = dim_vectors.mean(axis=0)
    med_vector = np.percentile(dim_vectors, 50, axis=0)

    # noinspection PyTypeChecker
    to_print += [['Min values:'] + min_vector.tolist(),
                 ['Max values:'] + max_vector.tolist(),
                 ['Avg values:'] + avg_vector.tolist(),
                 ['Std dev:'] + dim_vectors.std(axis=0, ddof=1).tolist(),
                 ['5% percentile:'] + np.percentile(dim_vectors, 4.55, axis=0).tolist(),
                 ['32% percentile:'] + np.percentile(dim_vectors, 31.73, axis=0).tolist(),
                 ['50% percentile:'] + med_vector.tolist(),
                 ['68% percentile:'] + np.percentile(dim_vectors, 68.27, axis=0).tolist(),
                 ['95% percentile:'] + np.percentile(dim_vectors, 95.45, axis=0).tolist(),
                 ]
    if len_buffer is not None:
        to_print.append(['Max plus {} buffer:'.format(len_buffer)] + (max_vector + len_buffer).tolist())

    if min_max_dict is not None:
        nan_list = [np.nan] * len(header_row)
        avg_ini_diff = ['Avg % Diff:'] + nan_list
        med_ini_diff = ['Med % Diff:'] + nan_list
        med_is_min = ['Median is Min:'] + nan_list
        med_is_max = ['Median is Max:'] + nan_list
        for col_num, header in enumerate(to_print[0]):
            if header in min_max_dict[0]:
                ini_val = min_max_dict[0][header]
                low_val = min_max_dict[1][header]
                upp_val = min_max_dict[2][header]
                avg_val = avg_vector[col_num - 1]
                med_val = med_vector[col_num - 1]
                min_val = min_vector[col_num - 1]
                max_val = max_vector[col_num - 1]
                min_tol = max(TOL * max(abs(min_val), abs(low_val)), TOL)
                med_tol = max(TOL * abs(med_val), TOL)
                max_tol = max(TOL * max(abs(max_val), abs(upp_val)), TOL)
                if (low_val - min_val) > min_tol:
                    warning("Minimum value found for header '{}' ({}) is less than lower bound ({})"
                            "".format(header, min_val, low_val))
                if (max_val - upp_val) > max_tol:
                    warning("Maximum value found for header '{}' ({}) is greater than upper bound ({})"
                            "".format(header, max_val, upp_val))
                avg_ini_diff[col_num] = (avg_val - ini_val) / ini_val * 100
                med_ini_diff[col_num] = (med_val - ini_val) / ini_val * 100
                if abs(med_val - low_val) > med_tol:
                    med_is_min[col_num] = 0
                else:
                    med_is_min[col_num] = 1
                if abs(med_val - upp_val) > med_tol:
                    med_is_max[col_num] = 0
                else:
                    med_is_max[col_num] = 1
                    # else:
                    #     for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]:
                    #         min_max_list.append(np.nan)
        for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]:
            to_print.append(min_max_list)

    # Printing to standard out: do not print quotes around strings because using csv writer
    # print("Number of dimensions ({}) based on first line of file: {}".format(len(dim_vectors[0]), data_file))
    if len(dim_vectors[0]) < 12:
        for index, row in enumerate(to_print):
            # formatting for header
            if index == 0 and header:
                print("{:>20s} {}".format(row[0],
                                          ' '.join(['{:>16s}'.format(x.strip()) for x in row[1:]])))
            # formatting for vals
            else:
                print("{:>20s} {}".format(row[0], ' '.join(['{:16.6f}'.format(x) for x in row[1:]])))

    f_name = create_out_fname(data_file, prefix='stats_', ext='.csv', base_dir=out_dir)
    list_to_csv(to_print, f_name)
    # list_to_file(to_print, f_name, delimiter=',')

    if make_hist:
        create_hists(data_file, header_row, hist_data, out_dir)