Exemplo n.º 1
0
def main(argv=None):
    # Read input
    args, ret = parse_cmdline(argv)
    if ret != GOOD_RET or args is None:
        return ret

    len_buffer = None

    try:
        if args.buffer is not None:
            try:
                len_buffer = float(args.buffer)
            except ValueError:
                raise InvalidDataError("Input for buffer ({}) could not be converted to a float.".format(args.buffer))
        if args.out_dir is None:
            args.out_dir = os.path.dirname(args.file)
        if args.min_max_file is None:
            min_max_dict = None
        else:
            min_max_dict = read_csv(args.min_max_file, quote_style=csv.QUOTE_NONNUMERIC)
        process_file(args.file, args.out_dir, len_buffer, args.delimiter, min_max_dict,
                     header=args.names, make_hist=args.histogram)
    except IOError as e:
        warning("Problems reading file:", e)
        return IO_ERROR
    except InvalidDataError as e:
        warning("Problems reading data:", e)
        return INVALID_DATA

    return GOOD_RET  # success
Exemplo n.º 2
0
def parse_cmdline(argv):
    """
    Returns the parsed argument list and return code.
    `argv` is a list of arguments, or `None` for ``sys.argv[1:]``.
    """
    if argv is None:
        argv = sys.argv[1:]

    # initialize the parser object:
    parser = ThrowingArgumentParser(description='Reads in a file and adds a beginning and/or end to each line. '
                                                'The first argument must be the name of the file to be read.')
    # Below, it is a positional argument, that is required.
    parser.add_argument("file", help="The location of the file to be amended (required).", )
    parser.add_argument("-b", "--begin", help="String to add to the beginning of a line.",
                        default=DEF_BEGIN_STR)
    parser.add_argument("-e", "--end", help="String to add to the end of a line.",
                        default=DEF_END_STR)
    parser.add_argument("-n", "--new_name", help="Name of amended file.",
                        default=DEF_NEW_FNAME)
    args = None

    try:
        args = parser.parse_args(argv)
        if args.begin == DEF_BEGIN_STR and args.end == DEF_END_STR:
            warning("Return file will be the same as the input, as no begin or end strings were passed. "
                    "Use -h for help.")
    except ArgumentParserError as e:
        warning("Argument Parser Error:", e)
        parser.print_help()
        return args, INPUT_ERROR

    return args, GOOD_RET
Exemplo n.º 3
0
def main(argv=None):
    # Read input
    args, ret = parse_cmdline(argv)
    if ret != GOOD_RET:
        return ret

    try:
        process_file(args.file, args.begin, args.end, args.new_name)
    except IOError as e:
        warning("Problems reading file:", e)
        return IO_ERROR

    return GOOD_RET  # success
Exemplo n.º 4
0
def parse_cmdline(argv):
    """
    Returns the parsed argument list and return code.
    `argv` is a list of arguments, or `None` for ``sys.argv[1:]``.
    """
    if argv is None:
        argv = sys.argv[1:]

    # initialize the parser object:
    parser = argparse.ArgumentParser(description='Reads in space-separated columns and returns the min, max, avg, and '
                                                 'std dev for each column. It can optionally prepare histograms of '
                                                 'non-numerical data.')
    parser.add_argument("-f", "--file", help="The location of the file with the dimensions with one line per vector, "
                                             "space-separated, containing at least two lines. The default file is {}, "
                                             "located in the current directory".format(DEF_ARRAY_FILE),
                        default=DEF_ARRAY_FILE)

    parser.add_argument("-b", "--buffer", help="If specified, the program will output only the max dimension"
                                               "in each column plus an additional buffer amount (float).",
                        default=None)

    parser.add_argument("-d", "--delimiter", help="Delimiter. Default is '{}'".format(DEF_DELIMITER),
                        default=DEF_DELIMITER)

    parser.add_argument("-m", "--min_max_file", help="CSV file with column names (first line), "
                                                     "initial values (second line), min values "
                                                     "(third line), and max values (fourth line), used to further "
                                                     "analyze the data file.",
                        default=None)

    parser.add_argument("-n", "--names", help="File contains column names (header) (default is false). "
                                              "Note: lines beginning with '#' are ignored.",
                        action='store_true')

    parser.add_argument("-o", "--out_dir", help="Output folder. Default is the directory of the file to be processed.",
                        default=None)

    parser.add_argument("-s", "--histogram", help="Create histograms of the non-numerical data (default is false).",
                        action='store_true')

    args = None
    try:
        args = parser.parse_args(argv)
    except SystemExit as e:
        if hasattr(e, 'code') and e.code == 0:
            return args, GOOD_RET
        warning(e)
        parser.print_help()
        return args, INPUT_ERROR

    return args, GOOD_RET
Exemplo n.º 5
0
def parse_cmdline(argv):
    """
    Returns the parsed argument list and return code.
    `argv` is a list of arguments, or `None` for ``sys.argv[1:]``.
    """
    if argv is None:
        argv = sys.argv[1:]

    # initialize the parser object:
    parser = argparse.ArgumentParser(
        description='Adds a word to a Hunspell-type dictionary file.')

    parser.add_argument(
        "-s",
        "--sfx",
        help=
        "Suffix to be added after word (and after a '/'. For example, 'SM' will "
        "allow the word to be made plural and possessive. See hunspell "
        "documentation for more documentation on codes.",
        default='')

    parser.add_argument("-d",
                        "--dict_loc",
                        help="Location of the dictionary file to be modified. "
                        "The default is: '{}'".format(DEF_DICT),
                        default=DEF_DICT)

    parser.add_argument("new_word",
                        help="The word to add to the dictionary",
                        type=str)

    args = None
    try:
        args = parser.parse_args(argv)
    except (InvalidDataError, IOError, DuplicateOptionError, SystemExit) as e:
        if hasattr(e, 'code') and e.code == 0:
            return args, GOOD_RET
        warning(e)
        parser.print_help()
        return args, INPUT_ERROR

    return args, GOOD_RET
Exemplo n.º 6
0
def main(argv=None):
    # Read input
    args, ret = parse_cmdline(argv)
    # TODO: did not show the expected behavior when I didn't have a required cfg in the ini file
    if ret != GOOD_RET or args is None:
        return ret

    cfg = args.config

    # Read and process pdb files
    try:
        atom_num_dict = read_csv_dict(cfg[ATOM_REORDER_FILE])
        mol_num_dict = read_csv_dict(cfg[MOL_RENUM_FILE], one_to_one=False)
        element_dict = create_element_dict(cfg[ELEMENT_DICT_FILE])
        process_pdb(cfg, atom_num_dict, mol_num_dict, element_dict)
    except IOError as e:
        warning("Problems reading file:", e)
        return IO_ERROR
    except (InvalidDataError, ValueError) as e:
        warning("Problems with input:", e)
        return INVALID_DATA

    return GOOD_RET  # success
Exemplo n.º 7
0
def main(argv=None):
    """
    Runs the main program.

    :param argv: The command line arguments.
    :return: The return code for the program's termination.
    """
    args, ret = parse_cmdline(argv)
    if ret != GOOD_RET or args is None:
        return ret

    cfg = args.config

    try:
        make_tpl(cfg, cfg[TPL_FNAME], cfg[FILLED_TPL_FNAME])
    except (TemplateNotReadableError, IOError) as e:
        warning("Problems reading file: {}".format(e))
        return IO_ERROR
    except (KeyError, InvalidDataError) as e:
        warning(e)
        return IO_ERROR

    return GOOD_RET  # success
Exemplo n.º 8
0
def parse_cmdline(argv):
    """
    Returns the parsed argument list and return code.
    `argv` is a list of arguments, or `None` for ``sys.argv[1:]``.
    """
    if argv is None:
        argv = sys.argv[1:]

    # initialize the parser object:
    parser = argparse.ArgumentParser(
        description=
        'Creates a new version of a pdb file. Atoms will be numbered '
        'starting from one. Options include renumbering molecules.')
    parser.add_argument(
        "-c",
        "--config",
        help="The location of the configuration file in ini format. "
        "The default file name is {}, located in the "
        "base directory where the program as run.".format(DEF_CFG_FILE),
        default=DEF_CFG_FILE,
        type=read_cfg)
    args = None
    try:
        args = parser.parse_args(argv)
    except IOError as e:
        warning(e)
        parser.print_help()
        return args, IO_ERROR

    except (KeyError, InvalidDataError, SystemExit) as e:
        if hasattr(e, 'code') and e.code == 0:
            return args, GOOD_RET
        warning("Input data missing:", e)
        parser.print_help()
        return args, INPUT_ERROR

    return args, GOOD_RET
Exemplo n.º 9
0
def process_pdb(cfg, atom_num_dict, mol_num_dict, element_dict):
    pdb_loc = cfg[PDB_FILE]
    pdb_data = {HEAD_CONTENT: [], ATOMS_CONTENT: [], TAIL_CONTENT: []}
    # to allow warning to be printed once and only once
    missing_types = []
    qmmm_elem_id_dict = {}
    ca_res_atom_id_dict = {}
    cb_res_atom_id_dict = {}
    atoms_for_vmd = []

    with open(pdb_loc) as f:
        wat_count = 0
        atom_count = 0
        mol_count = 1

        current_mol = None
        last_mol_num = None
        atoms_content = []

        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            line_head = line[:cfg[PDB_LINE_TYPE_LAST_CHAR]]
            # head_content to contain Everything before 'Atoms' section
            # also capture the number of atoms
            if line_head == 'REMARK' or line_head == 'CRYST1':
                pdb_data[HEAD_CONTENT].append(line)

            # atoms_content to contain everything but the xyz
            elif line_head == 'ATOM  ':

                # My template PDB has ***** after atom_count 99999. Thus, I'm renumbering. Otherwise, this this:
                # atom_num = line[cfg[PDB_LINE_TYPE_LAST_CHAR]:cfg[PDB_ATOM_NUM_LAST_CHAR]]
                # For renumbering, making sure prints in the correct format, including num of characters:
                atom_count += 1

                # For reordering atoms
                if atom_count in atom_num_dict:
                    atom_id = atom_num_dict[atom_count]
                else:
                    atom_id = atom_count

                if atom_id > 99999:
                    atom_num = format(atom_id, 'x')
                    if len(atom_num) > 5:
                        warning(
                            "Hex representation of {} is {}, which is greater than 5 characters. This"
                            "will affect the PDB output formatting.".format(
                                atom_id, atom_num))
                else:
                    atom_num = '{:5d}'.format(atom_id)

                atom_type = line[
                    cfg[PDB_ATOM_NUM_LAST_CHAR]:cfg[PDB_ATOM_TYPE_LAST_CHAR]]
                atom_type_stripped = atom_type.strip()
                res_type = line[
                    cfg[PDB_ATOM_TYPE_LAST_CHAR]:cfg[PDB_RES_TYPE_LAST_CHAR]]
                mol_num = int(line[
                    cfg[PDB_RES_TYPE_LAST_CHAR]:cfg[PDB_MOL_NUM_LAST_CHAR]])
                pdb_x = float(
                    line[cfg[PDB_MOL_NUM_LAST_CHAR]:cfg[PDB_X_LAST_CHAR]])
                pdb_y = float(line[cfg[PDB_X_LAST_CHAR]:cfg[PDB_Y_LAST_CHAR]])
                pdb_z = float(line[cfg[PDB_Y_LAST_CHAR]:cfg[PDB_Z_LAST_CHAR]])
                occ_t = line[cfg[PDB_Z_LAST_CHAR]:cfg[PDB_LAST_T_CHAR]]
                element = line[cfg[PDB_LAST_T_CHAR]:cfg[PDB_LAST_ELEM_CHAR]]
                last_cols = line[cfg[PDB_LAST_ELEM_CHAR]:]

                # For user-specified changing of molecule number
                if mol_num in mol_num_dict:
                    mol_num = mol_num_dict[mol_num]

                # If doing water molecule checking...
                if cfg[FIRST_WAT_ID] <= atom_count <= cfg[LAST_WAT_ID]:
                    if (wat_count % 3) == 0:
                        current_mol = mol_num
                        if atom_type != '  OH2 ':
                            warning(
                                'Expected an OH2 atom to be the first atom of a water molecule. '
                                'Check line: {}'.format(line))
                        # last_cols = '  0.00  0.00      S2   O'
                    else:
                        if current_mol != mol_num:
                            warning('Water not in order on line:', line)
                        if (wat_count % 3) == 1:
                            if atom_type != '  H1  ':
                                warning(
                                    'Expected an H1 atom to be the second atom of a water molecule. '
                                    'Check line: {}'.format(line))
                        else:
                            if atom_type != '  H2  ':
                                warning(
                                    'Expected an H2 atom to be the second atom of a water molecule. '
                                    'Check line: {}'.format(line))
                    wat_count += 1

                if mol_num in cfg[
                        RESID_QMMM] and atom_type not in SKIP_ATOM_TYPES:
                    if atom_type == C_ALPHA:
                        ca_res_atom_id_dict[mol_num] = atom_id
                    else:
                        if atom_type == C_BETA:
                            cb_res_atom_id_dict[mol_num] = atom_id
                        if atom_type_stripped in element_dict:
                            element = element_dict[atom_type_stripped]
                        else:
                            raise InvalidDataError(
                                "Did not find atom type '{}' in the element dictionary. Please "
                                "provide a new atom type, element dictionary (using keyword {} "
                                "in the configuration file) that includes all atom types in the "
                                "residues identified with the '{}' key."
                                "".format(atom_type_stripped,
                                          ELEMENT_DICT_FILE, RESID_QMMM))
                        if element in qmmm_elem_id_dict:
                            qmmm_elem_id_dict[element].append(atom_id)
                        else:
                            qmmm_elem_id_dict[element] = [atom_id]
                        atoms_for_vmd.append(atom_id - 1)

                if cfg[ADD_ELEMENTS] and atom_count <= cfg[LAST_ADD_ELEM]:
                    if atom_type_stripped in element_dict:
                        element = element_dict[atom_type_stripped]
                    else:
                        if atom_type_stripped not in missing_types:
                            warning(
                                "Please add atom type '{}' to dictionary of elements. Will not write/overwrite "
                                "element type in the pdb output.".format(
                                    atom_type_stripped))
                            missing_types.append(atom_type_stripped)

                # For numbering molecules from 1 to end
                if cfg[RENUM_MOL]:
                    if last_mol_num is None:
                        last_mol_num = mol_num

                    if mol_num != last_mol_num:
                        last_mol_num = mol_num
                        mol_count += 1
                        if mol_count == 10000:
                            warning(
                                "Molecule numbers greater than 9999 will be printed in hex"
                            )

                    # Due to PDB format constraints, need to print in hex starting at 9999 molecules.
                    if mol_count > 9999:
                        mol_num = format(mol_count, 'x')
                        if len(mol_num) > 4:
                            warning(
                                "Hex representation of {} is {}, which is greater than 4 characters. This"
                                "will affect the PDB output formatting.".
                                format(atom_id, atom_num))
                    else:
                        mol_num = '{:4d}'.format(mol_count)

                line_struct = [
                    line_head, atom_num, atom_type, res_type, mol_num, pdb_x,
                    pdb_y, pdb_z, occ_t, element, last_cols
                ]
                atoms_content.append(line_struct)

            # tail_content to contain everything after the 'Atoms' section
            else:
                pdb_data[TAIL_CONTENT].append(line)

    # Only sort if there is renumbering
    if len(atom_num_dict) > 0:
        pdb_data[ATOMS_CONTENT] = sorted(atoms_content,
                                         key=lambda entry: entry[1])
    else:
        pdb_data[ATOMS_CONTENT] = atoms_content

    if cfg[PDB_NEW_FILE] is None:
        f_name = create_out_fname(cfg[PDB_FILE],
                                  suffix="_new",
                                  base_dir=cfg[OUT_BASE_DIR])
    else:
        f_name = create_out_fname(cfg[PDB_NEW_FILE],
                                  base_dir=cfg[OUT_BASE_DIR])
    print_pdb(pdb_data[HEAD_CONTENT], pdb_data[ATOMS_CONTENT],
              pdb_data[TAIL_CONTENT], f_name, cfg[PDB_FORMAT])

    if len(cfg[RESID_QMMM]) > 0:
        f_name = create_out_fname('amino_id.dat', base_dir=cfg[OUT_BASE_DIR])
        print_mode = "w"
        for elem in sorted(qmmm_elem_id_dict):
            print_qm_kind(qmmm_elem_id_dict[elem],
                          elem,
                          f_name,
                          mode=print_mode)
            print_mode = 'a'
        print_qm_links(ca_res_atom_id_dict,
                       cb_res_atom_id_dict,
                       f_name,
                       mode=print_mode)
        f_name = create_out_fname('vmd_protein_atoms.dat',
                                  base_dir=cfg[OUT_BASE_DIR])
        list_to_csv([atoms_for_vmd], f_name, delimiter=' ')
Exemplo n.º 10
0
def parse_cmdline(argv=None):
    """
    Returns the parsed argument list and return code.
    :param argv: A list of arguments, or `None` for ``sys.argv[1:]``.
    """
    if argv is None:
        argv = sys.argv[1:]

    # initialize the parser object:
    parser = argparse.ArgumentParser(
        description='Fills in a template file with parameter values.')
    parser.add_argument(
        "-c",
        "--config",
        help="The location of the configuration file in ini format. "
        "The default file name is {}, located in the "
        "base directory where the program as run. "
        "Note: 1) a [{}] section is required. 2) optional sections are [{}] and "
        "[{}], which allows key values to be calculated based on other tpl "
        "values. 3) Equations will be evaluated in the order provided, so if "
        "an equation depends on the value computed from another equation, list "
        "the dependent equation after its inputs. 4) Multiple values and "
        "equations may be listed for any keys. In that case, the program will "
        "create multiple output files. If a static '{}' is provided, the "
        "file will be overwritten, leaving only one filled file at the end. "
        "The '{}' can include keys (i.e. filled_tpl_name = {{key1}}.txt), so "
        "if multiple values are listed for key1 (i.e. key1 = A,B,C), multiple "
        "output files will be created (A.txt, B.txt, C.txt)."
        "".format(DEF_CFG_FILE, MAIN_SEC, TPL_VALS_SEC, TPL_EQS_SEC,
                  FILLED_TPL_FNAME, FILLED_TPL_FNAME),
        default=DEF_CFG_FILE,
        type=read_cfg)
    parser.add_argument(
        "-f",
        "--filled_tpl_name",
        help="File name for new file to be created by filling the template "
        "file. It can also be specified in the configuration file. "
        "If specified in both places, the command line option will "
        "take precedence.",
        default=None)

    args = None
    try:
        args = parser.parse_args(argv)
        if not os.path.isfile(args.config[TPL_FNAME]):
            if args.config[TPL_FNAME] == DEF_TPL:
                error_message = "Check input for the configuration key '{}'; " \
                                "could not find the default template file: {}"
            else:
                error_message = "Could not find the template file specified with " \
                                "the configuration key '{}': {}"
            raise IOError(
                error_message.format(TPL_FNAME, args.config[TPL_FNAME]))
        if args.filled_tpl_name is not None:
            args.config[FILLED_TPL_FNAME] = args.filled_tpl_name
        if args.config[FILLED_TPL_FNAME] is None:
            raise InvalidDataError(
                "Missing required key '{}', which can be specified in the "
                "required either in the command line for configuration file."
                "".format(FILLED_TPL_FNAME))
    except (KeyError, InvalidDataError, IOError, SystemExit) as e:
        if hasattr(e, 'code') and e.code == 0:
            return args, GOOD_RET
        warning(e)
        parser.print_help()
        return args, INPUT_ERROR
    return args, GOOD_RET
Exemplo n.º 11
0
def process_file(data_file, out_dir, len_buffer, delimiter, min_max_dict, header=False, make_hist=False):
    try:
        dim_vectors, header_row, hist_data = np_float_array_from_file(data_file, delimiter=delimiter,
                                                                      header=header, gather_hist=make_hist)

    except InvalidDataError as e:
        raise InvalidDataError("{}\n"
                               "Run program with '-h' to see options, such as specifying header row (-n) "
                               "and/or delimiter (-d)".format(e))

    if header:
        to_print = [[''] + header_row]
    else:
        to_print = []

    max_vector = dim_vectors.max(axis=0)
    min_vector = dim_vectors.min(axis=0)
    avg_vector = dim_vectors.mean(axis=0)
    med_vector = np.percentile(dim_vectors, 50, axis=0)

    # noinspection PyTypeChecker
    to_print += [['Min values:'] + min_vector.tolist(),
                 ['Max values:'] + max_vector.tolist(),
                 ['Avg values:'] + avg_vector.tolist(),
                 ['Std dev:'] + dim_vectors.std(axis=0, ddof=1).tolist(),
                 ['5% percentile:'] + np.percentile(dim_vectors, 4.55, axis=0).tolist(),
                 ['32% percentile:'] + np.percentile(dim_vectors, 31.73, axis=0).tolist(),
                 ['50% percentile:'] + med_vector.tolist(),
                 ['68% percentile:'] + np.percentile(dim_vectors, 68.27, axis=0).tolist(),
                 ['95% percentile:'] + np.percentile(dim_vectors, 95.45, axis=0).tolist(),
                 ]
    if len_buffer is not None:
        to_print.append(['Max plus {} buffer:'.format(len_buffer)] + (max_vector + len_buffer).tolist())

    if min_max_dict is not None:
        nan_list = [np.nan] * len(header_row)
        avg_ini_diff = ['Avg % Diff:'] + nan_list
        med_ini_diff = ['Med % Diff:'] + nan_list
        med_is_min = ['Median is Min:'] + nan_list
        med_is_max = ['Median is Max:'] + nan_list
        for col_num, header in enumerate(to_print[0]):
            if header in min_max_dict[0]:
                ini_val = min_max_dict[0][header]
                low_val = min_max_dict[1][header]
                upp_val = min_max_dict[2][header]
                avg_val = avg_vector[col_num - 1]
                med_val = med_vector[col_num - 1]
                min_val = min_vector[col_num - 1]
                max_val = max_vector[col_num - 1]
                min_tol = max(TOL * max(abs(min_val), abs(low_val)), TOL)
                med_tol = max(TOL * abs(med_val), TOL)
                max_tol = max(TOL * max(abs(max_val), abs(upp_val)), TOL)
                if (low_val - min_val) > min_tol:
                    warning("Minimum value found for header '{}' ({}) is less than lower bound ({})"
                            "".format(header, min_val, low_val))
                if (max_val - upp_val) > max_tol:
                    warning("Maximum value found for header '{}' ({}) is greater than upper bound ({})"
                            "".format(header, max_val, upp_val))
                avg_ini_diff[col_num] = (avg_val - ini_val) / ini_val * 100
                med_ini_diff[col_num] = (med_val - ini_val) / ini_val * 100
                if abs(med_val - low_val) > med_tol:
                    med_is_min[col_num] = 0
                else:
                    med_is_min[col_num] = 1
                if abs(med_val - upp_val) > med_tol:
                    med_is_max[col_num] = 0
                else:
                    med_is_max[col_num] = 1
                    # else:
                    #     for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]:
                    #         min_max_list.append(np.nan)
        for min_max_list in [avg_ini_diff, med_ini_diff, med_is_min, med_is_max]:
            to_print.append(min_max_list)

    # Printing to standard out: do not print quotes around strings because using csv writer
    # print("Number of dimensions ({}) based on first line of file: {}".format(len(dim_vectors[0]), data_file))
    if len(dim_vectors[0]) < 12:
        for index, row in enumerate(to_print):
            # formatting for header
            if index == 0 and header:
                print("{:>20s} {}".format(row[0],
                                          ' '.join(['{:>16s}'.format(x.strip()) for x in row[1:]])))
            # formatting for vals
            else:
                print("{:>20s} {}".format(row[0], ' '.join(['{:16.6f}'.format(x) for x in row[1:]])))

    f_name = create_out_fname(data_file, prefix='stats_', ext='.csv', base_dir=out_dir)
    list_to_csv(to_print, f_name)
    # list_to_file(to_print, f_name, delimiter=',')

    if make_hist:
        create_hists(data_file, header_row, hist_data, out_dir)