Exemplo n.º 1
0
    def testBasicUse(self):
        silent_remove(SUB_SUB_DIR, dir_with_files=True)
        make_dir(SUB_SUB_DIR)
        fname_after_run = os.path.join(SUB_SUB_DIR, 'for_hartree',
                                       'pet_mono_637_tzvp.log')
        silent_remove(fname_after_run)

        file1_to_copy = os.path.join(SUB_DATA_DIR, 'pet_mono_637_tzvp.tpl')
        temp_fname1 = os.path.join(SUB_SUB_DIR, 'pet_mono_637_tzvp.log')
        file2_to_copy = os.path.join(SUB_DATA_DIR, 'me2propprpnt_7.log')
        temp_fname2 = os.path.join(SUB_SUB_DIR, 'me2propprpnt_7.log')
        file3_to_copy = os.path.join(SUB_DATA_DIR, 'pet_mono_671_tzvp.log')
        temp_fname3 = os.path.join(SUB_SUB_DIR, 'pet_mono_671_tzvp.log')

        good_output = "The following files completed normally:\n" \
                      "    tests/test_data/check_gauss/temp_dir/pet_mono_637_tzvp.log\n" \
                      "The following files may have failed:\n" \
                      "    tests/test_data/check_gauss/temp_dir/me2propprpnt_7.log\n" \
                      "The following files may still be running:\n" \
                      "    tests/test_data/check_gauss/temp_dir/pet_mono_671_tzvp.log\n"

        try:
            copyfile(file1_to_copy, temp_fname1)
            copyfile(file2_to_copy, temp_fname2)
            copyfile(file3_to_copy, temp_fname3)
            test_input = ["-d", SUB_SUB_DIR]
            # main(test_input)
            with capture_stdout(main, test_input) as output:
                self.assertTrue(good_output in output)
            self.assertTrue(os.path.isfile(fname_after_run))
        finally:
            silent_remove(SUB_SUB_DIR,
                          dir_with_files=True,
                          disable=DISABLE_REMOVE)
            pass
Exemplo n.º 2
0
def main(argv=None):
    args, ret = parse_cmdline(argv)
    if ret != GOOD_RET or args is None:
        return ret

    try:
        # start with copies of global variable dicts; then only the copies will be altered
        if args.file_name:
            mw_formula_dict = MW_FORM_DICT.copy()
            mw_deprot_formula_dict = MW_DEPROT_FORM_DICT.copy()
            mw_prot_formula_dict = MW_PROT_FORM_DICT.copy()
            form_smi_dict = deepcopy(FORMULA_SMI_DICT)
            form_dbe_dict = FORMULA_DBE_DICT.copy()
            smi_name_dict = deepcopy(SMI_NAME_DICT)
            smi_source_dict = deepcopy(SMI_SOURCE_DICT)

            number_additions = process_input_file(
                args.file_name, mw_formula_dict, mw_deprot_formula_dict,
                mw_prot_formula_dict, form_smi_dict, form_dbe_dict,
                smi_name_dict, smi_source_dict)

            # Reading complete, now output
            if number_additions:
                pretty_print_dicts(mw_formula_dict, mw_deprot_formula_dict,
                                   mw_prot_formula_dict, form_smi_dict,
                                   form_dbe_dict, smi_name_dict,
                                   smi_source_dict)

        if args.image_library:
            if args.mw_list:
                mw_keys = [x.strip() for x in args.mw_list.split(",")]
            else:
                mw_keys = MW_FORM_DICT.keys()
            if args.out_dir:
                make_dir(args.out_dir)
            image_grid_mult_mw(mw_keys,
                               MW_FORM_DICT,
                               FORMULA_SMI_DICT,
                               out_dir=args.out_dir)

    except IOError as e:
        warning(e)
        return IO_ERROR
    except InvalidDataError as e:
        warning(e)
        return INVALID_DATA

    return GOOD_RET  # success
Exemplo n.º 3
0
def check_if_files_to_be_saved(cfg):
    """
    Evaluate input for requests to save output and check for valid specified locations
    :param cfg: dict of configuration values
    :return: if the cfg designs that files should be created, returns an updated cfg dict, and raises errors if
              invalid data in encountered
    """
    if cfg[OUT_FORMAT_LIST]:
        # remove any periods to aid comparison; might as well also change comma to space and then split on just space
        out_format_list = cfg[OUT_FORMAT_LIST].replace(".", " ").replace(",", " ")
        format_set = set(out_format_list.split())
    else:
        format_set = set()

    if cfg[BASENAME] and (cfg[BASENAME] != DEF_BASENAME):
        # If cfg[BASENAME] is not just the base name, make it so, saving a dir or ext in their spots
        out_path, base_name = os.path.split(cfg[BASENAME])
        if out_path and cfg[OUT_DIR]:
            cfg[OUT_DIR] = os.path.join(cfg[OUT_DIR], out_path)
        elif out_path:
            cfg[OUT_DIR] = out_path
        base, ext = os.path.splitext(base_name)
        cfg[BASENAME] = base
        format_set.add(ext.replace(".", ""))

    if len(format_set) > 0:
        for format_type in format_set:
            if format_type in OUT_TYPE_LIST:
                cfg[SAVE_FILES] = True
                cfg[format_type] = True
            else:
                raise InvalidDataError(f"Invalid extension provided: '{format_type}'. The currently supported types "
                                       f"are: '{OUT_TYPE_STR}'")
    if cfg[PLOT_BONDS]:
        cfg[SAVE_FILES] = True

    # if out_dir does not already exist, recreate it, only if we will actually need it
    if cfg[SAVE_FILES] and cfg[OUT_DIR]:
        make_dir(cfg[OUT_DIR])
Exemplo n.º 4
0
def collect_check_process_file_list(args):
    # find files to process, allowing any combination of a file name, a list of file names, searching dir...

    # first look for mzML files
    process_file_list = check_for_files(args.file_name, args.list_file, search_pattern=MZML_EXT,
                                        search_dir=args.directory, search_sub_dir=args.sub_dir_flag,
                                        warn_if_no_matches=False)
    # if no mzml files, look for csv files
    if len(process_file_list) == 0:
        process_file_list = check_for_files(args.file_name, args.list_file, search_pattern=CSV_EXT,
                                            search_dir=args.directory, search_sub_dir=args.sub_dir_flag,
                                            warn_if_no_matches=False)

    # Now check that didn't accidentally get program output or wrong file type
    filtered_file_list = []
    for fname in process_file_list:
        fname_lower = os.path.basename(fname).lower()
        if not (fname_lower.endswith(MZML_EXT.lower()) or fname_lower.endswith(CSV_EXT.lower())):
            warning(f"The expected file extensions are '{MZML_EXT}' and '{CSV_EXT}'.\n"
                    f"    Encountered file: {os.path.relpath(fname)}.\n    Skipping file.")
            continue
        if not (fnmatch.fnmatch(fname_lower, "*matched.csv") or fnmatch.fnmatch(fname_lower, "*matched_ext.csv")):
            filtered_file_list.append(fname)
    if not len(filtered_file_list) and args.blank_file_name is None:
        raise InvalidDataError("No files found to process. Exiting program.")
    process_file_list = filtered_file_list

    # Make sure that blank files are not processed twice by removing from this list if it is there
    if args.blank_file_name:
        if args.blank_file_name in process_file_list:
            process_file_list.remove(args.blank_file_name)
        fname_lower = os.path.basename(args.blank_file_name).lower()
        if not (fname_lower.lower().endswith(MZML_EXT.lower()) or fname_lower.endswith(CSV_EXT.lower())):
            raise InvalidDataError(f"The expected file extensions for MS output are '{MZML_EXT}' and '{CSV_EXT}'.\n"
                                   f"    Specified blank file: {os.path.relpath(args.blank_file_name)}\n"
                                   f"    Exiting program.")

    # Now check names for protonated/deprotonated flags
    for fname in process_file_list:
        fname_lower = os.path.basename(fname).lower()
        pos_match = fnmatch.fnmatch(fname_lower, "*+*")
        neg_match = fnmatch.fnmatch(fname_lower, "*-*")
        if pos_match and neg_match:
            raise InvalidDataError(f"Found both a '+' and a '-' in the file name: {os.path.relpath(fname)}\n"
                                   f"    Only one of these characters can appear in a file name, as this "
                                   f"program uses these characters to determine if matches should be attempted for "
                                   f"protonated ('+') or deprotonated ('-') ion MWs.")
        if not (pos_match or neg_match) and not args.quit_after_mzml_to_csv:
            if args.omit_mol_ion:
                raise InvalidDataError(f"The '-x'/'--omit_mol_ion' option was selection, although there is no '+' nor "
                                       f"'-' in the file name: {os.path.relpath(fname)}\n    Thus, no matching type "
                                       f"has been selected. Program exiting.")
            else:
                warning(f"Since neither '+' nor '-' appear in the file name: {os.path.relpath(fname)}\n    "
                        f"Only matches to the molecular ion will be reported.")

    # While setting up, also make output directory if it does, so there is a place to save a CSV file
    if args.out_dir:
        make_dir(args.out_dir)

    return process_file_list
Exemplo n.º 5
0
def parse_cmdline(argv):
    """
    Returns the parsed argument list and return code.
    `argv` is a list of arguments, or `None` for ``sys.argv[1:]``.
    """
    if argv is None:
        argv = sys.argv[1:]

    # initialize the parser object:
    parser = argparse.ArgumentParser(
        description=
        'Creates Gaussian input files from SMILES strings, given a template '
        'input file.')
    parser.add_argument(
        "-t",
        "--" + GAU_TPL_FILE,
        help=f"Required: the location of the Gaussian input template file. "
        f"This file must contain the string '{REQ_STR}' in the "
        f"location where the atom type and coordinates should be "
        f"added.",
        metavar='PATH',
        default=None)
    parser.add_argument(
        "-l",
        "--" + LIST_FNAME,
        help=f"Option to specify a file with a list of SMILES strings "
        f"(one file per line on the list).",
        metavar='PATH',
        default=None)
    parser.add_argument(
        "-m",
        "--" + MAX_CONFS,
        help=f"Option to specify the maximum number of conformations to be "
        f"created for each SMILES string. The default value is "
        f"{DEF_MAX_CONFS}.",
        metavar='INT',
        default=DEF_MAX_CONFS,
        type=int)
    parser.add_argument(
        "-o",
        "--" + OUT_DIR,
        help="Directory where created files should be saved. The default "
        "is the working directory. If a directory is specified and does not "
        "yet exist, it will be created.",
        metavar='PATH',
        default=None)
    parser.add_argument(
        "-s",
        "--" + SMI,
        help=
        "Option to specify a SMILES string. Multiple strings can be separated "
        "by ','; if '.' is present the two molecules/fragments will be "
        "considered together.",
        metavar='STR',
        default=None)
    args = None
    try:
        args = parser.parse_args(argv)
        if args.gau_tpl_file is None:
            raise InvalidDataError(
                f"A template Gaussian input file is required to be specified with the "
                f"'-t'/'--{GAU_TPL_FILE}' option.")
        if not os.path.isfile(args.gau_tpl_file):
            raise InvalidDataError(
                f"Could not locate the specified Gaussian input file.")

        if args.list_file is None:
            args.list_file = []
        else:
            args.list_file = file_rows_to_list(args.list_file)
        if args.smiles is not None:
            smi_list = [smi.strip() for smi in args.smiles.split(",")]
            args.list_file.extend(smi_list)
        if len(args.list_file) == 0:
            raise InvalidDataError(
                f"No SMILES input has been specified. Specify a single SMILES string with the "
                f"'-s'/'--{SMI}' option or a files with a list of SMILES strings (one per line) "
                f"with the '-l'/'--{LIST_FNAME}' option.")
        args.list_file = list(set(args.list_file))

        if args.out_dir is not None:
            make_dir(args.out_dir)
    except (IOError, KeyError, InvalidDataError, MissingSectionHeaderError,
            SystemExit) as e:
        if hasattr(e, 'code') and e.code == 0:
            return args, GOOD_RET
        warning(e)
        parser.print_help()
        return args, INPUT_ERROR

    return args, GOOD_RET
def write_output(fname, ms_level, num_matches, short_output_list, long_output_list, matched_formulas,
                 combined_out_fname, omit_mol_ion_flag, deprot_flag, prot_flag, write_mode, out_dir):
    """
    Print output from matching M/Z to lignin molecule library
    :param fname: location of input file processed
    :param ms_level: int, type of MS output, for output name so there are separate files from multiple-channel input
    :param num_matches: the number of matches made between input M/Z and MW in lignin library
    :param short_output_list: list of dicts of summary matching data (one list per match)
    :param long_output_list: list of dicts of extended matching data (sorted by MZ values)
    :param matched_formulas: set of formula names that were matched to M/Z values
    :param combined_out_fname: None or string if output from multiple files is to be written to one file
    :param omit_mol_ion_flag: boolean to indicate if molecular ion matches were not attempted (True) or sought (False)
    :param deprot_flag: boolean to indicate if matches were found for molecular ions
    :param prot_flag: flag to indicate if matches were found for molecular ions
    :param write_mode: flag to indicate if matches were found for molecular ions
    :param out_dir: location of output directory, or None if the current directory is the output directory
    :return: n/a; several output files created
    """
    # prepare string for txt output file
    if write_mode == 'a':
        short_txt_output_str = ''
    else:
        short_txt_output_str = MATCH_STR_HEADER
    for mz_dict in short_output_list:
        peak_str = MZ_STR_FMT.format(mz_dict[M_Z], mz_dict[INTENSITY], mz_dict[RET_TIME])
        short_txt_output_str += MATCH_STR_FMT.format(peak_str, mz_dict[REL_INTENSITY], mz_dict[CALC_MW],
                                                     mz_dict[PPM_ERR], mz_dict[PARENT_FORMULA], mz_dict[DBE],
                                                     mz_dict[MATCH_TYPE])

    ms_str = f"_ms{ms_level}"
    if ms_str in fname:
        suffix = DEF_SUFFIX
        ext_suffix = DEF_LONG_SUFFIX
    else:
        suffix = ms_str + DEF_SUFFIX
        ext_suffix = ms_str + DEF_LONG_SUFFIX
    f_out_txt = create_out_fname(fname, suffix=suffix, base_dir=out_dir, ext="txt")
    f_out_csv = create_out_fname(fname, suffix=suffix, base_dir=out_dir, ext="csv")
    if combined_out_fname:
        f_out_long = create_out_fname(combined_out_fname, suffix="_ext", base_dir=out_dir, ext="csv")
    else:
        f_out_long = create_out_fname(fname, suffix=ext_suffix, base_dir=out_dir, ext="csv")
    # Print quick summary; first note which types of matches were investigated
    if omit_mol_ion_flag:
        match_str_list = []
    else:
        match_str_list = ["molecular ion"]
    if deprot_flag:
        match_str_list.append("deprotonated ion")
    if prot_flag:
        match_str_list.append("protonated ion")
    print(f"    {num_matches} of these matched a MW in our dictionaries for a {' or a '.join(match_str_list)}")
    # save output to files
    short_write_mode = 'w'
    if num_matches == 0:
        warning(f"No MW to MZ matches (within specified ppm error) found for file: {os.path.basename(fname)}\n    "
                f"Summary output will not be printed.")
    else:
        str_to_file(short_txt_output_str, os.path.relpath(f_out_txt), print_info=True, mode=short_write_mode)
        write_csv(short_output_list, os.path.relpath(f_out_csv), SHORT_OUTPUT_HEADERS, extrasaction="ignore",
                  mode=short_write_mode)
    if out_dir:
        struct_dir = os.path.join(out_dir, STRUCT_DIR)
    else:
        struct_dir = STRUCT_DIR
    make_dir(struct_dir)
    for formula in matched_formulas:
        my_formula = formula.replace("*", "")
        make_image_grid(formula, list(FORMULA_SMI_DICT[my_formula]), out_dir=struct_dir, write_output=False)

    # print long output even if no matches
    write_csv(long_output_list, os.path.relpath(f_out_long), OUTPUT_HEADERS, extrasaction="ignore", mode=write_mode)
def process_ms_run_file(args, fname, blank_data_array_dict):
    """
    Read ms_data and remove low intensity peaks
    :param args: command-line input and default values for program options
    :param fname: the file name of the file with ms data
    :param blank_data_array_dict: dictionary of blank data (keys are ms level); empty dict if no blank data provided
    :return:
    """
    base_name = os.path.basename(fname)
    print(f"\nReading file: {os.path.basename(fname)}")
    if args.out_dir is None:
        # if out_dir was specified, the directory has already been created; otherwise, make a directory just for this
        #     file's output
        args.out_dir = get_fname_root(fname) + DEF_SUFFIX
        make_dir(args.out_dir)
    fname_lower = base_name.lower()

    # Reading file
    if fname_lower.endswith(MZML_EXT.lower()):
        data_array_dict = process_mzml_input(fname, args.num_decimals_ms_accuracy, args.ms_accuracy,
                                             args.ret_time_accuracy, args.direct_injection)
    else:
        # previously screened that only CSV_EXT or MZML_EXT get to this point
        data_array_dict = read_validate_csv_data(fname, fname_lower, args.ms_accuracy, args.direct_injection)
        # not all mz entries in data_array_dict will be rounded, but wait to round until needed
    if len(data_array_dict) == 0:
        warning(f"Found no spectra to analyze in file: {os.path.relpath(fname)}\n    Skipping file.")
        return None

    # Clean up noise and save, if not already clean
    for ms_level, data_array in data_array_dict.items():
        print(f"Read MS Level {ms_level}")
        # # todo: it does not look like rounding below is needed
        # if not np.isnan(data_array_dict[ms_level][0][2]):
        #     data_array_dict[ms_level][:, 2] = round_to_fraction(data_array_dict[ms_level][:, 2],
        #                                                         args.ret_time_accuracy)
        if args.direct_injection or not ("clean" in fname_lower):
            comment = ""
            if ms_level in blank_data_array_dict:
                data_array, ret_type = compare_blank(data_array, blank_data_array_dict[ms_level], args.ms_accuracy,
                                                     args.ret_time_accuracy, args.num_decimals_ms_accuracy,
                                                     args.threshold)
                if ret_type == 1:
                    warning(f"No common retention times found for blank file: {os.path.relpath(args.blank_file_name)}\n"
                            f"    and ms run file: {os.path.relpath(fname)}\n")
                elif ret_type == 2:
                    warning(f"No common M/Z values found for blank file: {os.path.relpath(args.blank_file_name)}\n"
                            f"    and ms run file: {os.path.relpath(fname)}\n")
                else:
                    comment = f" Subtracted blank run data provided in file: {os.path.relpath(args.blank_file_name)}"
                    print("   " + comment)
                    comment = "#" + comment + "\n"

            # whether or not remove blank, want to prune and print
            data_array, comment = prune_intensity(data_array, args.min_rel_intensity, comment)
            # removing blanks changes sorting *if* there is retention time; change back (to be consistent with
            #     other output) now that pruned (rather than before), because it is faster to sort a shorter array
            if not np.isnan(data_array[-1][2]):
                # sorting needed; arrives sorted first by retention time
                data_array = data_array[np.lexsort((-data_array[:, 1], data_array[:, 0]))]
            print_clean_csv(fname, fname_lower, ms_level, data_array, comment, args.direct_injection,
                            args.unlabeled_csvs, args.numpy_save_fmt, args.out_dir)
            data_array_dict[ms_level] = data_array
    return data_array_dict