def testBasicUse(self): silent_remove(SUB_SUB_DIR, dir_with_files=True) make_dir(SUB_SUB_DIR) fname_after_run = os.path.join(SUB_SUB_DIR, 'for_hartree', 'pet_mono_637_tzvp.log') silent_remove(fname_after_run) file1_to_copy = os.path.join(SUB_DATA_DIR, 'pet_mono_637_tzvp.tpl') temp_fname1 = os.path.join(SUB_SUB_DIR, 'pet_mono_637_tzvp.log') file2_to_copy = os.path.join(SUB_DATA_DIR, 'me2propprpnt_7.log') temp_fname2 = os.path.join(SUB_SUB_DIR, 'me2propprpnt_7.log') file3_to_copy = os.path.join(SUB_DATA_DIR, 'pet_mono_671_tzvp.log') temp_fname3 = os.path.join(SUB_SUB_DIR, 'pet_mono_671_tzvp.log') good_output = "The following files completed normally:\n" \ " tests/test_data/check_gauss/temp_dir/pet_mono_637_tzvp.log\n" \ "The following files may have failed:\n" \ " tests/test_data/check_gauss/temp_dir/me2propprpnt_7.log\n" \ "The following files may still be running:\n" \ " tests/test_data/check_gauss/temp_dir/pet_mono_671_tzvp.log\n" try: copyfile(file1_to_copy, temp_fname1) copyfile(file2_to_copy, temp_fname2) copyfile(file3_to_copy, temp_fname3) test_input = ["-d", SUB_SUB_DIR] # main(test_input) with capture_stdout(main, test_input) as output: self.assertTrue(good_output in output) self.assertTrue(os.path.isfile(fname_after_run)) finally: silent_remove(SUB_SUB_DIR, dir_with_files=True, disable=DISABLE_REMOVE) pass
def main(argv=None): args, ret = parse_cmdline(argv) if ret != GOOD_RET or args is None: return ret try: # start with copies of global variable dicts; then only the copies will be altered if args.file_name: mw_formula_dict = MW_FORM_DICT.copy() mw_deprot_formula_dict = MW_DEPROT_FORM_DICT.copy() mw_prot_formula_dict = MW_PROT_FORM_DICT.copy() form_smi_dict = deepcopy(FORMULA_SMI_DICT) form_dbe_dict = FORMULA_DBE_DICT.copy() smi_name_dict = deepcopy(SMI_NAME_DICT) smi_source_dict = deepcopy(SMI_SOURCE_DICT) number_additions = process_input_file( args.file_name, mw_formula_dict, mw_deprot_formula_dict, mw_prot_formula_dict, form_smi_dict, form_dbe_dict, smi_name_dict, smi_source_dict) # Reading complete, now output if number_additions: pretty_print_dicts(mw_formula_dict, mw_deprot_formula_dict, mw_prot_formula_dict, form_smi_dict, form_dbe_dict, smi_name_dict, smi_source_dict) if args.image_library: if args.mw_list: mw_keys = [x.strip() for x in args.mw_list.split(",")] else: mw_keys = MW_FORM_DICT.keys() if args.out_dir: make_dir(args.out_dir) image_grid_mult_mw(mw_keys, MW_FORM_DICT, FORMULA_SMI_DICT, out_dir=args.out_dir) except IOError as e: warning(e) return IO_ERROR except InvalidDataError as e: warning(e) return INVALID_DATA return GOOD_RET # success
def check_if_files_to_be_saved(cfg): """ Evaluate input for requests to save output and check for valid specified locations :param cfg: dict of configuration values :return: if the cfg designs that files should be created, returns an updated cfg dict, and raises errors if invalid data in encountered """ if cfg[OUT_FORMAT_LIST]: # remove any periods to aid comparison; might as well also change comma to space and then split on just space out_format_list = cfg[OUT_FORMAT_LIST].replace(".", " ").replace(",", " ") format_set = set(out_format_list.split()) else: format_set = set() if cfg[BASENAME] and (cfg[BASENAME] != DEF_BASENAME): # If cfg[BASENAME] is not just the base name, make it so, saving a dir or ext in their spots out_path, base_name = os.path.split(cfg[BASENAME]) if out_path and cfg[OUT_DIR]: cfg[OUT_DIR] = os.path.join(cfg[OUT_DIR], out_path) elif out_path: cfg[OUT_DIR] = out_path base, ext = os.path.splitext(base_name) cfg[BASENAME] = base format_set.add(ext.replace(".", "")) if len(format_set) > 0: for format_type in format_set: if format_type in OUT_TYPE_LIST: cfg[SAVE_FILES] = True cfg[format_type] = True else: raise InvalidDataError(f"Invalid extension provided: '{format_type}'. The currently supported types " f"are: '{OUT_TYPE_STR}'") if cfg[PLOT_BONDS]: cfg[SAVE_FILES] = True # if out_dir does not already exist, recreate it, only if we will actually need it if cfg[SAVE_FILES] and cfg[OUT_DIR]: make_dir(cfg[OUT_DIR])
def collect_check_process_file_list(args): # find files to process, allowing any combination of a file name, a list of file names, searching dir... # first look for mzML files process_file_list = check_for_files(args.file_name, args.list_file, search_pattern=MZML_EXT, search_dir=args.directory, search_sub_dir=args.sub_dir_flag, warn_if_no_matches=False) # if no mzml files, look for csv files if len(process_file_list) == 0: process_file_list = check_for_files(args.file_name, args.list_file, search_pattern=CSV_EXT, search_dir=args.directory, search_sub_dir=args.sub_dir_flag, warn_if_no_matches=False) # Now check that didn't accidentally get program output or wrong file type filtered_file_list = [] for fname in process_file_list: fname_lower = os.path.basename(fname).lower() if not (fname_lower.endswith(MZML_EXT.lower()) or fname_lower.endswith(CSV_EXT.lower())): warning(f"The expected file extensions are '{MZML_EXT}' and '{CSV_EXT}'.\n" f" Encountered file: {os.path.relpath(fname)}.\n Skipping file.") continue if not (fnmatch.fnmatch(fname_lower, "*matched.csv") or fnmatch.fnmatch(fname_lower, "*matched_ext.csv")): filtered_file_list.append(fname) if not len(filtered_file_list) and args.blank_file_name is None: raise InvalidDataError("No files found to process. Exiting program.") process_file_list = filtered_file_list # Make sure that blank files are not processed twice by removing from this list if it is there if args.blank_file_name: if args.blank_file_name in process_file_list: process_file_list.remove(args.blank_file_name) fname_lower = os.path.basename(args.blank_file_name).lower() if not (fname_lower.lower().endswith(MZML_EXT.lower()) or fname_lower.endswith(CSV_EXT.lower())): raise InvalidDataError(f"The expected file extensions for MS output are '{MZML_EXT}' and '{CSV_EXT}'.\n" f" Specified blank file: {os.path.relpath(args.blank_file_name)}\n" f" Exiting program.") # Now check names for protonated/deprotonated flags for fname in process_file_list: fname_lower = os.path.basename(fname).lower() pos_match = fnmatch.fnmatch(fname_lower, "*+*") neg_match = fnmatch.fnmatch(fname_lower, "*-*") if pos_match and neg_match: raise InvalidDataError(f"Found both a '+' and a '-' in the file name: {os.path.relpath(fname)}\n" f" Only one of these characters can appear in a file name, as this " f"program uses these characters to determine if matches should be attempted for " f"protonated ('+') or deprotonated ('-') ion MWs.") if not (pos_match or neg_match) and not args.quit_after_mzml_to_csv: if args.omit_mol_ion: raise InvalidDataError(f"The '-x'/'--omit_mol_ion' option was selection, although there is no '+' nor " f"'-' in the file name: {os.path.relpath(fname)}\n Thus, no matching type " f"has been selected. Program exiting.") else: warning(f"Since neither '+' nor '-' appear in the file name: {os.path.relpath(fname)}\n " f"Only matches to the molecular ion will be reported.") # While setting up, also make output directory if it does, so there is a place to save a CSV file if args.out_dir: make_dir(args.out_dir) return process_file_list
def parse_cmdline(argv): """ Returns the parsed argument list and return code. `argv` is a list of arguments, or `None` for ``sys.argv[1:]``. """ if argv is None: argv = sys.argv[1:] # initialize the parser object: parser = argparse.ArgumentParser( description= 'Creates Gaussian input files from SMILES strings, given a template ' 'input file.') parser.add_argument( "-t", "--" + GAU_TPL_FILE, help=f"Required: the location of the Gaussian input template file. " f"This file must contain the string '{REQ_STR}' in the " f"location where the atom type and coordinates should be " f"added.", metavar='PATH', default=None) parser.add_argument( "-l", "--" + LIST_FNAME, help=f"Option to specify a file with a list of SMILES strings " f"(one file per line on the list).", metavar='PATH', default=None) parser.add_argument( "-m", "--" + MAX_CONFS, help=f"Option to specify the maximum number of conformations to be " f"created for each SMILES string. The default value is " f"{DEF_MAX_CONFS}.", metavar='INT', default=DEF_MAX_CONFS, type=int) parser.add_argument( "-o", "--" + OUT_DIR, help="Directory where created files should be saved. The default " "is the working directory. If a directory is specified and does not " "yet exist, it will be created.", metavar='PATH', default=None) parser.add_argument( "-s", "--" + SMI, help= "Option to specify a SMILES string. Multiple strings can be separated " "by ','; if '.' is present the two molecules/fragments will be " "considered together.", metavar='STR', default=None) args = None try: args = parser.parse_args(argv) if args.gau_tpl_file is None: raise InvalidDataError( f"A template Gaussian input file is required to be specified with the " f"'-t'/'--{GAU_TPL_FILE}' option.") if not os.path.isfile(args.gau_tpl_file): raise InvalidDataError( f"Could not locate the specified Gaussian input file.") if args.list_file is None: args.list_file = [] else: args.list_file = file_rows_to_list(args.list_file) if args.smiles is not None: smi_list = [smi.strip() for smi in args.smiles.split(",")] args.list_file.extend(smi_list) if len(args.list_file) == 0: raise InvalidDataError( f"No SMILES input has been specified. Specify a single SMILES string with the " f"'-s'/'--{SMI}' option or a files with a list of SMILES strings (one per line) " f"with the '-l'/'--{LIST_FNAME}' option.") args.list_file = list(set(args.list_file)) if args.out_dir is not None: make_dir(args.out_dir) except (IOError, KeyError, InvalidDataError, MissingSectionHeaderError, SystemExit) as e: if hasattr(e, 'code') and e.code == 0: return args, GOOD_RET warning(e) parser.print_help() return args, INPUT_ERROR return args, GOOD_RET
def write_output(fname, ms_level, num_matches, short_output_list, long_output_list, matched_formulas, combined_out_fname, omit_mol_ion_flag, deprot_flag, prot_flag, write_mode, out_dir): """ Print output from matching M/Z to lignin molecule library :param fname: location of input file processed :param ms_level: int, type of MS output, for output name so there are separate files from multiple-channel input :param num_matches: the number of matches made between input M/Z and MW in lignin library :param short_output_list: list of dicts of summary matching data (one list per match) :param long_output_list: list of dicts of extended matching data (sorted by MZ values) :param matched_formulas: set of formula names that were matched to M/Z values :param combined_out_fname: None or string if output from multiple files is to be written to one file :param omit_mol_ion_flag: boolean to indicate if molecular ion matches were not attempted (True) or sought (False) :param deprot_flag: boolean to indicate if matches were found for molecular ions :param prot_flag: flag to indicate if matches were found for molecular ions :param write_mode: flag to indicate if matches were found for molecular ions :param out_dir: location of output directory, or None if the current directory is the output directory :return: n/a; several output files created """ # prepare string for txt output file if write_mode == 'a': short_txt_output_str = '' else: short_txt_output_str = MATCH_STR_HEADER for mz_dict in short_output_list: peak_str = MZ_STR_FMT.format(mz_dict[M_Z], mz_dict[INTENSITY], mz_dict[RET_TIME]) short_txt_output_str += MATCH_STR_FMT.format(peak_str, mz_dict[REL_INTENSITY], mz_dict[CALC_MW], mz_dict[PPM_ERR], mz_dict[PARENT_FORMULA], mz_dict[DBE], mz_dict[MATCH_TYPE]) ms_str = f"_ms{ms_level}" if ms_str in fname: suffix = DEF_SUFFIX ext_suffix = DEF_LONG_SUFFIX else: suffix = ms_str + DEF_SUFFIX ext_suffix = ms_str + DEF_LONG_SUFFIX f_out_txt = create_out_fname(fname, suffix=suffix, base_dir=out_dir, ext="txt") f_out_csv = create_out_fname(fname, suffix=suffix, base_dir=out_dir, ext="csv") if combined_out_fname: f_out_long = create_out_fname(combined_out_fname, suffix="_ext", base_dir=out_dir, ext="csv") else: f_out_long = create_out_fname(fname, suffix=ext_suffix, base_dir=out_dir, ext="csv") # Print quick summary; first note which types of matches were investigated if omit_mol_ion_flag: match_str_list = [] else: match_str_list = ["molecular ion"] if deprot_flag: match_str_list.append("deprotonated ion") if prot_flag: match_str_list.append("protonated ion") print(f" {num_matches} of these matched a MW in our dictionaries for a {' or a '.join(match_str_list)}") # save output to files short_write_mode = 'w' if num_matches == 0: warning(f"No MW to MZ matches (within specified ppm error) found for file: {os.path.basename(fname)}\n " f"Summary output will not be printed.") else: str_to_file(short_txt_output_str, os.path.relpath(f_out_txt), print_info=True, mode=short_write_mode) write_csv(short_output_list, os.path.relpath(f_out_csv), SHORT_OUTPUT_HEADERS, extrasaction="ignore", mode=short_write_mode) if out_dir: struct_dir = os.path.join(out_dir, STRUCT_DIR) else: struct_dir = STRUCT_DIR make_dir(struct_dir) for formula in matched_formulas: my_formula = formula.replace("*", "") make_image_grid(formula, list(FORMULA_SMI_DICT[my_formula]), out_dir=struct_dir, write_output=False) # print long output even if no matches write_csv(long_output_list, os.path.relpath(f_out_long), OUTPUT_HEADERS, extrasaction="ignore", mode=write_mode)
def process_ms_run_file(args, fname, blank_data_array_dict): """ Read ms_data and remove low intensity peaks :param args: command-line input and default values for program options :param fname: the file name of the file with ms data :param blank_data_array_dict: dictionary of blank data (keys are ms level); empty dict if no blank data provided :return: """ base_name = os.path.basename(fname) print(f"\nReading file: {os.path.basename(fname)}") if args.out_dir is None: # if out_dir was specified, the directory has already been created; otherwise, make a directory just for this # file's output args.out_dir = get_fname_root(fname) + DEF_SUFFIX make_dir(args.out_dir) fname_lower = base_name.lower() # Reading file if fname_lower.endswith(MZML_EXT.lower()): data_array_dict = process_mzml_input(fname, args.num_decimals_ms_accuracy, args.ms_accuracy, args.ret_time_accuracy, args.direct_injection) else: # previously screened that only CSV_EXT or MZML_EXT get to this point data_array_dict = read_validate_csv_data(fname, fname_lower, args.ms_accuracy, args.direct_injection) # not all mz entries in data_array_dict will be rounded, but wait to round until needed if len(data_array_dict) == 0: warning(f"Found no spectra to analyze in file: {os.path.relpath(fname)}\n Skipping file.") return None # Clean up noise and save, if not already clean for ms_level, data_array in data_array_dict.items(): print(f"Read MS Level {ms_level}") # # todo: it does not look like rounding below is needed # if not np.isnan(data_array_dict[ms_level][0][2]): # data_array_dict[ms_level][:, 2] = round_to_fraction(data_array_dict[ms_level][:, 2], # args.ret_time_accuracy) if args.direct_injection or not ("clean" in fname_lower): comment = "" if ms_level in blank_data_array_dict: data_array, ret_type = compare_blank(data_array, blank_data_array_dict[ms_level], args.ms_accuracy, args.ret_time_accuracy, args.num_decimals_ms_accuracy, args.threshold) if ret_type == 1: warning(f"No common retention times found for blank file: {os.path.relpath(args.blank_file_name)}\n" f" and ms run file: {os.path.relpath(fname)}\n") elif ret_type == 2: warning(f"No common M/Z values found for blank file: {os.path.relpath(args.blank_file_name)}\n" f" and ms run file: {os.path.relpath(fname)}\n") else: comment = f" Subtracted blank run data provided in file: {os.path.relpath(args.blank_file_name)}" print(" " + comment) comment = "#" + comment + "\n" # whether or not remove blank, want to prune and print data_array, comment = prune_intensity(data_array, args.min_rel_intensity, comment) # removing blanks changes sorting *if* there is retention time; change back (to be consistent with # other output) now that pruned (rather than before), because it is faster to sort a shorter array if not np.isnan(data_array[-1][2]): # sorting needed; arrives sorted first by retention time data_array = data_array[np.lexsort((-data_array[:, 1], data_array[:, 0]))] print_clean_csv(fname, fname_lower, ms_level, data_array, comment, args.direct_injection, args.unlabeled_csvs, args.numpy_save_fmt, args.out_dir) data_array_dict[ms_level] = data_array return data_array_dict