def main(): print( "\n*******************************************************************************" ) print( "* GCMStoolbox - a set of tools for GC-MS data analysis *" ) print("* Version: {} ({}) *". format(gcmstoolbox.version, gcmstoolbox.date)) print( "* Author: Wim Fremout, Royal Institute for Cultural Heritage *" ) print( "* Licence: GNU GPL version 3 *" ) print( "* *" ) print( "* IMPORT: *" ) print( "* import one or more AMDIS (.elu, .msl, .csl, .isl) and NIST MS SEARCH *" ) print( "* (.msp) files and store the mass spectra in GCMStoolbox JSON format *" ) print( "* *" ) print( "*******************************************************************************\n" ) ### OPTIONPARSER usage = "usage: %prog [options] IMPORTFILE1 [IMPORTFILE2 [...]]" parser = OptionParser(usage, version="GCMStoolbox version " + gcmstoolbox.version + " (" + gcmstoolbox.date + ")\n") parser.add_option("-v", "--verbose", help="Be very verbose [not default]", action="store_true", dest="verbose", default=False) parser.add_option("-o", "--jsonout", help="JSON output file name [default: gcmstoolbox.json]", action="store", dest="jsonout", type="string", default="gcmstoolbox.json") parser.add_option("-a", "--append", help="Append to existing json file [not default]", action="store_true", dest="append", default=False) group = OptionGroup(parser, "IMPORT OPTIONS", "Special formatting options for the ELinC project") group.add_option( "-s", "--specno", help= "Override spectrum numbering, start with I [default: 1]; the append option may override this", action="store", dest="i", default=1, type="int") group.add_option( "-n", "--norm", help= "Normalise to a given maximum, 0 to skip normalisation [default=999])", action="store", dest="n", default=999, type="int") group.add_option( "--allmodels", help="For AMDIS .ELU files: import all models [not default]", action="store_true", dest="allmodels", default=False) parser.add_option_group(group) group = OptionGroup(parser, "ELinC", "Special formatting options for the ELinC project") group.add_option( "-e", "--elinc", help="Retrieve parameters from the structured file names [not default]", action="store_true", dest="elinc", default=False) parser.add_option_group(group) (options, args) = parser.parse_args() ### ARGUMENTS AND OPTIONS cmd = " ".join(sys.argv) if options.verbose: print("Processing import files and options") # make a list of input files inFiles = [] if len(args) == 0: print(" !! No import files?\n") exit() else: for arg in args: inFiles.extend(glob(arg)) inFiles = list(set(inFiles)) #remove duplicates for inFile in inFiles: if os.path.isdir(inFile): inFiles.remove(inFile) #remove directories else: if options.verbose: print(" - import file: " + inFile) # number of inFiles; must not be 0 numInFiles = len(inFiles) if numInFiles == 0: print(" !! No import files?\n") exit() else: if options.verbose: print(" => " + str(numInFiles) + " import files") if options.verbose: print(" => JSON output file: " + options.jsonout + (" [append]" if options.append else "")) if options.append: data = gcmstoolbox.openJSON(options.jsonout) # check if it is a spectra file (cannot append to groups file) if data['info']['mode'] != "spectra": print(" !! Cannot append to a '" + data['info']['mode'] + "' mode data file.\n") exit() # add administration to specta[0] (info) data['info']['cmds'].append(cmd) data['info']['sources'].extend(inFiles) # spectrum number counter (remark: len(spectra) is always one count higher than the number of spectra; spectra[0] is info!) if len(data['spectra']) < options.i: i = options.i else: i = len(data['spectra']) + 1 else: cmds = [cmd] data = OrderedDict() data['info'] = OrderedDict([('mode', 'spectra'), ('cmds', cmds)]) data['spectra'] = OrderedDict() i = options.i # spectrum number if options.elinc and options.verbose: print(" => ELinC special formatting is set") ### ITERATE THROUGH INFILES # init progress bar if not options.verbose: print("\nProcessing files") j = 0 k = len(inFiles) gcmstoolbox.printProgress(j, k) for inFile in inFiles: if options.verbose: print("\nProcessing file: " + inFile) with open(inFile, 'r') as fh: #file handle closes itself lastSpectrum = False while True: # read spectra inFile = os.path.basename(inFile) spectrum = readspectrum(fh, inFile, norm=options.n, elinc=options.elinc, verbose=options.verbose) # break from while loop if readspectrum returns False (<= EOF) if spectrum == "eof": break # apply special ELinC formatting if options.elinc: elincize(spectrum, inFile, verbose=options.verbose) # store only the Amdis model with the lowest OR (except if options.allmodels command line option is active) if not options.allmodels and ('OR' in spectrum) and ('RI' in spectrum): # check if the previous spectrum in the the ELU file is another model for the same scan (same RI, other OR) if lastSpectrum: if spectrum['RI'] == data['spectra'][lastSpectrum][ 'RI']: # if the new spectrum has higher OR than the stored spectrum, skip this one if spectrum['OR'] >= data['spectra'][lastSpectrum][ 'OR']: if options.verbose: print( " - Skipping: a more likely model is already stored" ) continue else: if options.verbose: print( " - Replacing an already stored less likely model" ) # it's a bit messy, but in order to overwrite a spectrum we need to del data['spectra'][ lastSpectrum] # (1) remove the old i -= 1 # (2) reduce the iterator # write spectrum spectrum['DB#'] = str(i) key = spectrum.pop('Name') key = 'S{} {}'.format(i, key) key = key[:77] # longer spectrum names cause problems in AMDIS data['spectra'][key] = spectrum # keep track of the previous spectrum in case of ELU models for the same peak lastSpectrum = key # increase spectrum number i += 1 # adjust progress bar if not options.verbose: j += 1 gcmstoolbox.printProgress(j, k) ### WRITE SPECTRA JSON print("\nWriting data file") gcmstoolbox.saveJSON(data, options.jsonout) print(" => Finalised. Wrote " + options.jsonout + "\n") exit()
def main(): print( "\n*******************************************************************************" ) print( "* GCMStoolbox - a set of tools for GC-MS data analysis *" ) print("* Version: {} ({}) *". format(gcmstoolbox.version, gcmstoolbox.date)) print( "* Author: Wim Fremout, Royal Institute for Cultural Heritage *" ) print( "* Licence: GNU GPL version 3 *" ) print( "* *" ) print( "* GROUP: *" ) print( "* Search groups in a NIST search of a large dataset against itself *" ) print( "* *" ) print( "*******************************************************************************\n" ) ### OPTIONPARSER usage = "usage: %prog [options] MSPEPSEARCH_FILE" parser = OptionParser(usage, version="GCMStoolbox version " + gcmstoolbox.version + " (" + gcmstoolbox.date + ")\n") parser.add_option("-v", "--verbose", help="Be very verbose", action="store_true", dest="verbose", default=False) parser.add_option("-i", "--jsonin", help="JSON input file name [default: gcmstoolbox.json]", action="store", dest="jsonin", type="string", default="gcmstoolbox.json") parser.add_option( "-o", "--jsonout", help="JSON output file name [default: same as JSON input file]", action="store", dest="jsonout", type="string") group = OptionGroup( parser, "RETENTION INDEX GROUPING CRITERIUM", "Only select matching mass spectra that have a retention index matching an RI window around the RI of the unknown spectrum.\n[RIwindow] = [RIfixed] + [RIfactor] * RI\nNote: if both RIfixed and RIfactor are zero, no retention based grouping will be applied." ) group.add_option("-r", "--rifixed", help="Apply an RI window with fixed term. [default: 0]", action="store", dest="rifixed", type="float", default=0) group.add_option( "-R", "--rifactor", help="Apply an RI window with RI-dependent factor [default: 0]", action="store", dest="rifactor", type="float", default=0) group.add_option("-D", "--discard", help="Discard hits without RI", action="store_true", dest="discard", default=False) parser.add_option_group(group) group = OptionGroup( parser, "NIST MS SEARCH GROUPING CRITERIUM", "(Reverse) match settings are set in and calculated by MSPEPSEARCH. However, the options below can be used to set a minimal MF and/or RMF for the grouping process." ) group.add_option("-m", "--match", help="Apply NIST MS match limit [default: 0]", action="store", dest="minmf", type="int", default=0) group.add_option("-n", "--reverse", help="Apply NIST MS reverse match limit [default: 0]", action="store", dest="minrmf", type="int", default=0) parser.add_option_group(group) group = OptionGroup( parser, "AMBIGUOUS MATCHES", "Sometimes a spectrum is matched against a series of spectra that are allocated to two or more different groups. By default, these groups are not merged." ) group.add_option("-M", "--merge", help="Merge groups with ambiguous matches", action="store_true", dest="merge", default=False) parser.add_option_group(group) (options, args) = parser.parse_args() ### ARGUMENTS AND OPTIONS global data, allocations, doubles, j, k cmd = " ".join(sys.argv) if options.verbose: print("Processing arguments") # input file if len(args) == 0: print(" !! No MSPEPSEARCH file?\n") exit() elif len(args) >= 2: print( " !! Too many arguments. Only one MSPEPSEARCH file can be processed." ) exit() elif os.path.isfile(args[0]): inFile = args[0] else: print(" !! MSPEPSEARCH file " + args[0] + " not found.") exit() # check and read JSON input file data = gcmstoolbox.openJSON(options.jsonin) # json output if options.jsonout == None: options.jsonout = options.jsonin if options.verbose: print(" => JSON input file: " + options.jsonin) print(" => JSON output file: " + options.jsonout + "\n") ### GROUP # init progress bar print("\nProcessing file: " + inFile) k = len(data['spectra']) if not options.verbose: j = 0 gcmstoolbox.printProgress(j, k) # open MSPEPSEARCH file, read and interpret it line by line i = 1 with open(inFile, 'r') as fh: for line in fh: for z in range(k): if line.casefold().startswith('unknown'): line, i = readlist(fh, line, i, options.rifixed, options.rifactor, options.discard, options.minmf, options.minrmf, options.merge, options.verbose) # update progress bar if not options.verbose: j += 1 gcmstoolbox.printProgress(j, k) if line == "eof": break ### BUILD GROUPS print("\nGrouping spectra ...") data['groups'] = OrderedDict() # init progress bar if not options.verbose: j = 0 k = len(data['spectra']) gcmstoolbox.printProgress(j, k) for s, g in allocations.items(): g = "G" + str(g) buildgroups(data['groups'], g, s) # adjust progress bar if not options.verbose: j += 1 gcmstoolbox.printProgress(j, k) del allocations ### STATS stats = OrderedDict() stats["spectra"] = len(data['spectra']) stats["groups"] = len(data['groups']) if options.merge: stats["merged"] = [sorted(d) for d in doubles.values()] else: stats["ambiguous"] = [sorted(d) for d in doubles.values()] stats["stats"] = groupstats(data['groups']) print("\nSTATISTICS") print(" - Number of mass spectra: " + str(stats["spectra"])) print(" - Number of groups: " + str(stats["groups"])) if not options.merge: print(" - Groups that may be the same component:") for key in sorted(doubles.keys()): print(" - " + ", ".join(str(d) for d in sorted(doubles[key]))) print(" - Number of hits per group:") if options.verbose: lines = groupstats(data['groups'], options.verbose) else: lines = stats["stats"] for l in lines: print(" - " + l) ### UPDATE JSON FILE if options.verbose: print("\nUpdate JSON output file: " + options.jsonout + "\n") data["info"]["mode"] = "group" data["info"]["grouping"] = stats data["info"]["cmds"].append(cmd) gcmstoolbox.saveJSON(data, options.jsonout) # backup and safe json print("\nFinalised. Wrote " + options.jsonout + "\n") exit()
def main(): print("\n*******************************************************************************") print( "* GCMStoolbox - a set of tools for GC-MS data analysis *") print( "* Version: {} ({}) *".format(gcmstoolbox.version, gcmstoolbox.date)) print( "* Author: Wim Fremout, Royal Institute for Cultural Heritage *") print( "* Licence: GNU GPL version 3 *") print( "* *") print( "* FILTER *") print( "* Reduces the groups json file based on a number of filtering options *") print( "* *") print( "*******************************************************************************\n") ### OPTIONPARSER usage = "\n\nCommands:\n" usage += " list Overview of defined filters\n" usage += " --> usage: %prog list [options]\n" usage += " on Enable filter\n" usage += " --> usage: %prog on [options] FILTER_NUMBERS\n" usage += " off Disable filter\n" usage += " --> usage: %prog off [options] FILTER_NUMBERS\n" usage += " make Define a new filter\n" usage += " --> usage: %prog make [options]" parser = OptionParser(usage, version="GCMStoolbox version " + gcmstoolbox.version + " (" + gcmstoolbox.date + ")\n") parser.add_option("-v", "--verbose", help="Be very verbose", action="store_true", dest="verbose", default=False) parser.add_option("-i", "--jsonin", help="JSON input file name [default: gcmstoolbox.json]", action="store", dest="jsonin", type="string", default="gcmstoolbox.json") parser.add_option("-o", "--jsonout", help="JSON output file name [default: same as JSON input file]", action="store", dest="jsonout", type="string") group = OptionGroup(parser, "MAKE: Filter out groups based on group number") group.add_option("-g", "--group", help="Group number [default: 0], multiple possible", action="append", dest="group", type="string") parser.add_option_group(group) group = OptionGroup(parser, "MAKE: Filter out groups on the number of spectra in a group") group.add_option("-c", "--count", help="Minimal number of spectra per group", action="store", dest="count", type="int") group.add_option("-C", help="Don't count multiple spectra from the same source", action="store_true", dest="sourcecount", default=False) parser.add_option_group(group) group = OptionGroup(parser, "MAKE: Filter out groups based on the presence of a chosen m/z") group.add_option("-m", "--mass", help="m/z value, multiple possible", action="append", dest="mass", type="int") group.add_option("-M", "--percent", help="Minimal relative intensity of a m/z value [default: 90]", action="store", dest="percent", type="int", default=90) group.add_option("-s", "--sum", help="Calculate sumspectra with the N spectra with highest signal, 0 for all [default: 0]", action="store", dest="n", type="int", default=0) parser.add_option_group(group) (options, args) = parser.parse_args() ### ARGUMENTS AND OPTIONS cmd = " ".join(sys.argv) if options.verbose: print("Processing arguments...") # check and read JSON input file data = gcmstoolbox.openJSON(options.jsonin) if data['info']['mode'] == 'spectra': print("\n!! Cannot filter on ungrouped spectra.") exit() # json output if options.jsonout == None: options.jsonout = options.jsonin if options.verbose: print(" => JSON input file: " + options.jsonin) print(" => JSON output file: " + options.jsonout + "\n") # command and arguments if len(args) == 0: print(" !! No command given\n") exit() elif args[0].lower().startswith("l"): if len(args) > 1: print(" !! The list command does not support arguments\n") exit() else: #LIST for id, it in data['filters'].items(): print(id + ": filters out " + str(len(it['out'])) + " groups [" + ("Enabled" if it['active'] else "Disabled") + "]") if 'crit1' in it: print(" - remove groups: " + it['crit1']) if 'crit2' in it: print(" - remove on spectrum count: " + it['crit2']) if 'crit3' in it: print(" - remove on m/z values: " + it['crit3']) print('') exit() elif (args[0].lower() == 'on') or (args[0].lower() == 'off'): flist = [x.upper() for x in args] act = True if (flist.pop(0) == 'ON') else False safe = False for f in flist: if not f.startswith("F"): f = "F" + f if f in data['filters']: data['filters'][f]['active'] = act print(('Enabled ' if act else 'Disabled ') + f) safe = True if safe: data["info"]["cmds"].append(cmd) gcmstoolbox.saveJSON(data, options.jsonout) # backup and safe json print(" => Updated " + options.jsonout + "\n") else: print(" !! Invalid filter names\n") exit() elif args[0].lower().startswith("m"): if len(args) > 1: print(" !! The list command does not support arguments\n") exit() # else: proceed else: print(" !! Invalid command given\n") exit() #criterium flags c1 = False if options.group is None else True #CRITERIUM1: group numbers to be removed c2 = False if options.count is None else True #CRITERIUM2: minimal spectrum count per group c3 = False if options.mass is None else True #CRITERIUM3: minimal intensity of choses m/z values if not (c1 or c2 or c3): print("\n!! No criteria selected. Nothing to do.") exit() ### INITIALISE candidates = set(data["groups"].keys()) # candidates for removal; each criterium will remove those groups that should be kept # since we iterate through a set that will be smaller after each criterium, we'll do the # most time-consuming criteria last ### CRITERIUM 1: GROUP NUMBER if c1: removegroups = [] for g in options.group: g = str(g).upper() if not g.startswith('G'): g = "G" + g removegroups.append(g) print("\nCRITERIUM 1: remove groups by group numbers: " + ", ".join(removegroups)) if not options.verbose: i = 0 j = len(candidates) gcmstoolbox.printProgress(i, j) for c in list(candidates): # iterate over a copy of the set, so we can remove things from the original while iterating if c not in removegroups: candidates.discard(c) # progress bar if not options.verbose: i += 1 gcmstoolbox.printProgress(i, j) if options.verbose: print("candidates for removal:") if len(candidates) == 0: print(" none") else: print(tabulate(candidates)) ### CRITERIUM 2: SPECTRUM COUNT if c2: print("\nCRITERIUM 2: remove groups with less than " + str(options.count) + " spectra...") if not options.verbose: i = 0 j = len(candidates) gcmstoolbox.printProgress(i, j) for c in list(candidates): # iterate over a copy of the set, so we can remove things from the original while iterating if not options.sourcecount: # count number of spectra if data["groups"][c]["count"] >= options.count: #remove from candidates = keep group candidates.discard(c) else: # count number of sources spset = set() nosource = 0 # also count spectra without source for s in data["groups"][c]["spectra"]: if "Source" in data["spectra"][s]: spset.add(data["spectra"][s]["Source"]) else: nosource += 1 if (len(spset) + nosource) >= options.count: #remove from candidates = keep group candidates.discard(c) # progress bar if not options.verbose: i += 1 gcmstoolbox.printProgress(i, j) if options.verbose: print("candidates for removal:") if len(candidates) == 0: print(" none") else: print(tabulate(candidates)) ### CRITERIUM 3: RUBBISH PEAK SEARCH if c3: print("\nCRITERIUM 3: remove groups with m/z value " + ", ".join(str(m) for m in options.mass)) if not options.verbose: i = 0 j = len(candidates) gcmstoolbox.printProgress(i, j) for c in list(candidates): # read the spectra in this group splist = [] for s in data['groups'][c]['spectra']: splist.append(data['spectra'][s]) # if more than one spectrum, make sumspectrum if len(splist) > 1: sumsp = gcmstoolbox.sumspectrum(*splist, highest = options.n) else: sumsp = splist[0] # check masses remove = False maxval = max(sumsp['xydata'].values()) for m in options.mass: if str(m) in sumsp['xydata']: if int(sumsp['xydata'][str(m)]) > (maxval * 0.01 * options.percent): #remove group if options.verbose: print(" --> G" + c + " m/z=" + str(m) + " y-value=" + str(sumsp['xydata'][str(m)]) + " threshold=" + str(maxval * 0.01 * options.percent)) remove = True # final decission #if a group is tagged for removal, we need to keep it in the candidates set! if it is not tagged for removal, we eliminate it as a candidate if not remove: candidates.discard(c) # progress bar if not options.verbose: i += 1 gcmstoolbox.printProgress(i, j) if options.verbose: print("candidates for removal:") if len(candidates) == 0: print(" none") else: print(tabulate(candidates)) ### UPDATE GROUPS AND WRITE IT AS JSON if 'filters' not in data: data['filters'] = OrderedDict() f = "F1" else: f = "F" + str(len(data['filters']) + 1) data['filters'][f] = OrderedDict() if c1: data['filters'][f]['crit1'] = ", ".join(removegroups) if c2: data['filters'][f]['crit2'] = str(options.count) if c3: data['filters'][f]['crit3'] = "m/z " + ", ".join(str(m) for m in options.mass) + "; " + str(options.percent) + "%; " + str(options.n) data['filters'][f]['active'] = True data['filters'][f]['out'] = sorted(candidates) print("\nFilter " + f) print(" - initial number of groups: " + str( len(data['groups']) )) print(" - number of removed groups: " + str( len(candidates) )) print(" - number of retained groups: " + str( len(data['groups']) - len(candidates) )) af = [] ac = set() for f, filter in data['filters'].items(): if filter['active']: af.append(f) ac.update(filter['out']) print("\nAll active filters (" + ", ".join(af) + ")") print(" - initial number of groups: " + str( len(data['groups']) )) print(" - number of removed groups: " + str( len(ac) )) print(" - number of retained groups: " + str( len(data['groups']) - len(ac) )) data['info']['mode'] = "filter" data["info"]["cmds"].append(cmd) gcmstoolbox.saveJSON(data, options.jsonout) # backup and safe json print(" => Finalised. Wrote " + options.jsonout + "\n") exit()
def main(): print( "\n*******************************************************************************" ) print( "* GCMStoolbox - a set of tools for GC-MS data analysis *" ) print("* Version: {} ({}) *". format(gcmstoolbox.version, gcmstoolbox.date)) print( "* Author: Wim Fremout, Royal Institute for Cultural Heritage *" ) print( "* Licence: GNU GPL version 3 *" ) print( "* *" ) print( "* EXPORT: *" ) print( "* export the GCMStoolbox data file (JSON) into NIST MS SEARCH format (.msp) *" ) print( "* *" ) print( "*******************************************************************************\n" ) ### OPTIONPARSER usage = "usage: %prog [options] MSP_FILE" parser = OptionParser(usage, version="GCMStoolbox version " + gcmstoolbox.version + " (" + gcmstoolbox.date + ")\n") parser.add_option("-v", "--verbose", help="Be very verbose [not default]", action="store_true", dest="verbose", default=False) parser.add_option("-i", "--jsonin", help="JSON input file name [default: gcmstoolbox.json]", action="store", dest="jsonin", type="string", default="gcmstoolbox.json") parser.add_option( "-o", "--jsonout", help="JSON output file name [default: same as JSON input file]", action="store", dest="jsonout", type="string") parser.add_option( "-m", "--mode", help="Mode: auto|spectra|group|components [default:auto]", action="store", dest="mode", type="string", default="auto") parser.add_option( "-g", "--group", help= "Group numbers to export in group mode; multiple instances can be defined", action="append", dest="group", type="string") (options, args) = parser.parse_args() ### ARGUMENTS AND OPTIONS cmd = " ".join(sys.argv) if options.verbose: print("Processing import files and options") # check MSP output file if len(args) == 0: print(" !! No MSP file name given\n") exit() elif len(args) != 1: print( " !! Too many arguments. Only one MSP file name can be created.\n" ) exit() else: mspfile = args[0] # check and read JSON input file data = gcmstoolbox.openJSON(options.jsonin) # json output if options.jsonout == None: options.jsonout = options.jsonin if options.verbose: print(" => JSON input file: " + options.jsonin) print(" => JSON output file: " + options.jsonout) print(" => Output msp file: " + mspfile + "\n") ### MODE if options.mode.lower().startswith('a'): mode = data['info']['mode'] if mode == 'filter': mode = 'group' elif options.mode.lower().startswith('s'): mode = 'spectra' elif options.mode.lower().startswith('g'): mode = 'group' if data['info']['mode'] == 'spectra': print(" !! No groups defined - run groups.py first\n") exit() if len(options.group) == 0: print(" !! Group mode requires at least one group (-g)\n") exit() elif options.mode.lower().startswith('c'): mode = 'components' if data['info']['mode'] != 'components': print(" !! No components defined - run componentlib.py first\n") exit() else: print( " !! Unknown mode (possible modes are 'auto', 'spectra', 'group' and 'components'\n" ) exit() print("Mode: " + mode) ### WRITE FILE print("\nProcessing mass spectra") # make list of spectra to be added splist = OrderedDict() if (mode == "spectra") or (mode == "components"): splist = data[mode] elif mode == "group": for g in options.group: if 'G' + str(g) in data['groups']: # add original spectra to splist for s in data['groups']['G' + str(g)]['spectra']: splist[s] = data['spectra'][s] # if a component exists with a sumspectrum, add this. if 'components' in data: for c in data['components']: if data['components'][c]['Group'] == 'G' + str(g): splist[c] = data['components'][c] break else: print(" !! G" + str(g) + " was not found.") with open(mspfile, "w") as fh: # init progress bar if not options.verbose: j = 0 k = len(splist) gcmstoolbox.printProgress(j, k) for name, spectrum in splist.items(): writespectrum(fh, mspfile, name, spectrum, options.verbose) # adjust progress bar if not options.verbose: j += 1 gcmstoolbox.printProgress(j, k) print("\n => Wrote {}\n".format(mspfile)) ### TRACE IN JSON FILE print("\nPut a trace in the JSON output file: " + options.jsonout + "\n") data = gcmstoolbox.openJSON( options.jsonin ) # reread the file to be sure we haven't accidentally messed up the data data['info']['cmds'].append(" ".join( sys.argv)) # put a trace in the data file gcmstoolbox.saveJSON(data, options.jsonout) # backup and safe json exit()
def main(): print( "\n*******************************************************************************" ) print( "* GCMStoolbox - a set of tools for GC-MS data analysis *" ) print("* Version: {} ({}) *". format(gcmstoolbox.version, gcmstoolbox.date)) print( "* Author: Wim Fremout, Royal Institute for Cultural Heritage *" ) print( "* Licence: GNU GPL version 3 *" ) print( "* *" ) print( "* REPORT *" ) print( "* Generate CSV report of a component library *" ) print( "* *" ) print( "*******************************************************************************\n" ) ### OPTIONPARSER usage = "usage: %prog [options] REPORT_CSV" parser = OptionParser(usage, version="GCMStoolbox version " + gcmstoolbox.version + " (" + gcmstoolbox.date + ")\n") parser.add_option("-v", "--verbose", help="Be very verbose", action="store_true", dest="verbose", default=False) parser.add_option("-i", "--jsonin", help="JSON input file name [default: gcmstoolbox.json]", action="store", dest="jsonin", type="string", default="gcmstoolbox.json") parser.add_option( "-o", "--jsonout", help="JSON output file name [default: same as JSON input file]", action="store", dest="jsonout", type="string") parser.add_option( "-g", "--groupby", help= "Group measurements by categories (eg. Source, Sample, AAdays, Resin...)", action="store", dest="groupby", type="string", default="Source") (options, args) = parser.parse_args() ### ARGUMENTS cmd = " ".join(sys.argv) if options.verbose: print("Processing arguments...") # output file if len(args) == 0: #exit without complaining print("\n!! Needs a file name for the CSV report") exit() elif len(args) == 1: outfile = args[0] else: print("\n!! Too many arguments") exit() # check and read JSON input file data = gcmstoolbox.openJSON(options.jsonin) if data['info']['mode'] != "components": print( "\n!! Reports can only be generated if the components have been built." ) exit() # json output if options.jsonout == None: options.jsonout = options.jsonin if options.verbose: print(" => JSON input file: " + options.jsonin) print(" => JSON output file: " + options.jsonout) print(" => Output msp file: " + mspfile + "\n") ### READ COMPONENTS print("\nRunning through components...") report = [] if not options.verbose: i = 0 j = len(data['components']) gcmstoolbox.printProgress(i, j) for c in data['components']: categories = OrderedDict() component = data['components'][c] # check all spectra of a component and search for the group-by categories for s in component['Spectra']: spectrum = data['spectra'][s] # lookup category in spectrum (or default to unknown) if options.groupby in spectrum: cat = spectrum[options.groupby] else: cat = 'unknown' # spectrumIS if 'IS' in spectrum: spectrumIS = int(spectrum['IS']) else: spectrumIS = 1 # store IS and count in categories if cat not in categories: categories[cat] = OrderedDict([('sumIS', spectrumIS), ('count', 1)]) else: categories[cat]['sumIS'] += spectrumIS categories[cat]['count'] += 1 # divide sumIS by the number of spectra for cat in categories: meanIS = categories[cat]['sumIS'] // categories[cat][ 'count'] #integer division! categories[ cat] = meanIS # this is what we need to report, sumIS and count can thus be overwritten # prepare report line for this component reportline = [ "C" + component['DB#'], # column A: component number len( component['Spectra'] ), # column B: number of spectra on which this group group/component was calculated component['RI'], # column C: component RI component['dRI'], # column D: RI difference within the component categories # ordereddict with category -> mean intensities ] report.append(reportline) # update progress bar if options.verbose: print(" - " + c) else: i += 1 gcmstoolbox.printProgress(i, j) ### CALCULATE SUM-IS # the sum-IS is the sum of all spectra of a given source file # in case a category is composed of multiple source files, the sum-IS is a the average # (sum of the IS values of all spectra within this category, divided by the number of sources) print("\nCalculate IS for each " + options.groupby + "...") if not options.verbose: i = 0 j = len(data['spectra']) gcmstoolbox.printProgress(i, j) # compile a list of all group-by categories categories = set() for line in report: categories.update(line[4].keys()) categories = sorted(categories) #convert to sorted list # calculate sumIS and count for each category catIS = dict() catSpectra = dict() catSources = dict() for spectrum in data['spectra'].values(): if options.groupby in spectrum: cat = spectrum[options.groupby] else: cat = 'unknown' # spectrumIS if 'IS' in spectrum: spectrumIS = int(spectrum['IS']) else: spectrumIS = 1 # store IS and count in categories if cat not in catIS: catIS[cat] = spectrumIS catSpectra[cat] = 1 catSources[cat] = set() else: catIS[cat] += spectrumIS catSpectra[cat] += 1 catSources[cat].add(spectrum['Source']) # update progress bar if options.verbose: print(" - S{}: category {} (#{})--> added {} to summed IS".format( spectrum['DB#'], cat, catSpectra[cat], spectrumIS)) else: i += 1 gcmstoolbox.printProgress(i, j) # calculate mean IS for cat in categories: # count sources per category catSources[cat] = len(catSources[cat]) # calculate average sum-IS catIS[cat] = catIS[cat] // catSources[cat] ### MAKE REPORT print("\nGenerating report...") if not options.verbose: i = 0 j = len(report) gcmstoolbox.printProgress(i, j) # write report file with open(outfile, 'w', newline='') as fh: mkreport = csv.writer(fh, dialect='excel') # write header rows mkreport.writerow(["component", "number of spectra", "RI", "dRI"] + categories) mkreport.writerow(["(average sum-IS)", "", "", ""] + [catIS[cat] for cat in categories]) mkreport.writerow(["(number of spectra)", "", "", ""] + [catSpectra[cat] for cat in categories]) mkreport.writerow(["(number of sources)", "", "", ""] + [catSources[cat] for cat in categories]) # next rows: components for row in report: # the last item in a report item (row) is a dict of categories and mean IS # replace it with a complete and sorted list of mean IS'es catIS = row.pop() for cat in categories: if cat in catIS: row.append(catIS[cat]) else: row.append("") # write row to report mkreport.writerow(row) if not options.verbose: i += 1 gcmstoolbox.printProgress(i, j) print("\n => Wrote {}\n".format(outfile)) ### TRACE IN JSON FILE print("\nPut a trace in the JSON output file: " + options.jsonout + "\n") data = gcmstoolbox.openJSON( options.jsonin ) # reread the file to be sure we haven't accidentally messed up the data data['info']['cmds'].append(" ".join( sys.argv)) # put a trace in the data file gcmstoolbox.saveJSON(data, options.jsonout) # backup and safe json exit()
def main(): print("\n*******************************************************************************") print( "* GCMStoolbox - a set of tools for GC-MS data analysis *") print( "* Version: {} ({}) *".format(gcmstoolbox.version, gcmstoolbox.date)) print( "* Author: Wim Fremout, Royal Institute for Cultural Heritage *") print( "* Licence: GNU GPL version 3 *") print( "* *") print( "* BUILD *") print( "* Builds the component spectra *") print( "* *") print( "*******************************************************************************\n") ### OPTIONPARSER usage = "usage: %prog [options]" parser = OptionParser(usage, version="GCMStoolbox version " + gcmstoolbox.version + " (" + gcmstoolbox.date + ")\n") parser.add_option("-v", "--verbose", help="Be very verbose", action="store_true", dest="verbose", default=False) parser.add_option("-i", "--jsonin", help="JSON input file name [default: gcmstoolbox.json]", action="store", dest="jsonin", type="string", default="gcmstoolbox.json") parser.add_option("-o", "--jsonout", help="JSON output file name [default: same as JSON input file]", action="store", dest="jsonout", type="string") parser.add_option("-c", "--cnumber", help="Start number for component numbers", action="store", dest="c", type="int" , default=1) parser.add_option("-p", "--preserve", help="Preserve group numbers", action="store_true", dest="preserve", default=False) parser.add_option("-s", "--sum", help="Calculate sumspectra with the N spectra with highest signal, 0 for all [default: 0]", action="store", dest="n", type="int", default=0) (options, args) = parser.parse_args() ### ARGUMENTS cmd = " ".join(sys.argv) if options.verbose: print("Processing arguments...") # check number of arguments if len(args) != 0: #exit without complaining print("\n!! Too many arguments") exit() # check and read JSON input file data = gcmstoolbox.openJSON(options.jsonin) if data['info']['mode'] == 'spectra': print("\n!! Cannot build components using ungrouped spectra.") exit() # json output if options.jsonout == None: options.jsonout = options.jsonin if options.verbose: print(" => JSON input file: " + options.jsonin) print(" => JSON output file: " + options.jsonout + "\n") # preserve and c number flags cannot be used together if options.preserve and (options.c != 1): print("\n!! The options -c (--cnumber) and -p (--preserve) cannot be used together.") exit() ### APPLY ACTIVE FILTERS print("\nApply filters...") if not options.verbose: i = 0 j = len(data['filters']) gcmstoolbox.printProgress(i, j) out = set() for id, f in data['filters'].items(): if f['active']: out.update(f['out']) if options.verbose: print(" - add " + id) if not options.verbose: i += 1 gcmstoolbox.printProgress(i, j) ### BUILD COMPONENTS print("\nBuild components...") i = 0 # we'll use this both for the progress bar and for the component number (i + options.c, if options.preserve is false) #report = [] data['components'] = OrderedDict() # to sort components on RI, we'll make an intermediary groups dict (ri: groupname) groups = [] ris = [] for gid, group in data['groups'].items(): if gid not in out: #apply filters if 'minRI' in group: # find position gri = float(group['minRI']) pos = 0 for r in ris: if r <= gri: pos += 1 else: break # add to groups and ris groups.insert(pos, gid) ris.insert(pos, gri) else: #group without minRI: add to the back of the groups list groups.append(gid) # init progress bar if not options.verbose: j = len(groups) gcmstoolbox.printProgress(i, j) # build components from the groups for g in groups: # init group = data['groups'][g] groupspectra = [] # group or component numbering: if not options.preserve: c = i + options.c else: c = int(g.replace('G', '')) # collect the spectra for s in group['spectra']: #if not options.elinc: csvSpectra.append(s) groupspectra.append(data['spectra'][s]) # if more than one spectrum, make sumspectrum if len(groupspectra) > 1: sp = gcmstoolbox.sumspectrum(*groupspectra, highest=options.n) else: sp = deepcopy(groupspectra[0]) # rebuild the spectra metadata (and change for single spectra things) name = "C{} RI{}".format(str(c), str(round(float(sp['RI'])))) sp['DB#'] = str(c) sp['Group'] = g sp['Spectra'] = group['spectra'] for item in ["Source", "Sample", "Resin", "AAdays", "Color", "PyTemp"]: values = set() for s in groupspectra: if item in s: values.add(s[item]) if len(values) > 0: # store as list in component sp[item] = sorted(values) # and add it to the component name if item == "AAdays": valuesInt = [ int(x) for x in values ] valuesInt = sorted(valuesInt) # condense the list of AAdays into sequences (0,2,4,8,32 becomes 0-8,32) seq = [] days = [0, 2, 4, 8, 16, 32, 64] k = 0 for low in days: if low in valuesInt: #lower limit of sequence seq.insert(k, str(low)) valuesInt.remove(low) found = False for high in days: if high > low: if high in valuesInt: #higher limit of sequence found = high valuesInt.remove(high) else: break if found: seq[k] += "-" + str(found) k += 1 # add possible AAdays values other than 0,2,4,8... for x in valuesInt: seq.append(str(x)) name += " " + ",".join(seq) + "d" elif item == "Color": name += " " + "/".join(sorted(values)) elif item == "Source": pass elif item == "Sample": pass else: name += " " + "-".join(sorted(values)) # crop name (longer names cause problems in Amdis) name = name[:77] # add to data data['components'][name] = sp # add a "link" to the group data # (used to include sumspectrum if a group library is exported) # commented out, because this data remains present when components are built multiple times --> becomes ambiguous!! #data['groups'][g]['component'] = name i += 1 # update progress bar if options.verbose: print(" - " + name) else: gcmstoolbox.printProgress(i, j) ### SAVE OUTPUT JSON print("\nSaving data...") data['info']['mode'] = "components" data["info"]["cmds"].append(cmd) gcmstoolbox.saveJSON(data, options.jsonout) # backup and safe json print(" => Wrote " + options.jsonout + "\n") exit()