def read_expr_list(file_name): """ Reads the set of experiment files and returns a list of :class:`pyms.Experiment.Experiment` objects :param file_name: The name of the file which lists experiment dump file names, one file per line :type file_name: str or pathlib.Path :return: A list of Experiment instances :rtype: list of pyms.Experiment.Experiment :author: Vladimir Likic """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name, mkdirs=False) fp = file_name.open() exprfiles = fp.readlines() fp.close() expr_list = [] for exprfile in exprfiles: exprfile = exprfile.strip() expr = load_expr(exprfile) expr_list.append(expr) return expr_list
def load_peaks(file_name: Union[str, pathlib.Path]) -> Peak: """ Loads the peak_list stored with 'store_peaks' :param file_name: File name of peak list :type file_name: str or os.PathLike :return: The list of Peak objects :rtype: :class:`list` of :class:`pyms.Peak.Class.Peak` :author: Andrew Isaac :author: Dominic Davis-Foster (pathlib support) """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name, mkdirs=False) fp = file_name.open('rb') peak_list = pickle.load(fp) fp.close() if not is_sequence(peak_list): raise IOError("The selected file is not a List") if not len(peak_list) > 0 or not isinstance(peak_list[0], Peak): raise IOError("The selected file is not a list of Peak objects") return peak_list
def load_expr(file_name): """ Loads an experiment saved with :meth:`pyms.Experiment.store_expr` :param file_name: Experiment file name :type file_name: str or pathlib.Path :return: The loaded experiment :rtype: pyms.Experiment.Experiment :author: Vladimir Likic :author: Andrew Isaac :author: Dominic Davis-Foster (type assertions and pathlib support) """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name, mkdirs=False) fp = file_name.open('rb') expr = pickle.load(fp) fp.close() if not isinstance(expr, Experiment): raise IOError("The loaded file is not an experiment file") return expr
def file2matrix(file_name): """ Convert a .csv file to a numpy array :param file_name: Filename (.csv) to convert (area.csv, area_ci.csv) :type file_name: str or pathlib.Path :return: Data matrix :rtype: :class:`numpy.array` :author: Jairus Bowne :author: Sean O'Callaghan :author: Dominic Davis-Foster (pathlib support) """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name, mkdirs=False) with file_name.open() as fp: reader = csv.reader(fp, delimiter=",", quotechar='"') matrix = [] for row in reader: newrow = [] for each in row: try: each = float(each) except: pass newrow.append(each) matrix.append(newrow) return numpy.array(matrix)
def store_peaks(peak_list: Peak, file_name: Union[str, pathlib.Path], protocol=1): """ Store the list of peak objects :param peak_list: A list of peak objects :type peak_list: list of :class:`pyms.Peaks.Class.Peak` :param file_name: File name to store peak list :type file_name: str or os.PathLike :param protocol: :type protocol: :author: Andrew Isaac :author: Dominic Davis-Foster (type assertions and pathlib support) """ if not is_peak_list(peak_list): raise TypeError("'peak_list' must be a list of Peak objects") if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name) fp = file_name.open('wb') pickle.dump(peak_list, fp, protocol) fp.close()
def write(self, file_name: PathLike, minutes: bool = False, formatting: bool = True): """ Writes the ion chromatogram to the specified file. :param file_name: The name of the output file :param minutes: A boolean value indicating whether to write time in minutes :param formatting: Whether to format the numbers in the output. :authors: Lewis Lee, Vladimir Likic, Dominic Davis-Foster (pathlib support) """ if not is_path(file_name): raise TypeError( "'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name) with file_name.open('w', encoding="UTF-8") as fp: time_list = copy.deepcopy(self._time_list) if minutes: for ii in range(len(time_list)): time_list[ii] = time_list[ii] / 60.0 for ii in range(len(time_list)): if formatting: fp.write( f"{time_list[ii]:8.4f} {self._intensity_array[ii]:#.6e}\n" ) else: fp.write(f"{time_list[ii]} {self._intensity_array[ii]}\n")
def write_intensities_stream(self, file_name: PathLike): """ Loop over all scans and, for each scan, write the intensities to the given file, one intensity per line. Intensities from different scans are joined without any delimiters. :param file_name: Output file name. :authors: Vladimir Likic, Dominic Davis-Foster (pathlib support) """ # noqa: D400 if not is_path(file_name): raise TypeError( "'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name) # n = len(self._scan_list) print(" -> Writing scans to a file") fp = file_name.open('w', encoding="UTF-8") for scan in self._scan_list: intensities = scan.intensity_list for i in intensities: fp.write(f"{i:8.4f}\n") fp.close()
def from_jcamp(cls, file_name): """ Create a MassSpectrum from a JCAMP-DX file :param file_name: Path of the file to read :type file_name: str or os.PathLike :return: MassSpectrum :rtype: :class:`pyms.Spectrum.MassSpectrum` :authors: Qiao Wang, Andrew Isaac, Vladimir Likic, David Kainer, Dominic Davis-Foster """ if not is_path(file_name): raise TypeError( "'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name, mkdirs=False) print(f" -> Reading JCAMP file '{file_name}'") lines_list = file_name.open('r') xydata = [] last_tag = None for line in lines_list: if line.strip(): if line.startswith("##"): # key word or information fields = line.split('=', 1) current_tag = fields[0] = fields[0].lstrip("##").upper() last_tag = fields[0] if current_tag.upper().startswith("END"): break else: if last_tag in xydata_tags: line_sub = re.split(r",| ", line.strip()) for item in line_sub: if not len(item.strip()) == 0: xydata.append(float(item.strip())) # By this point we should have all of the xydata if len(xydata) % 2 == 1: # TODO: This means the data is not in x, y pairs # Make a better error message raise ValueError("data not in pair !") mass_list = [] intensity_list = [] for i in range(len(xydata) // 2): mass_list.append(xydata[i * 2]) intensity_list.append(xydata[i * 2 + 1]) return cls(mass_list, intensity_list)
def export_leco_csv(self, file_name): """ Exports data in LECO CSV format :param file_name: The name of the output file :type file_name: str or pathlib.Path :authors: Andrew Isaac, Vladimir Likic, Dominic Davis-Foster (pathlib support) """ if not is_path(file_name): raise TypeError( "'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name, mkdirs=False) if not file_name.parent.is_dir(): file_name.parent.mkdir(parents=True) mass_list = self._mass_list time_list = self._time_list vals = self._intensity_array fp = file_name.open("w") # Format is text header with: # "Scan","Time",... # and the rest is "TIC" or m/z as text, i.e. "50","51"... # The following lines are: # scan_number,time,value,value,... # scan_number is an int, rest seem to be fixed format floats. # The format is 0.000000e+000 # write header fp.write("\"Scan\",\"Time\"") for ii in mass_list: if isinstance(ii, Number): fp.write(f",\"{int(ii):d}\"") else: raise TypeError("mass list datum not a number") fp.write("\r\n") # windows CR/LF # write lines for ii, time_ in enumerate(time_list): fp.write(f"{ii},{time_:#.6e}") for jj in range(len(vals[ii])): if isinstance(vals[ii][jj], Number): fp.write(f",{vals[ii][jj]:#.6e}") else: raise TypeError("datum not a number") fp.write("\r\n") fp.close()
def write(self, file_root): """ Writes the entire raw data to two CSV files: - 'file_root'.I.csv, containing the intensities; and - 'file_root'.mz.csv, containing the corresponding m/z values. In general these are not two-dimensional matrices, because different scans may have different numbers of m/z values recorded. :param file_root: The root for the output file names :type file_root: str or pathlib.Path :author: Vladimir Likic :author: Dominic Davis-Foster (pathlib support) """ if not isinstance(file_root, (str, pathlib.Path)): raise TypeError( "'file_root' must be a string or a pathlib.Path object") file_root = prepare_filepath(file_root) file_name1 = str(file_root) + ".I.csv" file_name2 = str(file_root) + ".mz.csv" print(f" -> Writing intensities to '{file_name1}'") print(f" -> Writing m/z values to '{file_name2}'") fp1 = open(file_name1, "w") fp2 = open(file_name2, "w") for scan in self._scan_list: for index, intensity in enumerate(scan.intensity_list): if index == 0: fp1.write(f"{intensity:.4f}") else: fp1.write(f",{intensity:.4f}") fp1.write("\n") for index, mass in enumerate(scan.mass_list): if index == 0: fp2.write(f"{mass:.4f}") else: fp2.write(f",{mass:.4f}") fp2.write("\n") fp1.close() fp2.close()
def export_ascii( self, root_name: PathLike, fmt: AsciiFiletypes = AsciiFiletypes.ASCII_DAT, ): """ Exports the intensity matrix, retention time vector, and m/z vector to the ascii format. By default, export_ascii("NAME") will create NAME.im.dat, NAME.rt.dat, and NAME.mz.dat where these are the intensity matrix, retention time vector, and m/z vector in tab delimited format. If ``format`` == ``<AsciiFiletypes.ASCII_CSV>``, the files will be in the CSV format, named NAME.im.csv, NAME.rt.csv, and NAME.mz.csv. :param root_name: Root name for the output files :param fmt: Format of the output file, either ``<AsciiFiletypes.ASCII_DAT>`` or ``<AsciiFiletypes.ASCII_CSV>`` :authors: Milica Ng, Andrew Isaac, Vladimir Likic, Dominic Davis-Foster (pathlib support) """ if not is_path(root_name): raise TypeError("'root_name' must be a string or a pathlib.Path object") root_name = prepare_filepath(root_name, mkdirs=True) fmt = AsciiFiletypes(fmt) if fmt is AsciiFiletypes.ASCII_DAT: separator = ' ' extension = "dat" elif fmt is AsciiFiletypes.ASCII_CSV: separator = ',' extension = "csv" # export 2D matrix of intensities vals = self._intensity_array save_data(f"{root_name}.im.{extension}", vals, sep=separator) # export 1D vector of m/z's, corresponding to rows of # the intensity matrix mass_list = self._mass_list save_data(f"{root_name}.mz.{extension}", mass_list, sep=separator) # export 1D vector of retention times, corresponding to # columns of the intensity matrix time_list = self._time_list save_data(f"{root_name}.rt.{extension}", time_list, sep=separator)
def write_filled_csv( sample_list: List[Sample], area_file: PathLike, filled_area_file: PathLike, ): r""" Creates a new ``area_ci.csv`` file, replacing NAs with values from the sample_list objects where possible. :param sample_list: :param area_file: The file ``'area_ci.csv'`` from PyMassSpec output. :param filled_area_file: the new output file which has ``'NA'``\s values replaced. :authors: Jairus Bowne, Sean O'Callaghan, Dominic Davis-Foster """ if not is_path(filled_area_file): raise TypeError( "'filled_area_file' must be a string or a pathlib.Path object") filled_area_file = prepare_filepath(filled_area_file) df = file2dataframe(area_file) uid_list: List[str] = df["UID"] rt_list: List[float] = [] for uid in uid_list: rt = uid.split('-')[-1] rt_list.append(float(rt)) for sample_name in df.columns[3:]: for sample in sample_list: if sample_name in sample.name: rt_area_dict = sample.rt_areas break else: raise ValueError( f"Sample {sample_name!r} not found in sample_list.") for i, part in enumerate(df[sample_name]): if part == "NA": try: df[sample_name][i] = rt_area_dict[rt_list[i]] except KeyError: pass df.to_csv(filled_area_file, index=False, na_rep="NA")
def store(self, file_name): """ stores an experiment to a file :param file_name: The name of the file :type file_name: str or os.PathLike :author: Vladimir Likic, Andrew Isaac, Dominic Davis-Foster (pathlib support) """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name) fp = file_name.open('wb') pickle.dump(self, fp, 1) fp.close()
def load_expr(file_name: PathLike) -> Experiment: """ Loads an experiment saved with :meth:`pyms.Experiment.Experiment.dump`. :param file_name: Experiment file name. :return: The loaded experiment. :author: Vladimir Likic, Andrew Isaac, Dominic Davis-Foster (type assertions and pathlib support) """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name, mkdirs=False) expr = _pickle_load_path(file_name) if not isinstance(expr, Experiment): raise TypeError("The loaded file is not an experiment file") return expr
def dump(self, file_name: Union[str, pathlib.Path], protocol: int = 3): """ Dumps an object to a file through :func:`pickle.dump()` :param file_name: Filename to save the dump as :type file_name: str or os.PathLike :param protocol: The pickle protocol to use. Default ``3`` :type protocol: int, optional :authors: Vladimir Likic, Dominic Davis-Foster (pathlib and pickle protocol support) """ if not is_path(file_name): raise TypeError( "'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name) fp = file_name.open('wb') pickle.dump(self, fp, protocol=protocol) fp.close()
def file2dataframe(file_name: PathLike) -> pandas.DataFrame: """ Convert a .csv file to a pandas DataFrame. :param file_name: CSV file to read. :authors: Jairus Bowne, Sean O'Callaghan, Dominic Davis-Foster (pathlib support) .. versionadded:: 2.3.0 """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name, mkdirs=False) return pandas.read_csv( file_name, delimiter=',', quotechar='"', header=0, )
def dump(self, file_name, protocol=3): """ Dumps an object to a file through pickle.dump() :param file_name: Name of the file for the object dump :type file_name: str or pathlib.Path :param protocol: The pickle protocol to use. Default 3 :type protocol: int, optional :author: Vladimir Likic :author: Dominic Davis-Foster (pathlib and pickle protocol support) """ if not is_path(file_name): raise TypeError( "'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name) fp = file_name.open('wb') pickle.dump(self, fp, protocol=protocol) fp.close()
def import_leco_csv(file_name): """ Imports data in LECO CSV format :param file_name: Path of the file to read :type file_name: str or pathlib.Path :return: Data as an IntensityMatrix :rtype: pyms.IntensityMatrix.IntensityMatrix :authors: Andrew Isaac, Dominic Davis-Foster (pathlib support) """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name, mkdirs=False) lines_list = file_name.open('r') data = [] time_list = [] mass_list = [] # Format is text header with: # "Scan","Time",... # and the rest is "TIC" or m/z as text, i.e. "50","51"... # The following lines are: # scan_number,time,value,value,... # scan_number is an int, rest seem to be fixed format floats. # The format is 0.000000e+000 num_mass = 0 FIRST = True HEADER = True data_col = -1 time_col = -1 # get each line for line in lines_list: cols = -1 data_row = [] if len(line.strip()) > 0: data_list = line.strip().split(',') # get each value in line for item in data_list: item = item.strip() item = item.strip('\'"') # remove quotes (in header) # Get header if HEADER: cols += 1 if len(item) > 0: if item.lower().find("time") > -1: time_col = cols try: value = float(item) # find 1st col with number as header if FIRST and value > 1: # assume >1 mass data_col = cols # assume time col is previous col if time_col < 0: time_col = cols - 1 FIRST = False mass_list.append(value) num_mass += 1 except ValueError: pass # Get rest else: cols += 1 if len(item) > 0: try: value = float(item) if cols == time_col: time_list.append(value) elif cols >= data_col: data_row.append(value) except ValueError: pass # check row length if not HEADER: if len(data_row) == num_mass: data.append(data_row) else: warn("ignoring row") HEADER = False # check col lengths if len(time_list) != len(data): warn("number of data rows and time list length differ") return IntensityMatrix(time_list, mass_list, data)
def write_filled_csv(sample_list: Sample, area_file: Union[str, pathlib.Path], filled_area_file: Union[str, pathlib.Path]): """ creates a new area_ci.csv file, replacing NAs with values from the sample_list objects where possible :param sample_list: A list of samples :type sample_list: :class:`list` of :class:`pyms.Gapfill.Class.Sample` objects :param area_file: the file 'area_ci.csv' from PyMassSpec output :type area_file: str or pathlib.Path :param filled_area_file: the new output file which has NA values replaced :type filled_area_file: str or pathlib.Path :author: Jairus Bowne :author: Sean O'Callaghan :author: Dominic Davis-Foster (pathlib support) """ if not is_path(filled_area_file): raise TypeError( "'filled_area_file' must be a string or a pathlib.Path object") filled_area_file = prepare_filepath(filled_area_file) old_matrix = file2matrix(area_file) # Invert it to be a little more efficient invert_old_matrix = zip(*old_matrix) # print invert_old_matrix[0:5] uid_list = invert_old_matrix[0][1:] rt_list = [] for uid in uid_list: rt = uid.split('-')[-1] rt_list.append(rt) # print(rt_list) # start setting up the output file invert_new_matrix = [] for line in invert_old_matrix[0:2]: invert_new_matrix.append(line) for line in invert_old_matrix[3:]: sample_name = line[0] new_line = [] new_line.append(sample_name) for sample in sample_list: if sample_name in sample.get_name(): rt_area_dict = sample.get_mp_rt_area_dict() # print rt_area_dict for i, part in enumerate(line[1:]): # print part if part == 'NA': try: area = rt_area_dict[str(rt_list[i])] new_line.append(area) except KeyError: pass else: new_line.append(part) invert_new_matrix.append(new_line) fp_new = filled_area_file.open('w') # new_matrix = numpy.empty(matrix_size) new_matrix = transposed(invert_new_matrix) for i, line in enumerate(new_matrix): for j, part in enumerate(line): fp_new.write(f"{part},") fp_new.write("\n") fp_new.close()
def write_ion_areas_csv(self, ms_file_name: Union[str, pathlib.Path], minutes: bool = True): """ Write Ion Areas to CSV File :param ms_file_name: The name of the file :type ms_file_name: str, PathLike :param minutes: :type minutes: bool :author: David Kainer :author: Dominic Davis-Foster (pathlib support) """ if not is_path(ms_file_name): raise TypeError( "'ms_file_name' must be a string or a PathLike object") ms_file_name = prepare_filepath(ms_file_name) with ms_file_name.open("w") as fp1: # create header header = ['"UID"', '"RTavg"'] for item in self.expr_code: header.append(f'"{item}"') # write headers fp1.write("|".join(header) + "\n") for peak_idx in range(len(self.peakpos[0])): ias = [] new_peak_list = [] for align_idx in range(len(self.peakpos)): peak = self.peakpos[align_idx][peak_idx] if peak is not None: ia = peak.ion_areas ia.update((mass, math.floor(intensity)) for mass, intensity in ia.items()) sorted_ia = sorted(ia.items(), key=operator.itemgetter(1), reverse=True) ias.append(sorted_ia) new_peak_list.append(peak) compo_peak = composite_peak(new_peak_list) # write to ms file fp1.write(compo_peak.UID) if minutes: fp1.write(f"|{compo_peak.rt/60:.3f}") else: fp1.write(f"|{compo_peak.rt:.3f}") for ia in ias: if ia is None: fp1.write("|NA") else: fp1.write(f"|{ia}") fp1.write("\n")
def write_common_ion_csv(self, area_file_name: Union[str, pathlib.Path], top_ion_list: List, minutes: bool = True): """ Writes the alignment to CSV files This function writes two files: one containing the alignment of peak retention times and the other containing the alignment of peak areas. :param area_file_name: The name for the areas alignment file :type area_file_name: str or os.PathLike :param top_ion_list: A list of the highest intensity common ion along the aligned peaks :type top_ion_list: ~collections.abc.Sequence :param minutes: An optional indicator whether to save retention times in minutes. If False, retention time will be saved in seconds :type minutes: bool, optional :author: Woon Wai Keen :author: Andrew Isaac :author: Sean O'Callaghan :author: Vladimir Likic :author: Dominic Davis-Foster (pathlib support) """ # TODO: minutes currently does nothing if not is_path(area_file_name): raise TypeError( "'area_file_name' must be a string or a PathLike object") if not is_sequence_of(top_ion_list, Number): raise TypeError("'top_ion_list' must be a Sequence of Numbers") area_file_name = prepare_filepath(area_file_name) with area_file_name.open("w") as fp: # create header header = ['"UID"', '"RTavg"', '"Quant Ion"'] for item in self.expr_code: header.append(f'"{item}"') # write headers fp.write(",".join(header) + "\n") rtsums = [] rtcounts = [] # The following two arrays will become list of lists # such that: # areas = [ [align1_peak1, align2_peak1, .....,alignn_peak1] # [align1_peak2, ................................] # ............................................. # [align1_peakm,....................,alignn_peakm] ] areas: List[List] = [] new_peak_lists: List[List[Peak]] = [] for peak_list in self.peakpos: index = 0 for peak in peak_list: # one the first iteration, populate the lists if len(areas) < len(peak_list): areas.append([]) new_peak_lists.append([]) rtsums.append(0) rtcounts.append(0) if peak is not None: rt = peak.rt # get the area of the common ion for the peak # an area of 'na' shows that while the peak was # aligned, the common ion was not present area = peak.get_ion_area(top_ion_list[index]) areas[index].append(area) new_peak_lists[index].append(peak) # The following code to the else statement is # just for calculating the average rt rtsums[index] += rt rtcounts[index] += 1 else: areas[index].append(None) index += 1 out_strings = [] index = 0 # now write the strings for the file for area_list in areas: # write initial info: # peak unique id, peak average rt compo_peak = composite_peak(new_peak_lists[index]) peak_UID = compo_peak.UID peak_UID_string = f'"{peak_UID}"' rt_avg = rtsums[index] / rtcounts[index] out_strings.append( f"{peak_UID_string},{rt_avg / 60:.3f},{top_ion_list[index]:f}" ) for area in area_list: if area is not None: out_strings[index] += f",{area:.4f}" else: out_strings[index] += ",NA" index += 1 # now write the file # print("length of areas[0]", len(areas[0])) # print("length of areas", len(areas)) # print("length of out_strings", len(out_strings)) for row in out_strings: fp.write(row + "\n")
def write_csv(self, rt_file_name: Union[str, pathlib.Path], area_file_name: Union[str, pathlib.Path], minutes: bool = True): """ Writes the alignment to CSV files This function writes two files: one containing the alignment of peak retention times and the other containing the alignment of peak areas. :param rt_file_name: The name for the retention time alignment file :type rt_file_name: str or pathlib.Path :param area_file_name: The name for the areas alignment file :type area_file_name: str or pathlib.Path :param minutes: An optional indicator whether to save retention times in minutes. If False, retention time will be saved in seconds :type minutes: bool, optional :author: Woon Wai Keen :author: Andrew Isaac :author: Vladimir Likic :author: David Kainer :author: Dominic Davis-Foster (pathlib support) """ if not isinstance(rt_file_name, (str, pathlib.Path)): raise TypeError( "'rt_file_name' must be a string or a pathlib.Path object") if not isinstance(area_file_name, (str, pathlib.Path)): raise TypeError( "'area_file_name' must be a string or a pathlib.Path object") rt_file_name = prepare_filepath(rt_file_name) area_file_name = prepare_filepath(area_file_name) fp1 = rt_file_name.open("w") fp2 = area_file_name.open("w") # create header header = ['UID', 'RTavg'] for item in self.expr_code: header.append(f'"{item}"') # write headers fp1.write(",".join(header) + "\n") fp2.write(",".join(header) + "\n") # for each alignment position write alignment's peak and area for peak_idx in range(len( self.peakpos[0])): # loop through peak lists (rows) rts = [] areas = [] new_peak_list = [] for align_idx in range(len(self.peakpos)): peak = self.peakpos[align_idx][peak_idx] if peak is not None: if minutes: rt = peak.rt / 60.0 else: rt = peak.rt rts.append(rt) areas.append(peak.area) new_peak_list.append(peak) else: rts.append(None) areas.append(None) compo_peak = composite_peak(new_peak_list) # write to retention times file fp1.write(compo_peak.UID) if minutes: fp1.write(f",{float(compo_peak.rt / 60):.3f}") else: fp1.write(f",{compo_peak.rt:.3f}") for rt in rts: if rt is None or numpy.isnan(rt): fp1.write(",NA") else: fp1.write(f",{rt:.3f}") fp1.write("\n") # write to peak areas file fp2.write(compo_peak.UID) if minutes: fp2.write(f",{float(compo_peak.rt / 60):.3f}") else: fp2.write(f",{compo_peak.rt:.3f}") for area in areas: if area is None: fp2.write(",NA") else: fp2.write(f",{area:.0f}") fp2.write("\n") fp1.close() fp2.close()
def JCAMP_reader(file_name: Union[str, os.PathLike]) -> GCMS_data: """ Generic reader for JCAMP DX files :param file_name: Path of the file to read :type file_name: str or os.PathLike :return: GC-MS data object :rtype: :class:`pyms.GCMS.Class.GCMS_data` :authors: Qiao Wang, Andrew Isaac, Vladimir Likic, David Kainer, Dominic Davis-Foster (pathlib support) """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name, mkdirs=False) print(f" -> Reading JCAMP file '{file_name}'") lines_list = file_name.open('r') data = [] page_idx = 0 xydata_idx = 0 time_list = [] scan_list = [] header_info = {} # Dictionary containing header information for line in lines_list: if len(line.strip()) != 0: # prefix = line.find('#') # if prefix == 0: if line.startswith("##"): # key word or information fields = line.split('=', 1) fields[0] = fields[0].lstrip("##").upper() fields[1] = fields[1].strip() if "PAGE" in fields[0]: if "T=" in fields[1]: # PAGE contains retention time starting with T= # FileConverter Pro style time = float(fields[1].lstrip( "T=")) # rt for the scan to be submitted time_list.append(time) page_idx = page_idx + 1 elif "RETENTION_TIME" in fields[0]: # OpenChrom style time = float(fields[1]) # rt for the scan to be submitted # Check to make sure time is not already in the time list; # Can happen when both ##PAGE and ##RETENTION_TIME are specified if time_list[-1] != time: time_list.append(time) elif fields[0] in xydata_tags: xydata_idx = xydata_idx + 1 elif fields[0] in header_info_fields: if fields[1].isdigit(): header_info[fields[0]] = int(fields[1]) elif is_float(fields[1]): header_info[fields[0]] = float(fields[1]) else: header_info[fields[0]] = fields[1] # elif prefix == -1: else: # Line doesn't start with ## # data if page_idx > 1 or xydata_idx > 1: if len(data) % 2 == 1: # TODO: This means the data is not in x, y pairs # Make a better error message raise ValueError("data not in pair !") mass_list = [] intensity_list = [] for i in range(len(data) // 2): mass_list.append(data[i * 2]) intensity_list.append(data[i * 2 + 1]) if len(mass_list) != len(intensity_list): raise ValueError( "len(mass_list) is not equal to len(intensity_list)" ) scan_list.append(Scan(mass_list, intensity_list)) data = [] data_sub = line.strip().split(',') for item in data_sub: if not len(item.strip()) == 0: data.append(float(item.strip())) if page_idx > 1: page_idx = 1 if xydata_idx > 1: xydata_idx = 1 else: data_sub = line.strip().split(',') for item in data_sub: if not len(item.strip()) == 0: data.append(float(item.strip())) if len(data) % 2 == 1: # TODO: This means the data is not in x, y pairs # Make a better error message raise ValueError("data not in pair !") # get last scan mass = [] intensity = [] for i in range(len(data) // 2): mass.append(data[i * 2]) intensity.append(data[i * 2 + 1]) if len(mass) != len(intensity): raise ValueError("len(mass) is not equal to len(intensity)") scan_list.append(Scan(mass, intensity)) # sanity check time_len = len(time_list) scan_len = len(scan_list) if time_len != scan_len: print(time_list) print(scan_list) raise ValueError( f"Number of time points ({time_len}) does not equal the number of scans ({scan_len})" ) data = GCMS_data(time_list, scan_list) return data
def write_excel( alignment: Alignment, file_name: PathLike, minutes: bool = True, ): """ Writes the alignment to an excel file, with colouring showing possible mis-alignments. :param alignment: :class:`pyms.DPA.Alignment.Alignment` object to write to file. :param file_name: The name for the retention time alignment file. :param minutes: Whether to save retention times in minutes. If :py:obj:`False`, retention time will be saved in seconds. :author: David Kainer """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name) wb = Workbook() ws = wb.active ws.title = "Aligned RT" # create header row ws["A1"] = "UID" ws["B1"] = "RTavg" for i, item in enumerate(alignment.expr_code): currcell = ws.cell(row=1, column=i + 3, value=f"{item}") comment = Comment("sample " + str(i), "dave") currcell.comment = comment # for each alignment position write alignment's peak and area for peak_idx in range(len(alignment.peakpos[0])): # loop through peak lists (rows) new_peak_list = [] for align_idx in range(len(alignment.peakpos)): # loops through samples (columns) peak = alignment.peakpos[align_idx][peak_idx] if peak is not None: if minutes: rt = peak.rt / 60.0 else: rt = peak.rt area = peak.area new_peak_list.append(peak) # write the RT into the cell in the excel file currcell = ws.cell(row=2 + peak_idx, column=3 + align_idx, value=round(rt, 3)) # get the mini-mass spec for this peak, and divide the ion intensities by 1000 to shorten them ia = peak.ion_areas ia.update((mass, int(intensity / 1000)) for mass, intensity in ia.items()) sorted_ia = sorted(ia.items(), key=operator.itemgetter(1), reverse=True) # write the peak area and mass spec into the comment for the cell comment = Comment(f"Area: {area:.0f} | MassSpec: {sorted_ia}", "dave") # currcell.number_format currcell.comment = comment else: # rt = 'NA' # area = 'NA' currcell = ws.cell(row=2 + peak_idx, column=3 + align_idx, value="NA") comment = Comment("Area: NA", "dave") # currcell.number_format currcell.comment = comment compo_peak = composite_peak(new_peak_list) if compo_peak is not None: peak_UID = compo_peak.UID peak_UID_string = f'"{peak_UID}"' ws.cell(row=2 + peak_idx, column=1, value=peak_UID_string) ws.cell(row=2 + peak_idx, column=2, value=f"{float(compo_peak.rt / 60):.3f}") # colour the cells in each row based on their RT percentile for that row i = 0 for row in ws.rows: i += 1 cell_range = ("{0}" + str(i) + ":{1}" + str(i)).format(get_column_letter(3), get_column_letter(len(row))) ws.conditional_formatting.add( cell_range, ColorScaleRule( start_type="percentile", start_value=1, start_color="E5FFCC", mid_type="percentile", mid_value=50, mid_color="FFFFFF", end_type="percentile", end_value=99, end_color="FFE5CC" ), ) wb.save(file_name)
def write_transposed_output( alignment: Alignment, file_name: PathLike, minutes: bool = True, ): """ :param alignment: :class:`pyms.DPA.Alignment.Alignment` object to write to file :param file_name: The name of the file :param minutes: """ if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name) wb = Workbook() ws1 = wb.create_sheet(title="Aligned RT") ws2 = wb.create_sheet(title="Aligned Area") ws1["A1"] = "Peak" ws1["A2"] = "RTavg" ws2["A1"] = "Peak" ws2["A2"] = "RTavg" style_outlier = PatternFill(fill_type="solid", fgColor="FFAE19", bgColor="FFAE19") # write column with sample IDs for i, item in enumerate(alignment.expr_code): ws1.cell(column=1, row=i + 3, value=f"{item}") ws2.cell(column=1, row=i + 3, value=f"{item}") # for each alignment position write alignment's peak and area for peak_idx in range(len(alignment.peakpos[0])): # loop through peak lists new_peak_list = [] # this will contain a list of tuples of form (peak, col, row), but only non-NA peaks for align_idx in range(len(alignment.peakpos)): # loops through samples peak = alignment.peakpos[align_idx][peak_idx] cell_col = 2 + peak_idx cell_row = 3 + align_idx if peak is not None: if minutes: rt = peak.rt / 60.0 else: rt = peak.rt area = peak.area # these are the col,row coords of the peak in the output matrix new_peak_list.append((peak, cell_col, cell_row)) # write the RT into the cell in the excel file currcell1 = ws1.cell(column=cell_col, row=cell_row, value=round(rt, 3)) ws2.cell(column=cell_col, row=cell_row, value=round(area, 3)) # type: ignore # get the mini-mass spec for this peak, and divide the ion intensities by 1000 to shorten them ia = peak.ion_areas ia.update((mass, int(intensity / 1000)) for mass, intensity in ia.items()) sorted_ia = sorted(ia.items(), key=operator.itemgetter(1), reverse=True) # write the peak area and mass spec into the comment for the cell comment = Comment(f"Area: {area:.0f} | MassSpec: {sorted_ia}", "dave") currcell1.comment = comment else: # rt = 'NA' # area = 'NA' currcell1 = ws1.cell(column=cell_col, row=cell_row, value="NA") ws2.cell(column=cell_col, row=cell_row, value="NA") comment = Comment("Area: NA", "dave") currcell1.comment = comment # this method will create the compo peak, and also mark outlier peaks with a bool is_outlier compo_peak = composite_peak(list(p[0] for p in new_peak_list)) if compo_peak is not None: ws1.cell(column=2 + peak_idx, row=1, value=f'"{compo_peak.UID}"') ws1.cell(column=2 + peak_idx, row=2, value=f"{float(compo_peak.rt / 60):.3f}") ws2.cell(column=2 + peak_idx, row=1, value=f'"{compo_peak.UID}"') ws2.cell(column=2 + peak_idx, row=2, value=f"{float(compo_peak.rt / 60):.3f}") # highlight outlier cells in the current peak list for p in new_peak_list: if p[0].is_outlier: # ws[ get_column_letter(p[1]) + str(p[2]) ].style = style_outlier ws1.cell(column=p[1], row=p[2]).fill = style_outlier ws2.cell(column=p[1], row=p[2]).fill = style_outlier wb.save(file_name)
def write_mass_hunter_csv( alignment: Alignment, file_name: PathLike, top_ion_list: List[int], ): # , peak_list_name): """ Creates a csv file with UID, common and qualifying ions and their ratios for mass hunter interpretation. :param alignment: alignment object to write to file :param file_name: name of the output file. :param top_ion_list: a list of the common ions for each peak in the averaged peak list for the alignment. """ # noqa: D400 if not is_path(file_name): raise TypeError("'file_name' must be a string or a PathLike object") file_name = prepare_filepath(file_name) fp = file_name.open('w', encoding="UTF-8") if top_ion_list is None: raise ValueError("List of common ions must be supplied") # write headers fp.write( '"UID","Common Ion","Qual Ion 1","ratio QI1/CI","Qual Ion 2",' '"ratio QI2/CI","l window delta","r window delta"\n' ) rtsums: List[float] = [] rtcounts = [] # The following two arrays will become list of lists # such that: # areas = [ [align1_peak1, align2_peak1, .....,alignn_peak1] # [align1_peak2, ................................] # ............................................. # [align1_peakm,....................,alignn_peakm] ] areas = [] # type: ignore new_peak_lists = [] # type: ignore rtmax = [] rtmin = [] for peak_list in alignment.peakpos: index = 0 for peak in peak_list: # on the first iteration, populate the lists if len(areas) < len(peak_list): areas.append([]) new_peak_lists.append([]) rtsums.append(0) rtcounts.append(0) rtmax.append(0.0) rtmin.append(0.0) if peak is not None: rt = peak.rt # get the area of the common ion for the peak # an area of 'na' shows that while the peak was # aligned, the common ion was not present area = peak.get_ion_area(top_ion_list[index]) areas[index].append(area) new_peak_lists[index].append(peak) # The following code to the else statement is # just for calculating the average rt rtsums[index] += rt rtcounts[index] += 1 # quick workaround for weird problem when # attempting to set rtmin to max time above if rtmin[index] == 0.0: rtmin[index] = 5400.0 if rt > rtmax[index]: rtmax[index] = rt if rt < rtmin[index]: rtmin[index] = rt else: areas[index].append(None) index += 1 out_strings = [] compo_peaks = [] index = 0 # now write the strings for the file for area_list in areas: # write initial info: # peak unique id, peak average rt compo_peak = composite_peak(new_peak_lists[index]) if compo_peak is None: continue compo_peaks.append(compo_peak) peak_UID = compo_peak.UID peak_UID_string = f'"{peak_UID}"' # calculate the time from the leftmost peak to the average l_window_delta = compo_peak.rt - rtmin[index] # print("l_window", l_window_delta, "rt", compo_peak.rt, "rt_min", rtmin[index]) r_window_delta = rtmax[index] - compo_peak.rt common_ion = top_ion_list[index] qual_ion_1 = int(peak_UID_string.split('-')[0].strip('"')) qual_ion_2 = int(peak_UID_string.split('-')[1]) if qual_ion_1 == common_ion: qual_ion_1 = compo_peak.get_third_highest_mz() elif qual_ion_2 == common_ion: qual_ion_2 = compo_peak.get_third_highest_mz() else: pass ci_intensity = compo_peak.get_int_of_ion(common_ion) q1_intensity = compo_peak.get_int_of_ion(qual_ion_1) q2_intensity = compo_peak.get_int_of_ion(qual_ion_2) try: q1_ci_ratio = float(q1_intensity) / float(ci_intensity) except TypeError: # if no area available for that ion q1_ci_ratio = 0.0 except ZeroDivisionError: # shouldn't happen but does!! q1_ci_ratio = 0.01 try: q2_ci_ratio = float(q2_intensity) / float(ci_intensity) except TypeError: q2_ci_ratio = 0.0 except ZeroDivisionError: # shouldn't happen, but does!! q2_ci_ratio = 0.01 out_strings.append( ','.join([ peak_UID, f"{common_ion}", f"{qual_ion_1}", f"{q1_ci_ratio * 100:.1f}", f"{qual_ion_2}", f"{q2_ci_ratio * 100:.1f}", f"{(l_window_delta + 1.5) / 60:.2f}", f"{(r_window_delta + 1.5) / 60:.2f}", ]) ) index += 1 # now write the file # print("length of areas[0]", len(areas[0])) # print("lenght of areas", len(areas)) # print("length of out_strings", len(out_strings)) for row in out_strings: fp.write(f"{row}\n") # dump_object(compo_peaks, peak_list_name) fp.close()