Пример #1
0
def read_expr_list(file_name):
    """
	Reads the set of experiment files and returns a list of :class:`pyms.Experiment.Experiment` objects

	:param file_name: The name of the file which lists experiment dump file names, one file per line
	:type file_name: str or pathlib.Path

	:return: A list of Experiment instances
	:rtype: list of pyms.Experiment.Experiment

	:author: Vladimir Likic
	"""

    if not is_path(file_name):
        raise TypeError("'file_name' must be a string or a PathLike object")

    file_name = prepare_filepath(file_name, mkdirs=False)

    fp = file_name.open()

    exprfiles = fp.readlines()
    fp.close()

    expr_list = []

    for exprfile in exprfiles:

        exprfile = exprfile.strip()
        expr = load_expr(exprfile)

        expr_list.append(expr)

    return expr_list
Пример #2
0
def load_peaks(file_name: Union[str, pathlib.Path]) -> Peak:
    """
    Loads the peak_list stored with 'store_peaks'

    :param file_name: File name of peak list
    :type file_name: str or os.PathLike

    :return: The list of Peak objects
    :rtype: :class:`list` of :class:`pyms.Peak.Class.Peak`

    :author: Andrew Isaac
    :author: Dominic Davis-Foster (pathlib support)
    """

    if not is_path(file_name):
        raise TypeError("'file_name' must be a string or a PathLike object")

    file_name = prepare_filepath(file_name, mkdirs=False)

    fp = file_name.open('rb')
    peak_list = pickle.load(fp)
    fp.close()

    if not is_sequence(peak_list):
        raise IOError("The selected file is not a List")
    if not len(peak_list) > 0 or not isinstance(peak_list[0], Peak):
        raise IOError("The selected file is not a list of Peak objects")

    return peak_list
Пример #3
0
def load_expr(file_name):
    """
	Loads an experiment saved with :meth:`pyms.Experiment.store_expr`

	:param file_name: Experiment file name
	:type file_name: str or pathlib.Path

	:return: The loaded experiment
	:rtype: pyms.Experiment.Experiment

	:author: Vladimir Likic
	:author: Andrew Isaac
	:author: Dominic Davis-Foster (type assertions and pathlib support)
	"""

    if not is_path(file_name):
        raise TypeError("'file_name' must be a string or a PathLike object")

    file_name = prepare_filepath(file_name, mkdirs=False)

    fp = file_name.open('rb')
    expr = pickle.load(fp)
    fp.close()

    if not isinstance(expr, Experiment):
        raise IOError("The loaded file is not an experiment file")

    return expr
Пример #4
0
def file2matrix(file_name):
    """
	Convert a .csv file to a numpy array

	:param file_name: Filename (.csv) to convert (area.csv, area_ci.csv)
	:type file_name: str or pathlib.Path

	:return: Data matrix
	:rtype: :class:`numpy.array`

	:author: Jairus Bowne
	:author: Sean O'Callaghan
	:author: Dominic Davis-Foster (pathlib support)
	"""

    if not is_path(file_name):
        raise TypeError("'file_name' must be a string or a PathLike object")

    file_name = prepare_filepath(file_name, mkdirs=False)

    with file_name.open() as fp:
        reader = csv.reader(fp, delimiter=",", quotechar='"')
        matrix = []
        for row in reader:
            newrow = []
            for each in row:
                try:
                    each = float(each)
                except:
                    pass
                newrow.append(each)
            matrix.append(newrow)

    return numpy.array(matrix)
Пример #5
0
def store_peaks(peak_list: Peak,
                file_name: Union[str, pathlib.Path],
                protocol=1):
    """
        Store the list of peak objects

    :param peak_list: A list of peak objects
    :type peak_list: list of :class:`pyms.Peaks.Class.Peak`
    :param file_name: File name to store peak list
    :type file_name: str or os.PathLike
    :param protocol:
    :type protocol:

    :author: Andrew Isaac
    :author: Dominic Davis-Foster (type assertions and pathlib support)

    """

    if not is_peak_list(peak_list):
        raise TypeError("'peak_list' must be a list of Peak objects")

    if not is_path(file_name):
        raise TypeError("'file_name' must be a string or a PathLike object")

    file_name = prepare_filepath(file_name)

    fp = file_name.open('wb')
    pickle.dump(peak_list, fp, protocol)
    fp.close()
Пример #6
0
    def write(self,
              file_name: PathLike,
              minutes: bool = False,
              formatting: bool = True):
        """
		Writes the ion chromatogram to the specified file.

		:param file_name: The name of the output file
		:param minutes: A boolean value indicating whether to write time in minutes
		:param formatting: Whether to format the numbers in the output.

		:authors: Lewis Lee, Vladimir Likic, Dominic Davis-Foster (pathlib support)
		"""

        if not is_path(file_name):
            raise TypeError(
                "'file_name' must be a string or a PathLike object")

        file_name = prepare_filepath(file_name)

        with file_name.open('w', encoding="UTF-8") as fp:

            time_list = copy.deepcopy(self._time_list)

            if minutes:
                for ii in range(len(time_list)):
                    time_list[ii] = time_list[ii] / 60.0

            for ii in range(len(time_list)):
                if formatting:
                    fp.write(
                        f"{time_list[ii]:8.4f} {self._intensity_array[ii]:#.6e}\n"
                    )
                else:
                    fp.write(f"{time_list[ii]} {self._intensity_array[ii]}\n")
Пример #7
0
    def write_intensities_stream(self, file_name: PathLike):
        """
		Loop over all scans and, for each scan, write the intensities to the
		given file, one intensity per line.

		Intensities from different scans are joined without any delimiters.

		:param file_name: Output file name.

		:authors: Vladimir Likic, Dominic Davis-Foster (pathlib support)
		"""  # noqa: D400

        if not is_path(file_name):
            raise TypeError(
                "'file_name' must be a string or a PathLike object")

        file_name = prepare_filepath(file_name)

        # n = len(self._scan_list)

        print(" -> Writing scans to a file")

        fp = file_name.open('w', encoding="UTF-8")

        for scan in self._scan_list:
            intensities = scan.intensity_list
            for i in intensities:
                fp.write(f"{i:8.4f}\n")

        fp.close()
Пример #8
0
    def from_jcamp(cls, file_name):
        """
		Create a MassSpectrum from a JCAMP-DX file

		:param file_name: Path of the file to read
		:type file_name: str or os.PathLike

		:return: MassSpectrum
		:rtype: :class:`pyms.Spectrum.MassSpectrum`

		:authors: Qiao Wang, Andrew Isaac, Vladimir Likic, David Kainer, Dominic Davis-Foster
		"""

        if not is_path(file_name):
            raise TypeError(
                "'file_name' must be a string or a PathLike object")

        file_name = prepare_filepath(file_name, mkdirs=False)

        print(f" -> Reading JCAMP file '{file_name}'")
        lines_list = file_name.open('r')
        xydata = []
        last_tag = None

        for line in lines_list:

            if line.strip():
                if line.startswith("##"):
                    # key word or information
                    fields = line.split('=', 1)
                    current_tag = fields[0] = fields[0].lstrip("##").upper()
                    last_tag = fields[0]

                    if current_tag.upper().startswith("END"):
                        break

                else:
                    if last_tag in xydata_tags:
                        line_sub = re.split(r",| ", line.strip())
                        for item in line_sub:
                            if not len(item.strip()) == 0:
                                xydata.append(float(item.strip()))

        # By this point we should have all of the xydata
        if len(xydata) % 2 == 1:
            # TODO: This means the data is not in x, y pairs
            #  Make a better error message
            raise ValueError("data not in pair !")

        mass_list = []
        intensity_list = []
        for i in range(len(xydata) // 2):
            mass_list.append(xydata[i * 2])
            intensity_list.append(xydata[i * 2 + 1])

        return cls(mass_list, intensity_list)
Пример #9
0
    def export_leco_csv(self, file_name):
        """
		Exports data in LECO CSV format

		:param file_name: The name of the output file
		:type file_name: str or pathlib.Path

		:authors: Andrew Isaac, Vladimir Likic, Dominic Davis-Foster (pathlib support)
		"""

        if not is_path(file_name):
            raise TypeError(
                "'file_name' must be a string or a PathLike object")

        file_name = prepare_filepath(file_name, mkdirs=False)

        if not file_name.parent.is_dir():
            file_name.parent.mkdir(parents=True)

        mass_list = self._mass_list
        time_list = self._time_list
        vals = self._intensity_array

        fp = file_name.open("w")

        # Format is text header with:
        # "Scan","Time",...
        # and the rest is "TIC" or m/z as text, i.e. "50","51"...
        # The following lines are:
        # scan_number,time,value,value,...
        # scan_number is an int, rest seem to be fixed format floats.
        # The format is 0.000000e+000

        # write header
        fp.write("\"Scan\",\"Time\"")
        for ii in mass_list:
            if isinstance(ii, Number):
                fp.write(f",\"{int(ii):d}\"")
            else:
                raise TypeError("mass list datum not a number")
        fp.write("\r\n")  # windows CR/LF

        # write lines
        for ii, time_ in enumerate(time_list):
            fp.write(f"{ii},{time_:#.6e}")
            for jj in range(len(vals[ii])):
                if isinstance(vals[ii][jj], Number):
                    fp.write(f",{vals[ii][jj]:#.6e}")
                else:
                    raise TypeError("datum not a number")
            fp.write("\r\n")

        fp.close()
Пример #10
0
    def write(self, file_root):
        """
		Writes the entire raw data to two CSV files:

		- 'file_root'.I.csv, containing the intensities; and
		- 'file_root'.mz.csv, containing the corresponding m/z values.

		In general these are not two-dimensional matrices, because different
		scans may have different numbers of m/z values recorded.

		:param file_root: The root for the output file names
		:type file_root: str or pathlib.Path

		:author: Vladimir Likic
		:author: Dominic Davis-Foster (pathlib support)
		"""

        if not isinstance(file_root, (str, pathlib.Path)):
            raise TypeError(
                "'file_root' must be a string or a pathlib.Path object")

        file_root = prepare_filepath(file_root)

        file_name1 = str(file_root) + ".I.csv"
        file_name2 = str(file_root) + ".mz.csv"

        print(f" -> Writing intensities to '{file_name1}'")
        print(f" -> Writing m/z values to '{file_name2}'")

        fp1 = open(file_name1, "w")
        fp2 = open(file_name2, "w")

        for scan in self._scan_list:

            for index, intensity in enumerate(scan.intensity_list):
                if index == 0:
                    fp1.write(f"{intensity:.4f}")
                else:
                    fp1.write(f",{intensity:.4f}")
            fp1.write("\n")

            for index, mass in enumerate(scan.mass_list):
                if index == 0:
                    fp2.write(f"{mass:.4f}")
                else:
                    fp2.write(f",{mass:.4f}")
            fp2.write("\n")

        fp1.close()
        fp2.close()
Пример #11
0
	def export_ascii(
			self,
			root_name: PathLike,
			fmt: AsciiFiletypes = AsciiFiletypes.ASCII_DAT,
			):
		"""
		Exports the intensity matrix, retention time vector, and m/z vector to the ascii format.

		By default, export_ascii("NAME") will create NAME.im.dat, NAME.rt.dat,
		and NAME.mz.dat where these are the intensity matrix, retention time
		vector, and m/z vector in tab delimited format.

		If ``format`` == ``<AsciiFiletypes.ASCII_CSV>``, the files will be in the CSV format, named
		NAME.im.csv, NAME.rt.csv, and NAME.mz.csv.

		:param root_name: Root name for the output files
		:param fmt: Format of the output file, either ``<AsciiFiletypes.ASCII_DAT>`` or ``<AsciiFiletypes.ASCII_CSV>``

		:authors: Milica Ng, Andrew Isaac, Vladimir Likic, Dominic Davis-Foster (pathlib support)
		"""

		if not is_path(root_name):
			raise TypeError("'root_name' must be a string or a pathlib.Path object")

		root_name = prepare_filepath(root_name, mkdirs=True)
		fmt = AsciiFiletypes(fmt)

		if fmt is AsciiFiletypes.ASCII_DAT:
			separator = ' '
			extension = "dat"
		elif fmt is AsciiFiletypes.ASCII_CSV:
			separator = ','
			extension = "csv"

		# export 2D matrix of intensities
		vals = self._intensity_array
		save_data(f"{root_name}.im.{extension}", vals, sep=separator)

		# export 1D vector of m/z's, corresponding to rows of
		# the intensity matrix
		mass_list = self._mass_list
		save_data(f"{root_name}.mz.{extension}", mass_list, sep=separator)

		# export 1D vector of retention times, corresponding to
		# columns of the intensity matrix
		time_list = self._time_list
		save_data(f"{root_name}.rt.{extension}", time_list, sep=separator)
Пример #12
0
def write_filled_csv(
    sample_list: List[Sample],
    area_file: PathLike,
    filled_area_file: PathLike,
):
    r"""
	Creates a new ``area_ci.csv`` file, replacing NAs with values from the sample_list objects where possible.

	:param sample_list:
	:param area_file: The file ``'area_ci.csv'`` from PyMassSpec output.
	:param filled_area_file: the new output file which has ``'NA'``\s values replaced.

	:authors: Jairus Bowne, Sean O'Callaghan, Dominic Davis-Foster
	"""

    if not is_path(filled_area_file):
        raise TypeError(
            "'filled_area_file' must be a string or a pathlib.Path object")

    filled_area_file = prepare_filepath(filled_area_file)

    df = file2dataframe(area_file)

    uid_list: List[str] = df["UID"]
    rt_list: List[float] = []
    for uid in uid_list:
        rt = uid.split('-')[-1]
        rt_list.append(float(rt))

    for sample_name in df.columns[3:]:

        for sample in sample_list:
            if sample_name in sample.name:
                rt_area_dict = sample.rt_areas
                break
        else:
            raise ValueError(
                f"Sample {sample_name!r} not found in sample_list.")

        for i, part in enumerate(df[sample_name]):
            if part == "NA":
                try:
                    df[sample_name][i] = rt_area_dict[rt_list[i]]
                except KeyError:
                    pass

    df.to_csv(filled_area_file, index=False, na_rep="NA")
Пример #13
0
	def store(self, file_name):
		"""
		stores an experiment to a file

		:param file_name: The name of the file
		:type file_name: str or os.PathLike

		:author: Vladimir Likic, Andrew Isaac, Dominic Davis-Foster (pathlib support)
		"""

		if not is_path(file_name):
			raise TypeError("'file_name' must be a string or a PathLike object")

		file_name = prepare_filepath(file_name)

		fp = file_name.open('wb')
		pickle.dump(self, fp, 1)
		fp.close()
Пример #14
0
def load_expr(file_name: PathLike) -> Experiment:
    """
	Loads an experiment saved with :meth:`pyms.Experiment.Experiment.dump`.

	:param file_name: Experiment file name.

	:return: The loaded experiment.

	:author: Vladimir Likic, Andrew Isaac, Dominic Davis-Foster (type assertions and pathlib support)
	"""

    if not is_path(file_name):
        raise TypeError("'file_name' must be a string or a PathLike object")

    file_name = prepare_filepath(file_name, mkdirs=False)
    expr = _pickle_load_path(file_name)

    if not isinstance(expr, Experiment):
        raise TypeError("The loaded file is not an experiment file")

    return expr
Пример #15
0
    def dump(self, file_name: Union[str, pathlib.Path], protocol: int = 3):
        """
		Dumps an object to a file through :func:`pickle.dump()`

		:param file_name: Filename to save the dump as
		:type file_name: str or os.PathLike
		:param protocol: The pickle protocol to use. Default ``3``
		:type protocol: int, optional

		:authors: Vladimir Likic, Dominic Davis-Foster (pathlib and pickle protocol support)
		"""

        if not is_path(file_name):
            raise TypeError(
                "'file_name' must be a string or a PathLike object")

        file_name = prepare_filepath(file_name)

        fp = file_name.open('wb')
        pickle.dump(self, fp, protocol=protocol)
        fp.close()
Пример #16
0
def file2dataframe(file_name: PathLike) -> pandas.DataFrame:
    """
	Convert a .csv file to a pandas DataFrame.

	:param file_name: CSV file to read.

	:authors: Jairus Bowne, Sean O'Callaghan, Dominic Davis-Foster (pathlib support)

	.. versionadded:: 2.3.0
	"""

    if not is_path(file_name):
        raise TypeError("'file_name' must be a string or a PathLike object")

    file_name = prepare_filepath(file_name, mkdirs=False)

    return pandas.read_csv(
        file_name,
        delimiter=',',
        quotechar='"',
        header=0,
    )
Пример #17
0
    def dump(self, file_name, protocol=3):
        """
		Dumps an object to a file through pickle.dump()

		:param file_name: Name of the file for the object dump
		:type file_name: str or pathlib.Path
		:param protocol: The pickle protocol to use. Default 3
		:type protocol: int, optional

		:author: Vladimir Likic
		:author: Dominic Davis-Foster (pathlib and pickle protocol support)
		"""

        if not is_path(file_name):
            raise TypeError(
                "'file_name' must be a string or a PathLike object")

        file_name = prepare_filepath(file_name)

        fp = file_name.open('wb')
        pickle.dump(self, fp, protocol=protocol)
        fp.close()
Пример #18
0
def import_leco_csv(file_name):
    """
	Imports data in LECO CSV format

	:param file_name: Path of the file to read
	:type file_name: str or pathlib.Path

	:return: Data as an IntensityMatrix
	:rtype: pyms.IntensityMatrix.IntensityMatrix

	:authors: Andrew Isaac, Dominic Davis-Foster (pathlib support)
	"""

    if not is_path(file_name):
        raise TypeError("'file_name' must be a string or a PathLike object")

    file_name = prepare_filepath(file_name, mkdirs=False)

    lines_list = file_name.open('r')
    data = []
    time_list = []
    mass_list = []

    # Format is text header with:
    # "Scan","Time",...
    # and the rest is "TIC" or m/z as text, i.e. "50","51"...
    # The following lines are:
    # scan_number,time,value,value,...
    # scan_number is an int, rest seem to be fixed format floats.
    # The format is 0.000000e+000

    num_mass = 0
    FIRST = True
    HEADER = True
    data_col = -1
    time_col = -1
    # get each line
    for line in lines_list:
        cols = -1
        data_row = []
        if len(line.strip()) > 0:
            data_list = line.strip().split(',')
            # get each value in line
            for item in data_list:
                item = item.strip()
                item = item.strip('\'"')  # remove quotes (in header)

                # Get header
                if HEADER:
                    cols += 1
                    if len(item) > 0:
                        if item.lower().find("time") > -1:
                            time_col = cols
                        try:
                            value = float(item)
                            # find 1st col with number as header
                            if FIRST and value > 1:  # assume >1 mass
                                data_col = cols
                                # assume time col is previous col
                                if time_col < 0:
                                    time_col = cols - 1
                                FIRST = False
                            mass_list.append(value)
                            num_mass += 1
                        except ValueError:
                            pass
                # Get rest
                else:
                    cols += 1
                    if len(item) > 0:
                        try:
                            value = float(item)
                            if cols == time_col:
                                time_list.append(value)
                            elif cols >= data_col:
                                data_row.append(value)
                        except ValueError:
                            pass

            # check row length
            if not HEADER:
                if len(data_row) == num_mass:
                    data.append(data_row)
                else:
                    warn("ignoring row")

            HEADER = False

    # check col lengths
    if len(time_list) != len(data):
        warn("number of data rows and time list length differ")

    return IntensityMatrix(time_list, mass_list, data)
Пример #19
0
def write_filled_csv(sample_list: Sample, area_file: Union[str, pathlib.Path],
                     filled_area_file: Union[str, pathlib.Path]):
    """
	creates a new area_ci.csv file, replacing NAs with values from the sample_list objects where possible

	:param sample_list: A list of samples
	:type sample_list: :class:`list` of :class:`pyms.Gapfill.Class.Sample` objects
	:param area_file: the file 'area_ci.csv' from PyMassSpec output
	:type area_file: str or pathlib.Path
	:param filled_area_file: the new output file which has NA values replaced
	:type filled_area_file: str or pathlib.Path

	:author: Jairus Bowne
	:author: Sean O'Callaghan
	:author: Dominic Davis-Foster (pathlib support)
	"""

    if not is_path(filled_area_file):
        raise TypeError(
            "'filled_area_file' must be a string or a pathlib.Path object")

    filled_area_file = prepare_filepath(filled_area_file)

    old_matrix = file2matrix(area_file)

    # Invert it to be a little more efficient
    invert_old_matrix = zip(*old_matrix)
    # print invert_old_matrix[0:5]

    uid_list = invert_old_matrix[0][1:]
    rt_list = []
    for uid in uid_list:
        rt = uid.split('-')[-1]
        rt_list.append(rt)

    # print(rt_list)

    # start setting up the output file
    invert_new_matrix = []
    for line in invert_old_matrix[0:2]:
        invert_new_matrix.append(line)

    for line in invert_old_matrix[3:]:
        sample_name = line[0]

        new_line = []
        new_line.append(sample_name)
        for sample in sample_list:
            if sample_name in sample.get_name():
                rt_area_dict = sample.get_mp_rt_area_dict()
                # print rt_area_dict

        for i, part in enumerate(line[1:]):
            # print part
            if part == 'NA':
                try:
                    area = rt_area_dict[str(rt_list[i])]
                    new_line.append(area)
                except KeyError:
                    pass
            else:
                new_line.append(part)

        invert_new_matrix.append(new_line)

    fp_new = filled_area_file.open('w')

    #    new_matrix = numpy.empty(matrix_size)
    new_matrix = transposed(invert_new_matrix)

    for i, line in enumerate(new_matrix):
        for j, part in enumerate(line):
            fp_new.write(f"{part},")
        fp_new.write("\n")

    fp_new.close()
Пример #20
0
    def write_ion_areas_csv(self,
                            ms_file_name: Union[str, pathlib.Path],
                            minutes: bool = True):
        """
		Write Ion Areas to CSV File

		:param ms_file_name: The name of the file
		:type ms_file_name: str, PathLike
		:param minutes:
		:type minutes: bool

		:author: David Kainer
		:author: Dominic Davis-Foster (pathlib support)
		"""

        if not is_path(ms_file_name):
            raise TypeError(
                "'ms_file_name' must be a string or a PathLike object")

        ms_file_name = prepare_filepath(ms_file_name)

        with ms_file_name.open("w") as fp1:

            # create header

            header = ['"UID"', '"RTavg"']
            for item in self.expr_code:
                header.append(f'"{item}"')

            # write headers
            fp1.write("|".join(header) + "\n")

            for peak_idx in range(len(self.peakpos[0])):

                ias = []
                new_peak_list = []

                for align_idx in range(len(self.peakpos)):

                    peak = self.peakpos[align_idx][peak_idx]

                    if peak is not None:

                        ia = peak.ion_areas
                        ia.update((mass, math.floor(intensity))
                                  for mass, intensity in ia.items())
                        sorted_ia = sorted(ia.items(),
                                           key=operator.itemgetter(1),
                                           reverse=True)
                        ias.append(sorted_ia)
                        new_peak_list.append(peak)

                compo_peak = composite_peak(new_peak_list)

                # write to ms file
                fp1.write(compo_peak.UID)

                if minutes:
                    fp1.write(f"|{compo_peak.rt/60:.3f}")
                else:
                    fp1.write(f"|{compo_peak.rt:.3f}")

                for ia in ias:
                    if ia is None:
                        fp1.write("|NA")
                    else:
                        fp1.write(f"|{ia}")

                fp1.write("\n")
Пример #21
0
    def write_common_ion_csv(self,
                             area_file_name: Union[str, pathlib.Path],
                             top_ion_list: List,
                             minutes: bool = True):
        """
		Writes the alignment to CSV files

		This function writes two files: one containing the alignment of peak
		retention times and the other containing the alignment of peak areas.

		:param area_file_name: The name for the areas alignment file
		:type area_file_name: str or os.PathLike
		:param top_ion_list: A list of the highest intensity common ion along the aligned peaks
		:type top_ion_list: ~collections.abc.Sequence
		:param minutes: An optional indicator whether to save retention times
			in minutes. If False, retention time will be saved in seconds
		:type minutes: bool, optional

		:author: Woon Wai Keen
		:author: Andrew Isaac
		:author: Sean O'Callaghan
		:author: Vladimir Likic
		:author: Dominic Davis-Foster (pathlib support)
		"""

        # TODO: minutes currently does nothing

        if not is_path(area_file_name):
            raise TypeError(
                "'area_file_name' must be a string or a PathLike object")

        if not is_sequence_of(top_ion_list, Number):
            raise TypeError("'top_ion_list' must be a Sequence of Numbers")

        area_file_name = prepare_filepath(area_file_name)

        with area_file_name.open("w") as fp:

            # create header
            header = ['"UID"', '"RTavg"', '"Quant Ion"']
            for item in self.expr_code:
                header.append(f'"{item}"')

            # write headers
            fp.write(",".join(header) + "\n")

            rtsums = []
            rtcounts = []

            # The following two arrays will become list of lists
            # such that:
            # areas = [  [align1_peak1, align2_peak1, .....,alignn_peak1]
            #            [align1_peak2, ................................]
            #              .............................................
            #            [align1_peakm,....................,alignn_peakm]  ]
            areas: List[List] = []
            new_peak_lists: List[List[Peak]] = []

            for peak_list in self.peakpos:
                index = 0
                for peak in peak_list:
                    # one the first iteration, populate the lists
                    if len(areas) < len(peak_list):
                        areas.append([])
                        new_peak_lists.append([])
                        rtsums.append(0)
                        rtcounts.append(0)

                    if peak is not None:
                        rt = peak.rt

                        # get the area of the common ion for the peak
                        # an area of 'na' shows that while the peak was
                        # aligned, the common ion was not present
                        area = peak.get_ion_area(top_ion_list[index])

                        areas[index].append(area)
                        new_peak_lists[index].append(peak)

                        # The following code to the else statement is
                        # just for calculating the average rt
                        rtsums[index] += rt
                        rtcounts[index] += 1

                    else:
                        areas[index].append(None)

                    index += 1

            out_strings = []
            index = 0
            # now write the strings for the file
            for area_list in areas:

                # write initial info:
                # peak unique id, peak average rt
                compo_peak = composite_peak(new_peak_lists[index])
                peak_UID = compo_peak.UID
                peak_UID_string = f'"{peak_UID}"'

                rt_avg = rtsums[index] / rtcounts[index]

                out_strings.append(
                    f"{peak_UID_string},{rt_avg / 60:.3f},{top_ion_list[index]:f}"
                )

                for area in area_list:
                    if area is not None:
                        out_strings[index] += f",{area:.4f}"
                    else:
                        out_strings[index] += ",NA"

                index += 1

            # now write the file
            #        print("length of areas[0]", len(areas[0]))
            #        print("length of areas", len(areas))
            #        print("length of out_strings", len(out_strings))
            for row in out_strings:
                fp.write(row + "\n")
Пример #22
0
    def write_csv(self,
                  rt_file_name: Union[str, pathlib.Path],
                  area_file_name: Union[str, pathlib.Path],
                  minutes: bool = True):
        """
		Writes the alignment to CSV files

		This function writes two files: one containing the alignment of peak
		retention times and the other containing the alignment of peak areas.

		:param rt_file_name: The name for the retention time alignment file
		:type rt_file_name: str or pathlib.Path
		:param area_file_name: The name for the areas alignment file
		:type area_file_name: str or pathlib.Path
		:param minutes: An optional indicator whether to save retention times
			in minutes. If False, retention time will be saved in seconds
		:type minutes: bool, optional

		:author: Woon Wai Keen
		:author: Andrew Isaac
		:author: Vladimir Likic
		:author: David Kainer
		:author: Dominic Davis-Foster (pathlib support)
		"""

        if not isinstance(rt_file_name, (str, pathlib.Path)):
            raise TypeError(
                "'rt_file_name' must be a string or a pathlib.Path object")

        if not isinstance(area_file_name, (str, pathlib.Path)):
            raise TypeError(
                "'area_file_name' must be a string or a pathlib.Path object")

        rt_file_name = prepare_filepath(rt_file_name)
        area_file_name = prepare_filepath(area_file_name)

        fp1 = rt_file_name.open("w")
        fp2 = area_file_name.open("w")

        # create header
        header = ['UID', 'RTavg']
        for item in self.expr_code:
            header.append(f'"{item}"')

        # write headers
        fp1.write(",".join(header) + "\n")
        fp2.write(",".join(header) + "\n")

        # for each alignment position write alignment's peak and area
        for peak_idx in range(len(
                self.peakpos[0])):  # loop through peak lists (rows)

            rts = []
            areas = []
            new_peak_list = []

            for align_idx in range(len(self.peakpos)):
                peak = self.peakpos[align_idx][peak_idx]

                if peak is not None:

                    if minutes:
                        rt = peak.rt / 60.0
                    else:
                        rt = peak.rt

                    rts.append(rt)
                    areas.append(peak.area)
                    new_peak_list.append(peak)

                else:
                    rts.append(None)
                    areas.append(None)

            compo_peak = composite_peak(new_peak_list)

            # write to retention times file
            fp1.write(compo_peak.UID)

            if minutes:
                fp1.write(f",{float(compo_peak.rt / 60):.3f}")
            else:
                fp1.write(f",{compo_peak.rt:.3f}")

            for rt in rts:
                if rt is None or numpy.isnan(rt):
                    fp1.write(",NA")
                else:
                    fp1.write(f",{rt:.3f}")
            fp1.write("\n")

            # write to peak areas file
            fp2.write(compo_peak.UID)

            if minutes:
                fp2.write(f",{float(compo_peak.rt / 60):.3f}")
            else:
                fp2.write(f",{compo_peak.rt:.3f}")

            for area in areas:
                if area is None:
                    fp2.write(",NA")
                else:
                    fp2.write(f",{area:.0f}")
            fp2.write("\n")

        fp1.close()
        fp2.close()
Пример #23
0
def JCAMP_reader(file_name: Union[str, os.PathLike]) -> GCMS_data:
    """
	Generic reader for JCAMP DX files

	:param file_name: Path of the file to read
	:type file_name: str or os.PathLike

	:return: GC-MS data object
	:rtype: :class:`pyms.GCMS.Class.GCMS_data`

	:authors: Qiao Wang, Andrew Isaac, Vladimir Likic, David Kainer, Dominic Davis-Foster (pathlib support)
	"""

    if not is_path(file_name):
        raise TypeError("'file_name' must be a string or a PathLike object")

    file_name = prepare_filepath(file_name, mkdirs=False)

    print(f" -> Reading JCAMP file '{file_name}'")
    lines_list = file_name.open('r')
    data = []
    page_idx = 0
    xydata_idx = 0
    time_list = []
    scan_list = []

    header_info = {}  # Dictionary containing header information

    for line in lines_list:

        if len(line.strip()) != 0:
            # prefix = line.find('#')
            # if prefix == 0:
            if line.startswith("##"):
                # key word or information
                fields = line.split('=', 1)
                fields[0] = fields[0].lstrip("##").upper()
                fields[1] = fields[1].strip()

                if "PAGE" in fields[0]:
                    if "T=" in fields[1]:
                        # PAGE contains retention time starting with T=
                        # FileConverter Pro style
                        time = float(fields[1].lstrip(
                            "T="))  # rt for the scan to be submitted
                        time_list.append(time)
                    page_idx = page_idx + 1
                elif "RETENTION_TIME" in fields[0]:
                    # OpenChrom style
                    time = float(fields[1])  # rt for the scan to be submitted

                    # Check to make sure time is not already in the time list;
                    # Can happen when both ##PAGE and ##RETENTION_TIME are specified
                    if time_list[-1] != time:
                        time_list.append(time)

                elif fields[0] in xydata_tags:
                    xydata_idx = xydata_idx + 1

                elif fields[0] in header_info_fields:
                    if fields[1].isdigit():
                        header_info[fields[0]] = int(fields[1])
                    elif is_float(fields[1]):
                        header_info[fields[0]] = float(fields[1])
                    else:
                        header_info[fields[0]] = fields[1]

            # elif prefix == -1:
            else:
                # Line doesn't start with ##
                # data
                if page_idx > 1 or xydata_idx > 1:
                    if len(data) % 2 == 1:
                        # TODO: This means the data is not in x, y pairs
                        #  Make a better error message
                        raise ValueError("data not in pair !")
                    mass_list = []
                    intensity_list = []
                    for i in range(len(data) // 2):
                        mass_list.append(data[i * 2])
                        intensity_list.append(data[i * 2 + 1])
                    if len(mass_list) != len(intensity_list):
                        raise ValueError(
                            "len(mass_list) is not equal to len(intensity_list)"
                        )
                    scan_list.append(Scan(mass_list, intensity_list))
                    data = []
                    data_sub = line.strip().split(',')
                    for item in data_sub:
                        if not len(item.strip()) == 0:
                            data.append(float(item.strip()))
                    if page_idx > 1:
                        page_idx = 1
                    if xydata_idx > 1:
                        xydata_idx = 1
                else:
                    data_sub = line.strip().split(',')
                    for item in data_sub:
                        if not len(item.strip()) == 0:
                            data.append(float(item.strip()))

    if len(data) % 2 == 1:
        # TODO: This means the data is not in x, y pairs
        #  Make a better error message
        raise ValueError("data not in pair !")

    # get last scan
    mass = []
    intensity = []
    for i in range(len(data) // 2):
        mass.append(data[i * 2])
        intensity.append(data[i * 2 + 1])

    if len(mass) != len(intensity):
        raise ValueError("len(mass) is not equal to len(intensity)")
    scan_list.append(Scan(mass, intensity))

    # sanity check
    time_len = len(time_list)
    scan_len = len(scan_list)
    if time_len != scan_len:
        print(time_list)
        print(scan_list)
        raise ValueError(
            f"Number of time points ({time_len}) does not equal the number of scans ({scan_len})"
        )

    data = GCMS_data(time_list, scan_list)

    return data
Пример #24
0
def write_excel(
		alignment: Alignment,
		file_name: PathLike,
		minutes: bool = True,
		):
	"""
	Writes the alignment to an excel file, with colouring showing possible mis-alignments.

	:param alignment: :class:`pyms.DPA.Alignment.Alignment` object to write to file.
	:param file_name: The name for the retention time alignment file.
	:param minutes: Whether to save retention times in minutes.
		If :py:obj:`False`, retention time will be saved in seconds.

	:author: David Kainer
	"""

	if not is_path(file_name):
		raise TypeError("'file_name' must be a string or a PathLike object")

	file_name = prepare_filepath(file_name)

	wb = Workbook()
	ws = wb.active
	ws.title = "Aligned RT"

	# create header row
	ws["A1"] = "UID"
	ws["B1"] = "RTavg"
	for i, item in enumerate(alignment.expr_code):
		currcell = ws.cell(row=1, column=i + 3, value=f"{item}")
		comment = Comment("sample " + str(i), "dave")
		currcell.comment = comment

	# for each alignment position write alignment's peak and area
	for peak_idx in range(len(alignment.peakpos[0])):  # loop through peak lists (rows)

		new_peak_list = []

		for align_idx in range(len(alignment.peakpos)):  # loops through samples (columns)
			peak = alignment.peakpos[align_idx][peak_idx]

			if peak is not None:

				if minutes:
					rt = peak.rt / 60.0
				else:
					rt = peak.rt

				area = peak.area
				new_peak_list.append(peak)

				# write the RT into the cell in the excel file
				currcell = ws.cell(row=2 + peak_idx, column=3 + align_idx, value=round(rt, 3))

				# get the mini-mass spec for this peak, and divide the ion intensities by 1000 to shorten them
				ia = peak.ion_areas
				ia.update((mass, int(intensity / 1000)) for mass, intensity in ia.items())
				sorted_ia = sorted(ia.items(), key=operator.itemgetter(1), reverse=True)

				# write the peak area and mass spec into the comment for the cell
				comment = Comment(f"Area: {area:.0f} | MassSpec: {sorted_ia}", "dave")
				# currcell.number_format
				currcell.comment = comment

			else:
				# rt = 'NA'
				# area = 'NA'
				currcell = ws.cell(row=2 + peak_idx, column=3 + align_idx, value="NA")
				comment = Comment("Area: NA", "dave")
				# currcell.number_format
				currcell.comment = comment

		compo_peak = composite_peak(new_peak_list)

		if compo_peak is not None:
			peak_UID = compo_peak.UID
			peak_UID_string = f'"{peak_UID}"'

			ws.cell(row=2 + peak_idx, column=1, value=peak_UID_string)
			ws.cell(row=2 + peak_idx, column=2, value=f"{float(compo_peak.rt / 60):.3f}")

	# colour the cells in each row based on their RT percentile for that row
	i = 0
	for row in ws.rows:
		i += 1
		cell_range = ("{0}" + str(i) + ":{1}" + str(i)).format(get_column_letter(3), get_column_letter(len(row)))
		ws.conditional_formatting.add(
				cell_range,
				ColorScaleRule(
						start_type="percentile",
						start_value=1,
						start_color="E5FFCC",
						mid_type="percentile",
						mid_value=50,
						mid_color="FFFFFF",
						end_type="percentile",
						end_value=99,
						end_color="FFE5CC"
						),
				)

		wb.save(file_name)
Пример #25
0
def write_transposed_output(
		alignment: Alignment,
		file_name: PathLike,
		minutes: bool = True,
		):
	"""

	:param alignment: :class:`pyms.DPA.Alignment.Alignment` object to write to file
	:param file_name: The name of the file
	:param minutes:
	"""

	if not is_path(file_name):
		raise TypeError("'file_name' must be a string or a PathLike object")

	file_name = prepare_filepath(file_name)

	wb = Workbook()
	ws1 = wb.create_sheet(title="Aligned RT")
	ws2 = wb.create_sheet(title="Aligned Area")

	ws1["A1"] = "Peak"
	ws1["A2"] = "RTavg"

	ws2["A1"] = "Peak"
	ws2["A2"] = "RTavg"

	style_outlier = PatternFill(fill_type="solid", fgColor="FFAE19", bgColor="FFAE19")

	# write column with sample IDs
	for i, item in enumerate(alignment.expr_code):
		ws1.cell(column=1, row=i + 3, value=f"{item}")
		ws2.cell(column=1, row=i + 3, value=f"{item}")

	# for each alignment position write alignment's peak and area
	for peak_idx in range(len(alignment.peakpos[0])):  # loop through peak lists

		new_peak_list = []  # this will contain a list of tuples of form (peak, col, row), but only non-NA peaks

		for align_idx in range(len(alignment.peakpos)):  # loops through samples
			peak = alignment.peakpos[align_idx][peak_idx]
			cell_col = 2 + peak_idx
			cell_row = 3 + align_idx

			if peak is not None:

				if minutes:
					rt = peak.rt / 60.0
				else:
					rt = peak.rt

				area = peak.area

				# these are the col,row coords of the peak in the output matrix
				new_peak_list.append((peak, cell_col, cell_row))

				# write the RT into the cell in the excel file
				currcell1 = ws1.cell(column=cell_col, row=cell_row, value=round(rt, 3))
				ws2.cell(column=cell_col, row=cell_row, value=round(area, 3))  # type: ignore

				# get the mini-mass spec for this peak, and divide the ion intensities by 1000 to shorten them
				ia = peak.ion_areas
				ia.update((mass, int(intensity / 1000)) for mass, intensity in ia.items())
				sorted_ia = sorted(ia.items(), key=operator.itemgetter(1), reverse=True)

				# write the peak area and mass spec into the comment for the cell
				comment = Comment(f"Area: {area:.0f} | MassSpec: {sorted_ia}", "dave")
				currcell1.comment = comment

			else:
				# rt = 'NA'
				# area = 'NA'
				currcell1 = ws1.cell(column=cell_col, row=cell_row, value="NA")
				ws2.cell(column=cell_col, row=cell_row, value="NA")
				comment = Comment("Area: NA", "dave")
				currcell1.comment = comment

		# this method will create the compo peak, and also mark outlier peaks with a bool is_outlier
		compo_peak = composite_peak(list(p[0] for p in new_peak_list))

		if compo_peak is not None:
			ws1.cell(column=2 + peak_idx, row=1, value=f'"{compo_peak.UID}"')
			ws1.cell(column=2 + peak_idx, row=2, value=f"{float(compo_peak.rt / 60):.3f}")
			ws2.cell(column=2 + peak_idx, row=1, value=f'"{compo_peak.UID}"')
			ws2.cell(column=2 + peak_idx, row=2, value=f"{float(compo_peak.rt / 60):.3f}")

			# highlight outlier cells in the current peak list
			for p in new_peak_list:
				if p[0].is_outlier:
					# ws[ get_column_letter(p[1]) + str(p[2]) ].style = style_outlier
					ws1.cell(column=p[1], row=p[2]).fill = style_outlier
					ws2.cell(column=p[1], row=p[2]).fill = style_outlier

	wb.save(file_name)
Пример #26
0
def write_mass_hunter_csv(
		alignment: Alignment,
		file_name: PathLike,
		top_ion_list: List[int],
		):  # , peak_list_name):
	"""
	Creates a csv file with UID, common and qualifying ions and their
	ratios for mass hunter interpretation.

	:param alignment: alignment object to write to file
	:param file_name: name of the output file.

	:param top_ion_list: a list of the common ions for each peak in the
		averaged peak list for the alignment.
	"""  # noqa: D400

	if not is_path(file_name):
		raise TypeError("'file_name' must be a string or a PathLike object")

	file_name = prepare_filepath(file_name)

	fp = file_name.open('w', encoding="UTF-8")

	if top_ion_list is None:
		raise ValueError("List of common ions must be supplied")

	# write headers
	fp.write(
			'"UID","Common Ion","Qual Ion 1","ratio QI1/CI","Qual Ion 2",'
			'"ratio QI2/CI","l window delta","r window delta"\n'
			)

	rtsums: List[float] = []
	rtcounts = []

	# The following two arrays will become list of lists
	# such that:
	# areas = [  [align1_peak1, align2_peak1, .....,alignn_peak1]
	#            [align1_peak2, ................................]
	#              .............................................
	#            [align1_peakm,....................,alignn_peakm]  ]
	areas = []  # type: ignore
	new_peak_lists = []  # type: ignore
	rtmax = []
	rtmin = []

	for peak_list in alignment.peakpos:
		index = 0

		for peak in peak_list:
			# on the first iteration, populate the lists
			if len(areas) < len(peak_list):
				areas.append([])
				new_peak_lists.append([])
				rtsums.append(0)
				rtcounts.append(0)
				rtmax.append(0.0)
				rtmin.append(0.0)

			if peak is not None:
				rt = peak.rt

				# get the area of the common ion for the peak
				# an area of 'na' shows that while the peak was
				# aligned, the common ion was not present
				area = peak.get_ion_area(top_ion_list[index])

				areas[index].append(area)
				new_peak_lists[index].append(peak)

				# The following code to the else statement is
				# just for calculating the average rt
				rtsums[index] += rt
				rtcounts[index] += 1

				# quick workaround for weird problem when
				# attempting to set rtmin to max time above
				if rtmin[index] == 0.0:
					rtmin[index] = 5400.0

				if rt > rtmax[index]:
					rtmax[index] = rt

				if rt < rtmin[index]:
					rtmin[index] = rt

			else:
				areas[index].append(None)

			index += 1

	out_strings = []
	compo_peaks = []
	index = 0
	# now write the strings for the file
	for area_list in areas:

		# write initial info:
		# peak unique id, peak average rt
		compo_peak = composite_peak(new_peak_lists[index])
		if compo_peak is None:
			continue

		compo_peaks.append(compo_peak)
		peak_UID = compo_peak.UID
		peak_UID_string = f'"{peak_UID}"'

		# calculate the time from the leftmost peak to the average
		l_window_delta = compo_peak.rt - rtmin[index]
		# print("l_window", l_window_delta, "rt", compo_peak.rt, "rt_min", rtmin[index])
		r_window_delta = rtmax[index] - compo_peak.rt

		common_ion = top_ion_list[index]
		qual_ion_1 = int(peak_UID_string.split('-')[0].strip('"'))
		qual_ion_2 = int(peak_UID_string.split('-')[1])

		if qual_ion_1 == common_ion:
			qual_ion_1 = compo_peak.get_third_highest_mz()
		elif qual_ion_2 == common_ion:
			qual_ion_2 = compo_peak.get_third_highest_mz()
		else:
			pass

		ci_intensity = compo_peak.get_int_of_ion(common_ion)
		q1_intensity = compo_peak.get_int_of_ion(qual_ion_1)
		q2_intensity = compo_peak.get_int_of_ion(qual_ion_2)

		try:
			q1_ci_ratio = float(q1_intensity) / float(ci_intensity)
		except TypeError:  # if no area available for that ion
			q1_ci_ratio = 0.0
		except ZeroDivisionError:
			# shouldn't happen but does!!
			q1_ci_ratio = 0.01
		try:
			q2_ci_ratio = float(q2_intensity) / float(ci_intensity)
		except TypeError:
			q2_ci_ratio = 0.0
		except ZeroDivisionError:
			# shouldn't happen, but does!!
			q2_ci_ratio = 0.01

		out_strings.append(
				','.join([
						peak_UID,
						f"{common_ion}",
						f"{qual_ion_1}",
						f"{q1_ci_ratio * 100:.1f}",
						f"{qual_ion_2}",
						f"{q2_ci_ratio * 100:.1f}",
						f"{(l_window_delta + 1.5) / 60:.2f}",
						f"{(r_window_delta + 1.5) / 60:.2f}",
						])
				)

		index += 1

	# now write the file
	#        print("length of areas[0]", len(areas[0]))
	#        print("lenght of areas", len(areas))
	#        print("length of out_strings", len(out_strings))
	for row in out_strings:
		fp.write(f"{row}\n")

	# dump_object(compo_peaks, peak_list_name)

	fp.close()