Пример #1
0
# The between-state alignment can be performed as follows alignment commands:

# In[27]:

# Define the within-state alignment parameters.
Db = 10.0  # rt modulation
Gb = 0.30  # gap penalty

T9 = PairwiseAlignment([A1, A2], Db, Gb)
A9 = align_with_tree(T9)

A9.write_csv(output_directory / "between_state_alignment" / 'rt.csv',
             output_directory / "between_state_alignment" / 'area.csv')

# Store the aligned peaks to disk.

# In[28]:

from pyms.Peak.List.IO import store_peaks

aligned_peaks = A9.aligned_peaks()
store_peaks(aligned_peaks,
            output_directory / "between_state_alignment" / 'peaks.bin')

# In this example the retention time tolerance for between-state alignment is
# greater compared to the retention time tolerance for the within-state alignment
# as we expect less fidelity in retention times between them. The same functions
# are used for the within-state and between-state alignment. The result of the
# alignment is saved to a file as the area and retention time matrices
# (described above).
Пример #2
0
    def store(self, filename=None):
        """
		Save the Project to a file

		:param filename:
		:type filename:

		:return:
		:rtype:
		"""

        if filename:
            self.filename.value = filename

        self.date_modified.value = time_now()

        if any((self.expr is None, self.tic is None, self.peak_list is None,
                self.intensity_matrix is None, self.gcms_data is None)):
            raise ValueError("Must call 'Experiment.run()' before 'store()'")

        # Write experiment, tic and peak list to temporary directory
        with tempfile.TemporaryDirectory() as tmp:
            self.gcms_data.dump(os.path.join(tmp, "gcms_data.dat"))
            self.intensity_matrix.dump(
                os.path.join(tmp, "intensity_matrix.dat"))
            self.tic.write(os.path.join(tmp, "tic.dat"), formatting=False)
            store_peaks(self.peak_list, os.path.join(tmp, "peaks.dat"), 3)
            store_expr(os.path.join(tmp, "experiment.expr"), self.expr)

            with tarfile.open(self.filename.value,
                              mode="w") as experiment_file:
                # # Add the method files
                # for method in self._method_files:
                # 	experiment_file.add(method)

                experiment_data = {
                    "name": str(self.name),
                    "user": str(self.user),
                    "device": str(self.device),
                    "date_created": float(self.date_created),
                    "date_modified": float(self.date_modified),
                    "description": str(self.description),
                    "version": "1.0.0",
                    "method": str(self.method),
                    "original_filename": str(self.original_filename),
                    "original_filetype": int(self.original_filetype),
                    "identification_performed": self.identification_performed,
                    "ident_audit_record": None,
                }

                if self.identification_performed:
                    experiment_data["ident_audit_record"] = dict(
                        self.ident_audit_record)
                    store_peaks(self.ident_peaks,
                                os.path.join(tmp, "ident_peaks.dat"), 3)
                    experiment_file.add(os.path.join(tmp, "ident_peaks.dat"),
                                        arcname="ident_peaks.dat")

                # Add the info file to the archive
                info_json = json.dumps(experiment_data,
                                       indent=4).encode("utf-8")
                tarinfo = tarfile.TarInfo('info.json')
                tarinfo.size = len(info_json)
                experiment_file.addfile(tarinfo=tarinfo,
                                        fileobj=BytesIO(info_json))

                # Add the method to the archive
                experiment_file.add(self.method.value,
                                    arcname=filename_only(self.method.value))

                # Add the experiment, tic, intrnsity_matrix, gcms_data and peak list
                experiment_file.add(os.path.join(tmp, "experiment.expr"),
                                    arcname="experiment.expr")
                experiment_file.add(os.path.join(tmp, "tic.dat"),
                                    arcname="tic.dat")
                experiment_file.add(os.path.join(tmp, "peaks.dat"),
                                    arcname="peaks.dat")
                experiment_file.add(os.path.join(tmp, "gcms_data.dat"),
                                    arcname="gcms_data.dat")
                experiment_file.add(os.path.join(tmp, "intensity_matrix.dat"),
                                    arcname="intensity_matrix.dat")

        return self.filename
Пример #3
0
def test_align_2_alignments(A1, pyms_datadir, tmp_pathplus):
    expr_list = []

    for jcamp_file in geco_codes:
        im = build_intensity_matrix_i(
            JCAMP_reader(pyms_datadir / f"{jcamp_file}.JDX"))

        # Intensity matrix size (scans, masses)
        n_scan, n_mz = im.size

        # noise filter and baseline correct
        for ii in range(n_mz):
            ic = im.get_ic_at_index(ii)
            ic_smooth = savitzky_golay(ic)
            ic_bc = tophat(ic_smooth, struct="1.5m")
            im.set_ic_at_index(ii, ic_bc)

        peak_list = BillerBiemann(im, points=9, scans=2)
        apl = rel_threshold(peak_list, 2)
        new_peak_list = num_ions_threshold(apl, 3, 3000)

        # ignore TMS ions and set mass range
        for peak in new_peak_list:
            peak.crop_mass(50, 400)
            peak.null_mass(73)
            peak.null_mass(147)

            # find area
            area = peak_sum_area(im, peak)
            peak.area = area
            area_dict = peak_top_ion_areas(im, peak)
            peak.ion_areas = area_dict

        expr = Experiment(jcamp_file, new_peak_list)

        # set time range for all experiments
        expr.sele_rt_range(["6.5m", "21m"])

        expr_list.append(expr)

    F2 = exprl2alignment(expr_list)
    T2 = PairwiseAlignment(F2, Dw, Gw)
    A2 = align_with_tree(T2, min_peaks=2)

    # top_ion_list = A2.common_ion()
    # A2.write_common_ion_csv(tmp_pathplus/'area1.csv', top_ion_list)

    # between replicates alignment parameters
    Db = 10.0  # rt modulation
    Gb = 0.30  # gap penalty

    print("Aligning input {1,2}")
    T9 = PairwiseAlignment([A1, A2], Db, Gb)
    A9 = align_with_tree(T9)

    A9.write_csv(tmp_pathplus / "rt.csv", tmp_pathplus / "area.csv")

    aligned_peaks = list(filter(None, A9.aligned_peaks()))
    store_peaks(aligned_peaks, tmp_pathplus / "peaks.bin")

    top_ion_list = A9.common_ion()
    A9.write_common_ion_csv(tmp_pathplus / "area.csv", top_ion_list)
Пример #4
0
 def test_store_peak_list_errors(self, filtered_peak_list, obj):
     with pytest.raises(TypeError):
         store_peaks(filtered_peak_list, obj)
Пример #5
0
T1 = PairwiseAlignment(F1, Dw, Gw)
A1 = align_with_tree(T1, min_peaks=2)

A1.write_csv('output/Art.csv', 'output/Aarea.csv')

print('Aligning expt B')
expr_list = []
expr_dir = "../61b/output/"
for expr_code in exprB_codes:
    file_name = os.path.join(expr_dir, expr_code + ".expr")
    expr = load_expr(file_name)
    expr_list.append(expr)
F2 = exprl2alignment(expr_list)
T2 = PairwiseAlignment(F2, Dw, Gw)
A2 = align_with_tree(T2, min_peaks=2)

A2.write_csv('output/Brt.csv', 'output/Barea.csv')

# between replicates alignment parameters
Db = 10.0  # rt modulation
Gb = 0.30  # gap penalty

print('Aligning input {1,2}')
T9 = PairwiseAlignment([A1, A2], Db, Gb)
A9 = align_with_tree(T9)

A9.write_csv('output/rt.csv', 'output/area.csv')

aligned_peaks = A9.aligned_peaks()
store_peaks(aligned_peaks, 'output/peaks.bin')
Пример #6
0
def peak_list_filename(im, filtered_peak_list, outputdir):
    filename = outputdir / "filtered_peak_list.dat"
    store_peaks(filtered_peak_list, filename)
    return filename
Пример #7
0
 def test_store_filename_errors(self, outputdir, obj):
     with pytest.raises(TypeError):
         store_peaks(obj, outputdir / test_string)
Пример #8
0
 def test_store_filename_errors(self, tmp_pathplus, obj):
     with pytest.raises(TypeError):
         store_peaks(obj, tmp_pathplus / test_string)
Пример #9
0
def peak_list_filename(im, filtered_peak_list, tmp_pathplus):
    filename = tmp_pathplus / "filtered_peak_list.dat"
    store_peaks(filtered_peak_list, filename)
    return filename
Пример #10
0
def import_processing(jcamp_file, spectrum_csv_file, report_csv_file, combined_csv_file, bb_points = 9, bb_scans = 2, noise_thresh = 2, target_range = (0,120), tophat_struct="1.5m", nistpath = "../MSSEARCH", base_peak_filter = ['73'], ExprDir = "."):		
	global nist_path
	nist_path = nistpath
	
	# Parameters
	base_peak_filter = [int(x) for x in base_peak_filter]
	target_range = tuple(target_range)
	sample_name = os.path.splitext(os.path.basename(jcamp_file))[0]
	number_of_peaks = 80
	
	data = JCAMP_reader(jcamp_file)
	
	# list of all retention times, in seconds
	times = data.get_time_list()
	# get Total Ion Chromatogram
	tic = data.get_tic() 
	# RT Range, time step, no. scans, min, max, mean and median m/z
	data.info()
	
	#data.write("output/data") # save output
	
	# Mass Binning	
	im = build_intensity_matrix_i(data) # covnert to intensity matrix
	#im.get_size() #number of scans, number of bins
	masses = im.get_mass_list() # list of mass bins
	
	print(" Minimum m/z bin: {}".format(im.get_min_mass()))
	print(" Maximum m/z bin: {}".format(im.get_max_mass()))
	
	# Write Binned Mass Spectra to OpenChrom-like CSV file
	ms = im.get_ms_at_index(0) # first mass spectrum
	spectrum_csv = open(spectrum_csv_file, 'w')
	spectrum_csv.write('RT(milliseconds);RT(minutes) - NOT USED BY IMPORT;RI;')
	spectrum_csv.write(';'.join(str(mz) for mz in ms.mass_list))
	spectrum_csv.write("\n")
		
	for scan in range(len(times)):
		spectrum_csv.write("{};{};{};".format(int(times[scan]*1000),rounders((times[scan]/60),"0.0000000000"),0))	
		ms = im.get_ms_at_index(scan)
		spectrum_csv.write(';'.join(str(intensity) for intensity in ms.mass_spec))
		spectrum_csv.write('\n')
	spectrum_csv.close()
	
	## Data filtering

	# Note that Turbomass does not use smoothing for qualitative method.	
	# Top-hat baseline Correction seems to bring down noise,
	#  retaning shapes, but keeps points on actual peaks
	
	#dump_object(im, "output/im.dump") # un-processed output

	n_scan, n_mz = im.get_size()
	for ii in range(n_mz):
		#print("\rWorking on IC#", ii+1, '  ',end='')
		ic = im.get_ic_at_index(ii)
		ic_smooth = savitzky_golay(ic)
		ic_bc = tophat(ic_smooth, struct=tophat_struct)
		im.set_ic_at_index(ii, ic_bc)

	#dump_object(im, "output/im-proc.dump") # processed output
		
	# Peak Detection based on Biller and Biemann, 1974, with a window
	#  of n points, and combining y scans if they apex next to each other
	peak_list = BillerBiemann(im, points=bb_points, scans=bb_scans) 
	
	print(" Number of peaks identified before filtering: {}".format(len(peak_list)))
	
	# Filtering peak lists with automatic noise filtering
	noise_level = window_analyzer(tic)
	peak_list = num_ions_threshold(peak_list, noise_thresh, noise_level)
	# why use 2 for number of ions above threshold?
	print(" Number of peaks identified: {}".format(len(peak_list)))

	# Peak Areas
	peak_area_list = []
	filtered_peak_list = []
	
	for peak in peak_list:
		apex_mass_list = peak.get_mass_spectrum().mass_list
		apex_mass_spec = peak.get_mass_spectrum().mass_spec
		base_peak_intensity = max(apex_mass_spec)
		base_peak_index = [index for index, intensity in enumerate(apex_mass_spec) if intensity == base_peak_intensity][0]
		base_peak_mass = apex_mass_list[base_peak_index]
		#print(base_peak_mass)
		if base_peak_mass in base_peak_filter:
			continue # skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed
		
		area = peak_sum_area(im, peak)
		peak.set_area(area)
		peak_area_list.append(area)
		filtered_peak_list.append(peak)
	
	# Save the TIC and Peak List
	tic.write(os.path.join(ExprDir,"{}_tic.dat".format(sample_name)),formatting=False)
	store_peaks(filtered_peak_list,os.path.join(ExprDir,"{}_peaks.dat".format(sample_name)))
	
	# from https://stackoverflow.com/questions/16878715/how-to-find-the-index-of-n-largest-elements-in-a-list-or-np-array-python?lq=1
	top_peaks = sorted(range(len(peak_area_list)), key=lambda x: peak_area_list[x])
	
	# Write to turbomass-like CSV file
	report_csv = open(report_csv_file, "w")
	
	# Write to GunShotMatch Combine-like CSV file
	combine_csv = open(combined_csv_file, "w")
	
	combine_csv.write(sample_name)
	combine_csv.write("\n")
		
	report_csv.write("#;RT;Scan;Height;Area\n")
	combine_csv.write("Retention Time;Peak Area;;Lib;Match;R Match;Name;CAS Number;Scan\n")
	
	report_buffer = []
	
	for index in top_peaks:
		# Peak Number (1-80)
		peak_number = top_peaks.index(index)+1 
		# Retention time (minutes, 3dp)
		RT = rounders(filtered_peak_list[index].get_rt()/60,"0.000") 
		
		if not target_range[0] < RT <= target_range[1]:
			continue # skip the peak if it is outside the desired range
		
		# scan number, not that we really nead it as the peak object has the spectrum
		Scan = data.get_index_at_time(filtered_peak_list[index].get_rt())+1 
		# the binned mass spectrum
		filtered_peak_list[index].get_mass_spectrum() 
		# TIC intensity, as proxy for Peak height, which should be from baseline
		Height = '{:,}'.format(rounders(tic.get_intensity_at_index(data.get_index_at_time(filtered_peak_list[index].get_rt())),"0"))
		# Peak area, originally in "intensity seconds", so dividing by 60 to
		#  get "intensity minutes" like turbomass uses
		Area = '{:,}'.format(rounders(filtered_peak_list[index].get_area()/60,"0.0")) 
		
		#report_csv.write("{};{};{};{};{};{}\n".format(peak_number, RT, Scan, Height, Area,bounds))
		report_buffer.append([peak_number, RT, Scan, Height, Area])

	report_buffer = report_buffer[::-1] # Reverse list order

	# List of peaks already added to report
	existing_peaks = []

	filtered_report_buffer = []
	
	for row in report_buffer:
		filtered_report_buffer.append(row)
	
	filtered_report_buffer = filtered_report_buffer[:number_of_peaks]
	
	filtered_report_buffer.sort(key=operator.itemgetter(2))
	
	for row in filtered_report_buffer:
		index = filtered_report_buffer.index(row)
		report_csv.write(";".join([str(i) for i in row]))
		
		ms = im.get_ms_at_index(row[2]-1)
		
		create_msp("{}_{}".format(sample_name,row[1]),ms.mass_list, ms.mass_spec)
		matches_dict = nist_ms_comparison("{}_{}".format(sample_name,row[1]),ms.mass_list, ms.mass_spec)
		
		combine_csv.write("{};{};Page {} of 80;;;;;;{}\n".format(row[1],row[4],index+1,row[2]))
		
		for hit in range(1,6):
			report_csv.write(str(matches_dict["Hit{}".format(hit)]))
			report_csv.write(";")
			combine_csv.write(";;{};{};{};{};{};{};\n".format(hit,
					matches_dict["Hit{}".format(hit)]["Lib"],
					matches_dict["Hit{}".format(hit)]["MF"],
					matches_dict["Hit{}".format(hit)]["RMF"],
					matches_dict["Hit{}".format(hit)]["Name"],
					matches_dict["Hit{}".format(hit)]["CAS"],
					))

		report_csv.write("\n")
		
		time.sleep(2)
		
	report_csv.close()
	combine_csv.close()
	
	# Create an experiment
	expr = Experiment(sample_name, filtered_peak_list)
	expr.sele_rt_range(["{}m".format(target_range[0]),"{}m".format(target_range[1])])
	store_expr(os.path.join(ExprDir,"{}.expr".format(sample_name)), expr)
	
	return 0
Пример #11
0
peak_list = BillerBiemann(im, points=9, scans=2)

print("Number of peaks found: ", len(peak_list))

# Filter peaks
# Filter the peak list,
# first by removing all intensities in a peak less than a given relative
# threshold,
# then by removing all peaks that have less than a given number of ions above
# a given value

# Parameters
# percentage ratio of ion intensity to max ion intensity
r = 2

# minimum number of ions, n
n = 3
# greater than or equal to threshold, t
t = 10000

# trim by relative intensity
pl = rel_threshold(peak_list, r)

# trim by threshold
new_peak_list = num_ions_threshold(pl, n, t)

print("Number of filtered peaks: ", len(new_peak_list))

# store peak list
store_peaks(new_peak_list, 'output/peaks.bin')
Пример #12
0
	def quantitative_processing(self, jcamp_file, log_stdout=True):
		"""
		Import JCAMP-DX Files

		:param jcamp_file:
		:type jcamp_file:
		:param log_stdout:
		:type log_stdout:
		
		:return:
		:rtype:
		"""
		
		# Determine the name of the sample from the filename
		sample_name = os.path.splitext(os.path.basename(jcamp_file))[0]
		
		# Log Stdout to File
		if log_stdout:
			sys.stdout = open(os.path.join(self.config.log_dir, sample_name + ".log"), "w")
		
		# Load data using JCAMP_reader
		data = JCAMP_reader(jcamp_file)
		
		# list of all retention times, in seconds
		times = data.get_time_list()
		# get Total Ion Chromatogram
		tic = data.get_tic()
		# RT Range, time step, no. scans, min, max, mean and median m/z
		data.info()
		
		# Build "intensity matrix" by binning data with integer bins and a
		# 	window of -0.3 to +0.7, the same as NIST uses
		im = build_intensity_matrix_i(data)
		
		# Show the m/z of the maximum and minimum bins
		print(" Minimum m/z bin: {}".format(im.get_min_mass()))
		print(" Maximum m/z bin: {}".format(im.get_max_mass()))
		
		# Crop masses
		min_mass, max_mass, *_ = self.config.mass_range
		
		if min_mass < im.get_min_mass():
			min_mass = im.get_min_mass()
		if max_mass > im.get_max_mass():
			max_mass = im.get_max_mass()
		im.crop_mass(min_mass, max_mass)
		
		# Perform Data filtering
		n_scan, n_mz = im.get_size()
		
		# Iterate over each IC in the intensity matrix
		for ii in range(n_mz):
			# print("\rWorking on IC#", ii+1, '  ',end='')
			ic = im.get_ic_at_index(ii)
			
			# Perform Savitzky-Golay smoothing.
			# Note that Turbomass does not use smoothing for qualitative method.
			ic_smooth = savitzky_golay(ic)
			
			# Perform Tophat baseline correction
			# Top-hat baseline Correction seems to bring down noise,
			#  		retaining shapes, but keeps points on actual peaks
			ic_bc = tophat(ic_smooth, struct=self.config.tophat_struct)
			
			# Set the IC in the intensity matrix to the filtered one
			im.set_ic_at_index(ii, ic_bc)
		
		# Peak Detection based on Biller and Biemann (1974), with a window
		# 	of <points>, and combining <scans> if they apex next to each other
		peak_list = BillerBiemann(im, points=self.config.bb_points, scans=self.config.bb_scans)
		
		print(" Number of peaks identified before filtering: {}".format(len(peak_list)))
		
		# Filtering peak lists with automatic noise filtering
		noise_level = window_analyzer(tic)
		# should we also do rel_threshold() here?
		# https://pymassspec.readthedocs.io/en/master/pyms/BillerBiemann.html#pyms.BillerBiemann.rel_threshold
		peak_list = num_ions_threshold(peak_list, self.config.noise_thresh, noise_level)
		
		filtered_peak_list = []
		
		for peak in peak_list:
			# Get mass and intensity lists for the mass spectrum at the apex of the peak
			apex_mass_list = peak.mass_spectrum.mass_list
			apex_mass_spec = peak.mass_spectrum.mass_spec
			
			# Determine the intensity of the base peak in the mass spectrum
			base_peak_intensity = max(apex_mass_spec)
			
			# Determine the index of the base peak in the mass spectrum
			base_peak_index = [
				index for index, intensity in enumerate(apex_mass_spec)
				if intensity == base_peak_intensity][0]
			
			# Finally, determine the mass of the base peak
			base_peak_mass = apex_mass_list[base_peak_index]
			
			# skip the peak if the base peak is at e.g. m/z 73, i.e. septum bleed
			if base_peak_mass in self.config.base_peak_filter:
				continue
			
			area = peak_sum_area(im, peak)
			peak.set_area(area)
			filtered_peak_list.append(peak)
			
		print(" Number of peaks identified: {}".format(len(filtered_peak_list)))
		
		# Save the TIC and Peak List
		tic.write(os.path.join(self.config.expr_dir, "{}_tic.dat".format(sample_name)), formatting=False)
		store_peaks(filtered_peak_list, os.path.join(self.config.expr_dir, "{}_peaks.dat".format(sample_name)))
		
		# Create an experiment
		expr = Experiment(sample_name, filtered_peak_list)
		expr.sele_rt_range(["{}m".format(self.config.target_range[0]), "{}m".format(self.config.target_range[1])])
		store_expr(os.path.join(self.config.expr_dir, "{}.expr".format(sample_name)), expr)