def main(): if len(sys.argv) < 3: raise RuntimeError("need arguments: tdf_directory output.mzML") analysis_dir = sys.argv[1] output_fname = sys.argv[2] if sys.version_info.major == 2: analysis_dir = unicode(analysis_dir) td = timsdata.TimsData(analysis_dir) conn = td.conn # Get total frame count: q = conn.execute("SELECT COUNT(*) FROM Frames") row = q.fetchone() N = row[0] print("Analysis has {0} frames.".format(N)) # Store output if output_fname.lower().endswith("mzml"): consumer = pyopenms.PlainMSDataWritingConsumer(output_fname) # Compress output try: opt = consumer.getOptions() cfg = pyopenms.NumpressConfig() cfg.estimate_fixed_point = True cfg.numpressErrorTolerance = -1.0 # skip check, faster cfg.setCompression("linear"); cfg.linear_fp_mass_acc = -1; # set the desired RT accuracy in seconds opt.setNumpressConfigurationMassTime(cfg) cfg = pyopenms.NumpressConfig() cfg = pyopenms.NumpressConfig() cfg.estimate_fixed_point = True cfg.numpressErrorTolerance = -1.0 # skip check, faster cfg.setCompression("slof"); opt.setNumpressConfigurationIntensity(cfg) opt.setCompression(True) # zlib compression consumer.setOptions(opt) except Exception: pass if output_fname.lower().endswith("sqmass"): consumer = pyopenms.MSDataSqlConsumer(output_fname) for frame_id in range(N): store_frame(frame_id+1, td, conn, consumer, compressFrame=True)
e.addSpectrum(s) if verbose: print "scan ", k if len(mz) > 0 and verbose: print " len: ", len(mz), len(intens) print " at ook0", ook0_axis[k] print " len: ", mz, intens for p in s: print p.getMZ(), p.getIntensity() # Store file at designated position pyopenms.MzMLFile().store(filename, e) td = timsdata.TimsData(analysis_dir) conn = td.conn # Get total frame count: q = conn.execute("SELECT COUNT(*) FROM Frames") row = q.fetchone() N = row[0] print("Analysis has {0} frames.".format(N)) # For testing ## store_frame(10, "/tmp/test2.mzML") for frame_id in range(N): store_frame(frame_id+1, "/tmp/test_%s.mzML" % (frame_id+1), q)
def write_mzml(args): """ Write mzml file Read command line arguments and use psims package to format and write mzml Parameters ---------- args : args manespace object from argspars Returns ------- None """ mzml_data_struct = process_arg(args) ### Connect to TDF DB logging.info("transforming TDF to mzML file: {}".format( mzml_data_struct['input'])) mzml_data_struct['td'] = timsdata.TimsData(mzml_data_struct['input']) ### If DIA method there is no precursor table if mzml_data_struct['td'].conn.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='Precursors'" ).fetchone(): precursor_table_exists = True else: precursor_table_exists = False mzml_data_struct['data_dict'] = get_spectrum_dict(mzml_data_struct, precursor_table_exists) logging.info("{} Total Frames.".format( mzml_data_struct['data_dict']['frame_count'])) logging.info("{} Total Spectra.".format( mzml_data_struct['data_dict']['total_spectra'])) logging.info("{} MS1 Frames.".format( mzml_data_struct['data_dict']['ms1_spectra_count'])) logging.info("{} MS2 Merged Scans.".format( mzml_data_struct['data_dict']['ms2_spectra_count'])) logging.info("writting to mzML file: {}".format( mzml_data_struct['output'])) mzml_data_struct['writer'] = MzMLWriter(mzml_data_struct['output']) mzml_data_struct['writer'].begin() write_header(mzml_data_struct) # Get Spectra number in specified range total_spectra_count = get_num_spectra(mzml_data_struct, precursor_table_exists) logging.info("Processing {} Spectra.".format(total_spectra_count)) logging.info("Reading, Merging and Formating Frames for mzML") with mzml_data_struct['writer'].run( id=1, instrument_configuration='IC1', start_time=mzml_data_struct['data_dict']['acq_date_time']): with mzml_data_struct['writer'].spectrum_list( count=total_spectra_count): # Process Frames mzml_data_struct['scan_loop_time1'] = time.time() mzml_data_struct['scan_index'] = 1 mzml_data_struct['precursor_frames'] = mzml_data_struct[ 'td'].conn.execute( "SELECT * From Frames where MsMsType=0").fetchall() # Check upper frame range if mzml_data_struct['end_frame'] == -1 or mzml_data_struct[ 'end_frame'] > mzml_data_struct['data_dict']['frame_count']: mzml_data_struct['end_frame'] = mzml_data_struct['data_dict'][ 'frame_count'] for precursor_frame in mzml_data_struct['precursor_frames']: # Get Precursor Frame ID mzml_data_struct['current_precursor'] = {} mzml_data_struct['current_precursor']['id'] = precursor_frame[ 0] mzml_data_struct['current_precursor'][ 'start_time'] = precursor_frame[1] / 60 if mzml_data_struct['current_precursor'][ 'id'] < mzml_data_struct[ 'start_frame'] or mzml_data_struct[ 'current_precursor']['id'] > mzml_data_struct[ 'end_frame']: continue write_precursor_frame(mzml_data_struct) logging.debug(mzml_data_struct['scan_index']) scan_progress(mzml_data_struct) if precursor_table_exists: for precursor_data in get_precursor_list(mzml_data_struct): mzml_data_struct['current_precursor'][ 'data'] = precursor_data write_pasef_msms_spectrum(mzml_data_struct) scan_progress(mzml_data_struct) logging.info("Writing final mzML") mzml_data_struct['writer'].end() return
def run_timstof_conversion(input, output=''): global place_high global precursor_counter analysis_dir = input td = timsdata.TimsData(analysis_dir) conn = td.conn # create a database connection #conn = create_connection(analysis_dir) precursor_map = {} with conn: # print("2. Query all tasks") msms_data = select_all_PasefFrameMsMsInfo(conn) # print msms_data[0:5] all_frame = select_all_Frames(conn) # print all_frame[0:5] precursor_list = select_all_Precursors(conn) for row in precursor_list: parent_id = int(row[-1]) if parent_id not in precursor_map: precursor_map[parent_id] = [] precursor_map[parent_id].append(row) all_ms1_frames = [a for a in all_frame if a[4] == '0'] frame_id_ms1_scan_map, ms2_scan_map = build_frame_id_ms1_scan_map( precursor_map, all_ms1_frames) #offset_map = build_offset_map(precursor_map, all_ms1_frames) precursor_array = np.array( precursor_list ) # 'ID', 'LargestPeakMz', 'AverageMz', 'MonoisotopicMz', 'Charge', 'ScanNumber', 'Intensity', 'Parent' # frame_parent_dict = msms_frame_parent_dict(all_frame) # parent_ms2_scan_map = build_frame_id_last_ms2_scan_map(precursor_list) parent_frame_array = np.array(precursor_array[:, 7]) frame_index_list = [] last_val = 0 for idx, val in enumerate(parent_frame_array): if val != last_val: frame_index_list.append(idx) last_val = val # frame_index_list.append(idx + 1) # frame_start_end_dict = {} #for idx, val in enumerate(frame_index_list[:-1]): # frame_start_end_dict[parent_frame_array[val]] = (frame_index_list[idx], frame_index_list[idx + 1]) ms2_header = 'H\tExtractor\tTimsTOF_extractor\n' \ 'H\tExtractorVersion\t{}\n' \ 'H\tPublicationDate\t20-02-2020\n' \ 'H\tComments\tTimsTOF_extractor written by Yu Gao, 2018\n' \ 'H\tComments\tTimsTOF_extractor modified by Titus Jung, 2019\n' \ 'H\tExtractorOptions\tMSn\n' \ 'H\tAcquisitionMethod\tData-Dependent\n' \ 'H\tInstrumentType\tTIMSTOF\n' \ 'H\tDataType\tCentroid\n' \ 'H\tScanType\tMS2\n' \ 'H\tResolution\n' \ 'H\tIsolationWindow\n' \ 'H\tFirstScan\t1\n' \ 'H\tLastScan\t{}\n' \ 'H\tMonoIsotopic PrecMz\tTrue\n'.format(version, len(msms_data)) ms1_header = 'H\tExtractor\tTimsTOF_extractor\n' \ 'H\tExtractorVersion\t{}\n' \ 'H\tPublicationDate\t20-02-2020\n' \ 'H\tComments\tTimsTOF_extractor written by Yu Gao, 2018\n' \ 'H\tComments\tTimsTOF_extractor modified by Titus Jung, 2019\n' \ 'H\tExtractorOptions\tMSn\n' \ 'H\tAcquisitionMethod\tData-Dependent\n' \ 'H\tInstrumentType\tTIMSTOF\n' \ 'H\tScanType\tMS1\n' .format(version) ms2_file_name = os.path.basename(analysis_dir).split('.')[0] + '_nopd.ms2' ms1_file_name = os.path.basename(analysis_dir).split('.')[0] + '_nopd.ms1' ms1_scan_set = set() if len(output) > 0: ms2_file_name = output ms1_file_name = output.replace('.ms2', '.ms1') #else: # os.chdir(sys.argv[2]) if convert_ms2: with open(ms2_file_name, 'w') as output_file: output_file.write(ms2_header) progress = 0 for row in precursor_list: prc_id, largest_preak_mz, average_mz, monoisotopic_mz, cs, scan_number, intensity, parent = row prc_id_int = int(prc_id) if monoisotopic_mz is not None and cs is not None: prc_mass_mz = float(monoisotopic_mz) prc_mass = (prc_mass_mz * cs) - (cs - 1) * 1.007276466 mz_int_arr = td.readPasefMsMs([prc_id_int]) parent_index = int(parent) scan_id = ms2_scan_map[parent_index][prc_id_int] rt_time = float(all_frame[parent_index][1]) k0 = td.scanNumToOneOverK0(parent_index, [scan_number]) mz_arr = mz_int_arr[prc_id_int][0] if len(mz_arr) > 0: output_file.write( "S\t{0:06d}\t{1:06d}\t{2:.4f}\n".format( scan_id, scan_id, prc_mass_mz)) output_file.write( "I\tTIMSTOF_Parent_ID\t{}\n".format(parent)) output_file.write( "I\tTIMSTOF_Precursor_ID\t{}\n".format(prc_id)) output_file.write( "I\tRetTime\t{0:.4f}\n".format(rt_time)) output_file.write("I\tIon Mobility\t{0:.4f}\n".format( k0[0])) output_file.write("Z\t{1}\t{0:.4f}\n".format( prc_mass, cs)) int_arr = mz_int_arr[prc_id_int][1] for j in range(0, len(mz_arr)): output_file.write("%.4f %.1f \n" % (mz_arr[j], int_arr[j])) progress += 1 if progress % 5000 == 0: print( "progress ms2: %.1f%%" % (float(progress) / len(precursor_list) * 100), time.process_time() - start_time) if convert_ms1: with open(ms1_file_name, 'w') as output_file: output_file.write(ms1_header) progress = 0 prev_id = 0 #scan_set = set() prev_scan = 0 precursor_counter = 0 lines = [] for i, frame in enumerate(all_ms1_frames): id = int(frame[0]) num_scans = int(frame[8]) index_intensity_arr = td.readScans(id, 0, num_scans) index_intensity_carr = np.concatenate(index_intensity_arr, axis=1) mobility_index = [ i for i, row in enumerate(index_intensity_arr) for j in range(len(row[0])) ] mass_array = td.indexToMz(id, index_intensity_carr[0]) one_over_k0 = td.scanNumToOneOverK0(id, mobility_index) voltage = td.scanNumToVoltage(id, mobility_index) temp = np.array( list( zip(mass_array, index_intensity_carr[1], one_over_k0, voltage))) mass_intensity = np.around(temp, decimals=4) sorted_mass_intensity = mass_intensity[ mass_intensity[:, 0].argsort()] scan_num = frame_id_ms1_scan_map[id] if len(sorted_mass_intensity) > 0: rt_time = 0 if i == 0 else all_ms1_frames[i - 1][1] lines.append("S\t%06d\t%06d\n" % (scan_num, scan_num)) lines.append("I\tTIMSTOF_Frame_id\t{}\n".format(id)) lines.append("I\tRetTime\t%.2f\n" % float(rt_time)) for row in sorted_mass_intensity: x_str = "%.4f %.1f %.4f \n" % (row[0], row[1], row[-2]) lines.append(x_str) # output_file.write("S\t%06d\t%06d\n" % (scan_num, scan_num)) # output_file.write("I\tTIMSTOF_Frame_id\t{}\n".format(id)) # output_file.write("I\tRetTime\t%.2f\n" % float(rt_time)) # output_file.writelines("%.4f %.1f %.4f\n" % (row[0], row[1], # row[-1]) for row in sorted_mass_intensity) if len(lines) > 1_000_000: output_file.writelines(lines) lines = [] progress += 1 if progress % 5000 == 0: print( "progress ms1 %.1f%%" % (float(progress) / len(all_ms1_frames) * 100), time.process_time() - start_time) output_file.writelines(lines) lines = [] conn.close()