def get_scans(self): #scan_sources = __load_scan_sources(settings) #for scan_source in scan_sources: index = 0 # For now just assume MZML. for spectrum in mzml_read(open(self.path, 'r')): yield mzml_spectrum_to_scan(spectrum, self, index) index += 1
def encode_spectra(prj, input_file, reference_spectra, **kw): """ Encode spectra :param input_file: :param prj: ProteomeXchange project/dataset accession :param reference_spectra: get a .mgf file contained 500 spectra as reference spectra from normalized dot product calculation :param kw: miss_record, ids_usi_save_file, encoded_spectra_save_file :return: ids_usi data, encoded_spectra data """ dirname, filename = os.path.split(os.path.abspath(input_file)) if kw.keys().__contains__("miss_record"): miss_record = kw["miss_record"] else: miss_record = dirname + "/" + filename.strip((filename.split(".")[-1])).strip(".") + "_missing_c_record.txt" if kw.keys().__contains__("ids_usi_save_file"): ids_usi_save_file = kw["ids_usi_save_file"] else: ids_usi_save_file = dirname + "/" + filename.strip((filename.split(".")[-1])).strip(".") + "_ids_usi.txt" print("kw:",kw,kw.keys(),kw.keys().__contains__("encoded_spectra_save_file")) if kw.keys().__contains__("encoded_spectra_save_file"): encoded_spectra_save_file = kw["encoded_spectra_save_file"] else: encoded_spectra_save_file = dirname + "/" + filename.strip((filename.split(".")[-1])).strip(".") + "_encoded.npy" print("start .npy...",input_file) if str(input_file).endswith(".mgf"): spectra_num = more_itertools.ilen(mgf_read(input_file, convert_arrays=1)) mgf_encoder = EncodeDataset(spectra_num) ids_usi_df, vstack_data = mgf_encoder.transform_mgf(prj, input_file, reference_spectra, miss_record) pd.DataFrame(ids_usi_df).to_csv(ids_usi_save_file, header=True, index=None) #np.save(encoded_spectra_save_file, vstack_data) return ids_usi_df, vstack_data elif str(input_file).endswith(".mzML"): spectra_num = more_itertools.ilen(mzml_read(input_file)) mzml_encoder = EncodeDataset(spectra_num) ids_usi_df, vstack_data = mzml_encoder.transform_mzml(prj, input_file, reference_spectra, miss_record) pd.DataFrame(ids_usi_df).to_csv(ids_usi_save_file, header=True, index=None) #np.save(encoded_spectra_save_file, vstack_data) return ids_usi_df, vstack_data else: with open(input_file) as fh: spectra_json_file = [json.loads(line) for line in fh if line] spectra_num = len(spectra_json_file) json_encoder = EncodeDataset(spectra_num) print("start .npy...1") ids_usi_df, vstack_data = json_encoder.transform_json(spectra_json_file, reference_spectra, miss_record) print("start .npy...2") pd.DataFrame(ids_usi_df).to_csv(ids_usi_save_file, header=True, index=None) print("start .npy...3") #np.save(encoded_spectra_save_file, vstack_data) print("start .npy...4") return ids_usi_df, vstack_data
def _read_test_mzml(self, name='test.mzML'): return mzml_read(self._test_data_path(name))
def transform_mzml(self, prj, input_spctra_file, ref_spectra, miss_save_name): self.spectra_dataset = None print('Start spectra encoding ...') # 500 reference spectra reference_spectra = mgf_read(ref_spectra, convert_arrays=1) reference_intensity = np.array( [bin_spectrum(r.get('m/z array'), r.get('intensity array')) for r in reference_spectra]) ndp_r_spec_list = caculate_r_spec(reference_intensity) self.ids_usi_dict, self.ids_list, self.usi_list, peakslist1, precursor_feature_list1 = {}, [], [], [], [] ndp_spec_list = [] i, j, k = 0, 0, 0 charge_none_record, charge_none_list = 0, [] encode_batch = 10000 self.mzml = mzml_read(input_spctra_file) if encode_batch > self.len: for s1 in self.mzml: # missing charge if s1.get("precursorList").get("precursor")[0].get("selectedIonList").get("selectedIon")[0].get( "charge state").__str__()[0] == "N": charge_none_record += 1 spectrum_id = s1.get("spectrum title") charge_none_list.append(spectrum_id) continue else: scan = s1.get("spectrum title").split(",")[-1].split(":")[-1].strip("\"").split("=")[-1] spectra_file_name = str(input_spctra_file).split("/")[-1] usi = "mzspec:" + str(prj) + ":" + spectra_file_name + ":scan:" + str(scan) # usi = str(prj) + ":" + str(input_spctra_file) + ":" + str(scan) ids = zlib.crc32(usi.encode('utf8')) while self.ids_usi_dict.keys().__contains__(ids): ids += 1 self.ids_usi_dict[ids] = usi self.usi_list.append(usi) self.ids_list.append(ids) charge1 = int( s1.get("precursorList").get("precursor")[0].get("selectedIonList").get("selectedIon")[0].get( "charge state").__str__()[0]) bin_s1 = bin_spectrum(s1.get('m/z array'), s1.get('intensity array')) # ndp_spec1 = np.math.sqrt(np.dot(bin_s1, bin_s1)) ndp_spec1 = caculate_spec(bin_s1) peakslist1.append(bin_s1) ndp_spec_list.append(ndp_spec1) mass1 = s1.get("precursorList").get("precursor")[0].get("selectedIonList").get("selectedIon")[0].get( "selected ion m/z") # mass1 = float(s1.get('params').get('pepmass')[0]) # charge1 = int(s1.get('params').get('charge').__str__()[0]) precursor_feature1 = np.concatenate((self.gray_code(mass1), self.charge_to_one_hot(charge1))) precursor_feature_list1.append(precursor_feature1) tmp_precursor_feature_list1 = np.array(precursor_feature_list1) intensList01 = np.array(peakslist1) # calculate normalized dot product tmp_dplist01 = caculate_nornalization_dp(reference_intensity, ndp_r_spec_list, np.array(peakslist1), np.array(ndp_spec_list)) tmp01 = concatenate((tmp_dplist01, intensList01), axis=1) spectrum01 = concatenate((tmp01, tmp_precursor_feature_list1), axis=1) self.spectra_dataset = spectrum01 peakslist1.clear() precursor_feature_list1.clear() ndp_spec_list.clear() else: for s1 in self.mzml: # missing charge if s1.get("precursorList").get("precursor")[0].get("selectedIonList").get("selectedIon")[0].get( "charge state").__str__()[0] == "N": charge_none_record += 1 spectrum_id = s1.get("spectrum title") charge_none_list.append(spectrum_id) continue else: scan = s1.get("spectrum title").split(",")[-1].split(":")[-1].strip("\"").split("=")[-1] spectra_file_name = str(input_spctra_file).split("/")[-1] usi = "mzspec:" + str(prj) + ":" + spectra_file_name + ":scan:" + str(scan) ids = zlib.crc32(usi.encode('utf8')) while self.ids_usi_dict.keys().__contains__(ids): ids += 1 self.ids_usi_dict[ids] = usi self.usi_list.append(usi) self.ids_list.append(ids) charge1 = int( s1.get("precursorList").get("precursor")[0].get("selectedIonList").get("selectedIon")[0].get( "charge state").__str__()[0]) bin_s1 = bin_spectrum(s1.get('m/z array'), s1.get('intensity array')) # ndp_spec1 = np.math.sqrt(np.dot(bin_s1, bin_s1)) ndp_spec1 = caculate_spec(bin_s1) peakslist1.append(bin_s1) ndp_spec_list.append(ndp_spec1) mass1 = s1.get("precursorList").get("precursor")[0].get("selectedIonList").get("selectedIon")[0].get( "selected ion m/z") # mass1 = float(s1.get('params').get('pepmass')[0]) # charge1 = int(s1.get('params').get('charge').__str__()[0]) precursor_feature1 = np.concatenate((self.gray_code(mass1), self.charge_to_one_hot(charge1))) precursor_feature_list1.append(precursor_feature1) if len(peakslist1) == encode_batch: i += 1 tmp_precursor_feature_list1 = np.array(precursor_feature_list1) intensList01 = np.array(peakslist1) # calculate normorlized dot product tmp_dplist01 = caculate_nornalization_dp(reference_intensity, ndp_r_spec_list, np.array(peakslist1), np.array(ndp_spec_list)) tmp01 = concatenate((tmp_dplist01, intensList01), axis=1) spectrum01 = concatenate((tmp01, tmp_precursor_feature_list1), axis=1) if i == 1: self.spectra_dataset = spectrum01 else: self.spectra_dataset = np.vstack((self.spectra_dataset, spectrum01)) peakslist1.clear() precursor_feature_list1.clear() ndp_spec_list.clear() j = i * encode_batch elif (j + encode_batch + charge_none_record) > self.len: if len(peakslist1) == self.len - j - charge_none_record: tmp_precursor_feature_list1 = np.array(precursor_feature_list1) intensList01 = np.array(peakslist1) tmp_dplist01 = caculate_nornalization_dp(reference_intensity, ndp_r_spec_list, np.array(peakslist1), np.array(ndp_spec_list)) tmp01 = concatenate((tmp_dplist01, intensList01), axis=1) spectrum01 = concatenate((tmp01, tmp_precursor_feature_list1), axis=1) self.spectra_dataset = np.vstack((self.spectra_dataset, spectrum01)) peakslist1.clear() precursor_feature_list1.clear() ndp_spec_list.clear() else: continue if len(charge_none_list) > 0: np_mr = np.array(charge_none_list) df_mr = pd.DataFrame(np_mr, index=None, columns=None) # df_mr.to_csv(miss_save_name) df_mr.to_csv(miss_save_name, mode="a+", header=None, index=None) print("Charge Missing Number:{}".format(charge_none_record)) del charge_none_list self.ids_usi_df = pd.DataFrame({"ids": self.ids_list, "usi": self.usi_list}, columns=["ids", "usi"]) self.usi_list.clear() self.ids_list.clear() self.ids_usi_dict.clear() return self.ids_usi_df, self.spectra_dataset