예제 #1
0
파일: __init__.py 프로젝트: jj-umn/psm-eval
 def get_scans(self):
     #scan_sources = __load_scan_sources(settings)
     #for scan_source in scan_sources:
     index = 0
     # For now just assume MZML.
     for spectrum in mzml_read(open(self.path, 'r')):
         yield mzml_spectrum_to_scan(spectrum, self, index)
         index += 1
예제 #2
0
def encode_spectra(prj, input_file, reference_spectra, **kw):
  """
    Encode spectra
    :param input_file:
    :param prj: ProteomeXchange project/dataset accession
    :param reference_spectra: get a .mgf file contained 500 spectra as reference spectra from normalized dot product calculation
    :param kw: miss_record, ids_usi_save_file, encoded_spectra_save_file
    :return: ids_usi data, encoded_spectra data
    """
  dirname, filename = os.path.split(os.path.abspath(input_file))
  if kw.keys().__contains__("miss_record"):
    miss_record = kw["miss_record"]
  else:
    miss_record = dirname + "/" + filename.strip((filename.split(".")[-1])).strip(".") + "_missing_c_record.txt"

  if kw.keys().__contains__("ids_usi_save_file"):
    ids_usi_save_file = kw["ids_usi_save_file"]
  else:
    ids_usi_save_file = dirname + "/" + filename.strip((filename.split(".")[-1])).strip(".") + "_ids_usi.txt"

  print("kw:",kw,kw.keys(),kw.keys().__contains__("encoded_spectra_save_file"))
  if kw.keys().__contains__("encoded_spectra_save_file"):
    encoded_spectra_save_file = kw["encoded_spectra_save_file"]
  else:
    encoded_spectra_save_file = dirname + "/" + filename.strip((filename.split(".")[-1])).strip(".") + "_encoded.npy"
  print("start .npy...",input_file)
  if str(input_file).endswith(".mgf"):
    spectra_num = more_itertools.ilen(mgf_read(input_file, convert_arrays=1))

    mgf_encoder = EncodeDataset(spectra_num)
    ids_usi_df, vstack_data = mgf_encoder.transform_mgf(prj, input_file, reference_spectra, miss_record)

    pd.DataFrame(ids_usi_df).to_csv(ids_usi_save_file, header=True, index=None)
    #np.save(encoded_spectra_save_file, vstack_data)

    return ids_usi_df, vstack_data

  elif str(input_file).endswith(".mzML"):

    spectra_num = more_itertools.ilen(mzml_read(input_file))
    mzml_encoder = EncodeDataset(spectra_num)
    ids_usi_df, vstack_data = mzml_encoder.transform_mzml(prj, input_file, reference_spectra, miss_record)

    pd.DataFrame(ids_usi_df).to_csv(ids_usi_save_file, header=True, index=None)
    #np.save(encoded_spectra_save_file, vstack_data)

    return ids_usi_df, vstack_data
  else:
    with open(input_file) as fh:
      spectra_json_file = [json.loads(line) for line in fh if line]
    spectra_num = len(spectra_json_file)
    json_encoder = EncodeDataset(spectra_num)
    print("start .npy...1")
    ids_usi_df, vstack_data = json_encoder.transform_json(spectra_json_file, reference_spectra, miss_record)
    print("start .npy...2")

    pd.DataFrame(ids_usi_df).to_csv(ids_usi_save_file, header=True, index=None)
    print("start .npy...3")
    #np.save(encoded_spectra_save_file, vstack_data)
    print("start .npy...4")

    return ids_usi_df, vstack_data
예제 #3
0
파일: util.py 프로젝트: jj-umn/psm-eval
 def _read_test_mzml(self, name='test.mzML'):
     return mzml_read(self._test_data_path(name))
예제 #4
0
  def transform_mzml(self, prj, input_spctra_file, ref_spectra, miss_save_name):
    self.spectra_dataset = None
    print('Start spectra encoding ...')
    # 500 reference spectra
    reference_spectra = mgf_read(ref_spectra, convert_arrays=1)
    reference_intensity = np.array(
      [bin_spectrum(r.get('m/z array'), r.get('intensity array')) for r in reference_spectra])
    ndp_r_spec_list = caculate_r_spec(reference_intensity)

    self.ids_usi_dict, self.ids_list, self.usi_list, peakslist1, precursor_feature_list1 = {}, [], [], [], []
    ndp_spec_list = []
    i, j, k = 0, 0, 0
    charge_none_record, charge_none_list = 0, []
    encode_batch = 10000

    self.mzml = mzml_read(input_spctra_file)
    if encode_batch > self.len:
      for s1 in self.mzml:

        # missing charge
        if s1.get("precursorList").get("precursor")[0].get("selectedIonList").get("selectedIon")[0].get(
          "charge state").__str__()[0] == "N":
          charge_none_record += 1
          spectrum_id = s1.get("spectrum title")
          charge_none_list.append(spectrum_id)
          continue
        else:
          scan = s1.get("spectrum title").split(",")[-1].split(":")[-1].strip("\"").split("=")[-1]
          spectra_file_name = str(input_spctra_file).split("/")[-1]
          usi = "mzspec:" + str(prj) + ":" + spectra_file_name + ":scan:" + str(scan)
          # usi = str(prj) + ":" + str(input_spctra_file) + ":" + str(scan)

          ids = zlib.crc32(usi.encode('utf8'))
          while self.ids_usi_dict.keys().__contains__(ids):
            ids += 1
          self.ids_usi_dict[ids] = usi
          self.usi_list.append(usi)
          self.ids_list.append(ids)
          charge1 = int(
            s1.get("precursorList").get("precursor")[0].get("selectedIonList").get("selectedIon")[0].get(
              "charge state").__str__()[0])

        bin_s1 = bin_spectrum(s1.get('m/z array'), s1.get('intensity array'))
        # ndp_spec1 = np.math.sqrt(np.dot(bin_s1, bin_s1))
        ndp_spec1 = caculate_spec(bin_s1)
        peakslist1.append(bin_s1)
        ndp_spec_list.append(ndp_spec1)
        mass1 = s1.get("precursorList").get("precursor")[0].get("selectedIonList").get("selectedIon")[0].get(
          "selected ion m/z")
        # mass1 = float(s1.get('params').get('pepmass')[0])
        # charge1 = int(s1.get('params').get('charge').__str__()[0])
        precursor_feature1 = np.concatenate((self.gray_code(mass1), self.charge_to_one_hot(charge1)))
        precursor_feature_list1.append(precursor_feature1)

      tmp_precursor_feature_list1 = np.array(precursor_feature_list1)
      intensList01 = np.array(peakslist1)

      # calculate normalized dot product
      tmp_dplist01 = caculate_nornalization_dp(reference_intensity, ndp_r_spec_list, np.array(peakslist1),
                                               np.array(ndp_spec_list))
      tmp01 = concatenate((tmp_dplist01, intensList01), axis=1)
      spectrum01 = concatenate((tmp01, tmp_precursor_feature_list1), axis=1)

      self.spectra_dataset = spectrum01
      peakslist1.clear()
      precursor_feature_list1.clear()
      ndp_spec_list.clear()
    else:
      for s1 in self.mzml:

        # missing charge
        if s1.get("precursorList").get("precursor")[0].get("selectedIonList").get("selectedIon")[0].get(
          "charge state").__str__()[0] == "N":
          charge_none_record += 1
          spectrum_id = s1.get("spectrum title")
          charge_none_list.append(spectrum_id)
          continue
        else:
          scan = s1.get("spectrum title").split(",")[-1].split(":")[-1].strip("\"").split("=")[-1]
          spectra_file_name = str(input_spctra_file).split("/")[-1]
          usi = "mzspec:" + str(prj) + ":" + spectra_file_name + ":scan:" + str(scan)
          ids = zlib.crc32(usi.encode('utf8'))
          while self.ids_usi_dict.keys().__contains__(ids):
            ids += 1
          self.ids_usi_dict[ids] = usi
          self.usi_list.append(usi)
          self.ids_list.append(ids)
          charge1 = int(
            s1.get("precursorList").get("precursor")[0].get("selectedIonList").get("selectedIon")[0].get(
              "charge state").__str__()[0])

        bin_s1 = bin_spectrum(s1.get('m/z array'), s1.get('intensity array'))
        # ndp_spec1 = np.math.sqrt(np.dot(bin_s1, bin_s1))
        ndp_spec1 = caculate_spec(bin_s1)
        peakslist1.append(bin_s1)
        ndp_spec_list.append(ndp_spec1)
        mass1 = s1.get("precursorList").get("precursor")[0].get("selectedIonList").get("selectedIon")[0].get(
          "selected ion m/z")
        # mass1 = float(s1.get('params').get('pepmass')[0])
        # charge1 = int(s1.get('params').get('charge').__str__()[0])
        precursor_feature1 = np.concatenate((self.gray_code(mass1), self.charge_to_one_hot(charge1)))
        precursor_feature_list1.append(precursor_feature1)

        if len(peakslist1) == encode_batch:
          i += 1
          tmp_precursor_feature_list1 = np.array(precursor_feature_list1)
          intensList01 = np.array(peakslist1)

          # calculate normorlized dot product
          tmp_dplist01 = caculate_nornalization_dp(reference_intensity, ndp_r_spec_list, np.array(peakslist1),
                                                   np.array(ndp_spec_list))

          tmp01 = concatenate((tmp_dplist01, intensList01), axis=1)
          spectrum01 = concatenate((tmp01, tmp_precursor_feature_list1), axis=1)

          if i == 1:
            self.spectra_dataset = spectrum01
          else:
            self.spectra_dataset = np.vstack((self.spectra_dataset, spectrum01))
          peakslist1.clear()
          precursor_feature_list1.clear()
          ndp_spec_list.clear()
          j = i * encode_batch

        elif (j + encode_batch + charge_none_record) > self.len:
          if len(peakslist1) == self.len - j - charge_none_record:
            tmp_precursor_feature_list1 = np.array(precursor_feature_list1)
            intensList01 = np.array(peakslist1)

            tmp_dplist01 = caculate_nornalization_dp(reference_intensity, ndp_r_spec_list,
                                                     np.array(peakslist1), np.array(ndp_spec_list))

            tmp01 = concatenate((tmp_dplist01, intensList01), axis=1)
            spectrum01 = concatenate((tmp01, tmp_precursor_feature_list1), axis=1)

            self.spectra_dataset = np.vstack((self.spectra_dataset, spectrum01))

            peakslist1.clear()
            precursor_feature_list1.clear()
            ndp_spec_list.clear()
          else:
            continue

    if len(charge_none_list) > 0:
      np_mr = np.array(charge_none_list)
      df_mr = pd.DataFrame(np_mr, index=None, columns=None)
      # df_mr.to_csv(miss_save_name)
      df_mr.to_csv(miss_save_name, mode="a+", header=None, index=None)
      print("Charge Missing Number:{}".format(charge_none_record))
      del charge_none_list

    self.ids_usi_df = pd.DataFrame({"ids": self.ids_list, "usi": self.usi_list}, columns=["ids", "usi"])

    self.usi_list.clear()
    self.ids_list.clear()
    self.ids_usi_dict.clear()

    return self.ids_usi_df, self.spectra_dataset