Exemplo n.º 1
0
  def _score_spectrum(self,
                      precursor_mass,
                      spectrum_original,
                      state0_c,
                      state0_h,
                      candidate_list,
                      model,
                      model_output_logprob,
                      model_lstm_state,
                      session,
                      direction):
    """TODO(nh2tran): docstring."""

    #~ print("".join(["="] * 80)) # section-separating line
    #~ print("WorkerDB: _score()")

    # convert symbols into id
    candidate_list = [[deepnovo_config.vocab[x] for x in candidate] 
                      for candidate in candidate_list]

    # we shall group candidates into minibatches
    # === candidate_len ===
    # s
    # i
    # z
    # e
    # =====================
    minibatch_size = len(candidate_list) # number of candidates
    candidate_len = len(candidate_list[0]) # length of each candidate

    # candidates share the same state0, so repeat into [minibatch_size, 512]
    # the states will also be updated after every iteration
    state0_c = state0_c.reshape((1, -1)) # reshape to [1, 512]
    state0_h = state0_h.reshape((1, -1))
    minibatch_state_c = np.repeat(state0_c, minibatch_size, axis=0)
    minibatch_state_h = np.repeat(state0_h, minibatch_size, axis=0)

    # mass of each candidate, will be accumulated everytime an AA is appended
    minibatch_prefix_mass = np.zeros(minibatch_size)

    # output is a list of candidate_len arrays of shape [minibatch_size, 26]
    # each row is log of probability distribution over 26 classes/symbols
    output_logprob_list = []

    # recurrent iterations
    for position in range(candidate_len):

      # gather minibatch data
      minibatch_AA_id = np.zeros(minibatch_size)
      for index, candidate in enumerate(candidate_list):
        AA = candidate[position]
        minibatch_AA_id[index] = AA
        minibatch_prefix_mass[index] += deepnovo_config.mass_ID[AA]
      # this is the most time-consuming ~70-75%
      minibatch_intensity = [get_candidate_intensity(spectrum_original,
                                                     precursor_mass,
                                                     prefix_mass,
                                                     direction)
                             for prefix_mass in np.nditer(minibatch_prefix_mass)]
      # final shape [minibatch_size, 26, 8, 10]
      minibatch_intensity = np.array(minibatch_intensity)

      # model feed
      input_feed = {}
      input_feed[model.input_dict["AAid"][1].name] = minibatch_AA_id
      input_feed[model.input_dict["intensity"].name] = minibatch_intensity
      input_feed[model.input_dict["lstm_state"][0].name] = minibatch_state_c
      input_feed[model.input_dict["lstm_state"][1].name] = minibatch_state_h
      # and run
      output_feed = [model_output_logprob, model_lstm_state]
      output_logprob, (minibatch_state_c, minibatch_state_h) = session.run(
          fetches=output_feed,
          feed_dict=input_feed)

      output_logprob_list.append(output_logprob)

    return output_logprob_list
Exemplo n.º 2
0
    def _extend_peak(self, direction, session, model, spectrum_batch,
                     peak_batch):
        """TODO(nh2tran): docstring.
       Inputs:
         spectrum_batch: a list of spectrum, each is a dictionary
           spectrum["scan"]
           spectrum["precursor_mass"]
           spectrum["spectrum_holder"]
           spectrum["spectrum_original_forward"]
           spectrum["spectrum_original_backward"]
         peak_batch: one peak for each spectrum, each peak is a dictionary
           peak["prefix_mass"] for extension in the forward direction
           peak["sufffix_mass"] for extension in the backward direction
           peak["mass_tolerance"]
       Outputs:
         top_path_batch: for every input spectrum, the output is a list of paths,
           each path is a dictionary
             path["AAid_list"]
             path["score_list"]
             path["score_sum"]
    """

        print(
            "WorkerDenovo: _extend_peak(), direction={0:s}".format(direction))

        # test running time and tensorflow time
        test_time_decode = 0.0
        test_time_tf = 0.0
        test_time = 0.0
        start_time_decode = time.time()

        # for every input spectrum, the output is a list of paths,
        #   each path is a dictionary
        #   path["AAid_list"]
        #   path["score_list"]
        #   path["score_sum"]
        spectrum_batch_size = len(spectrum_batch)
        top_path_batch = [[] for x in range(spectrum_batch_size)]

        # forward/backward direction setting
        #   the direction determines the model, the spectrum and the peak mass
        if direction == "forward":
            model_lstm_state0 = model.output_forward["lstm_state0"]
            model_output_log_prob = model.output_forward["logprob"]
            model_lstm_state = model.output_forward["lstm_state"]
            spectrum_original_name = "spectrum_original_forward"
            peak_mass_name = "prefix_mass"
            FIRST_LABEL = self.GO_ID
            LAST_LABEL = self.EOS_ID
        elif direction == "backward":
            model_lstm_state0 = model.output_backward["lstm_state0"]
            model_output_log_prob = model.output_backward["logprob"]
            model_lstm_state = model.output_backward["lstm_state"]
            spectrum_original_name = "spectrum_original_backward"
            peak_mass_name = "suffix_mass"
            FIRST_LABEL = self.EOS_ID
            LAST_LABEL = self.GO_ID

        # PEAK EXTENSION includes 4 steps:
        #   STEP 1: initialize the lstm and the active_search_list.
        #   STEP 2, 3, 4 are repeated until the active_search_list is empty.
        #     STEP 2: gather data from active search entries and group into blocks.
        #     STEP 3: run tensorflow model on data blocks to predict next AA.
        #     STEP 4: retrieve data from blocks to update the active_search_list
        #       with knapsack dynamic programming and beam search.

        start_time_tf = time.time()
        # STEP 1: initialize lstm
        spectrum_holder_array = np.array(
            [x["spectrum_holder"] for x in spectrum_batch])
        input_feed = {}
        input_feed[model.input_dict["spectrum"].name] = spectrum_holder_array
        output_feed = model_lstm_state0
        c_state0_array, h_state0_array = session.run(fetches=output_feed,
                                                     feed_dict=input_feed)
        test_time_tf += time.time() - start_time_tf

        # STEP 1: initialize the active_search_list
        # active_search_list holds the info of search entries under processing
        #   each search entry is a dictionary
        #     search_entry["spectrum_id"]
        #     search_entry["current_path_list"]
        #   each path is also a dictionary
        #     path["AAid_list"]
        #     path["prefix_mass"]
        #     path["score_list"]
        #     path["score_sum"]
        #     path["c_state"]
        #     path["h_state"]
        active_search_list = []
        for spectrum_id in range(spectrum_batch_size):
            search_entry = {}
            search_entry["spectrum_id"] = spectrum_id
            path = {}
            path["AAid_list"] = [FIRST_LABEL]
            path["prefix_mass"] = peak_batch[spectrum_id][peak_mass_name]
            path["score_list"] = [0.0]
            path["score_sum"] = 0.0
            path["c_state"] = c_state0_array[spectrum_id]
            path["h_state"] = h_state0_array[spectrum_id]
            search_entry["current_path_list"] = [path]
            active_search_list.append(search_entry)

        # repeat STEP 2, 3, 4 until the active_search_list is empty.
        while True:

            # STEP 2: gather data from active search entries and group into blocks.

            # data blocks for the input feed of tensorflow model
            block_AAid_1 = []  # nobi
            block_AAid_2 = []  # nobi
            block_c_state = []
            block_h_state = []
            block_candidate_intensity = []
            # data blocks to record the current status of search entries
            block_AAid_list = []
            block_prefix_mass = []
            block_score_list = []
            block_score_sum = []
            block_knapsack_candidate = []

            # store the number of paths of each search entry in the big blocks
            #   to retrieve the info of each search entry later in STEP 4.
            search_entry_size = [0] * len(active_search_list)

            # gather data into blocks through 2 nested loops over active_search_list
            #   and over current_path_list of each search_entry
            for entry_index, search_entry in enumerate(active_search_list):

                spectrum_id = search_entry["spectrum_id"]
                current_path_list = search_entry["current_path_list"]
                precursor_mass = spectrum_batch[spectrum_id]["precursor_mass"]
                spectrum_original = spectrum_batch[spectrum_id][
                    spectrum_original_name]
                peak_mass_tolerance = peak_batch[spectrum_id]["mass_tolerance"]

                for path in current_path_list:

                    # keep track of the AA predicted in the previous iteration
                    #   for nobi (short k-mer) model, we will need 2 previous AA
                    AAid_list = path["AAid_list"]
                    AAid_2 = AAid_list[-1]
                    if len(AAid_list) > 1:
                        AAid_1 = AAid_list[-2]
                    else:
                        AAid_1 = AAid_2  # nobi

                    # the current status of this path
                    prefix_mass = path["prefix_mass"]
                    score_list = path["score_list"]
                    score_sum = path["score_sum"]
                    c_state = path["c_state"]
                    h_state = path["h_state"]

                    # when we reach LAST_LABEL, check if the mass of predicted sequence
                    #   is close to the given precursor_mass:
                    #   if yes, send the current path to output
                    #   if not, skip the current path
                    if AAid_2 == LAST_LABEL:  # nobi
                        if (abs(prefix_mass - precursor_mass) <=
                                peak_mass_tolerance):
                            top_path_batch[spectrum_id].append({
                                "AAid_list":
                                AAid_list,
                                "score_list":
                                score_list,
                                "score_sum":
                                score_sum
                            })
                        continue

                    start_time = time.time()
                    # get CANDIDATE INTENSITY to predict next AA
                    # TODO(nh2tran): change direction from 0/1 to "forward"/"backward"
                    direction_id = 0 if direction == "forward" else 1
                    candidate_intensity = get_candidate_intensity(
                        spectrum_original, precursor_mass, prefix_mass,
                        direction_id)
                    test_time += time.time() - start_time

                    # use knapsack and SUFFIX MASS to filter next AA candidate
                    suffix_mass = precursor_mass - prefix_mass - self.mass_ID[
                        LAST_LABEL]
                    knapsack_tolerance = int(
                        round(peak_mass_tolerance *
                              self.KNAPSACK_AA_RESOLUTION))
                    knapsack_candidate = self._search_knapsack(
                        suffix_mass, knapsack_tolerance)
                    # if not possible to extend, add LAST_LABEL to end the sequence
                    if not knapsack_candidate:
                        knapsack_candidate.append(LAST_LABEL)

                    # gather data blocks
                    block_AAid_1.append(AAid_1)  # nobi
                    block_AAid_2.append(AAid_2)  # nobi
                    block_c_state.append(c_state)
                    block_h_state.append(h_state)
                    block_candidate_intensity.append(candidate_intensity)

                    block_AAid_list.append(AAid_list)
                    block_prefix_mass.append(prefix_mass)
                    block_score_list.append(score_list)
                    block_score_sum.append(score_sum)
                    block_knapsack_candidate.append(knapsack_candidate)

                    # record the size of each search entry in the blocks
                    search_entry_size[entry_index] += 1

            # STEP 3: run tensorflow model on data blocks to predict next AA.
            #   output is stored in current_log_prob, current_c_state, current_h_state
            if block_AAid_1:

                start_time_tf = time.time()

                block_AAid_1 = np.array(block_AAid_1)  # nobi
                block_AAid_2 = np.array(block_AAid_2)  # nobi
                block_c_state = np.array(block_c_state)
                block_h_state = np.array(block_h_state)
                block_candidate_intensity = np.array(block_candidate_intensity)

                input_feed = {}
                input_feed[model.input_dict["AAid"]
                           [0].name] = block_AAid_1  # nobi
                input_feed[model.input_dict["AAid"]
                           [1].name] = block_AAid_2  # nobi
                input_feed[model.input_dict["lstm_state"]
                           [0].name] = block_c_state
                input_feed[model.input_dict["lstm_state"]
                           [1].name] = block_h_state
                input_feed[model.input_dict["intensity"].
                           name] = block_candidate_intensity

                output_feed = [model_output_log_prob,
                               model_lstm_state]  # lstm.len_full
                #~ output_feed = model_output_log_prob # nobi

                current_log_prob, (current_c_state,
                                   current_h_state) = session.run(
                                       output_feed,
                                       input_feed)  # lstm.len_full
                #~ current_log_prob = session.run(output_feed,input_feed) # nobi

                test_time_tf += time.time() - start_time_tf

            # STEP 4: retrieve data from blocks to update the active_search_list
            #   with knapsack dynamic programming and beam search.
            block_index = 0
            for entry_index, search_entry in enumerate(active_search_list):

                # find all possible new paths within knapsack filter
                new_path_list = []
                for index in range(
                        block_index,
                        block_index + search_entry_size[entry_index]):
                    for AAid in block_knapsack_candidate[index]:
                        new_path = {}
                        new_path["AAid_list"] = block_AAid_list[index] + [AAid]
                        new_path["prefix_mass"] = block_prefix_mass[
                            index] + self.mass_ID[AAid]
                        if AAid > 2:  # do NOT add score of GO, EOS, PAD
                            new_path["score_list"] = (
                                block_score_list[index] +
                                [current_log_prob[index][AAid]])
                            new_path["score_sum"] = (
                                block_score_sum[index] +
                                current_log_prob[index][AAid])
                        else:
                            new_path["score_list"] = block_score_list[index]
                            new_path["score_sum"] = block_score_sum[index]
                        new_path["c_state"] = current_c_state[
                            index]  # lstm.len_full
                        new_path["h_state"] = current_h_state[
                            index]  # lstm.len_full
                        #~ new_path["c_state"] = block_c_state[index] # nobi
                        #~ new_path["h_state"] = block_h_state[index] # nobi
                        new_path_list.append(new_path)

                # beam search to select top candidates
                if len(new_path_list) > self.beam_size:
                    new_path_score = np.array(
                        [x["score_sum"] for x in new_path_list])
                    top_k_index = np.argpartition(-new_path_score, self.beam_size)[:self.beam_size]  # pylint: disable=line-too-long
                    search_entry["current_path_list"] = [
                        new_path_list[top_k_index[x]]
                        for x in range(self.beam_size)
                    ]
                else:
                    search_entry["current_path_list"] = new_path_list

                # update the accumulated block_index
                block_index += search_entry_size[entry_index]

            # update active_search_list by removing empty entries
            active_search_list = [
                x for x in active_search_list if x["current_path_list"]
            ]
            # STOP the extension loop if active_search_list is empty
            if not active_search_list:
                break

        test_time_decode += time.time() - start_time_decode
        print("  test_time_tf = %.2f" % (test_time_tf))
        print("  test_time_decode = %.2f" % (test_time_decode))
        print("  test_time = %.2f" % (test_time))

        return top_path_batch