Python convert_to_unicode 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: etcmodel.models.tokenization

메소드/함수: convert_to_unicode

hotexamples.com에서의 예제들: 2

Python convert_to_unicode - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 etcmodel.models.tokenization.convert_to_unicode에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

    def _add_candidate_tokens(self, example: WikiHopExample,
                              begin_sentence_id: int) -> int:
        """Adds candidate tokens and returns the end_sentence_id.

    Every candidate is treated as a separate sentence.
    Args:
      example: The `WikiHopExample` to add the canddiate tokens to.
      begin_sentence_id: Begin sentence id to assign to candidates.

    Returns:
      end_sentence_id = begin_sentence_id + num_candidates
    """
        sentence_id = begin_sentence_id
        candidates = example.candidate_answers
        for (i, candidate) in enumerate(candidates):
            self.global_paragraph_breakpoints.append(1)
            self.global_token_ids.append(CANDIDATE_GLOBAL_TOKEN_ID)
            self.global_token_type_ids.append(CANDIDATE_GLOBAL_TOKEN_TYPE_ID)

            candidate = tokenization.convert_to_unicode(candidate)
            candidate = self.tokenizer.tokenize(candidate)
            candidate_token_ids = self.tokenizer.convert_tokens_to_ids(
                candidate)
            if i not in self.cand_to_span_positions:
                self.cand_to_span_positions[i] = []
            # Trivial span addition. Every candidate is present by default in the
            # long input.
            self.cand_to_span_positions[i].append(
                (len(self.long_token_ids),
                 len(self.long_token_ids) + len(candidate_token_ids) - 1))

            for token_id in candidate_token_ids:
                self.long_token_ids.append(token_id)
                self.long_token_type_ids.append(CANDIDATE_TOKEN_TYPE_ID)
                self.long_sentence_ids.append(sentence_id)
                self.long_paragraph_ids.append(-1)
                self.long_paragraph_breakpoints.append(0)

            self.long_paragraph_breakpoints[-1] = 1
            sentence_id += 1

        return sentence_id

예제 #2

파일 보기

  def _add_query_tokens(self, example: WikiHopExample,
                        begin_sentence_id: int) -> int:
    """Adds query tokens to long / global input.

    We mirror query tokens in global as well, i.e, we will have one global
    token per query WordPiece. Every WordPiece of the query is treated as a
    separate sentence.

    Args:
      example: The `WikiHopExample` to add the query tokens to.
      begin_sentence_id: The begin sentence id to be used to start assiging
        sentence ids to query tokens.

    Returns:
      end_sentence_id = begin_sentence_id + num_query_word_pieces
    """
    sentence_id = begin_sentence_id
    query = example.query
    query = tokenization.convert_to_unicode(query)
    query_tokens = self.tokenizer.tokenize(query)
    query_token_ids = self.tokenizer.convert_tokens_to_ids(query_tokens)

    for token_id in query_token_ids:
      self.long_token_ids.append(token_id)
      self.global_token_ids.append(QUESTION_GLOBAL_TOKEN_ID)

      self.long_token_type_ids.append(QUESTION_TOKEN_TYPE_ID)
      self.global_token_type_ids.append(QUESTION_GLOBAL_TOKEN_TYPE_ID)

      self.long_sentence_ids.append(sentence_id)
      self.long_paragraph_ids.append(-1)
      self.long_paragraph_breakpoints.append(0)
      self.global_paragraph_breakpoints.append(0)
      sentence_id += 1

    self.long_paragraph_breakpoints[-1] = 1
    self.global_paragraph_breakpoints[-1] = 1
    return sentence_id