Python whitespace_tokenize示例，bert.tokenization.whitespace_tokenize Python示例

示例#1

0

显示文件

def read_emr_entry(entry, is_training):
  """ reads input json to return list of examples where each example has context paragraph, question and answer span(if is_training is true)"""
  def is_whitespace(c):
    return c in " \t\r\n" or ord(c) == 0x202F

  examples = []
  note_ids = entry['note_id'] # context_id for NQ dataset
  contexts = entry['context']
  doc_tokens = []
  char_to_word_offset = []
  prev_is_whitespace = True
  for c in contexts:
    if(is_whitespace(c)):
      prev_is_whitespace = True
    else:
      if prev_is_whitespace:
        doc_tokens.append(c)
      else:
        doc_tokens[-1] += c
      prev_is_whitespace = False
    char_to_word_offset.append(len(doc_tokens)-1)

  questions = []
  for i, qas in enumerate(entry["qas"]): # qas is equivalent to questions in NQ dataset
    qas_id = qas['qas_id']
    question_text = qas['question']
    start_position = None
    end_position = None
    answer = None
    if is_training:
      answer= qas['answers']
      orig_answer_text = answer["text"]
      answer_offset = answer["offset"]
      answer_length = len(orig_answer_text)
      start_position = char_to_word_offset[answer_offset]
      end_position = char_to_word_offset[answer_offset + answer_length-1]
      # if answer is None or answer.offset
      if(contexts[answer_offset:answer_offset+answer_length].lower() != " ".join(doc_tokens[start_position:(end_position + 1)]).lower() or " ".join(doc_tokens[start_position:(end_position + 1)]).lower() != answer["text"].lower()):
        continue
      actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
      cleaned_answer_text = " ".join(tokenization.whitespace_tokenize(answer["text"]))

    questions.append(question_text)
    example = EmrExample(
        context_id=int(note_ids),
        qas_id=qas_id,
        question_text=question_text,
        doc_tokens=doc_tokens,
        answer=answer,
        start_position=start_position,
        end_position=end_position)
    examples.append(example)
  return examples

示例#2

0

显示文件

def read_emr_entry(entry, is_training):
    """ creates multiple training examples from given paragraph and question answer pairs from that paragraph """
    def is_whitespace(c):
        return c in " \t\r\n" or ord(c) == 0x202F

    examples = []
    note_ids = entry['note_id']  
    contexts = entry['context']
    doc_tokens = []
    char_to_word_offset = []
    prev_is_whitespace = True
    for c in contexts:
        if (is_whitespace(c)):
            prev_is_whitespace = True
        else:
            if prev_is_whitespace:
                doc_tokens.append(c)
            else:
                doc_tokens[-1] += c
            prev_is_whitespace = False
        char_to_word_offset.append(len(doc_tokens) - 1)

    questions = []
    for i, qas in enumerate(entry["qas"]):  
        qas_id = qas['qas_id']
        question_text = qas['question']
        start_position = None
        end_position = None
        answer = None
        if is_training:
            answer = qas['answers']
            orig_answer_text = answer["text"]
            answer_offset = answer["offset"]
            answer_length = len(orig_answer_text)
            start_position = char_to_word_offset[answer_offset]
            end_position = char_to_word_offset[answer_offset + answer_length]
            # if answer is None or answer.offset
            if(contexts[answer_offset:answer_offset+answer_length].lower() != " ".join(doc_tokens[start_position:(end_position + 1)]).lower() or " ".join(doc_tokens[start_position:(end_position + 1)]).lower() != answer["text"].lower()):
                continue
            actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
            cleaned_answer_text = " ".join(tokenization.whitespace_tokenize(answer["text"]))

        questions.append(question_text)
        example = EmrExample(
            context_id=int(note_ids),
            qas_id=qas_id,
            question_text=question_text,
            doc_tokens=doc_tokens,
            answer=answer,
            start_position=start_position,
            end_position=end_position)
        examples.append(example)
    return examples

示例#3

0

显示文件

 def tokenize(self, text, evidences):
     text = convert_to_unicode(text)
     text = self._clean_text(text)
     orig_tokens = whitespace_tokenize(text)
     split_tokens = []
     split_rations = []
     for token_idx, token in enumerate(orig_tokens):
         if self.do_lower_case:
             token = token.lower()
             token = self._run_strip_accents(token)
         sub_tokens = self._run_split_on_punc(token)
         sub_tokens = ' '.join(sub_tokens).strip().split()
         if len(sub_tokens) > 0:
             split_tokens.extend(sub_tokens)
             ration = self._is_token_rational(token_idx, evidences)
             split_rations.extend([ration] * len(sub_tokens))
     return zip(split_tokens, split_rations)

示例#4

0

显示文件

文件： bert_with_ration.py 项目： JoshuaGhost/expred

 def tokenize(self, text):
     text = convert_to_unicode(text)
     text = self._clean_text(text)
     orig_tokens = whitespace_tokenize(text)
     orig_tokens, orig_rations = self._parse_rations(orig_tokens)
     split_tokens = []
     split_rations = []
     for token, ration in zip(orig_tokens, orig_rations):
         if self.do_lower_case:
             token = token.lower()
             token = self._run_strip_accents(token)
         sub_tokens = self._run_split_on_punc(token)
         sub_tokens = ' '.join(sub_tokens).strip().split()
         if len(sub_tokens) > 0:
             split_tokens.extend(sub_tokens)
             split_rations.extend([ration] * len(sub_tokens))
     return zip(split_tokens, split_rations)

示例#5

0

显示文件

def read_squad_examples(input_file,
                        is_training,
                        max_examples=None,
                        writing_dev=False):
    """Read a SQuAD json file into a list of SquadExample."""
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False
                if is_training:

                    is_impossible = qa["is_impossible"]
                    if (len(qa["answers"]) !=
                            1) and (not is_impossible) and not writing_dev:
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            tf.logging.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text, cleaned_answer_text)
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                example = SquadExample(qas_id=qas_id,
                                       question_text=question_text,
                                       doc_tokens=doc_tokens,
                                       orig_answer_text=orig_answer_text,
                                       start_position=start_position,
                                       end_position=end_position,
                                       is_impossible=is_impossible)
                examples.append(example)

                if max_examples is not None and len(examples) == max_examples:
                    return examples

    return examples

示例#6

0

显示文件

文件： run_nq.py 项目： shyamalschandra/language

def read_nq_entry(entry, is_training):
  """Converts a NQ entry into a list of NqExamples."""

  def is_whitespace(c):
    return c in " \t\r\n" or ord(c) == 0x202F

  examples = []
  contexts_id = entry["id"]
  contexts = entry["contexts"]
  doc_tokens = []
  char_to_word_offset = []
  prev_is_whitespace = True
  for c in contexts:
    if is_whitespace(c):
      prev_is_whitespace = True
    else:
      if prev_is_whitespace:
        doc_tokens.append(c)
      else:
        doc_tokens[-1] += c
      prev_is_whitespace = False
    char_to_word_offset.append(len(doc_tokens) - 1)

  questions = []
  for i, question in enumerate(entry["questions"]):
    qas_id = "{}".format(contexts_id)
    question_text = question["input_text"]
    start_position = None
    end_position = None
    answer = None
    if is_training:
      answer_dict = entry["answers"][i]
      answer = make_nq_answer(contexts, answer_dict)

      # For now, only handle extractive, yes, and no.
      if answer is None or answer.offset is None:
        continue
      start_position = char_to_word_offset[answer.offset]
      end_position = char_to_word_offset[answer.offset + len(answer.text) - 1]

      # Only add answers where the text can be exactly recovered from the
      # document. If this CAN'T happen it's likely due to weird Unicode
      # stuff so we will just skip the example.
      #
      # Note that this means for training mode, every example is NOT
      # guaranteed to be preserved.
      actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
      cleaned_answer_text = " ".join(
          tokenization.whitespace_tokenize(answer.text))
      if actual_text.find(cleaned_answer_text) == -1:
        tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text,
                           cleaned_answer_text)
        continue

    questions.append(question_text)
    example = NqExample(
        example_id=int(contexts_id),
        qas_id=qas_id,
        questions=questions[:],
        doc_tokens=doc_tokens,
        doc_tokens_map=entry.get("contexts_map", None),
        answer=answer,
        start_position=start_position,
        end_position=end_position)
    examples.append(example)
  return examples

示例#7

0

显示文件

文件： run_contract_qa.py 项目： hoangtuanvu/Contract_Elements_Extraction

def read_contract_examples(input_file, is_training):
    """Read a Contract json file into a list of Contract Example."""
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            for c in paragraph_text:
                doc_tokens.append(c.lower())
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position = None
                end_position = None
                orig_answer_text = None
                is_impossible = False

                if is_training:
                    if FLAGS.version_2_with_negative:
                        is_impossible = qa["is_impossible"]

                    if (len(qa["answers"]) != 1) and (not is_impossible):
                        raise ValueError(
                            "For training, each question should have exactly 1 answer."
                        )

                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"].lower()
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = "".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = "".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            tf.logging.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text, cleaned_answer_text)
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                example = ContractExample(qas_id=qas_id,
                                          question_text=question_text,
                                          doc_tokens=doc_tokens,
                                          orig_answer_text=orig_answer_text,
                                          start_position=start_position,
                                          end_position=end_position,
                                          is_impossible=is_impossible)
                examples.append(example)

    return examples

示例#8

0

显示文件

文件： preprocess_qa.py 项目： zzj0402/language

def read_examples(input_file):
    """Read a SQuAD-like json file into a list of Examples."""
    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    reader = tf.gfile.Open(input_file, "r")
    if input_file.endswith(".gz"):
        reader = gzip.GzipFile(fileobj=reader)
    reader.next()
    examples = []
    for line in tqdm(reader):
        item = json.loads(line.strip())
        paragraph_text = item["context"]
        doc_tokens = []
        char_to_word_offset = []
        prev_is_whitespace = True
        for c in paragraph_text:
            if is_whitespace(c):
                prev_is_whitespace = True
            else:
                if prev_is_whitespace:
                    doc_tokens.append(c)
                else:
                    doc_tokens[-1] += c
                prev_is_whitespace = False
            char_to_word_offset.append(len(doc_tokens) - 1)

        for qa in item["qas"]:
            qas_id = qa["qid"]
            question_text = qa["question"]

            start_position = None
            end_position = None
            orig_answer_text = None
            is_impossible = False

            start_position = -1
            end_position = -1
            orig_answer_text = ""
            if FLAGS.version_2_with_negative:
                is_impossible = qa["is_impossible"]
            if not is_impossible:
                answer_offset = qa["detected_answers"][0]["char_spans"][0][0]
                answer_end = qa["detected_answers"][0]["char_spans"][0][1]
                answer_length = answer_end - answer_offset + 1
                orig_answer_text = item["context"][answer_offset:answer_end +
                                                   1]
                start_position = char_to_word_offset[answer_offset]
                end_position = char_to_word_offset[answer_offset +
                                                   answer_length - 1]
                # Only add answers where the text can be exactly recovered from the
                # document. If this CAN'T happen it's likely due to weird Unicode
                # stuff so we will just skip the example.
                #
                # Note that this means for training mode, every example is NOT
                # guaranteed to be preserved.
                actual_text = " ".join(
                    doc_tokens[start_position:(end_position + 1)])
                cleaned_answer_text = " ".join(
                    tokenization.whitespace_tokenize(orig_answer_text))
                if actual_text.find(cleaned_answer_text) == -1:
                    tf.logging.warning("Example %d", len(examples))
                    tf.logging.warning(json.dumps(item, indent=2))
                    tf.logging.warning("Could not find answer: '%s' vs. '%s'",
                                       actual_text, cleaned_answer_text)
                    continue

            example = Example(qas_id=qas_id,
                              question_text=question_text,
                              doc_tokens=doc_tokens,
                              orig_answer_text=orig_answer_text,
                              start_position=start_position,
                              end_position=end_position,
                              is_impossible=is_impossible)
            examples.append(example)
    reader.close()

    return examples

示例#9

0

显示文件

文件： cmrc.py 项目： zofuthan/bert-text

    def _read_json(self, input_file:str,
                   is_training:bool,
                   do_lower_case:bool):
        with tf.gfile.Open(input_file, "r") as reader:
            input_data = json.load(reader)["data"]

        examples = []

        for entry in tqdm(input_data,desc=input_file):
            for paragraph in entry["paragraphs"]:
                paragraph_text = paragraph["context"]
                raw_doc_tokens = customize_tokenizer(paragraph_text)
                doc_tokens = []
                char_to_word_offset = []

                k = 0
                temp_word = ""
                for c in paragraph_text:
                    if tokenization._is_whitespace(c):
                        char_to_word_offset.append(k - 1)
                        continue
                    else:
                        temp_word += c
                        char_to_word_offset.append(k)
                    if do_lower_case:
                        temp_word = temp_word.lower()
                    if temp_word == raw_doc_tokens[k]:
                        doc_tokens.append(temp_word)
                        temp_word = ""
                        k += 1

                assert k == len(raw_doc_tokens)

                for qa in paragraph["qas"]:
                    qas_id = qa["id"]
                    question_text = qa["question"]
                    start_position = None
                    end_position = None
                    orig_answer_text = None

                    if is_training:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]

                        if orig_answer_text not in paragraph_text:
                            tf.logging.warning("Could not find answer")
                        else:
                            answer_offset = paragraph_text.index(orig_answer_text)
                            answer_length = len(orig_answer_text)
                            start_position = char_to_word_offset[answer_offset]
                            end_position = char_to_word_offset[answer_offset + answer_length - 1]

                            # 跳过不符合要求的实例
                            actual_text = "".join(
                                doc_tokens[start_position:(end_position + 1)])
                            cleaned_answer_text = "".join(
                                tokenization.whitespace_tokenize(orig_answer_text))
                            if actual_text.find(cleaned_answer_text) == -1:
                                tf.logging.warning("Could not find answer: '%s' vs. '%s'", actual_text,
                                                   cleaned_answer_text)
                                continue

                    example = SquadExample(
                        qas_id=qas_id,
                        question_text=question_text,
                        doc_tokens=doc_tokens,
                        orig_answer_text=orig_answer_text,
                        start_position=start_position,
                        end_position=end_position)
                    examples.append(example)

        return examples

示例#10

0

显示文件

def read_squad_examples(input_file, is_training):
    """Read a SQuAD json file into a list of SquadExample."""
    with tf.gfile.Open(input_file, "r") as reader:
        input_data = json.load(reader)["data"]

    def is_whitespace(c):
        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
            return True
        return False

    examples = []
    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            paragraph_text = paragraph["context"]
            doc_tokens = []
            char_to_word_offset = []
            prev_is_whitespace = True
            for c in paragraph_text:
                if is_whitespace(c):
                    prev_is_whitespace = True
                else:
                    if prev_is_whitespace:
                        doc_tokens.append(c)
                    else:
                        doc_tokens[-1] += c
                    prev_is_whitespace = False
                char_to_word_offset.append(len(doc_tokens) - 1)

            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                is_impossible = None
                label = qa["class"]
                label_id = LABEL_LIST.index(label)

                # Even though answers are not used for membership classification,
                # we keep this to ensure consistency with training procedure
                if is_training:
                    if FLAGS.version_2_with_negative:
                        is_impossible = qa["is_impossible"]
                    if not is_impossible:
                        answer = qa["answers"][0]
                        orig_answer_text = answer["text"]
                        answer_offset = answer["answer_start"]
                        answer_length = len(orig_answer_text)
                        start_position = char_to_word_offset[answer_offset]
                        end_position = char_to_word_offset[answer_offset +
                                                           answer_length - 1]
                        # Only add answers where the text can be exactly recovered from the
                        # document. If this CAN'T happen it's likely due to weird Unicode
                        # stuff so we will just skip the example.
                        #
                        # Note that this means for training mode, every example is NOT
                        # guaranteed to be preserved.
                        actual_text = " ".join(
                            doc_tokens[start_position:(end_position + 1)])
                        cleaned_answer_text = " ".join(
                            tokenization.whitespace_tokenize(orig_answer_text))
                        if actual_text.find(cleaned_answer_text) == -1:
                            tf.logging.warning(
                                "Could not find answer: '%s' vs. '%s'",
                                actual_text, cleaned_answer_text)
                            continue
                    else:
                        start_position = -1
                        end_position = -1
                        orig_answer_text = ""

                example = SquadMembershipExample(qas_id=qas_id,
                                                 question_text=question_text,
                                                 doc_tokens=doc_tokens,
                                                 label_id=label_id)
                examples.append(example)

    random.shuffle(examples)
    return examples