Exemplo n.º 1
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for i, line in enumerate(lines):
         guid = f'{set_type}-{i}'
         text_a = tokenization.convert_to_unicode(
             line[REQUIRED_COLUMNS.index('text')])
         if set_type == 'test':
             label = '0'
         else:
             label = tokenization.convert_to_unicode(
                 line[REQUIRED_COLUMNS.index('label')])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
Exemplo n.º 2
0
 def _create_examples(self, lines, set_type):
   """Creates examples for the training/dev/test sets."""
   examples = []
   for i, line in enumerate(lines):
     if i == 0:
       continue
     guid = "%s-%s" % (set_type, 1)
     if set_type == "test":
       text_a = tokenization.convert_to_unicode(line[1])
       text_b = tokenization.convert_to_unicode(line[2])
       label = "entailment"
     else:
       text_a = tokenization.convert_to_unicode(line[1])
       text_b = tokenization.convert_to_unicode(line[2])
       label = tokenization.convert_to_unicode(line[-1])
     examples.append(
         InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
   return examples
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         text_a = tokenization.convert_to_unicode(line[3])
         text_b = tokenization.convert_to_unicode(line[4])
         if set_type == "test":
             label = "0"
         else:
             label = tokenization.convert_to_unicode(line[0])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         # Only the test set has a header
         if set_type == "test" and i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         if set_type == "test":
             text_a = tokenization.convert_to_unicode(line[1])
             label = "0"
         else:
             text_a = tokenization.convert_to_unicode(line[3])
             label = tokenization.convert_to_unicode(line[1])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
 def get_dev_examples(self, data_dir):
     """See base class."""
     lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "dev-%d" % (i)
         language = tokenization.convert_to_unicode(line[0])
         if language != tokenization.convert_to_unicode(self.language):
             continue
         text_a = tokenization.convert_to_unicode(line[6])
         text_b = tokenization.convert_to_unicode(line[7])
         label = tokenization.convert_to_unicode(line[1])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
 def _create_examples(self, lines, set_type):
     """Creates examples for the training/dev/test sets."""
     examples = []
     for i, line in enumerate(lines):
         if i == 0:
             continue
         guid = "%s-%s" % (set_type, i)
         text_a = tokenization.convert_to_unicode(line[7])
         text_b = tokenization.convert_to_unicode(line[8])
         if set_type == "test":
             label = 0.0
         else:
             label = self.label_type(
                 tokenization.convert_to_unicode(line[9]))
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
 def get_train_examples(self, data_dir):
     """See base class."""
     lines = self._read_tsv(
         os.path.join(data_dir, "multinli",
                      "multinli.train.%s.tsv" % self.language))
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         guid = "train-%d" % (i)
         text_a = tokenization.convert_to_unicode(line[0])
         text_b = tokenization.convert_to_unicode(line[1])
         label = tokenization.convert_to_unicode(line[2])
         if label == tokenization.convert_to_unicode("contradictory"):
             label = tokenization.convert_to_unicode("contradiction")
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples
Exemplo n.º 8
0
def create_training_instances(input_files,
                              tokenizer,
                              max_seq_length,
                              dupe_factor,
                              short_seq_prob,
                              masked_lm_prob,
                              max_predictions_per_seq,
                              rng,
                              do_whole_word_mask=False):
    """Create `TrainingInstance`s from raw text."""
    all_documents = [[]]

    # Input file format:
    # (1) One sentence per line. These should ideally be actual sentences, not
    # entire paragraphs or arbitrary spans of text. (Because we use the
    # sentence boundaries for the "next sentence prediction" task).
    # (2) Blank lines between documents. Document boundaries are needed so
    # that the "next sentence prediction" task doesn't span between documents.
    for input_file in input_files:
        with tf.io.gfile.GFile(input_file, "rb") as reader:
            while True:
                line = tokenization.convert_to_unicode(reader.readline())
                if not line:
                    break
                line = line.strip()

                # Empty lines are used as document delimiters
                if not line:
                    all_documents.append([])
                tokens = tokenizer.tokenize(line)
                if tokens:
                    all_documents[-1].append(tokens)

    # Remove empty documents
    all_documents = [x for x in all_documents if x]
    rng.shuffle(all_documents)

    vocab_words = list(tokenizer.vocab.keys())
    instances = []
    for _ in range(dupe_factor):
        for document_index in range(len(all_documents)):
            instances.extend(
                create_instances_from_document(all_documents, document_index,
                                               max_seq_length, short_seq_prob,
                                               masked_lm_prob,
                                               max_predictions_per_seq,
                                               vocab_words, rng,
                                               do_whole_word_mask))

    rng.shuffle(instances)
    return instances
Exemplo n.º 9
0
    def _tokenize(self, utterance):
        """
        Tokenize the utterance using word-piece tokenization used by BERT.

        Args:
          utterance: A string containing the utterance to be tokenized.

        Returns:
          bert_tokens: A list of tokens obtained by word-piece tokenization of the
            utterance.
          alignments: A dict mapping indices of characters corresponding to start
            and end positions of words (not subwords) to corresponding indices in
            bert_tokens list.
          inverse_alignments: A list of size equal to bert_tokens. Each element is a
            tuple containing the index of the starting and inclusive ending
            character of the word corresponding to the subword. This list is used
            during inference to map word-piece indices to spans in the original
            utterance.
        """
        utterance = tokenization.convert_to_unicode(utterance)
        # After _naive_tokenize, spaces and punctuation marks are all retained, i.e.
        # direct concatenation of all the tokens in the sequence will be the
        # original string.
        tokens = _naive_tokenize(utterance)
        # Filter out empty tokens and obtain aligned character index for each token.
        alignments = {}
        char_index = 0
        bert_tokens = []
        # These lists store inverse alignments to be used during inference.
        bert_tokens_start_chars = []
        bert_tokens_end_chars = []
        for token in tokens:
            if token.strip():
                subwords = self._tokenizer.tokenize(token)
                # Store the alignment for the index of starting character and the
                # inclusive ending character of the token.
                alignments[char_index] = len(bert_tokens)
                bert_tokens_start_chars.extend([char_index] * len(subwords))
                bert_tokens.extend(subwords)
                # The inclusive ending character index corresponding to the word.
                inclusive_char_end = char_index + len(token) - 1
                alignments[inclusive_char_end] = len(bert_tokens) - 1
                bert_tokens_end_chars.extend([inclusive_char_end] *
                                             len(subwords))
            char_index += len(token)
        inverse_alignments = list(
            zip(bert_tokens_start_chars, bert_tokens_end_chars))
        return bert_tokens, alignments, inverse_alignments
Exemplo n.º 10
0
def create_training_instances(input_dir,
                              tokenizer,
                              max_seq_length,
                              max_gapped_tokens,
                              dupe_factor,
                              rng):
    """
    Create training instances from multiple documents.
    The data format is (1) Each document is in it's own file.
    (2) Each sentence is in it's own line.
    """
    all_documents = []

    for file in os.listdir(input_dir):
        with open(os.path.join(input_dir, file), "r+") as reader:
            all_documents.append([])
            for line in reader.readlines():
                line = tokenization.convert_to_unicode(line)
                line = line.strip()

                if not line:
                    continue

                tokens = tokenizer.tokenize(line)
                all_documents[-1].append(tokens)
    all_documents = [x for x in all_documents if x]
    rng.shuffle(all_documents)

    instances = []
    for _ in range(dupe_factor):
        for i, doc in enumerate(all_documents):
            instances.extend(
                create_instances_from_document(
                    all_documents[i], max_seq_length,
                    max_gapped_tokens, rng))

    rng.shuffle(instances)
    return instances
Exemplo n.º 11
0
def preprocess_and_tokenize_input_files(
    input_files: Iterable[str],
    tokenizer: tokenization.FullSentencePieceTokenizer,
    use_eod: bool = True,
    do_lower_case: bool = False,
    log_example_freq: int = 100000) -> List[Tuple[np.array, np.array]]:
  """Preprocesses and encodes raw text from input files.

  This function preprocesses raw text and encodes them into tokens using a
  `SentencePieceModel` tokenization method. This also provides the sentence
  indicator for each token.

  Args:
    input_files: The list of input file names.
    tokenizer: The SentencePiece tokenizer that has the attribute `sp_model`.
    use_eod: Whether or not to use an EOD indicator. If `False`, then EOD is
      not included.
    do_lower_case: Whether or not to apply lower casing during raw text
      preprocessing.
    log_example_freq: The optional field for how many lines to process before
      emitting an info log.

  Returns:
    The preprocessed list. Each entry in the list is a tuple consisting of
    the token IDs and the sentence IDs.

  """
  all_data = []
  eod_symbol = special_symbols["<eod>"]

  total_number_of_lines = 0

  # Input file format:
  # (1) One sentence per line. These should ideally be actual sentences, not
  # entire paragraphs or arbitrary spans of text. (Because we use the
  # sentence boundaries for the "next sentence prediction" task).
  # (2) Blank lines between documents. Document boundaries are needed so
  # that the "next sentence prediction" task doesn't span between documents.
  for input_file in input_files:
    line_count = 0
    logging.info("Preprocessing %s", input_file)

    all_tokens = []
    all_sentence_ids = []

    sentence_id = True

    with tf.io.gfile.GFile(input_file, "rb") as reader:
      while True:
        line = tokenization.convert_to_unicode(reader.readline())
        if not line:
          break

        line_count += 1
        if line_count % log_example_freq == 0:
          logging.info("Loading line %d", line_count)

        line = line.strip()

        if not line:
          if use_eod:
            token_ids = [eod_symbol]
            sentence_id = not sentence_id
          else:
            continue
        else:
          preprocessed_line = _preprocess_line(
              line=line, do_lower_case=do_lower_case)
          token_ids = tokenization.encode_ids(
              sp_model=tokenizer.sp_model, text=preprocessed_line)

        all_tokens.extend(token_ids)
        all_sentence_ids.extend([sentence_id] * len(token_ids))
        sentence_id = not sentence_id
      logging.info("Finished processing %s. Number of lines: %d",
                   input_file, line_count)
      if line_count == 0:
        continue
      total_number_of_lines += line_count
      all_tokens = np.array(all_tokens, dtype=np.int64)
      all_sentence_ids = np.array(all_sentence_ids, dtype=np.bool)
      all_data.append((all_tokens, all_sentence_ids))

  logging.info("Completed text preprocessing. Total number of lines: %d",
               total_number_of_lines)
  return all_data
def process(input_file, tokenizer, rng, args):
    # logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s [%(levelname)-5.5s] [%(name)-12.12s]: %(message)s')
    logger = logging.getLogger(__name__)
    # read & tokenize docs
    all_documents = [[]]
    logger.info('Tokenizing documents...')
    num_logged_examples = 0
    with tf.io.gfile.GFile(input_file, 'rb') as reader:
        num_lines = sum(1 for _ in reader.readline())
    pbar = tqdm(total=num_lines, desc='Tokenization')
    with tf.io.gfile.GFile(input_file, 'rb') as reader:
        i = 0
        while True:
            line = tokenization.convert_to_unicode(reader.readline())
            if not line:
                break
            line = line.strip()
            # Empty lines are used as document delimiters
            if not line:
                all_documents.append([])
            tokens = tokenizer.tokenize(line)
            if tokens:
                all_documents[-1].append(tokens)
                if num_logged_examples < args.num_logged_samples:
                    print('**** Tokenization example ****')
                    print(line)
                    print(tokens)
                    print('****')
                    num_logged_examples += 1
            i += 1
            pbar.update(i)
    # shuffle
    logger.info('Shuffling documents...')
    all_documents = [x for x in all_documents if x]
    rng.shuffle(all_documents)
    num_documents = len(all_documents)
    logger.info(f'Tokenized a total of {num_documents:,} documents')
    # create instances
    logger.info('Creating instances...')
    vocab_words = list(tokenizer.vocab.keys())
    instances = []
    do_whole_word_masking = PRETRAINED_MODELS[
        args.model_class]['do_whole_word_masking']
    for _ in range(args.dupe_factor):
        for document_index in trange(len(all_documents),
                                     desc='Generating training instances'):
            instances.extend(
                create_instances_from_document(
                    all_documents, document_index, args.max_seq_length,
                    args.short_seq_prob, args.masked_lm_prob,
                    args.max_predictions_per_seq, vocab_words, rng,
                    do_whole_word_masking))
    all_documents = None  # free memory
    num_instances = len(instances)
    logger.info(f'Collected a total of {num_instances:,} training instances')
    logger.info('Shuffling training instances...')
    rng.shuffle(instances)
    # write tf records file
    _type = os.path.basename(os.path.dirname(input_file))
    if _type in ['train', 'dev', 'test']:
        output_folder = os.path.join(DATA_DIR, 'pretrain', args.run_name,
                                     'tfrecords', _type)
    else:
        _type = 'default'
        output_folder = os.path.join(DATA_DIR, 'pretrain', args.run_name,
                                     'tfrecords')
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    input_file_name = os.path.basename(input_file)
    output_file = os.path.join(output_folder, f'{input_file_name}.tfrecords')
    logger.info(f'Writing to {output_file}...')
    write_instance_to_example_files(instances, tokenizer, args.max_seq_length,
                                    args.max_predictions_per_seq,
                                    [output_file], args.gzipped)
    return num_documents, num_instances, _type