예제 #1
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text_a = tokenization.convert_to_unicode(line[0])
         label = tokenization.convert_to_unicode(line[1])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples
예제 #2
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             idx_text = line.index('text')
             idx_label = line.index('label')
         else:
             guid = "%s-%s" % (set_type, i)
             text_a = tokenization.convert_to_unicode(line[idx_text])
             label = tokenization.convert_to_unicode(line[idx_label])
             examples.append(
                 InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
     return examples
예제 #3
0
  def _create_examples(self, lines, set_type):
    examples = []
    for i, line in enumerate(lines):
      if i == 0:
        idx_text = line.index('text')
        idx_label = line.index('label')
      else:
        guid = f'{set_type}-{i}'
        text_a = tokenization.convert_to_unicode(line[idx_text])
        label = tokenization.convert_to_unicode(line[idx_label])
        examples.append(
          InputExample(guid, text_a, label=label)
        )

    return examples
예제 #4
0
def read_examples(input_file):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    with tf.gfile.GFile(input_file, "r") as reader:
        while True:

            line = tokenization.convert_to_unicode(reader.readline())
            if not line:
                break

            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line)

            if m is None:
                text_a = line
            else:
                text_a = m.group(1)
                text_b = m.group(2)

            examples.append(
                InputExample(unique_id=unique_id, text_a=text_a,
                             text_b=text_b))
            unique_id += 1

    return examples
예제 #5
0
 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         # tsv format
         # id, prompt, text, label
         # prompt
         prompt = tokenization.convert_to_unicode(line[1])
         # text
         text = tokenization.convert_to_unicode(line[2])
         if set_type == "test":
             label = 1
         else:
             label = float(line[-1])
         examples.append(InputExample(prompt=prompt, text=text,
                                      label=label))
     return examples
def create_training_instances(input_files,
							  tokenizer,
							  max_seq_length,
							  dupe_factor,
							  short_seq_prob,
							  masked_lm_prob,
							  max_predictions_per_seq,
							  rng):

	""" input_files format
	(1) One sentence per line since those sentences are also used for 
		"next sentence prediction" task.
	(2) Blank lines between docs since it does not want 
		"next sentence prediction" task to predict unrelated
		sentences."""
	all_documents = [[]]

	for input_file in input_files:
		with tf.gfile.GFile(input_file, "r") as reader:
			while True:
				line = tokenization.convert_to_unicode(reader.readline())
				if not line:
					break
				line = line.strip()

				# Empty lines are used as document delimiters
				# if 'blank str' -> False
				if not line:
					all_documents.append([])
				tokens = tokenizer.tokenize(line)
				if tokens:
					all_documents[-1].append(tokens)

	# Remove empty documents
	all_documents = [x for x in all_documents if x]
	rng.shuffle(all_documents)

	vocab_words = list(tokenizer.vocab.keys())
	instances = []
	for _ in range(dupe_factor):
		for document_index in range(len(all_documents)):
			intances.extend(
				create_instances_from_document(
					all_documents, document_index, max_seq_length, short_seq_prob,
					masked_lm_prob, max_predictions_per_seq, vocab_words, rng))

	rng.shuffle(instances)
	return instances
def create_training_instances(input_files, tokenizer, max_seq_length,
                              dupe_factor, short_seq_prob, masked_lm_prob,
                              max_predictions_per_seq, disable_nsp, rng):
    """Create `TrainingInstance`s from raw text."""
    all_documents = [[]]

    # Input file format:
    # (1) One sentence per line. These should ideally be actual sentences, not
    # entire paragraphs or arbitrary spans of text. (Because we use the
    # sentence boundaries for the "next sentence prediction" task).
    # (2) Blank lines between documents. Document boundaries are needed so
    # that the "next sentence prediction" task doesn't span between documents.
    for input_file in input_files:
        with tf.gfile.GFile(input_file, "r") as reader:
            while True:
                line = tokenization.convert_to_unicode(reader.readline())
                if not line:
                    break
                line = line.strip()

                # If NSP is disabled, each line is a "document"
                if disable_nsp and line:
                    all_documents.append([])
                # Empty lines are used as document delimiters
                if not disable_nsp and not line:
                    all_documents.append([])
                tokens = tokenizer.tokenize(line)
                if tokens:
                    all_documents[-1].append(tokens)

    # Remove empty documents
    all_documents = [x for x in all_documents if x]
    rng.shuffle(all_documents)

    vocab_words = list(tokenizer.vocab.keys())
    instances = []
    for _ in range(dupe_factor):
        for document_index in range(len(all_documents)):
            instances.extend(
                create_instances_from_document(all_documents, document_index,
                                               max_seq_length, short_seq_prob,
                                               masked_lm_prob,
                                               max_predictions_per_seq,
                                               vocab_words, disable_nsp, rng))

    rng.shuffle(instances)
    return instances