Python convert_to_unicode 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: tokenization_sentencepiece

메소드/함수: convert_to_unicode

hotexamples.com에서의 예제들: 7

Python convert_to_unicode - 7개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 tokenization_sentencepiece.convert_to_unicode에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: run_classifier.py 프로젝트: takuya-andou/bert-japanese

 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         guid = "%s-%s" % (set_type, i)
         text_a = tokenization.convert_to_unicode(line[0])
         label = tokenization.convert_to_unicode(line[1])
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples

예제 #2

파일 보기

 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             idx_text = line.index('text')
             idx_label = line.index('label')
         else:
             guid = "%s-%s" % (set_type, i)
             text_a = tokenization.convert_to_unicode(line[idx_text])
             label = tokenization.convert_to_unicode(line[idx_label])
             examples.append(
                 InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
     return examples

예제 #3

파일 보기

  def _create_examples(self, lines, set_type):
    examples = []
    for i, line in enumerate(lines):
      if i == 0:
        idx_text = line.index('text')
        idx_label = line.index('label')
      else:
        guid = f'{set_type}-{i}'
        text_a = tokenization.convert_to_unicode(line[idx_text])
        label = tokenization.convert_to_unicode(line[idx_label])
        examples.append(
          InputExample(guid, text_a, label=label)
        )

    return examples

예제 #4

파일 보기

파일: extract_features.py 프로젝트: HisakaKoji/bert-japanese

def read_examples(input_file):
    """Read a list of `InputExample`s from an input file."""
    examples = []
    unique_id = 0
    with tf.gfile.GFile(input_file, "r") as reader:
        while True:

            line = tokenization.convert_to_unicode(reader.readline())
            if not line:
                break

            line = line.strip()
            text_a = None
            text_b = None
            m = re.match(r"^(.*) \|\|\| (.*)$", line)

            if m is None:
                text_a = line
            else:
                text_a = m.group(1)
                text_b = m.group(2)

            examples.append(
                InputExample(unique_id=unique_id, text_a=text_a,
                             text_b=text_b))
            unique_id += 1

    return examples

예제 #5

파일 보기

 def _create_examples(self, lines, set_type):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         if i == 0:
             continue
         # tsv format
         # id, prompt, text, label
         # prompt
         prompt = tokenization.convert_to_unicode(line[1])
         # text
         text = tokenization.convert_to_unicode(line[2])
         if set_type == "test":
             label = 1
         else:
             label = float(line[-1])
         examples.append(InputExample(prompt=prompt, text=text,
                                      label=label))
     return examples

예제 #6

파일 보기

파일: create_pretraining_data.py 프로젝트: Masao-Taketani/TensorFlow_practice

def create_training_instances(input_files,
							  tokenizer,
							  max_seq_length,
							  dupe_factor,
							  short_seq_prob,
							  masked_lm_prob,
							  max_predictions_per_seq,
							  rng):

	""" input_files format
	(1) One sentence per line since those sentences are also used for 
		"next sentence prediction" task.
	(2) Blank lines between docs since it does not want 
		"next sentence prediction" task to predict unrelated
		sentences."""
	all_documents = [[]]

	for input_file in input_files:
		with tf.gfile.GFile(input_file, "r") as reader:
			while True:
				line = tokenization.convert_to_unicode(reader.readline())
				if not line:
					break
				line = line.strip()

				# Empty lines are used as document delimiters
				# if 'blank str' -> False
				if not line:
					all_documents.append([])
				tokens = tokenizer.tokenize(line)
				if tokens:
					all_documents[-1].append(tokens)

	# Remove empty documents
	all_documents = [x for x in all_documents if x]
	rng.shuffle(all_documents)

	vocab_words = list(tokenizer.vocab.keys())
	instances = []
	for _ in range(dupe_factor):
		for document_index in range(len(all_documents)):
			intances.extend(
				create_instances_from_document(
					all_documents, document_index, max_seq_length, short_seq_prob,
					masked_lm_prob, max_predictions_per_seq, vocab_words, rng))

	rng.shuffle(instances)
	return instances

예제 #7

파일 보기

파일: create_pretraining_data.py 프로젝트: raymondhs/bert-japanese

def create_training_instances(input_files, tokenizer, max_seq_length,
                              dupe_factor, short_seq_prob, masked_lm_prob,
                              max_predictions_per_seq, disable_nsp, rng):
    """Create `TrainingInstance`s from raw text."""
    all_documents = [[]]

    # Input file format:
    # (1) One sentence per line. These should ideally be actual sentences, not
    # entire paragraphs or arbitrary spans of text. (Because we use the
    # sentence boundaries for the "next sentence prediction" task).
    # (2) Blank lines between documents. Document boundaries are needed so
    # that the "next sentence prediction" task doesn't span between documents.
    for input_file in input_files:
        with tf.gfile.GFile(input_file, "r") as reader:
            while True:
                line = tokenization.convert_to_unicode(reader.readline())
                if not line:
                    break
                line = line.strip()

                # If NSP is disabled, each line is a "document"
                if disable_nsp and line:
                    all_documents.append([])
                # Empty lines are used as document delimiters
                if not disable_nsp and not line:
                    all_documents.append([])
                tokens = tokenizer.tokenize(line)
                if tokens:
                    all_documents[-1].append(tokens)

    # Remove empty documents
    all_documents = [x for x in all_documents if x]
    rng.shuffle(all_documents)

    vocab_words = list(tokenizer.vocab.keys())
    instances = []
    for _ in range(dupe_factor):
        for document_index in range(len(all_documents)):
            instances.extend(
                create_instances_from_document(all_documents, document_index,
                                               max_seq_length, short_seq_prob,
                                               masked_lm_prob,
                                               max_predictions_per_seq,
                                               vocab_words, disable_nsp, rng))

    rng.shuffle(instances)
    return instances