Exemplos de convert_to_unicode em Python, exemplos de model.tokenization.convert_to_unicode em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: run_classifier.py Projeto: Zhangbeibei1991/Bert_for_CSSM

 def get_sentence_examples(self, questions):
     for index, data in enumerate(questions):
         guid = 'test-%d' % index
         text_a = tokenization.convert_to_unicode(str(data[0]))
         text_b = tokenization.convert_to_unicode(str(data[1]))
         label = str(0)
         yield InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: classification_tasks.py Projeto: ololo123321/electra-mpi

 def _load_glue(self, lines, split, text_a_loc, text_b_loc, label_loc,
                skip_first_line=False, eid_offset=0, swap=False):
   examples = []
   for (i, line) in enumerate(lines):
     try:
       if i == 0 and skip_first_line:
         continue
       eid = i - (1 if skip_first_line else 0) + eid_offset
       text_a = tokenization.convert_to_unicode(line[text_a_loc])
       if text_b_loc is None:
         text_b = None
       else:
         text_b = tokenization.convert_to_unicode(line[text_b_loc])
       if "test" in split or "diagnostic" in split:
         label = self._get_dummy_label()
       else:
         label = tokenization.convert_to_unicode(line[label_loc])
       if swap:
         text_a, text_b = text_b, text_a
       examples.append(InputExample(eid=eid, task_name=self.name,
                                    text_a=text_a, text_b=text_b, label=label))
     except Exception as ex:
       utils.log("Error constructing example from line", i,
                 "for task", self.name + ":", ex)
       utils.log("Input causing the error:", line)
   return examples

Exemplo n.º 3

0

Exibir arquivo

Arquivo: run_classifier.py Projeto: Zhangbeibei1991/Bert_for_CSSM

 def get_test_examples(self, data_dir):
     file_path = os.path.join(data_dir, 'test.csv')
     test_df = pd.read_csv(file_path, encoding='utf-8')
     test_data = []
     for index, test in enumerate(test_df.values):
         guid = 'test-%d' % index
         text_a = tokenization.convert_to_unicode(str(test[0]))
         text_b = tokenization.convert_to_unicode(str(test[1]))
         label = str(test[2])
         test_data.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
     return test_data

Exemplo n.º 4

0

Exibir arquivo

Arquivo: regression_reader.py Projeto: RonDen/Research

    def _convert_example_to_record(self, example, max_seq_length, tokenizer):
        """Converts a single `Example` into a single `Record`."""
        text_a = tokenization.convert_to_unicode(example.text_a)
        tokens_a = tokenizer.tokenize(text_a)
        tokens_b = None
        if "text_b" in example._fields:
            text_b = tokenization.convert_to_unicode(example.text_b)
            tokens_b = tokenizer.tokenize(text_b)
        if tokens_b:
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            self._truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[0:(max_seq_length - 2)]

        tokens = []
        text_type_ids = []
        tokens.append("[CLS]")
        text_type_ids.append(0)
        for token in tokens_a:
            tokens.append(token)
            text_type_ids.append(0)
        tokens.append("[SEP]")
        text_type_ids.append(0)

        if tokens_b:
            for token in tokens_b:
                tokens.append(token)
                text_type_ids.append(1)
            tokens.append("[SEP]")
            text_type_ids.append(1)

        token_ids = tokenizer.convert_tokens_to_ids(tokens)
        position_ids = list(range(2, len(token_ids) + 2))
        label_id = example.label

        Record = namedtuple(
            'Record',
            ['token_ids', 'text_type_ids', 'position_ids', 'label_id', 'qid'])

        qid = None
        if "qid" in example._fields:
            qid = example.qid

        record = Record(token_ids=token_ids,
                        text_type_ids=text_type_ids,
                        position_ids=position_ids,
                        label_id=label_id,
                        qid=qid)
        return record

Exemplo n.º 5

0

Exibir arquivo

 def _create_examples(self, lines, pred_type=False):
     """Creates examples for the training and dev sets."""
     examples = []
     for (i, line) in enumerate(lines):
         #if i == 0:
         #    continue
         guid = "0"
         text_a = tokenization.convert_to_unicode(line[0])
         text_b = tokenization.convert_to_unicode(line[1])
         if pred_type == True:
             label = 0.0
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=text_b,
                          label=label))
     return examples

Exemplo n.º 6

0

Exibir arquivo

Arquivo: create_pretraining_data.py Projeto: ranggarppb/bert-bahasa-plagiarism-checker

def create_training_instances(
    input_files,
    tokenizer,
    max_seq_length,
    dupe_factor,
    short_seq_prob,
    masked_lm_prob,
    max_predictions_per_seq,
    rng,
):
    """Create `TrainingInstance`s from raw text."""
    all_documents = [[]]

    # Input file format:
    # (1) One sentence per line. These should ideally be actual sentences, not
    # entire paragraphs or arbitrary spans of text. (Because we use the
    # sentence boundaries for the "next sentence prediction" task).
    # (2) Blank lines between documents. Document boundaries are needed so
    # that the "next sentence prediction" task doesn't span between documents.
    for input_file in input_files:
        with tf.gfile.GFile(input_file, 'r') as reader:
            while True:
                line = tokenization.convert_to_unicode(reader.readline())
                if not line:
                    break
                line = line.strip()

                # Empty lines are used as document delimiters
                if not line:
                    all_documents.append([])
                tokens = tokenizer.tokenize(line)
                if tokens:
                    all_documents[-1].append(tokens)

    # Remove empty documents
    all_documents = [x for x in all_documents if x]
    rng.shuffle(all_documents)

    vocab_words = list(tokenizer.vocab.keys())
    instances = []
    for _ in range(dupe_factor):
        for document_index in range(len(all_documents)):
            instances.extend(
                create_instances_from_document(
                    all_documents,
                    document_index,
                    max_seq_length,
                    short_seq_prob,
                    masked_lm_prob,
                    max_predictions_per_seq,
                    vocab_words,
                    rng,
                ))

    rng.shuffle(instances)
    return instances

Exemplo n.º 7

0

Exibir arquivo

def create_pairexamples_from_tsv_file(file_name):
    fp = open(file_name, "r", encoding="utf-8")
    reader = csv.reader(fp, delimiter="\t")
    lines = []
    for line in reader:
        lines.append(line)
    examples = []
    lines = tqdm(lines)
    for (i, line) in enumerate(lines):
        guid = "0"
        text_a = line[0]
        text_b = line[1]
        text_a = tokenization.convert_to_unicode(text_a)
        text_b = tokenization.convert_to_unicode(text_b)
        label = 0.0
        examples.append(
            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    fp.close()
    return examples

Exemplo n.º 8

0

Exibir arquivo

 def _create_encode_examples(self, lines):
     examples = []
     for (i, line) in enumerate(lines):
         #if i == 0:
         #    continue
         guid = "0"
         text_a = tokenization.convert_to_unicode(line[0])
         label = 0.0
         examples.append(
             InputExample(guid=guid,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples

Exemplo n.º 9

0

Exibir arquivo

def create_examples_from_tsv_file(file_name):
    fp = open(file_name, "r", encoding="utf-8")
    reader = csv.reader(fp, delimiter="\t")
    lines = []
    for line in reader:
        lines.append(line)

    examples = []
    for (i, line) in enumerate(lines):
        guid = "0"
        text_a = tokenization.convert_to_unicode(line[0])
        text_b = None
        label = "1"
        examples.append(
            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

Exemplo n.º 10

0

Exibir arquivo

def tokenize_and_align(tokenizer, words):
    """Splits up words into subword-level tokens."""
    basic_tokenizer = tokenizer.basic_tokenizer
    tokenized_words = []
    for word in words:
        word = tokenization.convert_to_unicode(word)
        word = basic_tokenizer._clean_text(word)
        if word == "[CLS]" or word == "[SEP]":
            word_toks = [word]
        else:
            word_toks = basic_tokenizer._run_split_on_punc(word)
        tokenized_word = []
        for word_tok in word_toks:
            tokenized_word += tokenizer.wordpiece_tokenizer.tokenize(word_tok)
        tokenized_words.append(tokenized_word)
    assert len(tokenized_words) == len(words)
    flatten = list(chain.from_iterable(tokenized_words))
    return flatten

Exemplo n.º 11

0

Exibir arquivo

Arquivo: classification_tasks.py Projeto: dacnguyen95/PSRMTE

 def _create_examples(self, items, split):
     examples = []
     for eid, item in enumerate(items):
         text_a = [
             tokenization.convert_to_unicode(item[feature])
             for feature in self.config.features
         ]
         text_a = [
             preprocess_abstract(text) if feature == "abstract" else text
             for text, feature in zip(text_a, self.config.features)
         ]
         label = item['journal'].lower()
         examples.append(
             InputExample(eid=eid,
                          task_name=self.name,
                          text_a=text_a,
                          text_b=None,
                          label=label))
     return examples

Exemplo n.º 12

0

Exibir arquivo

def create_examples_from_json_file(file_name):
    fp = open(file_name, "r", encoding="utf-8")
    # reader = csv.reader(fp, delimiter="\t")
    lines = []
    for line in fp:
        lines.append(line)

    examples = []
    lines = tqdm(lines)
    for (i, line) in enumerate(lines):
        guid = "0"
        json_data = json.loads(line.strip())
        title = re.sub("[\r\n]", " ", json_data["title"])
        content = re.sub("[\r\n]", " ", json_data["content"])
        text_a = tokenization.convert_to_unicode(title + " " + content)
        text_b = None
        label = "1"
        examples.append(
            InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
    return examples

Exemplo n.º 13

0

Exibir arquivo

Arquivo: tagging_tasks.py Projeto: johnnyb1509/2021_VBDI_NLP

def tokenize_and_align(tokenizer, words, cased=False):
    """Splits up words into subword-level tokens."""
    words = ["[CLS]"] + list(words) + ["[SEP]"]
    basic_tokenizer = tokenizer.basic_tokenizer
    tokenized_words = []
    for word in words:
        word = tokenization.convert_to_unicode(word)
        word = basic_tokenizer._clean_text(word)
        if word == "[CLS]" or word == "[SEP]":
            word_toks = [word]
        else:
            if not cased:
                word = word.lower()
                word = basic_tokenizer._run_strip_accents(word)
            word_toks = basic_tokenizer._run_split_on_punc(word)
        tokenized_word = []
        for word_tok in word_toks:
            tokenized_word += tokenizer.wordpiece_tokenizer.tokenize(word_tok)
        tokenized_words.append(tokenized_word)
    assert len(tokenized_words) == len(words)
    return tokenized_words

Exemplo n.º 14

0

Exibir arquivo

Arquivo: extract_features.py Projeto: ranggarppb/bert-bahasa-plagiarism-checker

def read_examples(input_texts):
  """Read a list of `InputExample`s from an input file."""
  examples = []
  unique_id = 0
  for text in input_texts :
    line = tokenization.convert_to_unicode(text)
    if not line:
      break
    line = line.strip()
    text_a = None
    text_b = None
    m = re.match(r"^(.*) \|\|\| (.*)$", line)
    if m is None:
      text_a = line
    else:
      text_a = m.group(1)
      text_b = m.group(2)
    examples.append(
        InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
    unique_id += 1
  return examples

Exemplo n.º 15

0

Exibir arquivo

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
            guid = "0"
            text_a = tokenization.convert_to_unicode(line[0])

            #if len(line) > 2:
            #    text_b = tokenization.convert_to_unicode(line[1])
            #else:
            #    text_b = None
            text_b = None
            if set_type == "test":
                label = "1"
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))
        return examples

Exemplo n.º 16

0

Exibir arquivo

    def _text_to_ids(self,
                     text,
                     tokenizer=None,
                     max_len=None,
                     trunc_type="right",
                     keep_sep=True):
        """convert text to vocab ids"""
        max_len = max_len or self.max_src_len - 1
        tokenizer = tokenizer or self.tokenizer
        text = tokenization.convert_to_unicode(text)
        if self.tokenized_input:
            tokens = text.split(" ")
        else:
            tokens = tokenizer.tokenize(text)
        token_ids = tokenizer.convert_tokens_to_ids(tokens) + [self.sep_id]

        token_ids = self._trunc_token_ids(token_ids, max_len, trunc_type,
                                          keep_sep)
        pos_ids = range(3,
                        len(token_ids) +
                        3)  ####################### pos start from 2
        return token_ids, pos_ids

Exemplo n.º 17

0

Exibir arquivo

Arquivo: sentence_embedding.py Projeto: zhp510730568/sentence_embedding

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        if set_type == 'train':
            for (i, line) in enumerate(lines):
                guid = "%s-%s" % (set_type, i)
                query = tokenization.convert_to_unicode(line[0])
                cand1 = tokenization.convert_to_unicode(line[1])
                cand2 = tokenization.convert_to_unicode(line[2])
                cand3 = tokenization.convert_to_unicode(line[3])
                label = int(line[-1])

                examples.append(
                    InputExample(guid=guid,
                                 query=query,
                                 cand1=cand1,
                                 cand2=cand2,
                                 cand3=cand3,
                                 label=label))
        elif set_type == 'dev':
            for (i, line) in enumerate(lines):
                guid = "%s-%s" % (set_type, i)
                query = tokenization.convert_to_unicode(line[0])
                cand1 = tokenization.convert_to_unicode(line[1])
                cand2 = tokenization.convert_to_unicode(line[2])
                cand3 = tokenization.convert_to_unicode(line[3])
                label = 0

                examples.append(
                    InputExample(guid=guid,
                                 query=query,
                                 cand1=cand1,
                                 cand2=cand2,
                                 cand3=cand3,
                                 label=label))
        return examples