Exemplo n.º 1
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        add_special_tokens: bool = True,
        unk_token: str = "[UNK]",
        sep_token: str = "[SEP]",
        cls_token: str = "[CLS]",
        clean_text: bool = True,
        handle_chinese_chars: bool = True,
        strip_accents: bool = True,
        lowercase: bool = True,
        wordpieces_prefix: str = "##",
    ):

        if vocab_file is not None:
            tokenizer = Tokenizer(
                WordPiece.from_files(vocab_file, unk_token=unk_token))
        else:
            tokenizer = Tokenizer(WordPiece.empty())

        tokenizer.add_special_tokens([unk_token, sep_token, cls_token])
        tokenizer.normalizer = BertNormalizer(
            clean_text=clean_text,
            handle_chinese_chars=handle_chinese_chars,
            strip_accents=strip_accents,
            lowercase=lowercase,
        )
        tokenizer.pre_tokenizer = BertPreTokenizer()

        if add_special_tokens and vocab_file is not None:
            sep_token_id = tokenizer.token_to_id(sep_token)
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
            cls_token_id = tokenizer.token_to_id(cls_token)
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")

            tokenizer.post_processor = BertProcessing(
                (sep_token, sep_token_id), (cls_token, cls_token_id))
        tokenizer.decoders = decoders.WordPiece(prefix=wordpieces_prefix)

        parameters = {
            "model": "BertWordPiece",
            "add_special_tokens": add_special_tokens,
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "clean_text": clean_text,
            "handle_chinese_chars": handle_chinese_chars,
            "strip_accents": strip_accents,
            "lowercase": lowercase,
            "wordpieces_prefix": wordpieces_prefix,
        }

        super().__init__(tokenizer, parameters)
Exemplo n.º 2
0
    print("Running GPT-2 tokenizer")
    tok_p = GPT2Tokenizer.from_pretrained('gpt2')

    # Create a Tokenizer using BPE
    tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
    # Use ByteLevel PreTokenizer
    tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    # Use ByteLevel Decoder
    tok_r.decoder = decoders.ByteLevel()
elif args.type == "bert":
    print("Running Bert tokenizer")
    tok_p = BertTokenizer.from_pretrained(args.vocab)

    tok_r = Tokenizer(
        WordPiece.from_files(args.vocab,
                             unk_token="[UNK]",
                             max_input_chars_per_word=100))
    tok_r.normalizer = BertNormalizer(
        clean_text=True,
        handle_chinese_chars=True,
        strip_accents=True,
        lowercase=True,
    )
    # tok_r.pre_tokenizer = pre_tokenizers.Whitespace()
    tok_r.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
    tok_r.decoder = decoders.WordPiece()
    tok_r.post_processor = BertProcessing(
        ("[SEP]", tok_r.token_to_id("[SEP]")),
        ("[CLS]", tok_r.token_to_id("[CLS]")),
    )
else:
Exemplo n.º 3
0
def preprocess_data(args):

    label_counter = Counter([])
    examples_per_file = Counter()

    print("Reading all files for labels.")
    for input_file in args.input_files:
        with xopen(input_file, "rt") as f:
            for example, labels in input_readers[args.task](f):
                examples_per_file[input_file] += 1
                label_counter.update(labels)

    if args.top_n_labels > 0:
        mlb_full = MultiLabelBinarizer(sparse_output=True)
        mlb_full = mlb_full.fit(label_counter.keys())
        label_counter = dict(label_counter.most_common(args.top_n_labels))

    mlb = MultiLabelBinarizer(sparse_output=True)
    # Passing a list in a list because that's what the function wants.
    mlb = mlb.fit([[pair for pair in label_counter]])

    # Save list of partial -> full mapping if doing top N labels.
    if args.top_n_labels > 0:

        label_mapping = np.where(np.in1d(mlb_full.classes_,
                                         mlb.classes_))[0].tolist()

        with xopen(args.label_mapping, "wt") as f:
            f.write(json.dumps(label_mapping))

        # Also save the full labels.
        with xopen(args.full_labels, "wt") as f:
            f.write(json.dumps(list(mlb_full.classes_)))

    # Save list of labels.
    with xopen(args.labels_out, "wt") as f:
        f.write(json.dumps(list(mlb.classes_)))

    # Set parallel tokenization thread count.
    os.environ["RAYON_NUM_THREADS"] = str(args.processes)

    from tokenizers import Tokenizer, decoders, trainers
    from tokenizers.models import WordPiece
    from tokenizers.normalizers import BertNormalizer
    from tokenizers.pre_tokenizers import BertPreTokenizer
    from tokenizers.processors import BertProcessing

    if args.task == 'cafa':
        # Define our custom tokenizer.
        # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word
        # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences.
        tokenizer = WordPiece.from_files(args.vocab,
                                         unk_token="[UNK]",
                                         max_input_chars_per_word=20000)
        tokenizer = Tokenizer(tokenizer)
        tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"])
        tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.post_processor = BertProcessing(
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[CLS]", tokenizer.token_to_id("[CLS]")))
        tokenizer.decoder = decoders.WordPiece(prefix='##')
    else:
        tokenizer = BertWordPieceTokenizer(args.vocab,
                                           lowercase=args.do_lower_case)

    tokenizer.enable_padding(max_length=args.seq_len)
    tokenizer.enable_truncation(max_length=args.seq_len)

    for input_file in args.input_files:
        with xopen(input_file, 'rt') as in_f:

            file_name = generate_out_filename(input_file, args)

            with xopen(file_name, "wt") as out_f:
                print("Processing to: ", file_name)

                # Write the shape as the first row, useful for the finetuning.
                out_f.write(
                    json.dumps((examples_per_file[input_file],
                                len(label_counter))) + '\n')

                batch_size = min(examples_per_file[input_file],
                                 args.processes * 100)
                example_batch = []
                labels_batch = []

                with ParallelGenerator(input_readers[args.task](in_f),
                                       max_lookahead=batch_size) as g:
                    for example, labels in g:

                        example_batch.append(example)
                        labels_batch.append(labels)

                        if len(example_batch) == batch_size:
                            example_batch = tokenizer.encode_batch(
                                example_batch)
                            labels_batch = mlb.transform(labels_batch)

                            for example, labels in zip(example_batch,
                                                       labels_batch):
                                # Convert sparse arrays to python lists for json dumping.
                                # print(labels);input()
                                labels = labels.nonzero()[1].tolist()
                                out_f.write(
                                    json.dumps([example.ids, labels]) + '\n')

                            example_batch = []
                            labels_batch = []

                    # Write out whatever is left in the last smaller batch.
                    example_batch = tokenizer.encode_batch(example_batch)
                    labels_batch = mlb.transform(labels_batch)

                    for example, labels in zip(example_batch, labels_batch):
                        # Convert sparse arrays to python lists for json dumping.
                        # print(labels);input()
                        labels = labels.nonzero()[1].tolist()
                        out_f.write(json.dumps([example.ids, labels]) + '\n')
Exemplo n.º 4
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        unk_token: Union[str, AddedToken] = "[UNK]",
        sep_token: Union[str, AddedToken] = "[SEP]",
        cls_token: Union[str, AddedToken] = "[CLS]",
        pad_token: Union[str, AddedToken] = "[PAD]",
        mask_token: Union[str, AddedToken] = "[MASK]",
        clean_text: bool = True,
        handle_chinese_chars: bool = True,
        strip_accents: bool = True,
        lowercase: bool = True,
        wordpieces_prefix: str = "##",
    ):

        if vocab_file is not None:
            tokenizer = Tokenizer(
                WordPiece.from_files(vocab_file, unk_token=str(unk_token)))
        else:
            tokenizer = Tokenizer(WordPiece.empty())

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(sep_token)) is not None:
            tokenizer.add_special_tokens([str(sep_token)])
        if tokenizer.token_to_id(str(cls_token)) is not None:
            tokenizer.add_special_tokens([str(cls_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        tokenizer.normalizer = BertNormalizer(
            clean_text=clean_text,
            handle_chinese_chars=handle_chinese_chars,
            strip_accents=strip_accents,
            lowercase=lowercase,
        )
        tokenizer.pre_tokenizer = BertPreTokenizer()

        if vocab_file is not None:
            sep_token_id = tokenizer.token_to_id(str(sep_token))
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
            cls_token_id = tokenizer.token_to_id(str(cls_token))
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")

            tokenizer.post_processor = BertProcessing(
                (str(sep_token), sep_token_id), (str(cls_token), cls_token_id))
        tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)

        parameters = {
            "model": "BertWordPiece",
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "clean_text": clean_text,
            "handle_chinese_chars": handle_chinese_chars,
            "strip_accents": strip_accents,
            "lowercase": lowercase,
            "wordpieces_prefix": wordpieces_prefix,
        }

        super().__init__(tokenizer, parameters)
Exemplo n.º 5
0
 def test_instantiate(self, bert_files):
     assert isinstance(WordPiece.empty(), Model)
     assert isinstance(WordPiece.from_files(bert_files["vocab"]), Model)
Exemplo n.º 6
0
def preprocess_data(args):

    label_counter = Counter([])
    examples_per_file = Counter()

    print("Reading all files for labels.")
    for input_file in args.input_files:
        with xopen(input_file, "rt") as f:
            for example, labels in input_readers[args.task](f):
                examples_per_file[input_file] += 1
                label_counter.update(labels)

    if args.top_n_labels > 0:
        mlb_full = MultiLabelBinarizer(sparse_output=True)
        mlb_full = mlb_full.fit(label_counter.keys())
        label_counter = dict(label_counter.most_common(args.top_n_labels))

    mlb = MultiLabelBinarizer(sparse_output=True)
    # Passing a list in a list because that's what the function wants.
    if args.labels_in:
        labels = json.load(open(args.labels_in))
        mlb = mlb.fit([labels])
    else:
        mlb = mlb.fit([[pair for pair in label_counter]])

    # Save list of partial -> full mapping if doing top N labels.
    if args.top_n_labels > 0:

        label_mapping = np.where(np.in1d(mlb_full.classes_,
                                         mlb.classes_))[0].tolist()

        with xopen(args.label_mapping, "wt") as f:
            f.write(json.dumps(label_mapping))

        # Also save the full labels.
        with xopen(args.full_labels, "wt") as f:
            f.write(json.dumps(list(mlb_full.classes_)))

    # Save list of labels.
    with xopen(args.labels_out, "wt") as f:
        f.write(json.dumps(list(mlb.classes_)))

    # Set parallel tokenization thread count.
    os.environ["RAYON_NUM_THREADS"] = str(args.processes)

    from tokenizers import Tokenizer, decoders, trainers
    from tokenizers.models import WordPiece
    from tokenizers.normalizers import BertNormalizer
    from tokenizers.pre_tokenizers import BertPreTokenizer
    from tokenizers.processors import BertProcessing

    if args.task == 'cafa':
        # Define our custom tokenizer.
        # It is exactly the same as the default BERT tokenizer, except for max_input_chars_per_word
        # being 20000 instead of 100. This tokenizer is very slow on the long protein sequences.
        tokenizer = WordPiece.from_files(args.vocab,
                                         unk_token="[UNK]",
                                         max_input_chars_per_word=20000)
        tokenizer = Tokenizer(tokenizer)
        tokenizer.add_special_tokens(["[UNK]", "[SEP]", "[CLS]"])
        tokenizer.normalizer = BertNormalizer(lowercase=args.do_lower_case)
        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.post_processor = BertProcessing(
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
            ("[CLS]", tokenizer.token_to_id("[CLS]")))
        tokenizer.decoder = decoders.WordPiece(prefix='##')
    else:
        tokenizer = BertWordPieceTokenizer(args.vocab,
                                           lowercase=args.do_lower_case)

    tokenizer.enable_padding(max_length=args.seq_len)
    tokenizer.enable_truncation(max_length=args.seq_len)

    for input_file in args.input_files:
        with xopen(input_file, 'rt') as in_f:

            file_name = generate_out_filename(input_file, args)

            with xopen(file_name, "wt") as out_f:
                print("Processing to: ", file_name)

                # Write the shape as the first row, useful for the finetuning.
                if args.labels_in:
                    n_labels = len(json.load(open(args.labels_in)))
                else:
                    n_labels = len(label_counter)
                out_f.write(
                    json.dumps((examples_per_file[input_file], n_labels)) +
                    '\n')

                batch_size = min(examples_per_file[input_file],
                                 args.processes * 100)
                example_batch = []
                labels_batch = []
                doc_idx_batch = []

                with ParallelGenerator(input_readers[args.task](in_f),
                                       max_lookahead=batch_size) as g:
                    START_POS = int(args.window_start) / 100
                    for doc_idx, (example, labels) in enumerate(g):
                        #example = ' '.join(example.split(' ')[-510:])
                        example_batch.append(example)
                        labels_batch.append(labels)
                        doc_idx_batch.append(doc_idx)

                        if len(example_batch) == batch_size:
                            example_batch = tokenizer.encode_batch(
                                example_batch)
                            labels_batch = mlb.transform(labels_batch)

                            for example, labels, doc_idx in zip(
                                    example_batch, labels_batch,
                                    doc_idx_batch):
                                # Convert sparse arrays to python lists for json dumping.
                                # print(labels);input()
                                labels = labels.nonzero()[1].tolist()
                                """try:
                                    [][0]
                                    print("DOC_LEN:",len(example.overflowing)+1)
                                    mid = len(example.overflowing)//2
                                    out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                                except IndexError:
                                    out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                                if args.all_blocks or args.n_blocks > 0:
                                    blocks = [example.ids] + [
                                        blk.ids for blk in example.overflowing
                                    ]
                                    #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                                    for b, block in enumerate(blocks, 2):
                                        if b > args.n_blocks and args.n_blocks > 0:
                                            break
                                        out_f.write(
                                            json.dumps(
                                                [block, labels, doc_idx]) +
                                            '\n')
                                else:
                                    window = get_window(example, START_POS)
                                    assert len(window) == 512
                                    assert all(
                                        [type(y) is int for y in window])
                                    out_f.write(
                                        json.dumps([window, labels]) + '\n')

                            example_batch = []
                            labels_batch = []

                    # Write out whatever is left in the last smaller batch.
                    example_batch = tokenizer.encode_batch(example_batch)
                    labels_batch = mlb.transform(labels_batch)

                    for example, labels, doc_idx in zip(
                            example_batch, labels_batch, doc_idx_batch):
                        # Convert sparse arrays to python lists for json dumping.
                        # print(labels);input()
                        labels = labels.nonzero()[1].tolist()
                        """try:
                            [][0]
                            print("DOC_LEN:",len(example.overflowing)+1)
                            mid = len(example.overflowing)//2
                            out_f.write(json.dumps( [example.overflowing[mid].ids, labels, len(example.overflowing)+1] ) + '\n')
                        except IndexError:
                            out_f.write(json.dumps( [example.ids, labels, len(example.overflowing)+1] ) + '\n')"""

                        if args.all_blocks or args.n_blocks > 0:
                            blocks = [example.ids] + [
                                blk.ids for blk in example.overflowing
                            ]
                            #print("BLOCKS:%d,TOKENS:%d" % (len(list(blocks)), sum([len(list(tokens)) for tokens in blocks])))
                            for b, block in enumerate(blocks, 2):
                                if b > args.n_blocks and args.n_blocks > 0:
                                    break
                                out_f.write(
                                    json.dumps([block, labels, doc_idx]) +
                                    '\n')
                        else:
                            out_f.write(
                                json.dumps(
                                    [get_window(example, START_POS), labels]) +
                                '\n')