예제 #1
0
class BPETokenization(Target):
    def __init__(self, tokenizer_dir: str,
                 max_line_length: Union[int, None] = 50,
                 padding_id: int = 0):
        super().__init__()
        assert exists(join(tokenizer_dir, "vocab.json")), f"vocab.json file missing in '{tokenizer_dir}'"
        assert exists(join(tokenizer_dir, "merges.txt")), f"merges.txt file missing in '{tokenizer_dir}'"

        self.tokenizer = ByteLevelBPETokenizer(vocab_file=join(tokenizer_dir, "vocab.json"),
                                               merges_file=join(tokenizer_dir, "merges.txt"))

        self.max_line_length = max_line_length
        self.padding_id = padding_id
        self.char_re = re.compile(rf"[^{string.printable}]")

    def __call__(self, document):
        assert isinstance(document, dict), f"wrong input of type {type(document)} to tokenizer"

        processed_text = self.char_re.sub("", document["text"])
        lines = [line for line in processed_text.split("\n") if len(line) > 0]
        encoded_lines = []
        for line in self.tokenizer.encode_batch(lines):
            if self.max_line_length is not None:
                encoded_lines.append(line.ids[:self.max_line_length])
            else:
                encoded_lines.append(line.ids)

        max_line_length = max(len(line) for line in encoded_lines)
        for j in range(len(encoded_lines)):
            amount_padding = max_line_length - len(encoded_lines[j])
            encoded_lines[j] = [self.padding_id] * amount_padding + encoded_lines[j]

        document["input_ids"] = encoded_lines

        return document
예제 #2
0
파일: data.py 프로젝트: ShenDezhou/CAIL2021
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "./model/bbpe/vocab.json",
            "./model/bbpe/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/").glob("*_eval.csv") if evaluate else Path(
            "./data/").glob("*_eval.csv")
        for src_file in src_files:
            print("🔥", src_file)
            with open(src_file, 'r', encoding='utf-8') as f:
                for index, line in enumerate(f):
                    self.examples += [
                        x.ids for x in tokenizer.encode_batch(line)
                    ]
                    if index % 10000 == 0:
                        print(src_file, index // 10000)
예제 #3
0
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            "models/faberto/vocab.json",
            "models/faberto/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        src_files = Path("./data/").glob("*-eval.txt") if evaluate else Path(
            "./data/").glob("*-train.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.open(encoding="utf-8").read().splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
def inference():

    from tokenizers import ByteLevelBPETokenizer
    from tokenizers.processors import BertProcessing
    '''
    initialize tokenizer with saved model files
    '''
    tokenizer = ByteLevelBPETokenizer(
        "./tok_checkpoints/tokenizer_model-vocab.json",
        "./tok_checkpoints/tokenizer_model-merges.txt",
    )
    '''
    optional step : preprocess the strings
    Ex: add <s> and </s> as BOS and EOS tokens to the string
        pad string to some max length and truncate string to some max length
    '''
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    tokenizer.enable_padding(pad_token='<pad>',
                             pad_id=tokenizer.get_vocab()['<pad>'],
                             length=20)
    tokenizer.enable_truncation(max_length=20)
    '''
    tokenize/encode strings
    '''
    input_ids = tokenizer.encode("Hello World, Whats up!!!").ids
    print("input ids", input_ids)
    tokens = tokenizer.encode("Hello World, Whats up!!!").tokens
    print("tokens", tokens)
    '''
    tokenize/encode batch of string
    '''
    batch_tokenized = tokenizer.encode_batch(
        ["Hello World, Whats up!!!", "Whata whata wa wada wada"])
    input_ids = [i.ids for i in batch_tokenized]
    print("input ids", input_ids)
    tokens = [i.tokens for i in batch_tokenized]
    print("tokens", tokens)
예제 #5
0
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            vocab_file,
            merges_file
        )

        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )

        tokenizer.enable_truncation(max_length=512)

        self.examples = []

        src_files = Path(data_folder).glob("**/*.txt")

        for src_file in src_files:
            print("🇩🇰", src_file)

            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
예제 #6
0
class WikiText2DataModule(pl.LightningDataModule):
    def __init__(self,
                 data_dir: str = 'data/wikitext-2',
                 train_batch_size: int = 64,
                 val_batch_size: int = 64,
                 dataloader_num_workers: int = 4,
                 seq_length: int = 64,
                 vocab_size=30000):
        super().__init__()
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
        self.dataloader_num_workers = dataloader_num_workers
        self.seq_length = seq_length
        self.vocab_size = vocab_size

        self.tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)

    def prepare_data(self, *args, **kwargs):
        dataset = load_dataset("wikitext",
                               "wikitext-103-raw-v1",
                               split="train+test+validation")
        column_names = dataset.column_names

        def batch_iterator(batch_size=1000):
            for i in range(0, len(dataset), batch_size):
                yield dataset[i:i + batch_size]["text"]

        if (not os.path.exists("data/wiki-vocab.json")) or (
                not os.path.exists("data/wiki-merges.txt")):
            print('TRAIN TOKENIZER')
            self.tokenizer.train_from_iterator(batch_iterator(),
                                               vocab_size=self.vocab_size)
            self.tokenizer.save_model("data/", "wiki")
        else:
            self.tokenizer = ByteLevelBPETokenizer("data/wiki-vocab.json",
                                                   "data/wiki-merges.txt",
                                                   add_prefix_space=True)

        dataset = load_dataset("wikitext", "wikitext-103-raw-v1")

        def tokenize_function(examples):
            return {
                'input_ids':
                list(
                    map(lambda x: x.ids,
                        self.tokenizer.encode_batch(examples['text'])))
            }

        self.tokenized_dataset = dataset.map(tokenize_function,
                                             batched=True,
                                             remove_columns=column_names,
                                             num_proc=4)

    def setup(self, stage: Optional[str] = None):
        # datasets = load_dataset('text',
        #                         data_dir=self.data_dir,
        #                         data_files={'train': 'wiki.train.small.raw',
        #                                     'valid': 'wiki.valid.small.raw'})

        def group_text(examples):
            # Concatenate all texts.
            concatenated_examples = {
                k: sum(examples[k], [])
                for k in examples.keys()
            }
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
            total_length = (total_length // self.seq_length) * self.seq_length
            # Split by chunks of max_len.
            result = {
                k: [
                    t[i:i + self.seq_length]
                    for i in range(0, total_length, self.seq_length)
                ]
                for k, t in concatenated_examples.items()
            }
            result["labels"] = result["input_ids"].copy()
            return result

        lm_dataset = self.tokenized_dataset.map(group_text,
                                                batched=True,
                                                num_proc=4)

        train_dataset = lm_dataset['train']
        eval_dataset = lm_dataset['validation']
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.test_dataset = lm_dataset['test']

    def collate_fn(self, features):
        batch = {}
        batch['inputs_ids'] = torch.tensor([f['input_ids'] for f in features],
                                           dtype=torch.long)
        batch['labels'] = batch['inputs_ids']
        return batch

    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.train_dataset,
                          batch_size=self.train_batch_size,
                          collate_fn=self.collate_fn,
                          num_workers=self.dataloader_num_workers)

    def val_dataloader(self) -> DataLoader:
        return DataLoader(self.eval_dataset,
                          batch_size=self.val_batch_size,
                          collate_fn=self.collate_fn,
                          num_workers=self.dataloader_num_workers)

    def test_dataloader(self) -> DataLoader:
        return DataLoader(self.test_dataset,
                          batch_size=self.val_batch_size,
                          collate_fn=self.collate_fn,
                          num_workers=self.dataloader_num_workers)
예제 #7
0
tokenizer.train(files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])
end = time.time()
print("L'entraînement a pris :", round((end - start) / 60, 2), ' minutes.')
# Save files to disk
tokenizer.save_model(chemin_modele, "Token_BERT_MTE")
# %%
print(tokenizer.encode("environnement"), tokenizer.encode("environnement"))
# %%
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)
#%%
examples = []
for src_file in Path(chemin_donnees).glob("**/*.txt"):
    print("🔥", src_file)
    lines = src_file.read_text(encoding="utf-8").splitlines()
    examples += [x.ids for x in tokenizer.encode_batch(lines)]

# %%
예제 #8
0
class ReferenceParser(Target):
    def __init__(self, model_dir, device: str = "cpu"):
        super().__init__()
        self.model_dir = abspath(model_dir)

        assert exists(self.model_dir
                      ), f"model directory '{self.model_dir}' does not exist"
        assert exists(
            join(self.model_dir, "config.json"
                 )), f"configuration file does not exist in {self.model_dir}"
        assert exists(join(
            self.model_dir,
            "merges.txt")), f"merges file does not exist in {self.model_dir}"
        assert exists(join(
            self.model_dir,
            "weights.pt")), f"model file does not exist in {self.model_dir}"
        assert exists(join(
            self.model_dir,
            "vocab.json")), f"vocab file does not exist in {self.model_dir}"

        with open(join(self.model_dir, "config.json"), "r") as config_file:
            self.model_config = json.load(config_file)
        if not torch.cuda.is_available():
            device = "cpu"
        self.device = torch.device(device)
        self.model = LSTMTagger(
            vocab_size=self.model_config["vocab_size"],
            embedding_dim=self.model_config["embedding_dim"],
            lstm_dim=self.model_config["lstm_dim"],
            device=device).to(self.device)
        weights = torch.load(join(self.model_dir, "weights.pt"),
                             map_location=device)
        self.model.load_state_dict(weights)
        self.model = self.model.eval()
        self.tokenizer = ByteLevelBPETokenizer(
            vocab_file=join(self.model_dir, "vocab.json"),
            merges_file=join(self.model_dir, "merges.txt"),
            lowercase=self.model_config["lowercase"])

        # self.space_re = re.compile(r"\s+")
        self.refer_re = re.compile(
            r"([(\[](\w[^\d()\[\]]+\s[(\[]?[12][0-9]{3}[A-Za-z]?[(\[]?[;,\s]*)+[)\]])"
        )
        self.et_al_re = re.compile(
            r"[^\d\s]+\set\.?\s*al\.?,?\s+([(\[][12][0-9]{3}[)\]])?")
        self.and_re = re.compile(
            r"([^\d\s]+,?\s+((and)|&)\s+)+[^\d\s]+[,\s]*([(\[][12][0-9]{3}[A-Za-z]?[)\]]|[12][0-9]{3}[A-Za-z]?)"
        )
        self.name_re = re.compile(r"[A-Z][^A-Z\d\s()\[\],;\.]+")

    def __call__(self, document):
        assert isinstance(
            document, dict
        ), f"wrong input of type {type(document)} to reference extractor"

        try:
            self.find_reference_blocks(document)
        except RuntimeError:
            logging.error(
                f"could not parse reference blocks of {document['name']}")

        text_references = self.find_text_references(document)

        # remove references from text, sort to match longest first
        sorted_ref = sorted(text_references,
                            key=lambda ref: len(ref[0]),
                            reverse=True)
        for reference in sorted_ref:
            self.clean_text(document, reference[0])

        return document

    def find_reference_blocks(self, document):
        lines = document["text_cleaned"].split("\n")
        labels = []

        # chunk text and prepare for model
        for i in range(0, len(lines), self.model_config["chunk_size"]):
            chunk = [
                x.ids for x in self.tokenizer.encode_batch(
                    lines[i:i + self.model_config["chunk_size"]])
            ]
            # padding
            max_line = max(len(line) for line in chunk)
            for j in range(len(chunk)):
                chunk[j] = [0] * (max_line - len(chunk[j])) + chunk[j]
            chunk = torch.Tensor([chunk]).long().to(self.device)

            predictions = self.model.forward(chunk, i, len(lines))
            labels.extend(torch.argmax(predictions[0], -1))

        keep_lines = []
        last_label, last_block = 0, []
        for line, label in zip(lines, labels):
            if last_label == 1 and label == 0 and len(last_block) > 0:
                document["entities"][Entity.REFERENCE].add(
                    "\n".join(last_block))
                last_block = []
            if label == 0:
                keep_lines.append(line)
            else:
                last_block.append(line)
            last_label = label
        if len(last_block) > 0:
            document["entities"][Entity.REFERENCE].add("\n".join(last_block))

        document["text_cleaned"] = "\n".join(keep_lines)

    def find_text_references(self, document):
        # find references based on different regular expressions
        references = []
        for x in self.refer_re.finditer(document["text_cleaned"]):
            start, end = x.span()
            references.append(
                (document["text_cleaned"][start:end], start, end))
        for x in self.et_al_re.finditer(document["text_cleaned"]):
            start, end = x.span()
            references.append(
                (document["text_cleaned"][start:end], start, end))
        for x in self.and_re.finditer(document["text_cleaned"]):
            start, end = x.span()
            references.append(
                (document["text_cleaned"][start:end], start, end))

        # remove overlapping references
        for refer_a, refer_b in combinations(references, 2):
            if refer_a[1] <= refer_b[1] < refer_a[2] and refer_b[2] <= refer_a[
                    2]:
                references.remove(refer_a)
                references.remove(refer_b)
                start = min(refer_a[1], refer_b[1])
                end = max(refer_a[2], refer_b[2])
                references.append(
                    (document["text_cleaned"][start:end], start, end))

        # extract names of authors in references and add reference entity
        document["references_authors"] = set()
        for reference in references:
            for x in self.name_re.findall(reference[0]):
                document["references_authors"].add(x)
            document["entities"][Entity.REFERENCE].add(reference[0])

        return references
class PreAbstractParser(Target):
    def __init__(self, model_dir, device="cpu"):
        super().__init__()
        self.model_dir = abspath(model_dir)

        assert exists(self.model_dir
                      ), f"model directory '{self.model_dir}' does not exist"
        assert exists(join(self.model_dir, "classes.json")
                      ), f"classes file does not exist in {self.model_dir}"
        assert exists(
            join(self.model_dir, "config.json"
                 )), f"configuration file does not exist in {self.model_dir}"
        assert exists(join(
            self.model_dir,
            "merges.txt")), f"merges file does not exist in {self.model_dir}"
        assert exists(join(
            self.model_dir,
            "weights.pt")), f"weights file does not exist in {self.model_dir}"
        assert exists(join(
            self.model_dir,
            "vocab.json")), f"vocab file does not exist in {self.model_dir}"

        with open(join(self.model_dir, "classes.json"), "r") as classes_file:
            self.class_to_index = json.load(classes_file)
            self.index_to_class = {
                v: k
                for k, v in self.class_to_index.items()
            }
        with open(join(self.model_dir, "config.json"), "r") as config_file:
            self.model_config = json.load(config_file)
        if not torch.cuda.is_available():
            device = "cpu"
        self.device = torch.device(device)
        self.model = LSTMTagger(
            vocab_size=self.model_config["vocab_size"],
            embedding_dim=self.model_config["embedding_dim"],
            lstm_dim=self.model_config["lstm_dim"],
            n_classes=len(self.class_to_index)).to(self.device)
        weights = torch.load(join(self.model_dir, "weights.pt"),
                             map_location=device)
        self.model.load_state_dict(weights)
        self.model = self.model.eval()
        self.tokenizer = ByteLevelBPETokenizer(
            vocab_file=join(self.model_dir, "vocab.json"),
            merges_file=join(self.model_dir, "merges.txt"),
            lowercase=self.model_config["lowercase"])

        self.noise_re = re.compile(r"[^A-Za-z ]")

        self.department_re = re.compile(r"(?:,\s*)?[^,]*Department[^,]*(?:,)",
                                        re.IGNORECASE)

    def __call__(self, document):
        assert isinstance(
            document,
            dict), f"wrong input of type {type(document)} to author parser"

        try:
            lines, labels = self.annotate_lines(
                document["text"][:document["abstract_start"]])
        except RuntimeError:
            logging.error(
                f"could not parse pre abstract of {document['name']}")
            return document

        keep_lines = []
        for line, label in zip(lines, labels):
            if "meta" in document and self.noise_re.sub(
                    "", line) == self.noise_re.sub("",
                                                   document["meta"]["title"]):
                keep_lines.append(line)
            elif label == "other":
                keep_lines.append(line)
            else:
                self.create_annotation(document, line, label)

        if "meta" in document:
            keep_lines = self.post_process_lines(document, keep_lines)

        document["text_cleaned"] = "\n".join(
            keep_lines) + document["text"][document["abstract_start"]:]

        return document

    def annotate_lines(self, text):
        lines = text.split("\n")

        tokenized = [x.ids for x in self.tokenizer.encode_batch(lines)]

        # padding
        max_tokens = max(len(sentence) for sentence in tokenized)
        for sentence in range(len(tokenized)):
            for _ in range(max_tokens - len(tokenized[sentence])):
                tokenized[sentence].insert(0, 0)

        tensor = torch.tensor([tokenized]).to(self.device)
        predictions = self.model.forward(tensor)
        predictions = torch.argmax(predictions[0], -1)
        predictions = [
            self.index_to_class[prediction.item()]
            for prediction in predictions
        ]

        return lines, predictions

    def create_annotation(self, document, line, label):
        if label == "private":
            document["entities"][Entity.PERSONAL_DATA].add(line)
        elif label == "author":
            document["entities"][Entity.AUTHOR].add(line)
        elif label == "email":
            document["entities"][Entity.EMAIL].add(line)
        elif label == "organization":
            for department_mention in self.department_re.findall(line):
                document["entities"][Entity.PERSONAL_DATA].add(
                    department_mention)
            line = self.department_re.sub("", line)
            document["entities"][Entity.INSTITUTION_COMPANY].add(line)
        else:
            logging.error(f"label '{label}' not recognized in {type(self)}")
            raise ValueError(f"label '{label}' not recognized")

    def post_process_lines(self, document, lines):
        keep_lines = []
        for line in lines:
            mention = False

            try:
                for author in document["meta"]["authors"]:
                    if re.search(
                            "[\s\-]*".join(
                                re.escape(name) for name in author.split()),
                            line, re.IGNORECASE):
                        mention = True
                        document["entities"][Entity.AUTHOR].add(line)

                for organization in document["meta"]["orgs"]:
                    if re.search(
                            "[\s\-]*".join(
                                re.escape(name)
                                for name in organization["name"].split()),
                            line, re.IGNORECASE):
                        mention = True
                        document["entities"][Entity.INSTITUTION_COMPANY].add(
                            line)
            except KeyError:
                logging.error(
                    f"conferences meta file misses key for {document['name']}")

            if not mention:
                keep_lines.append(line)

        return keep_lines