Пример #1
0
def get_datasets(max_sequence_length=10000, min_frequency=2):
    train = fetch_20newsgroups(subset='train',
                               shuffle=True,
                               random_state=57,
                               remove=('header', 'footer', 'quates'))
    test = fetch_20newsgroups(subset='test',
                              shuffle=True,
                              random_state=57,
                              remove=('header', 'footer', 'quates'))

    tokenizer = MosesTokenizer()
    train_target = []
    train_text = []
    for target, text in zip(train.target, train.data):
        for label in target_labels:
            if train.target_names[target].startswith(label):
                target = np.zeros(len(target_labels), dtype=np.float32)
                target[target_labels.index(label)] = 1.0
                train_target.append(target)
                train_text.append(
                    [token.lower() for token in tokenizer.tokenize(text)])
                break
    train_target = np.array(train_target, dtype=np.float32)

    test_target = []
    test_text = []
    for target, text in zip(test.target, test.data):
        for label in target_labels:
            if test.target_names[target].startswith(label):
                target = np.zeros(len(target_labels), dtype=np.float32)
                target[target_labels.index(label)] = 1.0
                test_target.append(target)
                test_text.append(
                    [token.lower() for token in tokenizer.tokenize(text)])
                break
    test_target = np.array(test_target, dtype=np.float32)

    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(
        max_sequence_length, min_frequency=min_frequency)
    corpus = train_text + test_text
    corpus = [' '.join(tokens) for tokens in corpus]
    vocab_processor.fit(corpus)

    train_text = np.array(list(
        vocab_processor.transform([' '.join(tokens)
                                   for tokens in train_text])),
                          dtype=np.int32)
    test_text = np.array(list(
        vocab_processor.transform([' '.join(tokens) for tokens in test_text])),
                         dtype=np.int32)

    train = tf.data.Dataset.from_tensor_slices({
        "target": train_target,
        "text": train_text
    })
    test = tf.data.Dataset.from_tensor_slices({
        "target": test_target,
        "text": test_text
    })

    return (train, test, len(target_labels), len(vocab_processor.vocabulary_))
Пример #2
0
def get_tokenizer(tokenizer, language='en'):
    r"""
    Generate tokenizer function for a string sentence.

    Arguments:
        tokenizer: the name of tokenizer function. If None, it returns split()
            function, which splits the string sentence by space.
            If basic_english, it returns _basic_english_normalize() function,
            which normalize the string first and split by space. If a callable
            function, it will return the function. If a tokenizer library
            (e.g. spacy, moses, toktok, revtok, subword), it returns the
            corresponding library.
        language: Default en

    Examples:
        >>> import torchtext
        >>> from torchtext.data import get_tokenizer
        >>> tokenizer = get_tokenizer("basic_english")
        >>> tokens = tokenizer("You can now install TorchText using pip!")
        >>> tokens
        >>> ['you', 'can', 'now', 'install', 'torchtext', 'using', 'pip', '!']

    """

    # default tokenizer is string.split(), added as a module function for serialization
    if tokenizer is None:
        return _split_tokenizer

    if tokenizer == "basic_english":
        if language != 'en':
            raise ValueError("Basic normalization is only available for Enlish(en)")
        return _basic_english_normalize

    # simply return if a function is passed
    if callable(tokenizer):
        return tokenizer

    if tokenizer == "spacy":
        try:
            import spacy
            spacy = spacy.load(language)
            return partial(_spacy_tokenize, spacy=spacy)
        except ImportError:
            print("Please install SpaCy. "
                  "See the docs at https://spacy.io for more information.")
            raise
        except AttributeError:
            print("Please install SpaCy and the SpaCy {} tokenizer. "
                  "See the docs at https://spacy.io for more "
                  "information.".format(language))
            raise
    elif tokenizer == "moses":
        try:
            from sacremoses import MosesTokenizer
            moses_tokenizer = MosesTokenizer()
            return moses_tokenizer.tokenize
        except ImportError:
            print("Please install SacreMoses. "
                  "See the docs at https://github.com/alvations/sacremoses "
                  "for more information.")
            raise
    elif tokenizer == "toktok":
        try:
            from nltk.tokenize.toktok import ToktokTokenizer
            toktok = ToktokTokenizer()
            return toktok.tokenize
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at https://nltk.org  for more information.")
            raise
    elif tokenizer == 'revtok':
        try:
            import revtok
            return revtok.tokenize
        except ImportError:
            print("Please install revtok.")
            raise
    elif tokenizer == 'subword':
        try:
            import revtok
            return partial(revtok.tokenize, decap=True)
        except ImportError:
            print("Please install revtok.")
            raise
    raise ValueError("Requested tokenizer {}, valid choices are a "
                     "callable that takes a single string as input, "
                     "\"revtok\" for the revtok reversible tokenizer, "
                     "\"subword\" for the revtok caps-aware tokenizer, "
                     "\"spacy\" for the SpaCy English tokenizer, or "
                     "\"moses\" for the NLTK port of the Moses tokenization "
                     "script.".format(tokenizer))
Пример #3
0
loader.load()
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer
from sacremoses import MosesDetokenizer
from collections import defaultdict

from tqdm import tqdm
from joblib import Parallel, delayed

from indicnlp.tokenize import indic_tokenize
from indicnlp.tokenize import indic_detokenize
from indicnlp.normalize import indic_normalize
from indicnlp.transliterate import unicode_transliterate

en_tok = MosesTokenizer(lang="en")
en_normalizer = MosesPunctNormalizer()


def preprocess_line(line, normalizer, lang, transliterate=False):
    if lang == "en":
        return " ".join(
            en_tok.tokenize(en_normalizer.normalize(line.strip()),
                            escape=False))
    elif transliterate:
        # line = indic_detokenize.trivial_detokenize(line.strip(), lang)
        return unicode_transliterate.UnicodeIndicTransliterator.transliterate(
            " ".join(
                indic_tokenize.trivial_tokenize(
                    normalizer.normalize(line.strip()), lang)),
            lang,
Пример #4
0
def get_sentencepiece(cache_dir: PathOrStr,
                      load_text,
                      pre_rules: ListRules = None,
                      post_rules: ListRules = None,
                      vocab_size: int = 30000,
                      model_type: str = 'unigram',
                      input_sentence_size: int = 1E7,
                      use_moses=False,
                      lang='en'):
    try:
        import sentencepiece as spm
    except ImportError:
        raise Exception(
            'sentencepiece module is missing: run `pip install sentencepiece`')

    cache_dir = pathlib.Path(cache_dir)
    pre_rules = pre_rules if pre_rules is not None else defaults.text_pre_rules
    post_rules = post_rules if post_rules is not None else defaults.text_post_rules

    special_cases = defaults.text_spec_tok

    if not os.path.isfile(cache_dir / 'spm.model') or not os.path.isfile(
            cache_dir / f'itos.pkl'):
        # load the text from the train tokens file
        text = load_text()
        text = filter(lambda x: len(x.rstrip(" ")), text)
        text = (reduce(lambda t, rule: rule(t), pre_rules, line)
                for line in text)
        if use_moses:
            mt = MosesTokenizer(lang)
            splitter = lambda t: mt.tokenize(t, return_str=False, escape=False)
        else:
            splitter = lambda t: t.split()

        def cleanup_n_postprocess(t):
            t = splitter(t)
            for r in post_rules:
                t = r(t)
            return ' '.join(t)

        text = map(cleanup_n_postprocess, text)
        raw_text_path = cache_dir / 'all_text.txt'
        with open(raw_text_path, 'w') as f:
            f.write("\n".join(text))

        sp_params = [
            f"--input={raw_text_path}", f"--character_coverage=1.0",
            f"--unk_id={len(defaults.text_spec_tok)}", f"--pad_id=-1",
            f"--bos_id=-1", f"--eos_id=-1", f"--max_sentence_length=20480",
            f"--input_sentence_size={int(input_sentence_size)}",
            f"--user_defined_symbols={','.join(special_cases)}",
            f"--model_prefix={cache_dir/'spm'}",
            f"--vocab_size={vocab_size} --model_type={model_type}"
        ]
        spm.SentencePieceTrainer.Train(" ".join(sp_params))

        with open(cache_dir / 'spm.vocab', 'r') as f:
            vocab = [line.split('\t')[0] for line in f.readlines()]

        pickle.dump(vocab, open(cache_dir / f'itos.pkl', 'wb'))
    # todo add post rules
    vocab = Vocab(pickle.load(open(cache_dir / f'itos.pkl', 'rb')))
    # We cannot use lambdas or local methods here, since `tok_func` needs to be
    # pickle-able in order to be called in subprocesses when multithread tokenizing
    tokenizer = SentencePieceTokenizer(cache_dir / 'spm.model',
                                       use_moses=use_moses,
                                       lang=lang,
                                       pre_rules=pre_rules,
                                       post_rules=post_rules)
    return {'tokenizer': tokenizer, 'vocab': vocab}
Пример #5
0
import logging
import os
from typing import List

import numpy as np
from sacremoses import MosesTokenizer

from video_dialogue_model.data.utils import (
    sent_num_file,
    offsets_file,
    src_file
)


os.environ['CUDA_VISIBLE_DEVICES'] = "0,"
TOKENIZER = MosesTokenizer(lang='en')


def load_origin_texts(data_dir, split="train") -> List[List[str]]:
    """load origin text data"""
    output = []
    ori_sen = []
    input_path = os.path.join(data_dir, f'{split}.origin.txt')
    logging.info(f"Loading origin data from {input_path}")
    with open(input_path, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            line.replace("\u2013", "-")
            ori_sen.append(line)
Пример #6
0
 def __init__(self):
     from sacremoses import MosesTokenizer
     self._tokenizer = MosesTokenizer()
Пример #7
0
    def __init__(self):
        super(RunHP, self).__init__()

        #   GENERAL  #
        self.seed = 42
        self.cuda_device_id = 6
        self.device = 'cpu'  # 'cuda' or 'cpu'
        self.training_logging_step = 50  # how often to print internal metrics
        self.epochs = 10  # if set to 0 will immediately just to evaluation
        self.learning_rate = 0.0005
        self.grads_clip = 0.25

        # GENERAL DATA RELATED #
        self.dataset = 'yelp'
        self.train_max_groups_per_batch = 6
        self.val_max_groups_per_batch = 13
        self.eval_max_groups_per_batch = 20
        self.max_rev_per_group = 8

        #   DATA SOURCES  #
        # `early_term` limits the number of chunks per epoch
        self.train_early_term = None
        self.val_early_term = None
        self.gener_early_term = 2

        #  GENERAL PATHS   #
        self.root_path = 'copycat'
        self.experiments_folder = 'first_run'
        self.output_dir = f'{self.root_path}/runs/{self.dataset}/{self.experiments_folder}'
        self.checkpoint_full_fn = 'checkpoint.tar'
        epc = ExperimentsPathController()
        self.output_path = epc(self.output_dir)
        self.checkpoint_path = f'{self.root_path}/artifacts/{self.dataset}/checkpoint.tar'
        self.tcaser_model_path = f'{self.root_path}/artifacts/{self.dataset}/data/tcaser.model'

        #   DATA PATHS  #
        self.base_data_path = f'data/{self.dataset}/'
        self.train_fp = comb_paths(self.base_data_path, "split/train/")
        self.val_fp = comb_paths(self.base_data_path, 'split/val/')
        self.words_vocab_fp = f'{self.root_path}/artifacts/{self.dataset}/data/words.txt'
        self.eval_dev_fp = comb_paths(self.base_data_path, 'gold', 'val.csv')
        self.eval_test_fp = comb_paths(self.base_data_path, 'gold', 'test.csv')

        #   ANNEALING   #
        self.c_m = 8.
        self.c_r = 0.8
        self.c_kl_ann_max_val = 1.
        self.c_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000
        self.z_m = 8.
        self.z_c = 0.8
        self.z_kl_ann_max_val = 1.
        self.z_kl_ann_batches = self.epochs * self.train_early_term if self.train_early_term else self.epochs * 10000

        #   DECODING/GENERATION  #
        self.beam_size = 5
        self.beam_len_norm = True
        self.beam_excl_words = []
        self.block_ngram_repeat = 3  # or None
        self.ngram_mirror_window = 3  # or None
        self.mirror_conjs = ["and", 'or', ',', 'but']  # or None
        self.block_consecutive = True
        self.min_gen_seq_len = 20

        #   POST-PROCESSING AND ANALYTICS #
        mt = MosesTokenizer()
        self.tok_func = partial(mt.tokenize, escape=False)
        self.sent_split_func = nltk.sent_tokenize
        dt = MosesDetokenizer()
        self.detok_func = partial(dt.detokenize, unescape=False)
        true_caser = MosesTruecaser(load_from=self.tcaser_model_path,
                                    is_asr=True)
        self.true_case_func = partial(true_caser.truecase,
                                      return_str=True,
                                      use_known=True)
        self.analytics_func = partial(ngram_seq_analysis,
                                      tokenizer=self.tok_func,
                                      sent_splitter=self.sent_split_func,
                                      n_grams_to_comp=(2, 3, 4))
Пример #8
0
DESCAPE = MOSES_BDIR + 'deescape-special-chars.perl'
REM_NON_PRINT_CHAR = MOSES_BDIR + 'remove-non-printing-char.perl'

# Romanization (Greek only)
ROMAN_LC = 'python3 ' + LASER + '/source/lib/romanize_lc.py -l '

# Mecab tokenizer for Japanese
MECAB = LASER + '/tools-external/mecab'

# Dictionaries to keep all sacremoses objects for different languages
sacremoses_norm_punct = {
    'en': MosesPunctNormalizer(lang='en'),
    'sv': MosesPunctNormalizer(lang='sv')
}
sacremoses_tokenizers = {
    'en': MosesTokenizer(lang='en'),
    'sv': MosesTokenizer(lang='sv')
}

###############################################################################
#
# Tokenize a line of text
#
###############################################################################


def TokenLine(line, lang='en', lower_case=True, romanize=False):
    assert lower_case, 'lower case is needed by all the models'
    '''
    roman = lang if romanize else 'none'
    tok = check_output(
import requests
import subprocess
import json
import os
import tempfile
from sacremoses import MosesTokenizer, MosesDetokenizer
from collections import defaultdict
from nltk import sent_tokenize

SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))

# PROCESSING TEXT
tokenizer_en = MosesTokenizer(lang='en')
detokenizer_en = MosesDetokenizer(lang='en')
tokenizer_es = MosesTokenizer(lang='es')
detokenizer_es = MosesDetokenizer(lang='es')

MAX_NUM_TOKENS = 10
SPLIT_DELIMITER = ';'
LANGUAGE_ISO_MAP = {'en': 'english', 'es': 'spanish'}


def tokenize(text, lang, return_str=True):
    if lang == 'en':
        text_tok = tokenizer_en.tokenize(text,
                                         return_str=return_str,
                                         escape=False)
        return text_tok
    elif lang == 'es':
        text_tok = tokenizer_es.tokenize(text,
                                         return_str=return_str,
Пример #10
0
def prepare(source_lang, target_lang, source_file, target_file):

    df_source = pd.read_csv(source_file,
                            names=['Source'],
                            sep="\n",
                            quoting=csv.QUOTE_NONE,
                            error_bad_lines=False)
    df_target = pd.read_csv(target_file,
                            names=['Target'],
                            sep="\n",
                            quoting=csv.QUOTE_NONE,
                            error_bad_lines=False)
    df = pd.concat([df_source, df_target],
                   axis=1)  # Join the two dataframes along columns
    print("Dataframe shape (rows, columns):", df.shape)

    # Delete nan
    df = df.dropna()

    print("--- Rows with Empty Cells Deleted\t--> Rows:", df.shape[0])

    # Tokenize and lower-case text, and remove HTML.
    # Use str() to avoid (TypeError: expected string or bytes-like object)
    # Note: removing tags should be before removing empty cells because some cells might have only tags and become empty.

    html = re.compile('<.*?>|&lt;.*?&gt;')  # Maybe also &quot;

    mtoken_source = MosesTokenizer(lang=source_lang)

    token_source = lambda row: mtoken_source.tokenize(
        re.sub(html, '', str(row)), return_str=True).strip().lower()

    df["Source"] = df["Source"].apply(token_source)

    print("--- Tokenizing the Source Complete\t--> Rows:", df.shape[0])

    mtoken_target = MosesTokenizer(lang=target_lang)

    token_target = lambda row: mtoken_target.tokenize(
        re.sub(html, '', str(row)), return_str=True).strip().lower()

    df["Target"] = df["Target"].apply(token_target)

    print("--- Tokenizing the Target Complete\t--> Rows:", df.shape[0])

    # Drop duplicates
    df = df.drop_duplicates()

    print("--- Duplicates Deleted\t\t\t--> Rows:", df.shape[0])

    # Drop copy-source rows
    df["Source-Copied"] = df['Source'] == df['Target']
    #display(df.loc[df['Source-Copied'] == True]) # display only copy-sourced rows
    df = df.set_index(['Source-Copied'])

    try:  # To avoid (KeyError: '[True] not found in axis') if there are no source-copied cells
        df = df.drop([True])  # Boolean, not string, do not add quotes
    except:
        pass

    print("--- Source-Copied Rows Deleted\t\t--> Rows:", df.shape[0])

    # Drop too-long rows (source or target)
    df["Too-Long"] = (df['Source'].str.len() > df['Target'].str.len() * 2) | (
        df['Target'].str.len() > df['Source'].str.len() * 2)
    #display(df.loc[df['Too long'] == True]) # display only too long rows
    df = df.set_index(['Too-Long'])

    try:  # To avoid (KeyError: '[True] not found in axis') if there are no too-long cells
        df = df.drop([True])  # Boolean, not string, do not add quotes
    except:
        pass

    print("--- Too-Long Source/Target Deleted\t--> Rows:", df.shape[0])

    # Replace empty cells with NaN
    df = df.replace(r'^\s*$', np.nan, regex=True)

    # Delete nan (already there, or generated from the previous steps)
    df = df.dropna()

    print("--- Rows with Empty Cells Deleted\t--> Rows:", df.shape[0])

    # Optional: Reset the indext and drop the boolean columns
    # df = df.reset_index()
    # df = df.set_index(['index'])
    # df = df.drop(['Source-Copied', 'Too-Long'], axis = 1)
    # display(df)

    # Write the dataframe to two Source and Target files
    source_file = source_file + '-tokenized.' + source_lang
    target_file = target_file + '-tokenized.' + target_lang

    df_dic = df.to_dict(orient='list')

    with open(source_file, "w") as sf:
        sf.write("\n".join(line for line in df_dic['Source']))

    with open(target_file, "w") as tf:
        tf.write("\n".join(line for line in df_dic['Target']))

    print("--- Wrote Files")
    print("Done!")
    print("Output files:\n• ", source_file, "\n• ", target_file)
Пример #11
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import json
import numpy as np
import re
import six

from collections import OrderedDict
from os import walk
from sacremoses import MosesTokenizer

tokenizer = MosesTokenizer()


def get_word_index(word_index_path):
    with open(word_index_path) as f:
        return json.load(f)


def build_data_index(input_dir, word_index):
    train_x = []
    train_y = []
    for root, dirs, files in walk(input_dir):
        for filename in files:
            if re.match(".*\d+_\d+.txt", filename):
                filepath = root + '/' + filename
                print(filepath)
Пример #12
0
    def __init__(self,
                 input,
                 tokenize=None,
                 vocab=None,
                 vocab_size=None,
                 subword_path=None,
                 seq_len=0,
                 sos=False,
                 oovs=0,
                 lang="en",
                 subsample=0,
                 **kwargs):
        """
        Base Dataset for Language Modeling.

        Args:
            tokenize (callable): tokenization callable, which takes as input
                a string and returns a list of tokens
            input (str, list): the path to the data file, or a list of samples.
            vocab (Vocab): a vocab instance. If None, then build a new one
                from the Datasets data.
            vocab_size(int): if given, then trim the vocab to the given number.
        """
        self.input = input
        self.seq_len = seq_len
        self.subword_path = subword_path
        self.sos = sos
        self.oovs = oovs
        self.subsample = subsample

        # > define tokenization to be used -------------------------------
        if tokenize is not None:
            self.tokenize = tokenize
        else:
            self.tokenize = self.space_tok

        if self.subword_path is not None:
            subword = spm.SentencePieceProcessor()
            subword_path = fix_paths(subword_path, "datasets")
            subword.Load(subword_path + ".model")
            self.tokenize = lambda x: subword.EncodeAsPieces(x.rstrip())
        else:
            self.tokenize = MosesTokenizer(lang=lang).tokenize

        # > Build Vocabulary --------------------------------------------
        self.vocab, is_vocab_built = self.init_vocab(vocab, subword_path, oovs)

        # > Cache text file ---------------------------------------------
        self.lengths = []
        _is_cached = False

        def _line_callback(x):
            _tokens = self.tokenize(x)
            self.lengths.append(len(self.add_special_tokens(_tokens)))

            if is_vocab_built is False:
                self.vocab.read_sequence(_tokens)

        # -------------------------------------------------------------
        # If there is a (vocab, lengths) tuple associated with the given input
        # file, then load them from cache and skip the recalculation
        # -------------------------------------------------------------
        _ckey = self._get_cache_key(input, vocab, self.tokenize,
                                    subword_path, vocab_size, self.subsample)
        _cfile = os.path.join(os.path.dirname(input), f".cache_{_ckey}")
        if os.path.isfile(_cfile):
            print("Loading data from cache...", end=" ")
            with open(_cfile, "rb") as f:
                _vocab, self.lengths = pickle.load(f)
                self.vocab = Vocab().from_vocab_instance(_vocab)
            print("done!")
            _is_cached = True

        # > Preprocessing ---------------------------------------------
        print("Preprocessing...")
        self.data = DatasetCache(input,
                                 callback=_line_callback,
                                 subsample=subsample)

        # if the text file has already been cached,
        # but lengths and vocab are not cached (i.e., new for this input file)
        if _is_cached is False and len(self.lengths) == 0:
            for i in range(len(self.data)):
                _line_callback(self.data[i])

        # trim down the size of a newly created vocab
        if subword_path is None and vocab_size is not None:
            self.vocab.build_lookup(vocab_size)

        # -------------------------------------------------------------
        # save to cache if not already saved
        # -------------------------------------------------------------
        if _is_cached is False:
            print("Writing data to cache...")
            with open(_cfile, "wb") as f:
                pickle.dump((self.vocab, self.lengths), f)

        self.lengths = numpy.array(self.lengths)
Пример #13
0
    def morph(self, source, reference, constrain_pos=True):
        # Return Format (raw, translation, is attack success, query number, modif_rate)

        orig_tokenized = MosesTokenizer(lang='en').tokenize(source)
        # skip too long or too short question
        if len(orig_tokenized) < 10 or len(orig_tokenized) > 100:
            return source, reference, None, None, None

        # generate candidates
        pos_tagged = [
            (tagged[0], '.') if '&' in tagged[0] else tagged
            for tagged in nltk.pos_tag(orig_tokenized, tagset='universal')
        ]

        token_inflections = self.get_inflections(orig_tokenized, pos_tagged,
                                                 constrain_pos)

        # get original bleu
        original_bleu, orig_predicted = self.get_bleu(source, reference)

        # skip examples already have glue == 0
        if original_bleu == 0:
            return source, reference, None, None, None

        forward_perturbed, forward_bleu, forward_predicted, num_queries_forward = self.search_nmt(
            token_inflections, orig_tokenized, source, original_bleu,
            reference)

        if forward_bleu == original_bleu:
            forward_predicted = orig_predicted

        # attack success
        if forward_bleu == 0:
            modif_rate = self.get_modif_rate(orig_tokenized, forward_perturbed)
            attack_text = MosesDetokenizer(
                lang='en').detokenize(forward_perturbed)
            return attack_text, forward_predicted, True, num_queries_forward + 1, modif_rate

        backward_perturbed, backward_bleu, backward_predicted, num_queries_backward = self.search_nmt(
            token_inflections,
            orig_tokenized,
            source,
            original_bleu,
            reference,
            backward=True)

        if backward_bleu == original_bleu:
            backward_predicted = orig_predicted
        num_queries = 1 + num_queries_forward + num_queries_backward
        if forward_bleu < backward_bleu:
            is_attack_success = False
            if forward_bleu == 0:
                is_attack_success = True
            modif_rate = self.get_modif_rate(orig_tokenized, forward_perturbed)
            attack_text = MosesDetokenizer(
                lang='en').detokenize(forward_perturbed)
            return attack_text, forward_predicted, is_attack_success, num_queries, modif_rate
        else:
            is_attack_success = False
            if backward_bleu == 0:
                is_attack_success = True
            modif_rate = self.get_modif_rate(orig_tokenized,
                                             backward_perturbed)
            attack_text = MosesDetokenizer(
                lang='en').detokenize(backward_perturbed)
            return attack_text, backward_predicted, is_attack_success, num_queries, modif_rate
Пример #14
0
import glob
import json
import logging as log
import os
import re

import pandas as pd
from sacremoses import MosesTokenizer

TOKENIZER = MosesTokenizer()

# NOTE: some of the DPR preparation code is taken from
# https://github.com/nyu-mll/jiant


def prepare_dpr(raw_data_dir, processed_data_dir):
    # create the directory for intermediate dpr .jsons
    dpr_jsons_dir = os.path.join(raw_data_dir, 'dpr_jsons')
    if not os.path.exists(dpr_jsons_dir):
        os.makedirs(dpr_jsons_dir)

    src_file = os.path.join(raw_data_dir, "dpr_data.txt")

    # load everything in memory
    # this preprocessing is required to be compatible with the jiant code
    text2examples = {}
    curr = {}
    with open(src_file) as fd:
        for line in fd:
            line = line.strip()
            if not line:
Пример #15
0
def get_tokenizer(tokenizer, language='en'):
    # default tokenizer is string.split(), added as a module function for serialization
    if tokenizer is None:
        return _split_tokenizer

    # simply return if a function is passed
    if callable(tokenizer):
        return tokenizer

    if tokenizer == "spacy":
        try:
            import spacy
            spacy = spacy.load(language)
            return partial(_spacy_tokenize, spacy=spacy)
        except ImportError:
            print("Please install SpaCy. "
                  "See the docs at https://spacy.io for more information.")
            raise
        except AttributeError:
            print("Please install SpaCy and the SpaCy {} tokenizer. "
                  "See the docs at https://spacy.io for more "
                  "information.".format(language))
            raise
    elif tokenizer == "moses":
        try:
            from sacremoses import MosesTokenizer
            moses_tokenizer = MosesTokenizer()
            return moses_tokenizer.tokenize
        except ImportError:
            print("Please install SacreMoses. "
                  "See the docs at https://github.com/alvations/sacremoses "
                  "for more information.")
            raise
    elif tokenizer == "toktok":
        try:
            from nltk.tokenize.toktok import ToktokTokenizer
            toktok = ToktokTokenizer()
            return toktok.tokenize
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at https://nltk.org  for more information.")
            raise
    elif tokenizer == 'revtok':
        try:
            import revtok
            return revtok.tokenize
        except ImportError:
            print("Please install revtok.")
            raise
    elif tokenizer == 'subword':
        try:
            import revtok
            return partial(revtok.tokenize, decap=True)
        except ImportError:
            print("Please install revtok.")
            raise
    raise ValueError("Requested tokenizer {}, valid choices are a "
                     "callable that takes a single string as input, "
                     "\"revtok\" for the revtok reversible tokenizer, "
                     "\"subword\" for the revtok caps-aware tokenizer, "
                     "\"spacy\" for the SpaCy English tokenizer, or "
                     "\"moses\" for the NLTK port of the Moses tokenization "
                     "script.".format(tokenizer))
Пример #16
0
  def load_model(self, src_language, trg_language, domain, bpe_src_code=None, tokenize=None):
    """ Load model for given trg language. """
    # model_dir = "{}-{}".format(self._model_dir_prefix, trg_language)
    model_dir = f"{self._model_dir_prefix}{src_language}-{trg_language}-{domain}"

    # Load the checkpoint.
    ckpt_path = os.path.join(model_dir, 'model.ckpt')
        
    # Load the vocabularies.
    src_vocab_path = os.path.join(model_dir, 'src_vocab.txt')

    trg_vocab_path = os.path.join(model_dir, 'trg_vocab.txt')
    
    # Load the config.
    config_path = os.path.join(model_dir, 'config_orig.yaml')

    # Adjust config.
    config = load_config(config_path)
    new_config_file = os.path.join(model_dir, 'config.yaml')
    config = self._update_config(config, src_vocab_path, trg_vocab_path,
                                 model_dir, ckpt_path)
    with open(new_config_file, 'w') as cfile:
      yaml.dump(config, cfile)

    # print('Loaded model for {}-{}.'.format(self._src_language, trg_language))
    print('Loaded model for {}-{}.'.format(src_language, trg_language))

    conf = {}

    logger = logging.getLogger(__name__)
    conf["logger"] = logger

    # load the Joey configuration
    cfg = load_config(new_config_file)

    # load the checkpoint
    if "load_model" in cfg['training'].keys():
        ckpt = cfg['training']["load_model"]
    else:
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError("No checkpoint found in directory {}."
                                    .format(model_dir))

    # prediction parameters from config
    conf["use_cuda"] = cfg["training"].get("use_cuda", False) if torch.cuda.is_available() else False

    conf["level"] = cfg["data"]["level"]
    conf["max_output_length"] = cfg["training"].get("max_output_length", None)
    conf["lowercase"] = cfg["data"].get("lowercase", False)

    # load the vocabularies
    src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt"
    trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt"
    
    conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file,
                            dataset=None, max_size=-1, min_freq=0)
    conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file,
                            dataset=None, max_size=-1, min_freq=0)

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        conf["beam_size"] = cfg["testing"].get("beam_size", 0)
        conf["beam_alpha"] = cfg["testing"].get("alpha", -1)
    else:
        conf["beam_size"] = 1
        conf["beam_alpha"] = -1

    # pre-processing
    if tokenize is not None:
        src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"])
        trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"])
        # tokenize input
        tokenizer = lambda x: src_tokenizer.tokenize(x, return_str=True)
        detokenizer = lambda x: trg_tokenizer.detokenize(
            x.split(), return_str=True)
    else:
        tokenizer = lambda x: x
        detokenizer = lambda x: x

    if bpe_src_code is not None and level == "bpe":
        # load bpe merge file
        merge_file = open(bpe_src_code, "r")
        bpe = apply_bpe.BPE(codes=merge_file)
        segmenter = lambda x: bpe.process_line(x.strip())
    elif conf["level"] == "char":
        # split to chars
        segmenter = lambda x: list(x.strip())
    else:
        segmenter = lambda x: x.strip()

    conf["preprocess"] = [tokenizer, segmenter]
    conf["postprocess"] = [detokenizer]
    # build model and load parameters into it
    model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"])
    model = build_model(cfg["model"], src_vocab=conf["src_vocab"], trg_vocab=conf["trg_vocab"])
    model.load_state_dict(model_checkpoint["model_state"])
    # ipdb.set_trace()
    if conf["use_cuda"]:
        model.cuda()
    conf["model"] = model
    print("Joey NMT model loaded successfully.")
    return conf
Пример #17
0
    def __init__(self, task, models, args, src_bpe=None, bpe_symbol='@@ '):
        self.task = task
        self.models = models
        self.src_dict = task.source_dictionary
        self.tgt_dict = task.target_dictionary
        self.src_bpe = src_bpe
        self.use_cuda = torch.cuda.is_available() and not args.cpu
        self.args = args

        # optimize model for generation
        for model in self.models:
            model.make_generation_fast_(
                beamable_mm_beam_size=None if self.args.no_beamable_mm else self.args.beam,
                need_attn=args.print_alignment,
            )
            if args.fp16:
                model.half()
            if self.use_cuda:
                model.cuda()

        self.generator = self.task.build_generator(args)

        # Load alignment dictionary for unknown word replacement
        # (None if no unknown word replacement, empty if no path to align dictionary)
        self.align_dict = utils.load_align_dict(args.replace_unk)

        self.max_positions = utils.resolve_max_positions(
            self.task.max_positions(),
            *[model.max_positions() for model in models]
        )

        self.in_transforms = []
        self.out_transforms = []

        if getattr(args, 'moses', False):
            tokenizer = MosesTokenizer(lang=args.source_lang or 'en')
            detokenizer = MosesDetokenizer(lang=args.target_lang or 'en')
            self.in_transforms.append(lambda s: tokenizer.tokenize(s, return_str=True))
            self.out_transforms.append(lambda s: detokenizer.detokenize(s.split()))
        elif getattr(args, 'nltk', False):
            from nltk.tokenize import word_tokenize
            self.in_transforms.append(lambda s: ' '.join(word_tokenize(s)))

        if getattr(args, 'gpt2_bpe', False):
            from fairseq.gpt2_bpe.gpt2_encoding import get_encoder
            encoder_json = os.path.join(os.path.dirname(src_bpe), 'encoder.json')
            vocab_bpe = src_bpe
            encoder = get_encoder(encoder_json, vocab_bpe)
            self.in_transforms.append(lambda s: ' '.join(map(str, encoder.encode(s))))
            self.out_transforms.append(lambda s: ' '.join(t for t in s.split() if t != '<unk>'))
            self.out_transforms.append(lambda s: encoder.decode(map(int, s.strip().split())))
        elif getattr(args, 'sentencepiece', False):
            import sentencepiece as spm
            sp = spm.SentencePieceProcessor()
            sp.Load(src_bpe)
            self.in_transforms.append(lambda s: ' '.join(sp.EncodeAsPieces(s)))
            self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, 'sentencepiece'))
        elif src_bpe is not None:
            bpe_parser = apply_bpe.create_parser()
            bpe_args = bpe_parser.parse_args(['--codes', self.src_bpe])
            bpe = apply_bpe.BPE(bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries)
            self.in_transforms.append(lambda s: bpe.process_line(s))
            self.out_transforms.append(lambda s: data_utils.process_bpe_symbol(s, bpe_symbol))
Пример #18
0
"""Do moses tok detok."""
# pylint: disable=invalid-name, unused-import

import sys

try:
    import sacremoses  # noqa: F401
except ModuleNotFoundError:
    import subprocess as sp
    import shlex
    proc = sp.Popen(shlex.split('pip install sacremoses'),
                    stdout=-1,
                    stderr=-1)
    out, err = proc.communicate()
    if err:
        sys.stderr.write('error: %s' % err.decode())
    sys.stdout.write('%s' % out.decode())

from sacremoses import MosesTokenizer, MosesDetokenizer

MTOK = MosesTokenizer().tokenize
MDETOK = MosesDetokenizer().detokenize
mtok = MTOK
mdetok = MDETOK
Пример #19
0
 def __init__(self):
     from sacremoses import MosesTokenizer  # pylint: disable=import-outside-toplevel
     self._tokenizer = MosesTokenizer()
Пример #20
0
def get_tokenizer(tokenizer):
    """
    Returns a tokenizer according to the parameters given.

    Parameters
    ----------
    tokenizer : str | callable
        If a callable object is given, it will just be returned.
        Otherwise, a string can be given to create one of the premade
        tokenizers. The string must be of format 'tokenizer' or `tokenizer-args`

        The available premade tokenizers are:
            - 'split' - default str.split(). Custom separator can be provided as
              `split-sep` where `sep` is the separator string.

            - 'spacy' - the spacy tokenizer, using the 'en' language
              model by default . Different language model can be provided as
              'spacy-lang' where `lang` is the language model name (e.g. `spacy-en`).
              If spacy model is used for the first time, an attempt to install it will be
              made. If that fails, user should download it by using command similar
              to the following `python -m spacy download en`.
              More details can be found in spacy documentation https://spacy.io/usage/models.

            - toktok - NLTK's toktok tokenizer. For more details
              see https://www.nltk.org/_modules/nltk/tokenize/toktok.html.

            - moses - Sacremoses's moses tokenizer. For more details
              see https://github.com/alvations/sacremoses.

    Returns
    -------
        The created (or given) tokenizer.

    Raises
    ------
    ImportError
        If the required package for the specified tokenizer is not installed.
    ValueError
        If the given tokenizer is not a callable or a string, or is a
        string that doesn't correspond to any of the supported tokenizers.
    """

    if callable(tokenizer):
        return tokenizer

    if not isinstance(tokenizer, str):
        raise ValueError(
            f"Wrong type passed to `get_tokenizer`. Allowed types are callables "
            f"and strings. The provided type is {type(tokenizer)}"
        )

    tokenizer, *language_or_sep = tokenizer.split("-", 1)
    language_or_sep = language_or_sep[0] if language_or_sep else None

    if tokenizer == "spacy":
        language = language_or_sep if language_or_sep is not None else "en_core_web_sm"
        spacy = load_spacy_model_or_raise(language, disable=["parser", "ner"])

        # closures instead of lambdas because they are serializable
        def spacy_tokenize(string):
            # need to wrap in a function to access .text
            return [token.text for token in spacy.tokenizer(string)]

        return spacy_tokenize

    elif tokenizer == "split":
        sep = language_or_sep

        def _split(string):
            return string.split(sep)

        return _split

    elif tokenizer == "toktok":
        from nltk.tokenize.toktok import ToktokTokenizer

        toktok = ToktokTokenizer()
        return toktok.tokenize

    elif tokenizer == "moses":
        try:
            from sacremoses import MosesTokenizer

            moses_tokenizer = MosesTokenizer()
            return moses_tokenizer.tokenize
        except ImportError:
            print(
                "Please install SacreMoses. "
                "See the docs at https://github.com/alvations/sacremoses "
                "for more information."
            )
            raise
    else:
        raise ValueError(f"Wrong value given for the tokenizer: {tokenizer}")
Пример #21
0
 def __init__(self, lang: str):
     super().__init__(lang=lang)
     self.tok = MosesTokenizer(lang)
Пример #22
0
from sacremoses import MosesTokenizer
from tqdm import tqdm

logging.basicConfig(filename='../logs/hand_annotated2conll.log',
                    filemode='w',
                    format="%(asctime)s:%(levelname)s:\t%(message)s",
                    level=logging.INFO)

TAGS_DICT = {
    "Nom": "PER_NOM",
    "Prenom": "PER_PRENOM",
    "Adresse": "LOC",
    "O": "O"
}

mt = MosesTokenizer(lang="fr")


def moses_tokenize(phrase):
    tokens = mt.penn_tokenize(phrase)
    return tokens


def tokenize(phrase):
    # TODO: Tokenize with proper tokenizer
    tokens = re.split("[\s,.]+", phrase)
    tokens = [t for t in tokens if t]
    return tokens


def tags_to_bio(all_tags):
Пример #23
0
 def __init__(self, do_lower_case: bool = False, escape: bool = False):
     self._tokenizer = MosesTokenizer()
     self._do_lower_case = do_lower_case
     self._escape = escape
Пример #24
0
###############################################################################
# Language Modeling on Wikitext-2
#
# This file generates new sentences sampled from the language model
#
###############################################################################

import argparse

import torch

import data

from sacremoses import MosesTokenizer
mt = MosesTokenizer(lang='en')

parser = argparse.ArgumentParser(
    description='PyTorch Wikitext-2 Language Model')

# Model parameters.
parser.add_argument('--input',
                    type=str,
                    required=False,
                    help='additional words')
parser.add_argument('--data',
                    type=str,
                    default='./data/wikitext-2',
                    help='location of the data corpus')
parser.add_argument('--checkpoint',
                    type=str,
                    default='./model.pt',
Пример #25
0
def evaluate(embedder, args):

    sp = spm.SentencePieceProcessor()
    sp.Load(args.sentencepiece)

    entok = MosesTokenizer(lang='en')

    from argparse import Namespace

    args = Namespace(batch_size=32,
                     entok=entok,
                     sp=sp,
                     embedder=embedder,
                     encoder=args.eval_encoder,
                     tokenize=args.tokenize)

    s = STS12Eval('STS/STS12-en-test')
    s.do_prepare()
    results = s.run(args, batcher)
    s = STS13Eval('STS/STS13-en-test')
    s.do_prepare()
    results.update(s.run(args, batcher))
    s = STS14Eval('STS/STS14-en-test')
    s.do_prepare()
    results.update(s.run(args, batcher))
    s = STS15Eval('STS/STS15-en-test')
    s.do_prepare()
    results.update(s.run(args, batcher))
    s = STS16Eval('STS/STS16-en-test')
    s.do_prepare()
    results.update(s.run(args, batcher))
    s = SemEval17('STS/STS17-test')
    s.do_prepare()
    results.update(s.run(args, batcher))
    s = STSBenchmarkEval('STS/STSBenchmark')
    s.do_prepare()
    results.update(s.run(args, batcher))
    s = STSHard('STS/STSHard')
    s.do_prepare()
    results.update(s.run(args, batcher))

    for i in results:
        print(i, results[i])

    total = []
    all = []
    cross = []
    foreign = []
    for i in results:
        if "STS" in i and "all" not in i and "SemEval17" not in i:
            total.append(results[i]["pearson"][0])
        if "STS" in i and "all" in i:
            all.append(results[i]["pearson"]["mean"])
        if i == "SemEval17.STS.input.track2.ar-en.txt" or i == "SemEval17.STS.input.track4a.es-en.txt" \
                or i == "SemEval17.STS.input.track6.tr-en.txt":
            cross.append(results[i]["pearson"][0])
        if i == "SemEval17.STS.input.track1.ar-ar.txt" or i == "SemEval17.STS.input.track3.es-es.txt":
            foreign.append(results[i]["pearson"][0])

    print("Average (cross): {0}".format(np.mean(cross)))
    print("Average (foreign): {0}".format(np.mean(foreign)))
    print("Average (datasets): {0}".format(np.mean(total)))
    print("Average (comps): {0}".format(np.mean(all)), flush=True)
    return np.mean(all)
Пример #26
0
def load_model(model_dir, bpe_src_code=None, tokenize=None):
    """
    Start the bot. This means loading the model according to the config file.

    :param model_dir: Model directory of trained Joey NMT model.
    :param bpe_src_code: BPE codes for source side processing (optional).
    :param tokenize: If True, tokenize inputs with Moses tokenizer.
    :return:
    """
    conf = {}
    cfg_file = model_dir + "/config.yaml"

    logger = logging.getLogger(__name__)
    conf["logger"] = logger
    # load the Joey configuration
    cfg = load_config(cfg_file)

    # load the checkpoint
    if "load_model" in cfg['training'].keys():
        ckpt = cfg['training']["load_model"]
    else:
        ckpt = get_latest_checkpoint(model_dir)
        if ckpt is None:
            raise FileNotFoundError("No checkpoint found in directory {}."
                                    .format(model_dir))

    # prediction parameters from config
    conf["use_cuda"] = cfg["training"].get("use_cuda", False)
    conf["level"] = cfg["data"]["level"]
    conf["max_output_length"] = cfg["training"].get("max_output_length", None)
    conf["lowercase"] = cfg["data"].get("lowercase", False)

    # load the vocabularies
    src_vocab_file = cfg["training"]["model_dir"] + "/src_vocab.txt"
    trg_vocab_file = cfg["training"]["model_dir"] + "/trg_vocab.txt"
    conf["src_vocab"] = build_vocab(field="src", vocab_file=src_vocab_file,
                                    dataset=None, max_size=-1, min_freq=0)
    conf["trg_vocab"] = build_vocab(field="trg", vocab_file=trg_vocab_file,
                                    dataset=None, max_size=-1, min_freq=0)

    # whether to use beam search for decoding, 0: greedy decoding
    if "testing" in cfg.keys():
        conf["beam_size"] = cfg["testing"].get("beam_size", 0)
        conf["beam_alpha"] = cfg["testing"].get("alpha", -1)
    else:
        conf["beam_size"] = 1
        conf["beam_alpha"] = -1

    # pre-processing
    if tokenize is not None:
        src_tokenizer = MosesTokenizer(lang=cfg["data"]["src"])
        trg_tokenizer = MosesDetokenizer(lang=cfg["data"]["trg"])
        # tokenize input
        def tokenizer(x): return src_tokenizer.tokenize(x, return_str=True)
        def detokenizer(x): return trg_tokenizer.detokenize(
            x.split(), return_str=True)
    else:
        def tokenizer(x): return x
        def detokenizer(x): return x

    if bpe_src_code is not None and level == "bpe":
        # load bpe merge file
        merge_file = open(bpe_src_code, "r")
        bpe = apply_bpe.BPE(codes=merge_file)
        def segmenter(x): return bpe.process_line(x.strip())
    elif conf["level"] == "char":
        # split to chars
        def segmenter(x): return list(x.strip())
    else:
        def segmenter(x): return x.strip()

    conf["preprocess"] = [tokenizer, segmenter]
    conf["postprocess"] = [detokenizer]
    # build model and load parameters into it
    model_checkpoint = load_checkpoint(ckpt, conf["use_cuda"])
    model = build_model(
        cfg["model"],
        src_vocab=conf["src_vocab"],
        trg_vocab=conf["trg_vocab"])
    model.load_state_dict(model_checkpoint["model_state"])

    if conf["use_cuda"]:
        model.cuda()
    conf["model"] = model
    print("Joey NMT model loaded successfully.")
    return conf
Пример #27
0
import io
import numpy as np
from sacremoses import MosesTokenizer
from glob import glob
from scipy import stats
from wer import wer

def get_wer(s1, s2):
    s1 = s1.split()                                                                                                                                                                                                        
    s2 = s2.split()
    return 0.5 * wer(s1,s2) + 0.5 * wer(s2,s1)

entok = MosesTokenizer(lang='en')

textfiles = glob("../STS/STS*-en-test/*input*txt")

def make_dataset(f, gs):
    sent1, sent2 = zip(*[l.split("\t") for l in
                               io.open(f,
                    encoding='utf8').read().splitlines()])
    raw_scores = np.array([x for x in
                           io.open(gs,
                                           encoding='utf8')
                        .read().splitlines()])
    not_empty_idx = raw_scores != ''

    def process(s):
        tok = entok.tokenize(s, escape=False)
        return " ".join(tok).lower()
    gs_scores = [float(x) for x in raw_scores[not_empty_idx]]
    sent1 = np.array([s for s in sent1])[not_empty_idx]
Пример #28
0
    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if "sent" in self.config.n_model:
            import nltk

            try:
                nltk.data.find("tokenizers/punkt")
            except LookupError:
                nltk.download("punkt")

            from nltk.tokenize import sent_tokenize

            return PororoSentTokenizer(sent_tokenize, self.config)

        if self.config.n_model == "mecab_ko":
            try:
                import mecab
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install python-mecab-ko with: `pip install python-mecab-ko`"
                )
            model = mecab.MeCab()
            return PororoMecabKoTokenizer(model, self.config)

        if self.config.n_model == "char":
            return PororoCharTokenizer(self.config)

        if self.config.n_model == "jamo":
            return PororoJamoTokenizer(self.config)

        if self.config.n_model == "word":
            return PororoWordTokenizer(self.config)

        if self.config.n_model == "roberta":
            from fairseq.data.encoders.gpt2_bpe import get_encoder

            encoder = download_or_load("misc/encoder.json", self.config.lang)
            vocab = download_or_load("misc/vocab.bpe", self.config.lang)
            model = get_encoder(encoder, vocab)

            with open(encoder, "r") as f_vocab:
                vocab = json.load(f_vocab)
                inv_dict = {v: k for k, v in vocab.items()}

            return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config)

        if self.config.n_model == "moses":
            try:
                from sacremoses import MosesDetokenizer, MosesTokenizer
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install sacremoses with: `pip install sacremoses`")
            model = MosesTokenizer(lang="en")
            detok = MosesDetokenizer(lang="en")
            return PororoMosesTokenizer(model, detok, self.config)

        if self.config.n_model == "jieba":
            try:
                import jieba
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install jieba with: `pip install jieba`")
            model = jieba.cut
            return PororoJiebaTokenizer(model, self.config)

        if self.config.n_model == "mecab":
            try:
                import fugashi
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install fugashi with: `pip install fugashi`")

            try:
                import ipadic
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install ipadic with: `pip install ipadic`")

            dic_dir = ipadic.DICDIR
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {} ".format(
                dic_dir,
                mecabrc,
            )
            model = fugashi.GenericTagger(mecab_option)
            return PororoMecabTokenizer(model, self.config)
        else:
            from pororo.tasks.utils.tokenizer import CustomTokenizer

            path = download_or_load(
                f"tokenizers/{self.config.n_model}.zip",
                self.config.lang,
            )

            ext = "json" if "unigram" not in self.config.n_model else "txt"
            merges_filename = (f"{path}/merges.txt" if "unigram"
                               not in self.config.n_model else None)

            model = CustomTokenizer.from_file(
                vocab_filename=f"{path}/vocab.{ext}",
                merges_filename=merges_filename,
                normalize=True if "jpe" not in self.config.n_model else False,
            )
            if "jpe" in self.config.n_model:
                return PororoJamoPairTokenizer(model, self.config)
            if "mecab.bpe" in self.config.n_model:
                return PororoMecabSPTokenizer(model, self.config)
            return PororoSPTokenizer(model, self.config)
Пример #29
0
#!/usr/bin/env python3

from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer

FILE_NAME = "/media/zhake/data/Projects/kaz-parallel-corpora/crawl/strategy2050_kz/xmls/30-35/texts/kaz_all_text_clean_split.txt"

mpn = MosesPunctNormalizer()
mt = MosesTokenizer()

with open(file=FILE_NAME, mode="r") as f_in:
    norm_text = [mpn.normalize(text=line) for line in f_in]

tok_text = [
    mt.tokenize(text=line, return_str=True, escape=False) for line in norm_text
]

with open(file=f"{FILE_NAME}_tok", mode="w") as f_out:
    for line in tok_text:
        print(line.strip(), file=f_out)
Пример #30
0
 def __init__(self, lang: str):
     self.mt = MosesTokenizer(lang)