Exemplo n.º 1
0
 def moses_tokenize(self, text, lang):
     if lang not in self.cache_moses_tokenizer:
         moses_tokenizer = sm.MosesTokenizer(lang=lang)
         self.cache_moses_tokenizer[lang] = moses_tokenizer
     return self.cache_moses_tokenizer[lang].tokenize(
         text, aggressive_dash_splits=True, return_str=False, escape=True
     )
Exemplo n.º 2
0
 def moses_tokenize(self, text, lang):
     if lang not in self.cache_moses_tokenizer:
         moses_tokenizer = sm.MosesTokenizer(lang=lang)
         self.cache_moses_tokenizer[lang] = moses_tokenizer
     else:
         moses_tokenizer = self.cache_moses_tokenizer[lang]
     return moses_tokenizer.tokenize(text, return_str=False, escape=False)
Exemplo n.º 3
0
def tokenize_raw(text, lang='en'):
    mt = sacremoses.MosesTokenizer(lang)
    text = mt.tokenize(text, return_str=True)
    text = re.sub(r'"', '"', text)
    text = re.sub(r''', "'", text)
    text = re.sub(r'(\d)\.(\d)', r'\1 @.@ \2', text)
    text = re.sub(r'(\d),(\d)', r'\1 @,@ \2', text)
    text = re.sub(r'(\w)-(\w)', r'\1 @-@ \2', text)
    return text
Exemplo n.º 4
0
def moses_tokenize(sents: List[str], lang: str) -> List[List[str]]:
    unsupported_langs = ['zh', 'ja', 'th']
    if lang.split('-')[0] in unsupported_langs:
        utils.Logging.warn(f"Moses does not support \"{lang}\" because it is not space-delimited. "
                           f"It will only split according to punctuation.")
    import sacremoses
    tok = sacremoses.MosesTokenizer(lang=lang)
    tok_sents = [tok.tokenize(sent.strip(), escape=False) for sent in sents]
    return tok_sents
Exemplo n.º 5
0
    def __init__(self, max_length=None):
        super().__init__(max_length)

        try:
            import sacremoses
            self.tokenize_fn = sacremoses.MosesTokenizer().tokenize
        except ImportError as e:
            import sys
            sys.stderr.write('ERROR: Please install sacremoses to use.')
            raise e
Exemplo n.º 6
0
def main(args):
    """Tokenizes, preserving tabs"""
    mt = sacremoses.MosesTokenizer(lang=args.lang)

    def tok(s):
        return mt.tokenize(s, return_str=True)

    for line in sys.stdin:
        parts = list(map(tok, line.split("\t")))
        print(*parts, sep="\t", flush=True)
Exemplo n.º 7
0
 def __init__(self, embeddings):
     self.embeddings = embeddings
     self.word_list = list(embeddings)
     self.special_list = self.SPECIAL_LIST
     for special_token in self.special_list:
         assert special_token not in self.embeddings
     self.full_token_list = self.special_list + self.word_list
     self.id_token_map = self.full_token_list
     self.token_id_map = {
         token: i
         for i, token in enumerate(self.id_token_map)
     }
     self.tokenizer = sacremoses.MosesTokenizer(lang='en')
Exemplo n.º 8
0
def normalize(sentence: str,
              lowercase: bool = True,
              tokenizer: str = "13a",
              return_str: bool = True):
    if lowercase:
        sentence = sentence.lower()

    if tokenizer in ["13a", "intl", "none"]:
        tokenizer_obj = _get_tokenizer(name=tokenizer)()
        normalized_sent = tokenizer_obj(sentence)
    elif tokenizer == "moses":
        normalized_sent = sacremoses.MosesTokenizer().tokenize(sentence,
                                                               return_str=True,
                                                               escape=False)
    elif tokenizer == "penn":
        normalized_sent = sacremoses.MosesTokenizer().penn_tokenize(
            sentence, return_str=True)

    if not return_str:
        normalized_sent = normalized_sent.split()

    return normalized_sent
Exemplo n.º 9
0
def preprocessing_en_file(input_file_name, output_file_name):
    tokenizer = sacremoses.MosesTokenizer()
    output_file = open(output_file_name, 'w', encoding='utf-8')
    with open(input_file_name, 'r', encoding='utf-8') as file:
        for line in file:
            # print('line\n', line)
            token_list = tokenizer.tokenize(line)
            # print('token_list\n', token_list)
            output_line = (' ').join(token_list)
            # print('output_line\n', output_line)
            output_file.write(output_line)
    output_file.close()
    return 0
Exemplo n.º 10
0
def normalize(sentence,
              lowercase: bool = True,
              tokenizer: str = '13a',
              return_str: bool = True):
    if lowercase:
        sentence = sentence.lower()

    if tokenizer in ['13a', 'intl']:
        normalized_sent = sacrebleu.TOKENIZERS[tokenizer]()(sentence)
    elif tokenizer == 'moses':
        normalized_sent = sacremoses.MosesTokenizer().tokenize(sentence,
                                                               return_str=True,
                                                               escape=False)
    elif tokenizer == 'penn':
        normalized_sent = sacremoses.MosesTokenizer().penn_tokenize(
            sentence, return_str=True)
    else:
        normalized_sent = sentence

    if not return_str:
        normalized_sent = normalized_sent.split()

    return normalized_sent
Exemplo n.º 11
0
def init_word_tokenizers(main, lang, word_tokenizer = 'default'):
    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization']['word_tokenizers'][lang]

    # NLTK
    if word_tokenizer.startswith('nltk_'):
        if word_tokenizer == 'nltk_nist':
            if 'nltk_nist_tokenizer' not in main.__dict__:
                main.nltk_nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()
        elif word_tokenizer == 'nltk_nltk':
            if 'nltk_nltk_tokenizer' not in main.__dict__:
                main.nltk_nltk_tokenizer = nltk.NLTKWordTokenizer()
        elif word_tokenizer == 'nltk_penn_treebank':
            if 'nltk_treebank_tokenizer' not in main.__dict__:
                main.nltk_treebank_tokenizer = nltk.TreebankWordTokenizer()
        elif word_tokenizer == 'nltk_tok_tok':
            if 'nltk_toktok_tokenizer' not in main.__dict__:
                main.nltk_toktok_tokenizer = nltk.ToktokTokenizer()
        elif word_tokenizer == 'nltk_twitter':
            if 'nltk_tweet_tokenizer' not in main.__dict__:
                main.nltk_tweet_tokenizer = nltk.TweetTokenizer()
    # Sacremoses
    elif word_tokenizer == 'sacremoses_moses':
        lang_sacremoses = wl_conversion.remove_lang_code_suffixes(main, wl_conversion.to_iso_639_1(main, lang))
        lang = wl_conversion.remove_lang_code_suffixes(main, lang)

        if f'sacremoses_moses_tokenizer_{lang}' not in main.__dict__:
            main.__dict__[f'sacremoses_moses_tokenizer_{lang}'] = sacremoses.MosesTokenizer(lang = lang_sacremoses)
    # spaCy
    elif word_tokenizer.startswith('spacy_'):
        init_spacy_models(main, lang)
    # Chinese
    elif word_tokenizer == 'pkuseg_zho':
        if 'pkuseg_word_tokenizer' not in main.__dict__:
            main.pkuseg_word_tokenizer = pkuseg.pkuseg()
    # Chinese & Japanese
    elif word_tokenizer.startswith('wordless_'):
        init_spacy_models(main, 'eng_us')
        init_spacy_models(main, 'other')
    # Japanese
    elif word_tokenizer.startswith('sudachipy_jpn'):
        if 'sudachipy_word_tokenizer' not in main.__dict__:
            main.sudachipy_word_tokenizer = sudachipy.Dictionary().create()
    # Tibetan
    elif word_tokenizer == 'botok_bod':
        if 'botok_word_tokenizer' not in main.__dict__:
            main.botok_word_tokenizer = botok.WordTokenizer()
Exemplo n.º 12
0
    def __init__(self, lang: str = 'en', vocab: Optional[Vocab] = None):
        self._lang = lang
        self._vocab = vocab
        if lang == 'zh':
            warnings.warn(
                'You may not use MosesTokenizer for Chinese sentences because it is '
                'not accurate. Try to use JiebaTokenizer. You may also tokenize the '
                'chinese sentence to characters and learn a BPE.')
        self._tokenizer = sacremoses.MosesTokenizer(lang=lang)
        self._detokenizer = sacremoses.MosesDetokenizer(lang=lang)

        # Here, we need to warm-up the tokenizer to compile the regex
        # This will boost the performance in MacOS
        # For benchmarking results, see
        # https://gist.github.com/sxjscience/f59d2b88262fefd4fb08565c9dec6099
        self._warmup()
Exemplo n.º 13
0
def loadAndTokenizeFile(lang, inputPath, outputPath, pattern, append=False):
    tok = sacremoses.MosesTokenizer(lang=lang)

    inputFile = open(inputPath, 'r')
    fileRights = 'a' if append else 'w'
    outputFile = open(outputPath, fileRights)
    p = re.compile(pattern)

    for line in inputFile:
        match = p.match(line)
        if match:
            outputFile.write(
                html.unescape(tok.tokenize(match.group(1), return_str=True)) +
                '\n')

    inputFile.close()
    outputFile.close()
Exemplo n.º 14
0
def tokenize_data(data, token_type):
    # input: list of strings
    # return: list of list of tokens
    if token_type == "gru":
        tokenizer = sacremoses.MosesTokenizer()
        preprocessed_data = []
        print("Processing data into tokens......")
        for sent in tqdm(data):
            tokenized_sent = tokenizer.tokenize(sent.lower())
            preprocessed_data.append(tokenized_sent)

    elif token_type == "bert":
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        preprocessed_data = []
        print("Processing data into tokens......")
        for sent in tqdm(data):
            tokenized_sent = ["[CLS]"] + tokenizer.tokenize(sent)
            # BERT only accepts sequences of max length 512
            preprocessed_data.append(tokenized_sent[:512])

    return preprocessed_data, tokenizer
Exemplo n.º 15
0
def normalize(sentence,
              lowercase: bool = True,
              tokenizer: str = '13a',
              return_str: bool = True):
    if lowercase:
        sentence = sentence.lower()

    if tokenizer == "13a":
        normalized_sent = sacrebleu.tokenize_13a(sentence)
    elif tokenizer == "intl":
        normalized_sent = sacrebleu.tokenize_v14_international(sentence)
    elif tokenizer == "moses":
        normalized_sent = sacremoses.MosesTokenizer().tokenize(sentence,
                                                               return_str=True)
    else:
        normalized_sent = sentence

    if not return_str:
        normalized_sent = normalized_sent.split()

    return normalized_sent
Exemplo n.º 16
0
def split_en():
    # tokenize english text
    import sacremoses
    tokenizer = sacremoses.MosesTokenizer(lang="en")
    lines = 0
    contents = []
    filename = "/home/user_data55/wangdq/data/ccmt/zh-en/parallel/train.en"
    with open(filename, 'r') as f:
        with open(filename + '2', 'w') as f2:
            for line in f:
                tokens = tokenizer.tokenize(line,
                                            aggressive_dash_splits=True,
                                            return_str=True,
                                            escape=False)
                contents.append(tokens + '\n')
                lines += 1
                if lines == 500:
                    f2.writelines(contents)
                    contents = []
                    lines = 0
            if len(contents) > 0:
                f2.writelines(contents)
Exemplo n.º 17
0
def wordless_word_tokenize(main,
                           text,
                           lang,
                           word_tokenizer='default',
                           keep_sentences=False):
    tokens_sentences = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    wordless_text_utils.check_word_tokenizers(main,
                                              lang=lang,
                                              word_tokenizer=word_tokenizer)

    if 'NLTK' in word_tokenizer:
        sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_sentences.append(treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_sentences.append(tweet_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_sentences.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_sentences.append(toktok_tokenizer.tokenize(sentence))

        if not keep_sentences:
            tokens_sentences = [
                itertools.chain.from_iterable(tokens_sentences)
            ]
    elif 'Sacremoses' in word_tokenizer:
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang)
        else:
            sentences = [text]

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_sentences.append(
                    moses_tokenizer.tokenize(sentence, escape=False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_sentences.append(
                    moses_tokenizer.penn_tokenize(sentence))
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if keep_sentences:
            for sentence in doc.sents:
                tokens_sentences.append(
                    [token.text for token in sentence.as_doc()])
        else:
            tokens_sentences.append([token.text for token in doc])

    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang=lang)
        else:
            sentences = [text]

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_sentences.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_sentences.extend(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_sentences.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wordless_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_sentences.extend(tokens)
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        sentences = wordless_sentence_tokenize(
            main,
            text,
            lang='tha',
            sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer')

        if word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm + TCC'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_sentences.append(
                    pythainlp.tokenize.word_tokenize(
                        sentence, engine='longest-matching'))
    # Tibetan
    elif 'pybo' in word_tokenizer:
        if keep_sentences:
            sentences = wordless_sentence_tokenize(main, text, lang='bod')
        else:
            sentences = [text]

        if word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (GMD)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text
                    for token in main.pybo_tokenizer_gmd.tokenize(sentence)
                ])
        elif word_tokenizer == main.tr('pybo - Tibetan Word Tokenizer (POS)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text
                    for token in main.pybo_tokenizer_pos.tokenize(sentence)
                ])
        elif word_tokenizer == main.tr(
                'pybo - Tibetan Word Tokenizer (tsikchen)'):
            for sentence in sentences:
                tokens_sentences.append([
                    token.text for token in
                    main.pybo_tokenizer_tsikchen.tokenize(sentence)
                ])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if keep_sentences:
            sentences = wordless_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )
        else:
            sentences = [text]

        for sentence in sentences:
            tokens_sentences.append(underthesea.word_tokenize(str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, tokens in enumerate(tokens_sentences):
        tokens_sentences[i] = [
            token.strip() for token in tokens if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for tokens in tokens_sentences:
            if tokens:
                tokens[-1] = wordless_text.Wordless_Token(tokens[-1],
                                                          boundary='',
                                                          sentence_ending=True)
    else:
        for tokens in tokens_sentences:
            if tokens:
                tokens[-1] = wordless_text.Wordless_Token(tokens[-1],
                                                          boundary=' ',
                                                          sentence_ending=True)

    return tokens_sentences
Exemplo n.º 18
0
 def __init__(self, lang: str):
     super().__init__()
     self.lang = lang
     self.moses = sacremoses.MosesTokenizer(lang)
     self.rm_accent = lang in self.LANG_WITHOUT_ACCENT
     self.ready = True
Exemplo n.º 19
0
def processLanguagePair(lgpair, keyfile_prefix, rawtranslations_glob,
                        lemtranslations_glob):
    # load sense keys from file
    sense_keys = []
    k = open(keyfile_prefix + ".key.txt", 'r', encoding='utf-8')
    for line in k:
        elements = line.strip().split("\t")
        t = (elements[0], elements[1], elements[2],
             tuple(elements[3].split(" ")), tuple(elements[4].split(" ")))
        sense_keys.append(t)
    k.close()

    # load domain keys from file
    indomain_keys = set()
    outdomain_keys = set()
    d = open(keyfile_prefix + ".domain.txt", 'r', encoding='utf-8')
    for line in d:
        elements = line.strip().split("\t")
        if elements[2] == "in":
            indomain_keys.add((elements[0], elements[1]))
        else:
            outdomain_keys.add((elements[0], elements[1]))
    d.close()

    # initialize tokenizer
    tokenizer = sacremoses.MosesTokenizer(lang=lgpair[-2:])

    # load and process submissions
    results = {}
    toksubmissions = sorted(glob.glob(rawtranslations_glob))
    lemsubmissions = sorted(glob.glob(lemtranslations_glob))
    for toksubmission, lemsubmission in zip(toksubmissions, lemsubmissions):

        if toksubmission.split("/")[-1] != lemsubmission.split(
                "/")[-1].replace(".parsed.toklemma", ""):
            print("Mismatch in filenames")
            print(toksubmission)
            print(lemsubmission)
            return

        counts = {
            "pos_in": 0,
            "pos_out": 0,
            "neg_in": 0,
            "neg_out": 0,
            "unk_in": 0,
            "unk_out": 0
        }
        tokf = open(toksubmission, 'r', encoding='utf-8')
        lemf = open(lemsubmission, 'r', encoding='utf-8')

        for tokline, lemline, key in zip(tokf, lemf, sense_keys):
            if (key[2], " ".join(key[3])) in indomain_keys:
                suffix = "_in"
            elif (key[2], " ".join(key[3])) in outdomain_keys:
                suffix = "_out"
            else:
                print("Domain not found:", (key[2], " ".join(key[3])))

            # first look in tokenized data
            tokwords = [
                x.lower()
                for x in tokenizer.tokenize(tokline.strip(), escape=False)
            ]
            posfound = any([posword in tokwords for posword in key[3]])
            negfound = any([negword in tokwords for negword in key[4]])

            # if not found, look in lemmatized data
            if (not posfound) and (not negfound):
                lemwords = lemline.strip().lower().split(" ")
                posfound = any([posword in lemwords for posword in key[3]])
                negfound = any([negword in lemwords for negword in key[4]])

            if posfound and not negfound:
                counts["pos" + suffix] += 1
            elif negfound:
                counts["neg" + suffix] += 1
            else:
                counts["unk" + suffix] += 1

        tokf.close()
        lemf.close()

        counts["cov_in"] = (counts["pos_in"] + counts["neg_in"]) / (
            counts["pos_in"] + counts["neg_in"] + counts["unk_in"])
        counts["cov_out"] = (counts["pos_out"] + counts["neg_out"]) / (
            counts["pos_out"] + counts["neg_out"] + counts["unk_out"])
        counts["cov_all"] = (
            counts["pos_in"] + counts["neg_in"] + counts["pos_out"] +
            counts["neg_out"]) / (counts["pos_in"] + counts["neg_in"] +
                                  counts["unk_in"] + counts["pos_out"] +
                                  counts["neg_out"] + counts["unk_out"])

        # Precision = pos / (pos+neg)
        counts["prec_in"] = 0 if counts["pos_in"] == 0 else counts[
            "pos_in"] / (counts["pos_in"] + counts["neg_in"])
        counts["prec_out"] = 0 if counts["pos_out"] == 0 else counts[
            "pos_out"] / (counts["pos_out"] + counts["neg_out"])
        counts["prec_all"] = 0 if (
            counts["pos_in"] + counts["pos_out"]) == 0 else (
                counts["pos_in"] +
                counts["pos_out"]) / (counts["pos_in"] + counts["neg_in"] +
                                      counts["pos_out"] + counts["neg_out"])

        # Recall = pos / (pos+unk)
        counts["rec_in"] = 0 if counts["pos_in"] == 0 else counts["pos_in"] / (
            counts["pos_in"] + counts["unk_in"])
        counts["rec_out"] = 0 if counts["pos_out"] == 0 else counts[
            "pos_out"] / (counts["pos_out"] + counts["unk_out"])
        counts["rec_all"] = 0 if (
            counts["pos_in"] + counts["pos_out"]) == 0 else (
                counts["pos_in"] +
                counts["pos_out"]) / (counts["pos_in"] + counts["unk_in"] +
                                      counts["pos_out"] + counts["unk_out"])

        counts["f1_in"] = 0 if (
            counts["prec_in"] + counts["rec_in"]
        ) == 0 else 2 * counts["prec_in"] * counts["rec_in"] / (
            counts["prec_in"] + counts["rec_in"])
        counts["f1_out"] = 0 if (
            counts["prec_out"] + counts["rec_out"]
        ) == 0 else 2 * counts["prec_out"] * counts["rec_out"] / (
            counts["prec_out"] + counts["rec_out"])
        counts["f1_all"] = 0 if (
            counts["prec_all"] + counts["rec_all"]
        ) == 0 else 2 * counts["prec_all"] * counts["rec_all"] / (
            counts["prec_all"] + counts["rec_all"])

        submissionName = toksubmission.split("/")[-1]
        results[submissionName] = counts

    print(lgpair.upper())
    print()
    print(
        "Submission\t\tInPos\tInNeg\tInUnk\tInCoverage\tInPrecision\tInRecall\tInFscore\t\tOutPos\tOutNeg\tOutUnk\tOutCoverage\tOutPrecision\tOutRecall\tOutFscore\t\tAllPos\tAllNeg\tAllUnk\tAllCoverage\tAllPrecision\tAllRecall\tAllFscore"
    )
    for submission, result in sorted(results.items(),
                                     key=lambda x: x[1]["f1_all"],
                                     reverse=True):
        s = submission
        s += "\t\t{}\t{}\t{}\t{:.2f}%\t{:.2f}%\t{:.2f}%\t{:.2f}%".format(
            result["pos_in"], result["neg_in"], result["unk_in"],
            100 * result["cov_in"], 100 * result["prec_in"],
            100 * result["rec_in"], 100 * result["f1_in"])
        s += "\t\t{}\t{}\t{}\t{:.2f}%\t{:.2f}%\t{:.2f}%\t{:.2f}%".format(
            result["pos_out"], result["neg_out"], result["unk_out"],
            100 * result["cov_out"], 100 * result["prec_out"],
            100 * result["rec_out"], 100 * result["f1_out"])
        s += "\t\t{}\t{}\t{}\t{:.2f}%\t{:.2f}%\t{:.2f}%\t{:.2f}%".format(
            result["pos_in"] + result["pos_out"],
            result["neg_in"] + result["neg_out"],
            result["unk_in"] + result["unk_out"], 100 * result["cov_all"],
            100 * result["prec_all"], 100 * result["rec_all"],
            100 * result["f1_all"])
        print(s)
    print()
def wordless_word_tokenize(main,
                           text,
                           lang,
                           word_tokenizer='default',
                           flat_tokens=True):
    tokens_hierarchical = []

    if lang not in main.settings_global['word_tokenizers']:
        lang = 'other'

    if word_tokenizer == 'default':
        word_tokenizer = main.settings_custom['word_tokenization'][
            'word_tokenizers'][lang]

    # Check initialization status of word (and sentence) tokenizers
    if flat_tokens:
        wordless_text_utils.check_word_tokenizers(
            main, lang=lang, word_tokenizer=word_tokenizer)
    else:
        wordless_text_utils.check_tokenizers(main,
                                             lang=lang,
                                             word_tokenizer=word_tokenizer)

    # NLTK
    if 'NLTK' in word_tokenizer:
        sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('NLTK - Penn Treebank Tokenizer'):
            treebank_tokenizer = nltk.TreebankWordTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(
                    treebank_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Twitter Tokenizer'):
            tweet_tokenizer = nltk.TweetTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(tweet_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - NIST Tokenizer'):
            nist_tokenizer = nltk.tokenize.nist.NISTTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(nist_tokenizer.tokenize(sentence))
        elif word_tokenizer == main.tr('NLTK - Tok-tok Tokenizer'):
            toktok_tokenizer = nltk.ToktokTokenizer()

            for sentence in sentences:
                tokens_hierarchical.append(toktok_tokenizer.tokenize(sentence))
    # Sacremoses
    elif 'Sacremoses' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang)

        if word_tokenizer == main.tr('Sacremoses - Moses Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(
                    moses_tokenizer.tokenize(sentence, escape=False))
        elif word_tokenizer == main.tr('Sacremoses - Penn Treebank Tokenizer'):
            moses_tokenizer = sacremoses.MosesTokenizer(
                lang=wordless_conversion.to_iso_639_1(main, lang))

            for sentence in sentences:
                tokens_hierarchical.append(
                    moses_tokenizer.penn_tokenize(sentence))
    # spaCy
    elif 'spaCy' in word_tokenizer:
        nlp = main.__dict__[f'spacy_nlp_{lang}']
        doc = nlp(text)
        # See Issue #3479: https://github.com/explosion/spaCy/issues/3479
        doc.is_parsed = True

        if flat_tokens:
            tokens_hierarchical.append([token.text for token in doc])
        else:
            for sentence in doc.sents:
                tokens_hierarchical.append(
                    [token.text for token in sentence.as_doc()])
    # syntok
    elif word_tokenizer == 'syntok - Word Tokenizer':
        syntok_tokenizer = syntok.tokenizer.Tokenizer()

        if flat_tokens:
            tokens_hierarchical.append(
                [token.value for token in syntok_tokenizer.tokenize(text)])
        else:
            for para in syntok.segmenter.analyze(text):
                for sentence in para:
                    tokens_hierarchical.append(
                        [token.value for token in sentence])
    # Chinese & Japanese
    elif ('jieba' in word_tokenizer or 'nagisa' in word_tokenizer
          or 'Wordless' in word_tokenizer):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang=lang)

        # Chinese
        if word_tokenizer == main.tr('jieba - Chinese Word Tokenizer'):
            for sentence in sentences:
                tokens_hierarchical.append(jieba.cut(sentence))
        elif word_tokenizer == main.tr(
                'Wordless - Chinese Character Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # English
                            if wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_hierarchical.append(tokens)
        # Japanese
        elif word_tokenizer == main.tr('nagisa - Japanese Word Tokenizer'):
            import nagisa

            for sentence in sentences:
                tokens_hierarchical.append(nagisa.tagging(str(sentence)).words)
        elif word_tokenizer == main.tr('Wordless - Japanese Kanji Tokenizer'):
            for sentence in sentences:
                tokens = []
                non_han_start = 0

                for i, char in enumerate(sentence):
                    if i >= non_han_start:
                        if wordless_checking_unicode.is_han(char):
                            tokens.append(char)

                            non_han_start += 1
                        else:
                            # Japanese Kana
                            if wordless_checking_unicode.is_kana(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_kana(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='jpn'))

                                        non_han_start = i + j + 1

                                        break
                            # English
                            elif wordless_checking_unicode.is_eng(char):
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or not wordless_checking_unicode.is_eng(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='eng'))

                                        non_han_start = i + j + 1

                                        break
                            # Other Languages
                            else:
                                for j, char in enumerate(sentence[i:]):
                                    if i + j + 1 == len(
                                            sentence
                                    ) or wordless_checking_unicode.is_han(
                                            sentence[i + j + 1]):
                                        tokens.extend(
                                            wordless_word_tokenize(
                                                main,
                                                sentence[non_han_start:i + j +
                                                         1],
                                                lang='other'))

                                        non_han_start = i + j + 1

                                        break

                tokens_hierarchical.append(tokens)
    # Russian
    elif word_tokenizer == 'razdel - Russian Word Tokenizer':
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang='rus')

        for sentence in sentences:
            tokens_hierarchical.append(
                [token.text for token in razdel.tokenize(sentence)])
    # Thai
    elif 'PyThaiNLP' in word_tokenizer:
        # Preserve sentence boundaries
        sentences = wordless_sentence_tokenize(
            main,
            text,
            lang='tha',
            sentence_tokenizer='PyThaiNLP - Thai Sentence Tokenizer')

        if word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm + TCC'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='newmm'))
        elif word_tokenizer == main.tr(
                'PyThaiNLP - Maximum Matching Algorithm'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(sentence, engine='mm'))
        elif word_tokenizer == main.tr('PyThaiNLP - Longest Matching'):
            for sentence in sentences:
                tokens_hierarchical.append(
                    pythainlp.tokenize.word_tokenize(
                        sentence, engine='longest-matching'))
    # Tibetan
    elif 'botok' in word_tokenizer:
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(main, text, lang='bod')

        botok_tokenizer = wordless_text_utils.check_botok_tokenizers(
            main, word_tokenizer)

        for sentence in sentences:
            tokens_hierarchical.append(
                [token.text for token in botok_tokenizer.tokenize(sentence)])
    # Vietnamese
    elif word_tokenizer == main.tr('Underthesea - Vietnamese Word Tokenizer'):
        if flat_tokens:
            sentences = [text]
        else:
            sentences = wordless_sentence_tokenize(
                main,
                text,
                lang='vie',
                sentence_tokenizer='Underthesea - Vietnamese Sentence Tokenizer'
            )

        for sentence in sentences:
            tokens_hierarchical.append(underthesea.word_tokenize(
                str(sentence)))

    # Remove empty tokens and strip whitespace
    for i, sentence in enumerate(tokens_hierarchical):
        tokens_hierarchical[i] = [
            token.strip() for token in sentence if token.strip()
        ]

    # Record token boundaries
    if lang in ['zho_cn', 'zho_tw', 'jpn']:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(
                    sentence[-1], boundary='', sentence_ending=True)
    else:
        for sentence in tokens_hierarchical:
            if sentence:
                sentence[-1] = wordless_text.Wordless_Token(
                    sentence[-1], boundary=' ', sentence_ending=True)

    # Clause tokenization
    if not flat_tokens:
        for i, sentence in enumerate(tokens_hierarchical):
            tokens_hierarchical[i] = wordless_clause_tokenize(
                main, sentence, lang)

    # Flatten tokens
    tokens_flat = list(wordless_misc.flatten_list(tokens_hierarchical))

    if flat_tokens:
        return tokens_flat
    else:
        return tokens_hierarchical
Exemplo n.º 21
0
            RegexpTokenizer(pattern=token_regexp).tokenize,
        ),
        (
            "UnicodeSegmentTokenizer(word_bounds=False)",
            UnicodeSegmentTokenizer(word_bounds=False).tokenize,
        ),
        (
            "UnicodeSegmentTokenizer(word_bounds=True)",
            UnicodeSegmentTokenizer(word_bounds=True).tokenize,
        ),
        ("VTextTokenizer('en')", VTextTokenizer("en").tokenize),
        ("CharacterTokenizer(4)", CharacterTokenizer(4).tokenize),
    ]

    if sacremoses is not None:
        db.append(("MosesTokenizer()", sacremoses.MosesTokenizer().tokenize))
    if spacy is not None:
        from spacy.lang.en import English

        db.append(("Spacy en", English().tokenizer))

    if blingfire is not None:
        db.append(
            ("BlingFire en", lambda x: blingfire.text_to_words(x).split(" ")))

    for label, func in db:
        t0 = time()

        out = []

        for idx, doc in enumerate(data):
    def __init__(self,
                 special=None,
                 min_freq=0,
                 max_size=None,
                 lower_case=False,
                 delimiter=None,
                 vocab_file=None,
                 pretrained_vocab_file=None,
                 never_split=None,
                 unk_token="<unk>",
                 eos_token="<eos>",
                 additional_special_tokens=["<formula>"],
                 language="en",
                 **kwargs):
        super().__init__(unk_token=unk_token,
                         eos_token=eos_token,
                         additional_special_tokens=additional_special_tokens,
                         **kwargs)

        if never_split is None:
            never_split = self.all_special_tokens
        if special is None:
            special = []
        self.counter = Counter()
        self.special = special
        self.min_freq = min_freq
        self.max_size = max_size
        self.lower_case = lower_case
        self.delimiter = delimiter
        self.vocab_file = vocab_file
        self.never_split = never_split
        self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
        self.punction_without_space_before_pattern = re.compile(
            r"[^\s][{}]".format(self.punctuation_symbols))
        self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern(
        )
        self.language = language
        self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
        self.moses_tokenizer = sm.MosesTokenizer(language)
        self.moses_detokenizer = sm.MosesDetokenizer(language)

        try:
            if pretrained_vocab_file is not None:
                # Hack because, honestly this tokenizer was not made to be used
                # in a library like ours, at all.
                vocab_dict = torch.load(pretrained_vocab_file)
                for key, value in vocab_dict.items():
                    if key not in self.__dict__:
                        self.__dict__[key] = value

            if vocab_file is not None:
                self.build_vocab()
        except Exception:
            raise ValueError(
                "Unable to parse file {}. Unknown format. "
                "If you tried to load a model saved through TransfoXLTokenizerFast,"
                "please note they are not compatible.".format(
                    pretrained_vocab_file))

        if vocab_file is not None:
            self.build_vocab()
Exemplo n.º 23
0
 def init_moses(self, lang):
     self.moses_tokenizer = sacremoses.MosesTokenizer(lang['src'])
     self.moses_detokenizer = sacremoses.MosesDetokenizer(lang['tgt'])
Exemplo n.º 24
0
 def __init__(self):
     self.question_generator = xlingqg.QuestionGenerator()
     self.translator = xlingqg.Translator()
     self.tokenizer = sacremoses.MosesTokenizer()
     self.detokenizer = sacremoses.MosesDetokenizer()
     self.answer_encoder = AnswerEncoder()
Exemplo n.º 25
0
 def __init__(self, lang):
     self.tokenizer = sacremoses.MosesTokenizer(lang)
Exemplo n.º 26
0
def whitespace_split(x):
    return x.split(" ")


tok_db = [
    # ("whitespace", lambda lang: whitespace_split),
    ("regexp", lambda lang: re.compile(r"\b\w\w+\b").findall),
    (
        "unicode-segmentation",
        lambda lang: UnicodeSegmentTokenizer(word_bounds=True).tokenize,
    ),
    ("vtext", lambda lang: VTextTokenizer(lang).tokenize),
]

if sacremoses is not None:
    tok_db.append(("MosesTokenizer", lambda lang: sacremoses.MosesTokenizer().tokenize))

if spacy is not None:

    def spacy_tokenizer(lang):
        if lang == "en":
            from spacy.lang.en import English as Nlp
        elif lang == "de":
            from spacy.lang.de import German as Nlp
        elif lang == "fr":
            from spacy.lang.fr import French as Nlp
        else:
            raise ValueError
        return Nlp().tokenizer

    tok_db.append(("spacy", spacy_tokenizer))
Exemplo n.º 27
0
    def __init__(self,
                 special=None,
                 min_freq=0,
                 max_size=None,
                 lower_case=False,
                 delimiter=None,
                 vocab_file=None,
                 pretrained_vocab_file: str = None,
                 never_split=None,
                 unk_token="<unk>",
                 eos_token="<eos>",
                 additional_special_tokens=["<formula>"],
                 language="en",
                 **kwargs):
        super().__init__(unk_token=unk_token,
                         eos_token=eos_token,
                         additional_special_tokens=additional_special_tokens,
                         **kwargs)

        if never_split is None:
            never_split = self.all_special_tokens
        if special is None:
            special = []
        self.counter = Counter()
        self.special = special
        self.min_freq = min_freq
        self.max_size = max_size
        self.lower_case = lower_case
        self.delimiter = delimiter
        self.vocab_file = vocab_file
        self.never_split = never_split
        self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
        self.punction_without_space_before_pattern = re.compile(
            r"[^\s][{}]".format(self.punctuation_symbols))
        self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern(
        )
        self.language = language
        self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
        self.moses_tokenizer = sm.MosesTokenizer(language)
        self.moses_detokenizer = sm.MosesDetokenizer(language)

        # This try... catch... is not beautiful but honestly this tokenizer was not made to be used
        # in a library like ours, at all.
        try:
            vocab_dict = None
            if pretrained_vocab_file is not None:
                # Priority on pickle files (support PyTorch and TF)
                with open(pretrained_vocab_file, "rb") as f:
                    vocab_dict = pickle.load(f)

                # Loading a torch-saved transfo-xl vocab dict with pickle results in an integer
                # Entering this if statement means that we tried to load a torch-saved file with pickle, and we failed.
                # We therefore load it with torch, if it's available.
                if type(vocab_dict) == int:
                    if not is_torch_available():
                        raise ImportError(
                            "Not trying to load dict with PyTorch as you need to install pytorch to load "
                            "from a PyTorch pretrained vocabulary, "
                            "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
                        )
                    vocab_dict = torch.load(pretrained_vocab_file)

            if vocab_dict is not None:
                for key, value in vocab_dict.items():
                    if key not in self.__dict__:
                        self.__dict__[key] = value
            elif vocab_file is not None:
                self.build_vocab()

        except Exception as e:
            raise ValueError(
                "Unable to parse file {}. Unknown format. "
                "If you tried to load a model saved through TransfoXLTokenizerFast,"
                "please note they are not compatible.".format(
                    pretrained_vocab_file)) from e

        if vocab_file is not None:
            self.build_vocab()
Exemplo n.º 28
0
 def __init__(
     self,
     special=None,
     min_freq=0,
     max_size=None,
     lower_case=False,
     delimiter=None,
     vocab_file=None,
     pretrained_vocab_file=None,
     never_split=None,
     unk="<unk>",
     eos="<eos>",
     additional_special_tokens=["<formula>"],
     language="en",
     **kw,
 ):
     super().__init__(
         special=special,
         min_freq=min_freq,
         max_size=max_size,
         lower_case=lower_case,
         delimiter=delimiter,
         vocab_file=vocab_file,
         pretrained_vocab_file=pretrained_vocab_file,
         never_split=never_split,
         unk=unk,
         eos=eos,
         additional_special_tokens=additional_special_tokens,
         language=language,
         **kw,
     )
     if never_split is None:
         never_split = self.all_special_tokens
     if special is None:
         special = []
     self.counter = Counter()
     self.special = special
     self.min_freq = min_freq
     self.max_size = max_size
     self.lower_case = lower_case
     self.delimiter = delimiter
     self.vocab_file = vocab_file
     self.never_split = never_split
     self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
     self.punction_without_space_before_pattern = re.compile(
         rf"[^\s][{self.punctuation_symbols}]"
     )
     self.punctuation_with_space_around_pattern = (
         self._compile_space_around_punctuation_pattern()
     )
     self.language = language
     self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
     self.moses_tokenizer = sm.MosesTokenizer(language)
     self.moses_detokenizer = sm.MosesDetokenizer(language)
     try:
         vocab_dict = None
         if pretrained_vocab_file is not None:
             with open(pretrained_vocab_file, "rb") as f:
                 vocab_dict = pickle.load(f)
             if type(vocab_dict) == int:
                 if not is_torch_available():
                     raise ImportError(
                         "Not trying to load dict with PyTorch as you need to install pytorch to load "
                         "from a PyTorch pretrained vocabulary, "
                         "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
                     )
                 vocab_dict = torch.load(pretrained_vocab_file)
         if vocab_dict is not None:
             for key, value in vocab_dict.items():
                 if key not in self.__dict__:
                     self.__dict__[key] = value
         elif vocab_file is not None:
             self.build_vocab()
     except Exception as e:
         raise ValueError(
             f"Unable to parse file {pretrained_vocab_file}. Unknown format. "
             "If you tried to load a model saved through TokenizerFast, "
             "please note they are not compatible."
         ) from e
     if vocab_file is not None:
         self.build_vocab()
else:
    ## Load training data
    span_train_name = ''
    train_df = pd.read_csv('/scratch/xl3119/'+spam_train_name+'train.csv')
    val_df = pd.read_csv('/scratch/xl3119/'+spam_train_name+'dev.csv')

    train_texts, train_labels, train_rating = list(train_df.review), list(train_df.label), list(train_df.rating)
    val_texts, val_labels, val_rating     = list(val_df.review), list(val_df.label), list(val_df.rating)

    print(
        f"Train size: {len(train_labels)}\n"
        f"Val size: {len(val_labels)}\n"
    )

    ## Tokenize data
    tokenizer = sacremoses.MosesTokenizer()
    train_data_indices, train_labels = featurize(train_texts, train_labels, tokenizer, vocab)
    val_data_indices, val_labels = featurize(val_texts, val_labels, tokenizer, vocab)

    pickle_fake_news = {'train_indices': train_data_indices,
                        'train_labels': train_labels,
                        'train_rating': train_rating,
                        'val_indices': val_data_indices,
                        'val_labels': val_labels,
                        'val_rating': val_rating,}

    pickle.dump(pickle_fake_news,open(tokens_save_dir, "wb"))
    print('Data has been saved')

## Build data loader
train_dataset = Fake_News_Dataset(train_data_indices, train_labels, max_sent_length)
Exemplo n.º 30
0
    return x.split(" ")


tok_db = [
    # ("whitespace", lambda lang: whitespace_split),
    ("regexp", lambda lang: re.compile(r"\b\w\w+\b").findall),
    (
        "unicode-segmentation",
        lambda lang: UnicodeWordTokenizer(word_bounds=True).tokenize,
    ),
    ("vtext", lambda lang: VTextTokenizer(lang).tokenize),
]

if sacremoses is not None:
    tok_db.append(
        ("MosesTokenizer", lambda lang: sacremoses.MosesTokenizer().tokenize))

if spacy is not None:

    def spacy_tokenizer(lang):
        if lang == "en":
            from spacy.lang.en import English as Nlp
        elif lang == "de":
            from spacy.lang.de import German as Nlp
        elif lang == "fr":
            from spacy.lang.fr import French as Nlp
        else:
            raise ValueError
        return Nlp().tokenizer

    tok_db.append(("spacy", spacy_tokenizer))