Python text_to_words示例

编程语言: Python

命名空间/包名称: blingfire

方法/功能: text_to_words

hotexamples.com的示例: 11

Python text_to_words - 已找到11个示例。这些是从开源项目中提取的最受好评的blingfire.text_to_words现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

 def transform(self, docs):
     docvecs = np.zeros((len(docs), self.gram_length))
     print("making vectors")
     for index, doc in enumerate(tqdm_notebook(docs)):
         for word, count in Counter(text_to_words(doc)).items():
             v = (self[word] * count) / (1 + self.idf[word])
             docvecs[index] += v
     return docvecs

示例#2

显示文件

文件： text.py 项目： wayne9qiu/s2search

def fix_author_text(s):
    """Author text gets special treatment.
    No de-dashing, no tokenization, and 
    replace periods by white space.
    """
    if pd.isnull(s):
        return ''
    s = unidecode(s)
    # fix cases when quotes are repeated
    s = re.sub('"+', '"', s)
    # no periods as those make author first letter matching hard
    s = re.sub(r'\.', ' ', s)
    s = replace_special_whitespace_chars(s)
    s = standardize_whitespace_length(s)
    return text_to_words(s).lower().strip()

示例#3

显示文件

文件： encode.py 项目： jind11/TitleStylist

 def encode_lines(self, lines):
     """
     Encode a set of lines. All lines will be encoded together.
     """
     enc_lines = []
     for line in lines:
         line = line.strip()
         if len(line) == 0 and not self.args.keep_empty:
             return ["EMPTY", None]
         if self.args.tokenizer == 'bpe':
             tokens = self.encode(line)
             enc_lines.append(" ".join(tokens))
         else:
             enc_lines.append(text_to_words(line))
     return ["PASS", enc_lines]

示例#4

显示文件

文件： text.py 项目： wayne9qiu/s2search

def fix_text(s):
    """General purpose text fixing using nlpre package
    and then tokenizing with blingfire
    """
    if pd.isnull(s):
        return ''
    s = unidecode(s)
    # fix cases when quotes are repeated
    s = re.sub('"+', '"', s)
    # dashes make quote matching difficult
    s = re.sub('-', ' ', s)
    s = replace_special_whitespace_chars(s)
    # tokenize
    s = text_to_words(s).lower().strip()
    # note: removing single non-alphanumerics
    # means that we will match ngrams that are
    # usually separate by e.g. commas in the text
    # this will improve # of matches but also
    # surface false positives
    return remove_single_non_alphanumerics(s)

示例#5

显示文件

            UnicodeSegmentTokenizer(word_bounds=True).tokenize,
        ),
        ("VTextTokenizer('en')", VTextTokenizer("en").tokenize),
        ("CharacterTokenizer(4)", CharacterTokenizer(4).tokenize),
    ]

    if sacremoses is not None:
        db.append(("MosesTokenizer()", sacremoses.MosesTokenizer().tokenize))
    if spacy is not None:
        from spacy.lang.en import English

        db.append(("Spacy en", English().tokenizer))

    if blingfire is not None:
        db.append(
            ("BlingFire en", lambda x: blingfire.text_to_words(x).split(" ")))

    for label, func in db:
        t0 = time()

        out = []

        for idx, doc in enumerate(data):
            out.append(func(doc))

        dt = time() - t0

        n_tokens = sum(len(tok) for tok in out)

        print("{:>45}: {:.2f}s [{:.1f} MB/s, {:.0f} kWPS]".format(
            label, dt, dataset_size / dt, n_tokens * 1e-3 / dt))

示例#6

显示文件

 def fit(self, docs):
     self.idf = defaultdict(int)
     for doc in docs:
         for word in set(text_to_words(doc)):
             self.idf[word] += 1

示例#7

显示文件

def blingf_tokenizer(s: str):
    return text_to_words(s)

示例#8

显示文件

文件： eval_tokenization.py 项目： joshlk/vtext

 def bling_tokenkizer(lang):
     return lambda x: blingfire.text_to_words(x).split(" ")

示例#9

显示文件

import sys
from blingfire import text_to_words

for l in sys.stdin:
    if l.strip():
        print(text_to_words(l.strip()))
    else:
        print('')

示例#10

显示文件

def word_tokenize(sent):
    return text_to_words(sent).split(' ')

示例#11

显示文件

def word_tokenize(string):
    """Tokenize space delimited string with blingfire."""
    return text_to_words(string).split(' ')