示例#1
0
 def transform(self, docs):
     docvecs = np.zeros((len(docs), self.gram_length))
     print("making vectors")
     for index, doc in enumerate(tqdm_notebook(docs)):
         for word, count in Counter(text_to_words(doc)).items():
             v = (self[word] * count) / (1 + self.idf[word])
             docvecs[index] += v
     return docvecs
示例#2
0
def fix_author_text(s):
    """Author text gets special treatment.
    No de-dashing, no tokenization, and 
    replace periods by white space.
    """
    if pd.isnull(s):
        return ''
    s = unidecode(s)
    # fix cases when quotes are repeated
    s = re.sub('"+', '"', s)
    # no periods as those make author first letter matching hard
    s = re.sub(r'\.', ' ', s)
    s = replace_special_whitespace_chars(s)
    s = standardize_whitespace_length(s)
    return text_to_words(s).lower().strip()
示例#3
0
 def encode_lines(self, lines):
     """
     Encode a set of lines. All lines will be encoded together.
     """
     enc_lines = []
     for line in lines:
         line = line.strip()
         if len(line) == 0 and not self.args.keep_empty:
             return ["EMPTY", None]
         if self.args.tokenizer == 'bpe':
             tokens = self.encode(line)
             enc_lines.append(" ".join(tokens))
         else:
             enc_lines.append(text_to_words(line))
     return ["PASS", enc_lines]
示例#4
0
def fix_text(s):
    """General purpose text fixing using nlpre package
    and then tokenizing with blingfire
    """
    if pd.isnull(s):
        return ''
    s = unidecode(s)
    # fix cases when quotes are repeated
    s = re.sub('"+', '"', s)
    # dashes make quote matching difficult
    s = re.sub('-', ' ', s)
    s = replace_special_whitespace_chars(s)
    # tokenize
    s = text_to_words(s).lower().strip()
    # note: removing single non-alphanumerics
    # means that we will match ngrams that are
    # usually separate by e.g. commas in the text
    # this will improve # of matches but also
    # surface false positives
    return remove_single_non_alphanumerics(s)
示例#5
0
            UnicodeSegmentTokenizer(word_bounds=True).tokenize,
        ),
        ("VTextTokenizer('en')", VTextTokenizer("en").tokenize),
        ("CharacterTokenizer(4)", CharacterTokenizer(4).tokenize),
    ]

    if sacremoses is not None:
        db.append(("MosesTokenizer()", sacremoses.MosesTokenizer().tokenize))
    if spacy is not None:
        from spacy.lang.en import English

        db.append(("Spacy en", English().tokenizer))

    if blingfire is not None:
        db.append(
            ("BlingFire en", lambda x: blingfire.text_to_words(x).split(" ")))

    for label, func in db:
        t0 = time()

        out = []

        for idx, doc in enumerate(data):
            out.append(func(doc))

        dt = time() - t0

        n_tokens = sum(len(tok) for tok in out)

        print("{:>45}: {:.2f}s [{:.1f} MB/s, {:.0f} kWPS]".format(
            label, dt, dataset_size / dt, n_tokens * 1e-3 / dt))
示例#6
0
 def fit(self, docs):
     self.idf = defaultdict(int)
     for doc in docs:
         for word in set(text_to_words(doc)):
             self.idf[word] += 1
示例#7
0
def blingf_tokenizer(s: str):
    return text_to_words(s)
示例#8
0
 def bling_tokenkizer(lang):
     return lambda x: blingfire.text_to_words(x).split(" ")
示例#9
0
import sys
from blingfire import text_to_words

for l in sys.stdin:
    if l.strip():
        print(text_to_words(l.strip()))
    else:
        print('')
示例#10
0
def word_tokenize(sent):
    return text_to_words(sent).split(' ')
示例#11
0
def word_tokenize(string):
    """Tokenize space delimited string with blingfire."""
    return text_to_words(string).split(' ')