Exemplos de tokenize em Python, exemplos de ciseau.tokenize em Python

Exemplo n.º 1

0

Exibir arquivo

 def test_quoted_expressions_with_ascii(self):
     expression = [
         "Julius ", u"Cæsar ", "declared ", "-- ", "professed ", "- ",
         "his ", "passion ", "for ", "wine ", "A", "."
     ]
     self.assertEqual(tokenize("".join(expression), normalize_ascii=False),
                      expression)
     self.assertEqual(
         tokenize("".join(expression), normalize_ascii=True),
         [w.replace(u"æ", "ae").replace("--", "-") for w in expression])

Exemplo n.º 2

0

Exibir arquivo

 def test_pre_post_quote(self):
     expression = [
         "On ", "January ", "28", ", ", "2011 ", ", ", "''", "Hollywood ",
         "Reporter", "'' ", "announced ", "that ", "Paramount ",
         "Pictures ", "had ", "given ", "the ", "green ", "light", "."
     ]
     self.assertEqual(tokenize("".join(expression)), expression)

Exemplo n.º 3

0

Exibir arquivo

def featurize_example(question, context, vocab):
    # Convert to indices
    question_idxs = [
        vocab.word_to_idx(normalize(w))
        for w in ciseau.tokenize(question, normalize_ascii=False)
    ]

    context_sents = ciseau.sent_tokenize(context,
                                         keep_whitespace=True,
                                         normalize_ascii=False)
    # + 1 for end of sentence
    sent_lengths = [len(sent) + 1 for sent in context_sents]
    context_idxs = []
    for sent in context_sents:
        for w in sent:
            context_idxs.append(vocab.word_to_idx(normalize(w)))
        context_idxs.append(vocab.eos)

    same_as_question = same_as_question_feature(question_idxs, context_idxs,
                                                vocab)
    repeated_words, repeated_intensity = repeated_word_features(
        context_idxs, vocab)

    return (question_idxs, context_idxs, same_as_question, repeated_words,
            repeated_intensity, sent_lengths), context_sents

Exemplo n.º 4

0

Exibir arquivo

def tokenize_example(question, context, answers, strip_labels=True):
    # Q: How should we choose the right answer
    answer = answers[0]["text"]
    answer_start = answers[0]["answer_start"]

    if strip_labels:
        answer_tokens = ciseau.tokenize(answer, normalize_ascii=False)
        start_offset, end_offset = normalize_answer_tokens(answer_tokens)
        answer = "".join(answer_tokens[start_offset:end_offset])
        # add back the piece that was stripped off:
        answer_start = answer_start + len("".join(
            answer_tokens[:start_offset]))

    # replace answer string with placeholder
    placeholder = "XXXX"
    new_context = context[:answer_start] + placeholder + context[answer_start +
                                                                 len(answer):]

    token_context = ciseau.sent_tokenize(new_context, keep_whitespace=True)
    token_question = ciseau.tokenize(question)

    sentence_label = None
    for sent_idx, sent in enumerate(token_context):
        answer_start = None
        for idx, word in enumerate(sent):
            if placeholder in word:
                answer_start = idx
                break

        if answer_start is None:
            continue

        sentence_label = sent_idx

        # deal with cases where the answer is in the middle
        # of the word
        answer = word.replace(placeholder, answer)
        token_answer = ciseau.tokenize(answer)

        answer_end = answer_start + len(token_answer) - 1
        answer_sent = sent[:answer_start] + token_answer + sent[answer_start +
                                                                1:]
        break

    token_context[sentence_label] = answer_sent

    return token_question, token_context, sentence_label, answer_start, answer_end

Exemplo n.º 5

0

Exibir arquivo

 def test_abbreviations(self):
     expression = [
         "Mr. ", "hooligan ", "and ", "his ", "brother ", "DR. ",
         "strange ", "know ", "each ", "other ", "well ", "said ", "d. ",
         "A. ", "Joe ", "the ", "sgt. ", "in ", "charge ", "of ", "all ",
         "this ", "bs", "."
     ]
     self.assertEqual(tokenize("".join(expression)), expression)

Exemplo n.º 6

0

Exibir arquivo

 def test_numerical_commas_periods_expressions(self):
     expression = [
         "In ", "the ", "year ", "2000", ", ", "there ", "was ",
         "evidence ", "that ", "100,000 ", "martians ", "came ", "to ",
         "see ", "us", ", ", "but ", "I ", "did", "n't ", "even ", "hear ",
         "98.2", ",", "98.3 ", "or ", "98.4 ", "speak ", "about ", "it", ","
     ]
     self.assertEqual(tokenize("".join(expression)), expression)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_tokenization.py Projeto: JonathanRaiman/xml_cleaner

 def test_weird_hybrid_expressions(self):
     expression = [
         u"Beyoncé", u"'s ", u"1840", u"'s ", u"song ", u"<", u"3lovely", u"."
     ]
     self.assertEqual(
         tokenize(
             "".join(expression)
         ),
         expression
     )

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_tokenization.py Projeto: JonathanRaiman/xml_cleaner

 def test_quoted_expressions_with_ascii(self):
     expression = [
         "Julius ", u"Cæsar ", "declared ", "-- ", "professed ", "- ",
         "his ", "passion ", "for ", "wine ", "A", "."
     ]
     self.assertEqual(
         tokenize(
             "".join(expression),
             normalize_ascii=False
         ),
         expression
     )
     self.assertEqual(
         tokenize(
             "".join(expression),
             normalize_ascii=True
         ),
         [w.replace(u"æ", "ae").replace("--", "-") for w in expression]
     )

Exemplo n.º 9

0

Exibir arquivo

Arquivo: test_tokenization.py Projeto: JonathanRaiman/xml_cleaner

 def test_abbreviations(self):
     expression = [
         "Mr. ", "hooligan ", "and ", "his ", "brother ", "DR. ",
         "strange ", "know ", "each ", "other ", "well ", "said ",
         "d. ", "A. ", "Joe ", "the ", "sgt. ", "in ", "charge ",
         "of ", "all ", "this ", "bs", "."
     ]
     self.assertEqual(
         tokenize("".join(expression)),
         expression
     )

Exemplo n.º 10

0

Exibir arquivo

 def test_em_dash(self):
     expression = [
         u"The ", u"earthquake ", u"was ", u"also ", u"felt ", u"in ",
         u"nearby ", u"countries ", u"and ", u"as ", u"far ", u"away ",
         u"as ", u"both ", u"Beijing ", u"and ", u"Shanghai ", u"—",
         u"1,500 ", u"km ", u"(", u"930 ", u"mi ", u") ", u"and ",
         u"1,700 ", u"km ", u"(", u"1,060 ", u"mi ", u") ", u"away", u"—",
         u"where ", u"office ", u"buildings ", u"swayed ", u"with ",
         u"the ", u"tremor", u"."
     ]
     self.assertEqual(tokenize("".join(expression), normalize_ascii=False),
                      expression)

Exemplo n.º 11

0

Exibir arquivo

def build_vocabulary(datadir, outdir, glove_path):
    """Construct the vocabulary object used throughout."""
    # We're not going to backprop through the word vectors
    # both train and dev words end up in the vocab.
    counter = Counter()
    for split in splits:
        datapath = os.path.join(datadir, split + ".json")

        for question, context, _, _ in data_stream(datapath):
            for word in ciseau.tokenize(question, normalize_ascii=False):
                counter[normalize(word)] += 1
            for word in ciseau.tokenize(context, normalize_ascii=False):
                counter[normalize(word)] += 1

    common_words = [UNK, SOS, EOS, PAD] + [w for w, _ in counter.most_common()]

    vocab_path = os.path.join(outdir, "vocab.txt")
    with io.open(vocab_path, "w", encoding="utf8") as handle:
        handle.write("\n".join(common_words))

    return Vocab(outdir)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_tokenization.py Projeto: JonathanRaiman/xml_cleaner

 def test_quoted_expressions(self):
     expression = [
         "(", "in ", "2008", ") ", "the ", "Martians ",
         "arrived ", "and ", "you", "'ll ", "see ", "what ",
         "I ", "mean", "."
     ]
     self.assertEqual(
         tokenize(
             "".join(expression)
         ),
         expression
     )

Exemplo n.º 13

0

Exibir arquivo

Arquivo: test_tokenization.py Projeto: JonathanRaiman/xml_cleaner

 def test_pre_post_quote(self):
     expression = [
         "On ", "January ", "28", ", ", "2011 ", ", ", "''",
         "Hollywood ", "Reporter", "'' ", "announced ",
         "that ", "Paramount ", "Pictures ", "had ", "given ",
         "the ", "green ", "light", "."
     ]
     self.assertEqual(
         tokenize(
             "".join(expression)
         ),
         expression
     )

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test_tokenization.py Projeto: JonathanRaiman/xml_cleaner

 def test_numerical_commas_periods_expressions(self):
     expression = [
         "In ", "the ", "year ", "2000", ", ",
         "there ", "was ", "evidence ", "that ", "100,000 ",
         "martians ", "came ", "to ", "see ", "us", ", ",
         "but ", "I ", "did", "n't ", "even ", "hear ",
         "98.2", ",", "98.3 ", "or ", "98.4 ", "speak ",
         "about ", "it", ","
     ]
     self.assertEqual(
         tokenize("".join(expression)),
         expression
     )

Exemplo n.º 15

0

Exibir arquivo

Arquivo: test_tokenization.py Projeto: JonathanRaiman/xml_cleaner

 def test_em_dash(self):
     expression = [
         u"The ", u"earthquake ", u"was ",
         u"also ", u"felt ", u"in ", u"nearby ", u"countries ",
         u"and ", u"as ", u"far ", u"away ", u"as ", u"both ",
         u"Beijing ", u"and ", u"Shanghai ", u"—", u"1,500 ", u"km ",
         u"(", u"930 ", u"mi ", u") ", u"and ", u"1,700 ", u"km ", u"(",
         u"1,060 ", u"mi ", u") ", u"away", u"—", u"where ", u"office ",
         u"buildings ", u"swayed ", u"with ", u"the ", u"tremor", u"."
     ]
     self.assertEqual(
         tokenize("".join(expression), normalize_ascii=False),
         expression
     )

Exemplo n.º 16

0

Exibir arquivo

Arquivo: dataset.py Projeto: zjulins/deeptype

def retokenize_example(x, y):
    tokens = ciseau.tokenize(" ".join(w for w in x),
                             normalize_ascii=False)
    out_y = []
    regular_cursor = 0
    tokens_length_total = 0
    regular_length_total = len(x[regular_cursor]) + 1 if len(x) > 0 else 0
    if regular_cursor + 1 == len(x):
        regular_length_total -= 1
    for i in range(len(tokens)):
        tokens_length_total = tokens_length_total + len(tokens[i])
        while regular_length_total < tokens_length_total:
            regular_cursor += 1
            regular_length_total = regular_length_total + len(x[regular_cursor]) + 1
            if regular_cursor + 1 == len(x):
                regular_length_total -= 1
        out_y.append(y[regular_cursor])
    assert(regular_cursor + 1 == len(x)), "error with %r" % (x,)
    return ([tok.rstrip() for tok in tokens], out_y)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: quickAnalyzer.py Projeto: wqmike123/nlpUtils

def ciseauTokenizer(sentence):
    return ciseau.tokenize(sentence)

Exemplo n.º 18

0

Exibir arquivo

 def test_weird_hybrid_expressions(self):
     expression = [
         u"Beyoncé", u"'s ", u"1840", u"'s ", u"song ", u"<", u"3lovely",
         u"."
     ]
     self.assertEqual(tokenize("".join(expression)), expression)

Exemplo n.º 19

0

Exibir arquivo

 def test_quoted_expressions(self):
     expression = [
         "(", "in ", "2008", ") ", "the ", "Martians ", "arrived ", "and ",
         "you", "'ll ", "see ", "what ", "I ", "mean", "."
     ]
     self.assertEqual(tokenize("".join(expression)), expression)

Exemplo n.º 20

0

Exibir arquivo

 def split(sentence):
     return ciseau.tokenize(sentence)

Exemplo n.º 21

0

Exibir arquivo

Arquivo: test_top1.py Projeto: stungkit/d6tjoin

 def strsplit(t):
     return [
         s for s in [s.replace(" ", "") for s in ciseau.tokenize(t)]
         if s not in ['.', ',', '-', ';', '(', ')']
     ]

Exemplo n.º 22

0

Exibir arquivo

Arquivo: JASSjr_index.py Projeto: prasys/JASSjr

import ciseau
with open('test_documents.xml', 'r') as f:  # read the XML file
    for line in f:
        print(line)
        print(ciseau.tokenize(line))