def test_quoted_expressions_with_ascii(self): expression = [ "Julius ", u"Cæsar ", "declared ", "-- ", "professed ", "- ", "his ", "passion ", "for ", "wine ", "A", "." ] self.assertEqual(tokenize("".join(expression), normalize_ascii=False), expression) self.assertEqual( tokenize("".join(expression), normalize_ascii=True), [w.replace(u"æ", "ae").replace("--", "-") for w in expression])
def test_pre_post_quote(self): expression = [ "On ", "January ", "28", ", ", "2011 ", ", ", "''", "Hollywood ", "Reporter", "'' ", "announced ", "that ", "Paramount ", "Pictures ", "had ", "given ", "the ", "green ", "light", "." ] self.assertEqual(tokenize("".join(expression)), expression)
def featurize_example(question, context, vocab): # Convert to indices question_idxs = [ vocab.word_to_idx(normalize(w)) for w in ciseau.tokenize(question, normalize_ascii=False) ] context_sents = ciseau.sent_tokenize(context, keep_whitespace=True, normalize_ascii=False) # + 1 for end of sentence sent_lengths = [len(sent) + 1 for sent in context_sents] context_idxs = [] for sent in context_sents: for w in sent: context_idxs.append(vocab.word_to_idx(normalize(w))) context_idxs.append(vocab.eos) same_as_question = same_as_question_feature(question_idxs, context_idxs, vocab) repeated_words, repeated_intensity = repeated_word_features( context_idxs, vocab) return (question_idxs, context_idxs, same_as_question, repeated_words, repeated_intensity, sent_lengths), context_sents
def tokenize_example(question, context, answers, strip_labels=True): # Q: How should we choose the right answer answer = answers[0]["text"] answer_start = answers[0]["answer_start"] if strip_labels: answer_tokens = ciseau.tokenize(answer, normalize_ascii=False) start_offset, end_offset = normalize_answer_tokens(answer_tokens) answer = "".join(answer_tokens[start_offset:end_offset]) # add back the piece that was stripped off: answer_start = answer_start + len("".join( answer_tokens[:start_offset])) # replace answer string with placeholder placeholder = "XXXX" new_context = context[:answer_start] + placeholder + context[answer_start + len(answer):] token_context = ciseau.sent_tokenize(new_context, keep_whitespace=True) token_question = ciseau.tokenize(question) sentence_label = None for sent_idx, sent in enumerate(token_context): answer_start = None for idx, word in enumerate(sent): if placeholder in word: answer_start = idx break if answer_start is None: continue sentence_label = sent_idx # deal with cases where the answer is in the middle # of the word answer = word.replace(placeholder, answer) token_answer = ciseau.tokenize(answer) answer_end = answer_start + len(token_answer) - 1 answer_sent = sent[:answer_start] + token_answer + sent[answer_start + 1:] break token_context[sentence_label] = answer_sent return token_question, token_context, sentence_label, answer_start, answer_end
def test_abbreviations(self): expression = [ "Mr. ", "hooligan ", "and ", "his ", "brother ", "DR. ", "strange ", "know ", "each ", "other ", "well ", "said ", "d. ", "A. ", "Joe ", "the ", "sgt. ", "in ", "charge ", "of ", "all ", "this ", "bs", "." ] self.assertEqual(tokenize("".join(expression)), expression)
def test_numerical_commas_periods_expressions(self): expression = [ "In ", "the ", "year ", "2000", ", ", "there ", "was ", "evidence ", "that ", "100,000 ", "martians ", "came ", "to ", "see ", "us", ", ", "but ", "I ", "did", "n't ", "even ", "hear ", "98.2", ",", "98.3 ", "or ", "98.4 ", "speak ", "about ", "it", "," ] self.assertEqual(tokenize("".join(expression)), expression)
def test_weird_hybrid_expressions(self): expression = [ u"Beyoncé", u"'s ", u"1840", u"'s ", u"song ", u"<", u"3lovely", u"." ] self.assertEqual( tokenize( "".join(expression) ), expression )
def test_quoted_expressions_with_ascii(self): expression = [ "Julius ", u"Cæsar ", "declared ", "-- ", "professed ", "- ", "his ", "passion ", "for ", "wine ", "A", "." ] self.assertEqual( tokenize( "".join(expression), normalize_ascii=False ), expression ) self.assertEqual( tokenize( "".join(expression), normalize_ascii=True ), [w.replace(u"æ", "ae").replace("--", "-") for w in expression] )
def test_abbreviations(self): expression = [ "Mr. ", "hooligan ", "and ", "his ", "brother ", "DR. ", "strange ", "know ", "each ", "other ", "well ", "said ", "d. ", "A. ", "Joe ", "the ", "sgt. ", "in ", "charge ", "of ", "all ", "this ", "bs", "." ] self.assertEqual( tokenize("".join(expression)), expression )
def test_em_dash(self): expression = [ u"The ", u"earthquake ", u"was ", u"also ", u"felt ", u"in ", u"nearby ", u"countries ", u"and ", u"as ", u"far ", u"away ", u"as ", u"both ", u"Beijing ", u"and ", u"Shanghai ", u"—", u"1,500 ", u"km ", u"(", u"930 ", u"mi ", u") ", u"and ", u"1,700 ", u"km ", u"(", u"1,060 ", u"mi ", u") ", u"away", u"—", u"where ", u"office ", u"buildings ", u"swayed ", u"with ", u"the ", u"tremor", u"." ] self.assertEqual(tokenize("".join(expression), normalize_ascii=False), expression)
def build_vocabulary(datadir, outdir, glove_path): """Construct the vocabulary object used throughout.""" # We're not going to backprop through the word vectors # both train and dev words end up in the vocab. counter = Counter() for split in splits: datapath = os.path.join(datadir, split + ".json") for question, context, _, _ in data_stream(datapath): for word in ciseau.tokenize(question, normalize_ascii=False): counter[normalize(word)] += 1 for word in ciseau.tokenize(context, normalize_ascii=False): counter[normalize(word)] += 1 common_words = [UNK, SOS, EOS, PAD] + [w for w, _ in counter.most_common()] vocab_path = os.path.join(outdir, "vocab.txt") with io.open(vocab_path, "w", encoding="utf8") as handle: handle.write("\n".join(common_words)) return Vocab(outdir)
def test_quoted_expressions(self): expression = [ "(", "in ", "2008", ") ", "the ", "Martians ", "arrived ", "and ", "you", "'ll ", "see ", "what ", "I ", "mean", "." ] self.assertEqual( tokenize( "".join(expression) ), expression )
def test_pre_post_quote(self): expression = [ "On ", "January ", "28", ", ", "2011 ", ", ", "''", "Hollywood ", "Reporter", "'' ", "announced ", "that ", "Paramount ", "Pictures ", "had ", "given ", "the ", "green ", "light", "." ] self.assertEqual( tokenize( "".join(expression) ), expression )
def test_numerical_commas_periods_expressions(self): expression = [ "In ", "the ", "year ", "2000", ", ", "there ", "was ", "evidence ", "that ", "100,000 ", "martians ", "came ", "to ", "see ", "us", ", ", "but ", "I ", "did", "n't ", "even ", "hear ", "98.2", ",", "98.3 ", "or ", "98.4 ", "speak ", "about ", "it", "," ] self.assertEqual( tokenize("".join(expression)), expression )
def test_em_dash(self): expression = [ u"The ", u"earthquake ", u"was ", u"also ", u"felt ", u"in ", u"nearby ", u"countries ", u"and ", u"as ", u"far ", u"away ", u"as ", u"both ", u"Beijing ", u"and ", u"Shanghai ", u"—", u"1,500 ", u"km ", u"(", u"930 ", u"mi ", u") ", u"and ", u"1,700 ", u"km ", u"(", u"1,060 ", u"mi ", u") ", u"away", u"—", u"where ", u"office ", u"buildings ", u"swayed ", u"with ", u"the ", u"tremor", u"." ] self.assertEqual( tokenize("".join(expression), normalize_ascii=False), expression )
def retokenize_example(x, y): tokens = ciseau.tokenize(" ".join(w for w in x), normalize_ascii=False) out_y = [] regular_cursor = 0 tokens_length_total = 0 regular_length_total = len(x[regular_cursor]) + 1 if len(x) > 0 else 0 if regular_cursor + 1 == len(x): regular_length_total -= 1 for i in range(len(tokens)): tokens_length_total = tokens_length_total + len(tokens[i]) while regular_length_total < tokens_length_total: regular_cursor += 1 regular_length_total = regular_length_total + len(x[regular_cursor]) + 1 if regular_cursor + 1 == len(x): regular_length_total -= 1 out_y.append(y[regular_cursor]) assert(regular_cursor + 1 == len(x)), "error with %r" % (x,) return ([tok.rstrip() for tok in tokens], out_y)
def ciseauTokenizer(sentence): return ciseau.tokenize(sentence)
def test_weird_hybrid_expressions(self): expression = [ u"Beyoncé", u"'s ", u"1840", u"'s ", u"song ", u"<", u"3lovely", u"." ] self.assertEqual(tokenize("".join(expression)), expression)
def test_quoted_expressions(self): expression = [ "(", "in ", "2008", ") ", "the ", "Martians ", "arrived ", "and ", "you", "'ll ", "see ", "what ", "I ", "mean", "." ] self.assertEqual(tokenize("".join(expression)), expression)
def split(sentence): return ciseau.tokenize(sentence)
def strsplit(t): return [ s for s in [s.replace(" ", "") for s in ciseau.tokenize(t)] if s not in ['.', ',', '-', ';', '(', ')'] ]
import ciseau with open('test_documents.xml', 'r') as f: # read the XML file for line in f: print(line) print(ciseau.tokenize(line))