def main(): """ Given a parallel corpus, partitions examples into training, development, and test sets. Provided output will be a directory containing the partitions: <corpus_name> / <corpus_name>_train.jsonl <corpus_name>_development.jsonl <corpus_name>_test.jsonl partition_info.txt when given a parallel corpus <corpus_name>.jsonl """ parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) project_root = os.path.abspath(os.path.realpath(os.path.join( os.path.dirname( # Escape out into project directory. os.path.dirname( # Escape out into scripts directory. os.path.realpath(__file__)))))) parser.add_argument("--corpus-path", type=str, help="Path to the parallel JSON lines corpus.") parser.add_argument("--save-dir", type=str, default=project_root, help="Directory to store the train-dev-test split directory.") args = parser.parse_args() corpus_name = os.path.basename(args.corpus_path).split('.')[0] out_file_path = os.path.join(args.save_dir, corpus_name + "_tokenized.jsonl") out_file = open(out_file_path, 'w') # Language-specific tokenizers. en_tokenizer = SpacyWordSplitter(language='en_core_web_sm') fr_tokenizer = SpacyWordSplitter(language='fr_core_news_sm') print("Tokenizing utterances for {}...".format(corpus_name)) with open(args.corpus_path) as f: for lines in tqdm(grouper(f, 100, '')): # When the grouper collects a group smaller than the batch, padding # is done via empty strings. # Check for them explicitly before continuing. examples = [ujson.loads(line.strip()) for line in filter(lambda l: l, lines)] en_utterances = [ex['en'] for ex in examples] fr_utterances = [ex['fr'] for ex in examples] en_utterances_tokenized = en_tokenizer.batch_split_words(en_utterances) fr_utterances_tokenized = fr_tokenizer.batch_split_words(fr_utterances) for i, ex in enumerate(examples): ex_tokenized = { 'id': ex['id'], 'en': ' '.join([token.text for token in en_utterances_tokenized[i]]), 'fr': ' '.join([token.text for token in fr_utterances_tokenized[i]]) } ujson.dump(ex_tokenized, out_file, ensure_ascii=False) out_file.write('\n') out_file.close()
class TestSpacyWordSplitter(AllenNlpTestCase): def setUp(self): super(TestSpacyWordSplitter, self).setUp() self.word_splitter = SpacyWordSplitter() def test_tokenize_handles_complex_punctuation(self): sentence = "this (sentence) has 'crazy' \"punctuation\"." expected_tokens = [ "this", "(", "sentence", ")", "has", "'", "crazy", "'", '"', "punctuation", '"', "." ] tokens = self.word_splitter.split_words(sentence) token_text = [t.text for t in tokens] assert token_text == expected_tokens for token in tokens: start = token.idx end = start + len(token.text) assert sentence[start:end] == token.text def test_tokenize_handles_contraction(self): # note that "would've" is kept together, while "ain't" is not. sentence = "it ain't joe's problem; would been yesterday" expected_tokens = [ "it", "ai", "n't", "joe", "'s", "problem", ";", "would", "been", "yesterday" ] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_multiple_contraction(self): sentence = "wouldn't've" expected_tokens = ["would", "n't", "'ve"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_final_apostrophe(self): sentence = "the jones' house" expected_tokens = ["the", "jones", "'", "house"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_removes_whitespace_tokens(self): sentence = "the\n jones' house \x0b 55" expected_tokens = ["the", "jones", "'", "house", "55"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_special_cases(self): # note that the etc. doesn't quite work --- we can special case this if we want. sentence = "Mr. and Mrs. Jones, etc., went to, e.g., the store" expected_tokens = [ "Mr.", "and", "Mrs.", "Jones", ",", "etc", ".", ",", "went", "to", ",", "e.g.", ",", "the", "store" ] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_batch_tokenization(self): sentences = [ "This is a sentence", "This isn't a sentence.", "This is the 3rd sentence." "Here's the 'fourth' sentence." ] batch_split = self.word_splitter.batch_split_words(sentences) separately_split = [ self.word_splitter.split_words(sentence) for sentence in sentences ] assert len(batch_split) == len(separately_split) for batch_sentence, separate_sentence in zip(batch_split, separately_split): assert len(batch_sentence) == len(separate_sentence) for batch_word, separate_word in zip(batch_sentence, separate_sentence): assert batch_word.text == separate_word.text def test_keep_spacy_tokens(self): word_splitter = SpacyWordSplitter() sentence = "This should be an allennlp Token" tokens = word_splitter.split_words(sentence) assert tokens assert all(isinstance(token, Token) for token in tokens) word_splitter = SpacyWordSplitter(keep_spacy_tokens=True) sentence = "This should be a spacy Token" tokens = word_splitter.split_words(sentence) assert tokens assert all(isinstance(token, spacy.tokens.Token) for token in tokens)
class TestSpacyWordSplitter(AllenNlpTestCase): def setUp(self): super(TestSpacyWordSplitter, self).setUp() self.word_splitter = SpacyWordSplitter() def test_tokenize_handles_complex_punctuation(self): sentence = "this (sentence) has 'crazy' \"punctuation\"." expected_tokens = ["this", "(", "sentence", ")", "has", "'", "crazy", "'", '"', "punctuation", '"', "."] tokens = self.word_splitter.split_words(sentence) token_text = [t.text for t in tokens] assert token_text == expected_tokens for token in tokens: start = token.idx end = start + len(token.text) assert sentence[start:end] == token.text def test_tokenize_handles_contraction(self): # note that "would've" is kept together, while "ain't" is not. sentence = "it ain't joe's problem; would been yesterday" expected_tokens = ["it", "ai", "n't", "joe", "'s", "problem", ";", "would", "been", "yesterday"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_multiple_contraction(self): sentence = "wouldn't've" expected_tokens = ["would", "n't", "'ve"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_final_apostrophe(self): sentence = "the jones' house" expected_tokens = ["the", "jones", "'", "house"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_removes_whitespace_tokens(self): sentence = "the\n jones' house \x0b 55" expected_tokens = ["the", "jones", "'", "house", "55"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_tokenize_handles_special_cases(self): # note that the etc. doesn't quite work --- we can special case this if we want. sentence = "Mr. and Mrs. Jones, etc., went to, e.g., the store" expected_tokens = ["Mr.", "and", "Mrs.", "Jones", ",", "etc", ".", ",", "went", "to", ",", "e.g.", ",", "the", "store"] tokens = [t.text for t in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_batch_tokenization(self): sentences = ["This is a sentence", "This isn't a sentence.", "This is the 3rd sentence." "Here's the 'fourth' sentence."] batch_split = self.word_splitter.batch_split_words(sentences) separately_split = [self.word_splitter.split_words(sentence) for sentence in sentences] assert len(batch_split) == len(separately_split) for batch_sentence, separate_sentence in zip(batch_split, separately_split): assert len(batch_sentence) == len(separate_sentence) for batch_word, separate_word in zip(batch_sentence, separate_sentence): assert batch_word.text == separate_word.text
import allennlp from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter spw = SpacyWordSplitter(pos_tags=True, ner=True) sentence = 'I am going ot the store in France. George Washington is awesome.' sentences = [ 'I am going to the store in France.', 'George Washington is awesome', 'I like ice cream.', 'The vikings are awesome people from Normandy' ] # Single split words out = spw.split_words(sentence) for o in out: print(o.pos_) for o in out: print(o.ent_type_) # Batch split words sent = spw.batch_split_words(sentences) for out in sent: for o in out: print(o.pos_) for out in sent: for o in out: print(o.ent_type_)