def split_sentences_spacy(text, language_model='en'): r""" You must download a spacy language model with python -m download 'en' The default English language model for spacy tends to be a lot more agressive than NLTK's punkt: >>> split_sentences_nltk("Hi Ms. Lovelace.\nI'm a wanna-\nbe human @ I.B.M. ;) --Watson 2.0") ['Hi Ms. Lovelace.', "I'm a wanna-\nbe human @ I.B.M.", ';) --Watson 2.0'] >>> split_sentences_spacy("Hi Ms. Lovelace.\nI'm a wanna-\nbe human @ I.B.M. ;) --Watson 2.0") ['Hi Ms. Lovelace.', "I'm a wanna-", 'be human @', 'I.B.M. ;) --Watson 2.0'] >>> split_sentences_spacy("Hi Ms. Lovelace. I'm at I.B.M. --Watson 2.0") ['Hi Ms. Lovelace.', "I'm at I.B.M. --Watson 2.0"] >>> split_sentences_nltk("Hi Ms. Lovelace. I'm at I.B.M. --Watson 2.0") ['Hi Ms. Lovelace.', "I'm at I.B.M.", '--Watson 2.0'] """ text = '\n'.join(iter_lines(text)) try: nlp = spacy.load(language_model) except (OSError, IOError): try: spacy.cli.download(language_model) except URLError: logger.warn("Unable to download Spacy language model '{}'. Using offline NLTK punkt sentence splitter instead.") return split_sentences_nltk(text) parsed_text = nlp(text) sentences = [] for w, span in enumerate(parsed_text.sents): sent = ''.join(parsed_text[i].string for i in range(span.start, span.end)).strip() if len(sent): sentences.append(sent) return sentences
def segment_sentences(path=os.path.join(DATA_PATH, 'book'), splitter=split_sentences_nltk, **find_files_kwargs): """ Return a list of all sentences and empty lines. TODO: 1. process each line with an aggressive sentence segmenter, like DetectorMorse 2. process our manuscript to create a complete-sentence and heading training set normalized/simplified syntax net tree is the input feature set common words and N-grams inserted with their label as additional feature 3. process a training set with a grammar checker and syntax to bootstrap a "complete sentence" labeler. 4. process each 1-3 line window (breaking on empty lines) with syntax net to label them 5. label each 1-3-line window of lines as "complete sentence, partial sentence/phrase, or multi-sentence" >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'book'))) ... 4 >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'psychology-scripts.txt'), splitter=split_sentences_nltk)) ... 23 """ sentences = [] if os.path.isdir(path): for filemeta in find_files(path, **find_files_kwargs): with open(filemeta['path']) as fin: i, batch = 0, [] try: for i, line in enumerate(fin): if not line.strip(): sentences.extend(splitter('\n'.join(batch))) batch = [line] # may contain all whitespace else: batch.append(line) except (UnicodeDecodeError, IOError): logger.error( 'UnicodeDecodeError or IOError on line {} in file {} from stat: {}' .format(i + 1, fin.name, filemeta)) raise if len(batch): # TODO: tag sentences with line + filename where they started sentences.extend(splitter('\n'.join(batch))) else: batch = [] for i, line in enumerate(iter_lines(path)): # TODO: filter out code and meta lines using asciidoc or markdown parser # split into batches based on empty lines if not line.strip(): sentences.extend(splitter('\n'.join(batch))) # first line may contain all whitespace batch = [line] else: batch.append(line) if len(batch): # TODO: tag sentences with line + filename where they started sentences.extend(splitter('\n'.join(batch))) return sentences
def split_sentences_regex(text): """ Use dead-simple regex to split text into sentences. Very poor accuracy. >>> split_sentences_regex("Hello World. I'm I.B.M.'s Watson. --Watson") ['Hello World.', "I'm I.B.M.'s Watson.", '--Watson'] """ text = '\n'.join(iter_lines(text)) parts = regex.split(r'([a-zA-Z0-9][.?!])[\s$]', text) sentences = [''.join(s) for s in zip(parts[0::2], parts[1::2])] return sentences + [parts[-1]] if len(parts) % 2 else sentences
def split_sentences_nltk(text, language_model='tokenizers/punkt/english.pickle'): text = '\n'.join(iter_lines(text)) try: sentence_detector = nltk.data.load(language_model) except LookupError: try: nltk.download('punkt', raise_on_error=True) sentence_detector = nltk.data.load(language_model) except ValueError: return split_sentences_regex(text) return list(sentence_detector.tokenize(text.strip()))
def segment_sentences(path=os.path.join(DATA_PATH, 'book'), ext='asc', splitter=split_sentences_nltk): """ Return a list of all sentences and empty lines. TODO: 1. process each line with an agressive sentence segmenter, like DetectorMorse 2. process our manuscript to create a complete-sentence and heading training set normalized/simplified syntax net tree is the input feature set common words and N-grams inserted with their label as additional feature 3. process a training set with a grammar checker and sentax next to bootstrap a "complete sentence" labeler. 4. process each 1-3 line window (breaking on empty lines) with syntax net to label them 5. label each 1-3-line window of lines as "complete sentence, partial sentence/phrase, or multi-sentence" >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'book'))) 8324 >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'book', ... 'Chapter 00 -- Preface.asc'), splitter=split_sentences_nltk)) 139 >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'book', ... 'Chapter 01 -- Packets of Thought (Basic NLP).asc'), splitter=split_sentences_nltk)) 585 """ sentences = [] if os.path.isdir(path): for filemeta in find_files(path, ext=ext): with open(filemeta['path'], 'rt') as fin: batch = [] for i, line in enumerate(fin): if not line.strip(): sentences.extend(splitter('\n'.join(batch))) batch = [line] # may contain all whitespace else: batch.append(line) if len(batch): # TODO: tag sentences with line + filename where they started sentences.extend(splitter('\n'.join(batch))) else: batch = [] for i, line in enumerate(iter_lines(path)): # TODO: filter out code and meta lines using asciidoc or markdown parser # split into batches based on empty lines if not line.strip(): sentences.extend(splitter('\n'.join(batch))) # first line may contain all whitespace batch = [line] else: batch.append(line) if len(batch): # TODO: tag sentences with line + filename where they started sentences.extend(splitter('\n'.join(batch))) return sentences
def tag_code_lines(text, markup=None): if (markup is None or markup.lower() in ('asc', '.asc', 'adoc', '.adoc', '.asciidoc') or ( os.path.isfile(text) and text.lower().split('.')[-1] in ('asc', 'adoc', 'asciidoc'))): markup = 'asciidoc' lines = [] within_codeblock = False for i, line in enumerate(iter_lines(text)): # TODO: filter out code and meta lines using asciidoc or markdown parser # split into batches based on empty lines tag = tag_code(line, markup=markup) if within_codeblock or tag.startswith('code.'): if tag.endswith('.end'): within_codeblock = False elif tag == 'code.start': within_codeblock = True lines.append(line, tag) return lines
def split_text_blocks(text): r""" Splits asciidoc and markdown files into a list of lists (blocks text in a list of lines of text) >>> split_text_blocks("# Title \nHello world! \nI'm here today to\nstrip\nyour text.\n \t \r\nNext block\n\nwas short.") [['# Title \n', 'Hello world! \n', "I'm here today to\n", 'strip\n', 'your text.\n', ' \t \r\n'], ['Next block\n', '\n'], ['was short.']] """ blocks = [] block = [] for line in iter_lines(text): block.append(line) if not line.strip(): blocks.append(block) block = [] if block: blocks.append(block) return blocks