Пример #1
0
def split_sentences_spacy(text, language_model='en'):
    r""" You must download a spacy language model with python -m download 'en'

    The default English language model for spacy tends to be a lot more agressive than NLTK's punkt:

    >>> split_sentences_nltk("Hi Ms. Lovelace.\nI'm a wanna-\nbe human @ I.B.M. ;) --Watson 2.0")
    ['Hi Ms. Lovelace.', "I'm a wanna-\nbe human @ I.B.M.", ';) --Watson 2.0']
    >>> split_sentences_spacy("Hi Ms. Lovelace.\nI'm a wanna-\nbe human @ I.B.M. ;) --Watson 2.0")
    ['Hi Ms. Lovelace.', "I'm a wanna-", 'be human @', 'I.B.M. ;) --Watson 2.0']

    >>> split_sentences_spacy("Hi Ms. Lovelace. I'm at I.B.M. --Watson 2.0")
    ['Hi Ms. Lovelace.', "I'm at I.B.M. --Watson 2.0"]
    >>> split_sentences_nltk("Hi Ms. Lovelace. I'm at I.B.M. --Watson 2.0")
    ['Hi Ms. Lovelace.', "I'm at I.B.M.", '--Watson 2.0']
    """
    text = '\n'.join(iter_lines(text))
    try:
        nlp = spacy.load(language_model)
    except (OSError, IOError):
        try:
            spacy.cli.download(language_model)
        except URLError:
            logger.warn("Unable to download Spacy language model '{}'. Using offline NLTK punkt sentence splitter instead.")
            return split_sentences_nltk(text)
    parsed_text = nlp(text)
    sentences = []
    for w, span in enumerate(parsed_text.sents):
        sent = ''.join(parsed_text[i].string for i in range(span.start, span.end)).strip()
        if len(sent):
            sentences.append(sent)
    return sentences
Пример #2
0
def segment_sentences(path=os.path.join(DATA_PATH, 'book'),
                      splitter=split_sentences_nltk,
                      **find_files_kwargs):
    """ Return a list of all sentences and empty lines.

    TODO:
        1. process each line with an aggressive sentence segmenter, like DetectorMorse
        2. process our manuscript to create a complete-sentence and heading training set normalized/simplified
           syntax net tree is the input feature set common words and N-grams inserted with their label as additional feature
        3. process a training set with a grammar checker and syntax to bootstrap a "complete sentence" labeler.
        4. process each 1-3 line window (breaking on empty lines) with syntax net to label them
        5. label each 1-3-line window of lines as "complete sentence, partial sentence/phrase, or multi-sentence"

    >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'book')))
    ...
    4
    >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'psychology-scripts.txt'), splitter=split_sentences_nltk))
    ...
    23
    """
    sentences = []
    if os.path.isdir(path):
        for filemeta in find_files(path, **find_files_kwargs):
            with open(filemeta['path']) as fin:
                i, batch = 0, []
                try:
                    for i, line in enumerate(fin):
                        if not line.strip():
                            sentences.extend(splitter('\n'.join(batch)))
                            batch = [line]  # may contain all whitespace
                        else:
                            batch.append(line)
                except (UnicodeDecodeError, IOError):
                    logger.error(
                        'UnicodeDecodeError or IOError on line {} in file {} from stat: {}'
                        .format(i + 1, fin.name, filemeta))
                    raise

                if len(batch):
                    # TODO: tag sentences with line + filename where they started
                    sentences.extend(splitter('\n'.join(batch)))
    else:
        batch = []
        for i, line in enumerate(iter_lines(path)):
            # TODO: filter out code and meta lines using asciidoc or markdown parser
            # split into batches based on empty lines
            if not line.strip():
                sentences.extend(splitter('\n'.join(batch)))
                # first line may contain all whitespace
                batch = [line]
            else:
                batch.append(line)
        if len(batch):
            # TODO: tag sentences with line + filename where they started
            sentences.extend(splitter('\n'.join(batch)))

    return sentences
Пример #3
0
def split_sentences_regex(text):
    """ Use dead-simple regex to split text into sentences. Very poor accuracy.

    >>> split_sentences_regex("Hello World. I'm I.B.M.'s Watson. --Watson")
    ['Hello World.', "I'm I.B.M.'s Watson.", '--Watson']
    """
    text = '\n'.join(iter_lines(text))
    parts = regex.split(r'([a-zA-Z0-9][.?!])[\s$]', text)
    sentences = [''.join(s) for s in zip(parts[0::2], parts[1::2])]
    return sentences + [parts[-1]] if len(parts) % 2 else sentences
Пример #4
0
def split_sentences_nltk(text, language_model='tokenizers/punkt/english.pickle'):
    text = '\n'.join(iter_lines(text))
    try:
        sentence_detector = nltk.data.load(language_model)
    except LookupError:
        try:
            nltk.download('punkt', raise_on_error=True)
            sentence_detector = nltk.data.load(language_model)
        except ValueError:
            return split_sentences_regex(text)

    return list(sentence_detector.tokenize(text.strip()))
Пример #5
0
def segment_sentences(path=os.path.join(DATA_PATH, 'book'), ext='asc', splitter=split_sentences_nltk):
    """ Return a list of all sentences and empty lines.

    TODO:
        1. process each line with an agressive sentence segmenter, like DetectorMorse
        2. process our manuscript to create a complete-sentence and heading training set normalized/simplified
           syntax net tree is the input feature set common words and N-grams inserted with their label as additional feature
        3. process a training set with a grammar checker and sentax next to bootstrap a "complete sentence" labeler.
        4. process each 1-3 line window (breaking on empty lines) with syntax net to label them
        5. label each 1-3-line window of lines as "complete sentence, partial sentence/phrase, or multi-sentence"

    >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'book')))
    8324
    >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'book',
    ...     'Chapter 00 -- Preface.asc'), splitter=split_sentences_nltk))
    139
    >>> len(segment_sentences(path=os.path.join(DATA_PATH, 'book',
    ...     'Chapter 01 -- Packets of Thought (Basic NLP).asc'), splitter=split_sentences_nltk))
    585
    """
    sentences = []
    if os.path.isdir(path):
        for filemeta in find_files(path, ext=ext):
            with open(filemeta['path'], 'rt') as fin:
                batch = []
                for i, line in enumerate(fin):
                    if not line.strip():
                        sentences.extend(splitter('\n'.join(batch)))
                        batch = [line]  # may contain all whitespace
                    else:
                        batch.append(line)
                if len(batch):
                    # TODO: tag sentences with line + filename where they started
                    sentences.extend(splitter('\n'.join(batch)))
    else:
        batch = []
        for i, line in enumerate(iter_lines(path)):
            # TODO: filter out code and meta lines using asciidoc or markdown parser
            # split into batches based on empty lines
            if not line.strip():
                sentences.extend(splitter('\n'.join(batch)))
                # first line may contain all whitespace
                batch = [line]
            else:
                batch.append(line)
        if len(batch):
            # TODO: tag sentences with line + filename where they started
            sentences.extend(splitter('\n'.join(batch)))

    return sentences
Пример #6
0
def tag_code_lines(text, markup=None):
    if (markup is None or markup.lower() in ('asc', '.asc', 'adoc', '.adoc', '.asciidoc') or (
            os.path.isfile(text) and text.lower().split('.')[-1] in ('asc', 'adoc', 'asciidoc'))):
        markup = 'asciidoc'
    lines = []
    within_codeblock = False
    for i, line in enumerate(iter_lines(text)):
        # TODO: filter out code and meta lines using asciidoc or markdown parser
        # split into batches based on empty lines
        tag = tag_code(line, markup=markup)
        if within_codeblock or tag.startswith('code.'):
            if tag.endswith('.end'):
                within_codeblock = False
            elif tag == 'code.start':
                within_codeblock = True
        lines.append(line, tag)
    return lines
Пример #7
0
def split_text_blocks(text):
    r""" Splits asciidoc and markdown files into a list of lists (blocks text in a list of lines of text)

    >>> split_text_blocks("# Title \nHello world! \nI'm here today to\nstrip\nyour text.\n \t  \r\nNext block\n\nwas short.")
    [['# Title \n',
      'Hello world! \n',
      "I'm here today to\n",
      'strip\n',
      'your text.\n',
      ' \t  \r\n'],
     ['Next block\n', '\n'],
     ['was short.']]
    """
    blocks = []
    block = []
    for line in iter_lines(text):
        block.append(line)
        if not line.strip():
            blocks.append(block)
            block = []
    if block:
        blocks.append(block)
    return blocks