示例#1
0
文件: cleanup.py 项目: ralogon/NegBio
def main(argv):
    argv = docopt.docopt(__doc__, argv=argv)
    print(argv)
    scan.scan_document(source=argv['SOURCE'],
                       directory=argv['--out'],
                       suffix='.negbio.xml',
                       fn=clean_sentences,
                       non_sequences=[])
示例#2
0
def main(argv):
    argv = docopt.docopt(__doc__, argv=argv)
    print(argv)

    ptb2dep = Ptb2DepConverter(universal=True)
    lemmatizer = Lemmatizer()
    scan.scan_document(source=argv['SOURCE'], directory=argv['--out'], suffix='.ud.xml',
                       fn=convert, non_sequences=[ptb2dep, lemmatizer])
示例#3
0
def main(argv):
    argv = docopt.docopt(__doc__, argv=argv)
    print(argv)
    parser = Bllip(model_dir=os.path.expanduser(argv['--model']))
    scan.scan_document(source=argv['SOURCE'],
                       directory=argv['--out'],
                       suffix='.bllip.xml',
                       fn=parse,
                       non_sequences=[parser])
示例#4
0
def main(argv):
    argv = docopt.docopt(__doc__, argv=argv)
    print(argv)
    neg_detector = Detector(argv['--neg-patterns'],
                            argv['--uncertainty-patterns'])
    scan.scan_document(source=argv['SOURCE'],
                       directory=argv['--out'],
                       suffix='.neg.xml',
                       fn=detect,
                       non_sequences=[neg_detector])
示例#5
0
文件: ssplit.py 项目: ralogon/NegBio
def main(argv):
    argv = docopt.docopt(__doc__, argv=argv)
    print(argv)
    splitter = NltkSSplitter(newline=argv['--newline_is_sentence_break'])

    scan.scan_document(source=argv['SOURCE'],
                       directory=argv['--out'],
                       suffix='.ss.xml',
                       fn=ssplit,
                       non_sequences=[splitter])
示例#6
0
def main(argv):
    argv = docopt.docopt(__doc__, argv=argv)
    print(argv)
    splitter = ssplit.NltkSSplitter(newline=True)
    parser = parse.Bllip(model_dir=argv['--model'])
    ptb2dep = ptb2ud.Ptb2DepConverter(universal=True)
    lemmatizer = ptb2ud.Lemmatizer()
    neg_detector = negdetect.Detector(argv['--neg-patterns'],
                                      argv['--uncertainty-patterns'])

    scan.scan_document(
        source=argv['SOURCE'],
        directory=argv['--out'],
        suffix='.neg.xml',
        fn=pipeline,
        non_sequences=[splitter, parser, ptb2dep, lemmatizer, neg_detector])
示例#7
0
"""
Clean up sentences

Usage:
    negbio_pipeline cleanup [options] --output=<directory> <file> ...

Options:
    --suffix=<suffix>               Append an additional SUFFIX to file names. [default: .negbio.xml]
    --verbose                       Print more information about progress.
    --output=<directory>            Specify the output directory.
"""

from negbio.cli_utils import parse_args
from negbio.pipeline.cleanup import clean_sentences
from negbio.pipeline.scan import scan_document

if __name__ == '__main__':
    argv = parse_args(__doc__)
    scan_document(source=argv['<file>'],
                  directory=argv['--output'],
                  suffix=argv['--suffix'],
                  fn=clean_sentences)
示例#8
0
"""
Usage:
    negbio_pipeline normalize [options] --output=<directory> <file> ...

Options:
    --output=<directory>    Specify the output directory.
    --suffix=<suffix>       Append an additional SUFFIX to file names. [default: .normalized.xml]
    --verbose               Print more information about progress.
"""

from negbio.cli_utils import parse_args
from negbio.ext.normalize_mimiccxr import normalize
from negbio.pipeline.scan import scan_document

if __name__ == '__main__':
    argv = parse_args(__doc__)
    scan_document(source=argv['<file>'],
                  verbose=argv['--verbose'],
                  suffix=argv['--suffix'],
                  directory=argv['--output'],
                  fn=normalize)
示例#9
0
"""
Split text into sentences

Usage:
    negbio_pipeline ssplit [options] --output=<directory> <file> ...

Options:
    --newline_is_sentence_break     Whether to treat newlines as sentence breaks. True means that a newline is always a
                                    sentence break. False means to ignore newlines for the purpose of sentence
                                    splitting. This is appropriate for continuous text, when just the non-whitespace
                                    characters should be used to determine sentence breaks. [default=False]
    --suffix=<suffix>               Append an additional SUFFIX to file names. [default: .ssplit.xml]
    --output=<directory>            Specify the output directory.
    --verbose                       Print more information about progress.
"""
from negbio.pipeline.scan import scan_document
from negbio.pipeline.ssplit import NegBioSSplitter
from negbio.cli_utils import parse_args

if __name__ == '__main__':
    argv = parse_args(__doc__)
    splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
    scan_document(source=argv['<file>'],
                  directory=argv['--output'],
                  suffix=argv['--suffix'],
                  fn=splitter.split_doc,
                  non_sequences=[])
示例#10
0
    negbio_pipeline neg [options] --output=<directory> <file> ...

Options:
    --neg-patterns=<file>           Specify negation rules [default: negbio/patterns/neg_patterns.txt]
    --uncertainty-patterns=<file>   Specify uncertainty rules [default: negbio/patterns/uncertainty_patterns.txt]
    --suffix=<suffix>               Append an additional SUFFIX to file names. [default: .neg.xml]
    --verbose                       Print more information about progress.
    --output=<directory>            Specify the output directory.
"""
import os

from negbio.cli_utils import parse_args, get_absolute_path
from negbio.neg.neg_detector import Detector
from negbio.pipeline.negdetect import detect
from negbio.pipeline.scan import scan_document

if __name__ == '__main__':
    argv = parse_args(__doc__)

    argv = get_absolute_path(argv,
                             '--neg-patterns',
                             'negbio/patterns/neg_patterns.txt')
    argv = get_absolute_path(argv,
                             '--uncertainty-patterns',
                             'negbio/patterns/uncertainty_patterns.txt')

    neg_detector = Detector(os.path.realpath(argv['--neg-patterns']),
                            os.path.realpath(argv['--uncertainty-patterns']))
    scan_document(source=argv['<file>'], directory=argv['--output'], suffix=argv['--suffix'],
                  fn=detect, non_sequences=[neg_detector])
    negbio_pipeline section_split [options] --output=<directory> <file> ...

Options:
    --suffix=<suffix>       Append an additional SUFFIX to file names. [default: .secsplit.xml]
    --output=<directory>    Specify the output directory.
    --verbose               Print more information about progress.
    --pattern=<file>        Specify section title list for matching.
"""
import re

from negbio.cli_utils import parse_args
from negbio.pipeline.scan import scan_document
from negbio.pipeline.section_split import split_document


def read_section_titles(pathname):
    with open(pathname) as fp:
        return re.compile('|'.join(fp.readlines()), re.MULTILINE)


if __name__ == '__main__':
    argv = parse_args(__doc__)

    if argv['--pattern'] is None:
        patterns = None
    else:
        patterns = read_section_titles(argv['--pattern'])

    scan_document(source=argv['<file>'], verbose=argv['--verbose'], suffix=argv['--suffix'],
                  directory=argv['--output'], fn=split_document, non_sequences=[patterns])