Exemplo n.º 1
0
def test_scan_collection_skip():
    filenames = create_collections()
    output_dir = tempfile.mkdtemp()
    # remove one file
    os.remove(filenames[0])

    p = NegBioPipeline([('fake', FakePipe())])
    p.scan(source=filenames, directory=output_dir, suffix='.xml')

    assert not os.path.exists(os.path.join(output_dir, filenames[0]))
Exemplo n.º 2
0
def test_scan_document_error():
    class FakePipeError(Pipe):
        def __call__(self, doc: bioc.BioCDocument, *args, **kwargs):
            raise KeyError

    filenames = create_collections()
    output_dir = tempfile.mkdtemp()

    p = NegBioPipeline([('fake_error', FakePipeError())])
    p.scan(source=filenames, directory=output_dir, suffix='.xml')

    for filename in filenames:
        assert filecmp.cmp(filename, os.path.join(output_dir, os.path.basename(filename)))
Exemplo n.º 3
0
def test_scan_collection():
    filenames = create_collections()
    output_dir = tempfile.mkdtemp()
    os.rmdir(output_dir)

    p = NegBioPipeline([('fake', FakePipe())])
    p.scan(source=filenames, directory=output_dir, suffix='.xml')
    for filename in filenames:
        filename = os.path.join(output_dir, os.path.basename(filename))
        with open(filename) as fp:
            c = bioc.load(fp)
            for doc in c.documents:
                assert doc.infons['fake']
Exemplo n.º 4
0
from negbio.pipeline2.dner_mm import MetaMapExtractor
from negbio.pipeline2.pipeline import NegBioPipeline
from pymetamap import MetaMap


def read_cuis(pathname):
    cuis = set()
    with open(pathname) as fp:
        for line in fp:
            line = line.strip()
            if line:
                cuis.add(line)
    return cuis


if __name__ == '__main__':
    argv = parse_args(__doc__)
    mm = MetaMap.get_instance(argv['--metamap'])

    if argv['--cuis'] is None:
        cuis = None
    else:
        cuis = read_cuis(argv['--cuis'])

    extractor = MetaMapExtractor(mm, cuis)
    pipeline = NegBioPipeline(pipeline=[('MetaMapExtractor', extractor)])
    pipeline.scan(source=argv['<file>'],
                  directory=argv['--output'],
                  suffix=argv['--suffix'],
                  overwrite=argv['--overwrite'])
Exemplo n.º 5
0
from negbio.cli_utils import parse_args, calls_asynchronously
from negbio.pipeline2.pipeline import NegBioPipeline
from negbio.pipeline2.section_split import SectionSplitter


def read_section_titles(pathname):
    with open(pathname) as fp:
        titles = [line.strip() for line in fp]
        p = '|'.join(titles)
        logging.debug('Section patterns: %s', p)
        return re.compile(p, re.IGNORECASE | re.MULTILINE)


if __name__ == '__main__':
    argv = parse_args(__doc__)
    workers = int(argv['--workers'])
    if workers == 1:
        if argv['--pattern'] is None:
            pattern = None
        else:
            pattern = read_section_titles(argv['--pattern'])

        splitter = SectionSplitter(pattern)
        pipeline = NegBioPipeline(pipeline=[('SectionSplitter', splitter)])
        pipeline.scan(source=argv['<file>'],
                      suffix=argv['--suffix'],
                      directory=argv['--output'],
                      overwrite=argv['--overwrite'])
    else:
        calls_asynchronously(argv, 'python -m negbio.negbio_section_split')
Exemplo n.º 6
0
Parse sentences

Usage:
    negbio_parse [options] --output=<directory> <file> ...

Options:
    --model=<directory>     Bllip parser model directory.
    --output=<directory>    Specify the output directory.
    --suffix=<suffix>       Append an additional SUFFIX to file names. [default: .bllip.xml]
    --verbose               Print more information about progress.
    --workers=<n>           Number of threads [default: 1]
    --files_per_worker=<n>  Number of input files per worker [default: 8]
    --overwrite             Overwrite the output file.
"""
from negbio.cli_utils import parse_args, calls_asynchronously
from negbio.pipeline2.parse import NegBioParser
from negbio.pipeline2.pipeline import NegBioPipeline

if __name__ == '__main__':
    argv = parse_args(__doc__)
    workers = int(argv['--workers'])
    if workers == 1:
        parser = NegBioParser(model_dir=argv['--model'])
        pipeline = NegBioPipeline(pipeline=[('NegBioParser', parser)])
        pipeline.scan(source=argv['<file>'],
                      directory=argv['--output'],
                      suffix=argv['--suffix'],
                      overwrite=argv['--overwrite'])
    else:
        calls_asynchronously(argv, 'python -m negbio.negbio_parse')
Exemplo n.º 7
0
"""
Determines the lemma

Usage:
    negbio_lemmatize [options] --output=<directory> <file> ...

Options:
    --output=<directory>    Specify the output directory.
    --suffix=<suffix>       Append an additional SUFFIX to file names. [default: .ud.xml]
    --verbose               Print more information about progress.
    --overwrite             Overwrite the output file.
"""
from negbio.cli_utils import parse_args
from negbio.pipeline2.lemmatize import Lemmatizer
from negbio.pipeline2.pipeline import NegBioPipeline

if __name__ == '__main__':
    argv = parse_args(__doc__)
    lemmatizer = Lemmatizer()
    pipeline = NegBioPipeline(pipeline=[('Lemmatizer', lemmatizer)])
    pipeline.scan(source=argv['<file>'],
                  directory=argv['--output'],
                  suffix=argv['--suffix'],
                  overwrite=argv['--overwrite'])
Exemplo n.º 8
0
    --newline_is_sentence_break     Whether to treat newlines as sentence breaks. True means that
                                    a newline is always a sentence break. False means to ignore
                                    newlines for the purpose of sentence splitting. This is
                                    appropriate for continuous text, when just the non-whitespace
                                    characters should be used to determine sentence breaks.
                                    [default=False]
    --suffix=<suffix>               Append an additional SUFFIX to file names.
                                    [default: .ssplit.xml]
    --output=<directory>            Specify the output directory.
    --verbose                       Print more information about progress.
    --overwrite                     Overwrite the output file.
    --workers=<n>                   Number of threads [default: 1]
    --files_per_worker=<n>          Number of input files per worker [default: 8]
"""
from negbio.pipeline2.pipeline import NegBioPipeline
from negbio.pipeline2.ssplit import NegBioSSplitter
from negbio.cli_utils import parse_args, calls_asynchronously

if __name__ == '__main__':
    argv = parse_args(__doc__)
    workers = int(argv['--workers'])
    if workers == 1:
        splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
        pipeline = NegBioPipeline(pipeline=[('NegBioSSplitter', splitter)])
        pipeline.scan(source=argv['<file>'],
                      directory=argv['--output'],
                      suffix=argv['--suffix'],
                      overwrite=argv['--overwrite'])
    else:
        calls_asynchronously(argv, 'python -m negbio.negbio_ssplit')
Exemplo n.º 9
0
"""
Usage:
    negbio_normalize [options] --output=<directory> <file> ...

Options:
    --output=<directory>    Specify the output directory.
    --suffix=<suffix>       Append an additional SUFFIX to file names. [default: .normalized.xml]
    --verbose               Print more information about progress.
    --overwrite             Overwrite the output file.
"""

from negbio.cli_utils import parse_args
from negbio.pipeline2.pipeline import NegBioPipeline
from negbio.pipeline2.normalize_mimiccxr import MIMICCXRNormalizer

if __name__ == '__main__':
    argv = parse_args(__doc__)
    normalizer = MIMICCXRNormalizer()
    pipeline = NegBioPipeline(pipeline=[('MIMICCXRNormalizer', normalizer)])
    pipeline.scan(source=argv['<file>'],
                  directory=argv['--output'],
                  suffix=argv['--suffix'],
                  overwrite=argv['--overwrite'])
Exemplo n.º 10
0
Convert from parse tree to universal dependencies

Usage:
    negbio_ptb2ud [options] --output=<directory> <file> ...

Options:
    --output=<directory>    Specify the output directory.
    --suffix=<suffix>       Append an additional SUFFIX to file names. [default: .ud.xml]
    --verbose               Print more information about progress.
    --workers=<n>           Number of threads [default: 1]
    --files_per_worker=<n>  Number of input files per worker [default: 8]
    --overwrite             Overwrite the output file.
"""
from negbio.cli_utils import parse_args, calls_asynchronously
from negbio.pipeline2.ptb2ud import NegBioPtb2DepConverter
from negbio.pipeline2.pipeline import NegBioPipeline

if __name__ == '__main__':
    argv = parse_args(__doc__)
    workers = int(argv['--workers'])
    if workers == 1:
        converter = NegBioPtb2DepConverter(universal=True)
        pipeline = NegBioPipeline(pipeline=[('NegBioPtb2DepConverter',
                                             converter)])
        pipeline.scan(source=argv['<file>'],
                      directory=argv['--output'],
                      suffix=argv['--suffix'],
                      overwrite=argv['--overwrite'])
    else:
        calls_asynchronously(argv, 'python -m negbio.negbio_ptb2ud')
Exemplo n.º 11
0
    negbio_dner_regex [options] --output=<directory> <file> ...

Options:
    --suffix=<suffix>       Append an additional SUFFIX to file names. [default: .regex.xml]
    --output=<directory>    Specify the output directory.
    --verbose               Print more information about progress.
    --phrases=<file>        File containing phrases for each observation. [default: patterns/cxr14_phrases_v2.yml]
    --overwrite             Overwrite the output file.
    --workers=<n>           Number of threads [default: 1]
    --files_per_worker=<n>  Number of input files per worker [default: 32]
"""
from pathlib import Path

from negbio.pipeline2.dner_regex import RegExExtractor
from negbio.cli_utils import parse_args, calls_asynchronously
from negbio.pipeline2.pipeline import NegBioPipeline

if __name__ == '__main__':
    argv = parse_args(__doc__)
    workers = int(argv['--workers'])
    if workers == 1:
        phrases_file = Path(argv['--phrases'])
        extractor = RegExExtractor(phrases_file, phrases_file.stem)
        pipeline = NegBioPipeline(pipeline=[('RegEx', extractor)])
        pipeline.scan(source=argv['<file>'],
                      directory=argv['--output'],
                      suffix=argv['--suffix'],
                      overwrite=argv['--overwrite'])
    else:
        calls_asynchronously(argv, 'python -m negbio.negbio_dner_regex')
Exemplo n.º 12
0
    --uncertainty-regex-patterns=FILE           Regex uncertainty rules [default: patterns/uncertainty_regex_patterns.yml]
    --suffix=<suffix>               Append an additional SUFFIX to file names. [default: .neg2.xml]
    --verbose                       Print more information about progress.
    --output=<directory>            Specify the output directory.
    --workers=<n>                   Number of threads [default: 1]
    --files_per_worker=<n>          Number of input files per worker [default: 32]
    --overwrite                     Overwrite the output file.
"""
from negbio.cli_utils import parse_args, calls_asynchronously
from negbio.pipeline2.negdetect2 import NegBioNegDetector2, Detector2
from negbio.pipeline2.pipeline import NegBioPipeline

if __name__ == '__main__':
    argv = parse_args(__doc__)
    workers = int(argv['--workers'])
    if workers == 1:
        neg_detector = NegBioNegDetector2(
            Detector2(argv['--pre-negation-uncertainty-patterns'],
                      argv['--neg-patterns'],
                      argv['--post-negation-uncertainty-patterns'],
                      argv['--neg-regex-patterns'],
                      argv['--uncertainty-regex-patterns']))
        pipeline = NegBioPipeline(pipeline=[('NegBioNegDetector',
                                             neg_detector)])
        pipeline.scan(source=argv['<file>'],
                      directory=argv['--output'],
                      suffix=argv['--suffix'],
                      overwrite=argv['--overwrite'])
    else:
        calls_asynchronously(argv, 'python -m negbio.negbio_neg2')
Exemplo n.º 13
0
Clean up sentences

Usage:
    negbio_cleanup [options] --output=<directory> <file> ...

Options:
    --suffix=<suffix>       Append an additional SUFFIX to file names. [default: .negbio.xml]
    --verbose               Print more information about progress.
    --output=<directory>    Specify the output directory.
    --overwrite             Overwrite the output file.
    --workers=<n>           Number of threads [default: 1]
    --files_per_worker=<n>  Number of input files per worker [default: 8]
"""

from negbio.cli_utils import parse_args, calls_asynchronously
from negbio.pipeline2.cleanup import CleanUp
from negbio.pipeline2.pipeline import NegBioPipeline

if __name__ == '__main__':
    argv = parse_args(__doc__)
    workers = int(argv['--workers'])
    if workers == 1:
        cleanup = CleanUp()
        pipeline = NegBioPipeline(pipeline=[('CleanUp', cleanup)])
        pipeline.scan(source=argv['<file>'],
                      directory=argv['--output'],
                      suffix=argv['--suffix'],
                      overwrite=argv['--overwrite'])
    else:
        calls_asynchronously(argv, 'python -m negbio.negbio_clean')