示例#1
0
def BioC_Converter(Inputfile, Outputfile, Originalfile):

    tiabs = {}
    with open(Originalfile, 'r', encoding='utf8') as file_Originalfile:
        collection = bioc.load(file_Originalfile)
        document_count = 0
        for document in collection.documents:
            passage_count = 0
            for passage in document.passages:
                if document_count not in tiabs:
                    tiabs[document_count] = {}
                tiabs[document_count][passage_count] = passage.text
                passage_count = passage_count + 1
            document_count = document_count + 1
    file_Originalfile.close()

    with open(Outputfile, 'w', encoding='utf8') as file_Outputfile:
        with open(Inputfile, 'r', encoding='utf8') as file_Inputfile:
            collection = bioc.load(file_Inputfile)
            document_count = 0
            for document in collection.documents:
                passage_count = 0
                for passage in document.passages:
                    passage.text = tiabs[document_count][passage_count]
                    for annotation in passage.annotations:
                        start = annotation.locations[0].offset
                        last = start + annotation.locations[0].length
                        annotation.text = tiabs[document_count][passage_count][
                            start:last]
                    passage_count = passage_count + 1
                document_count = document_count + 1
            bioc.dump(collection, file_Outputfile, pretty_print=False)
        file_Inputfile.close()
    file_Outputfile.close()
示例#2
0
def test_dump():
    with open(file, encoding='utf8') as fp:
        collection = bioc.load(fp, BioCFileType.BIOC_JSON)
    tmp = tempfile.mktemp()
    with open(tmp, 'w', encoding='utf8') as fp:
        bioc.dump(collection, fp, BioCFileType.BIOC_JSON)
    with open(tmp, encoding='utf8') as fp:
        collection = bioc.load(fp, BioCFileType.BIOC_JSON)
    assert_everything(collection)
示例#3
0
 def test_dump(self):
     with open(self.src) as fp:
         collection = bioc.load(fp)
     tmp = tempfile.NamedTemporaryFile()
     with open(tmp.name, 'w') as fp:
         bioc.dump(collection, fp)
     with open(tmp.name) as fp:
         collection = bioc.load(fp)
     self.__test_collection(collection)
示例#4
0
def create_prediction(source, dest, phrases_file, verbose=True):
    """

    Args:
        source: a list of source pathnames
        dest: output file name
        phrases_file: phrase pathname
    """
    with open(phrases_file) as fp:
        phrases = yaml.load(fp, yaml.FullLoader)
    total_findings = list(phrases.keys())

    rows = []
    cnt = collections.Counter()
    for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose, unit='col'):
            with open(pathname, encoding='utf8') as fp:
                collection = bioc.load(fp)

            for doc in collection.documents:
                label_dict = aggregate(doc)
                # if doc.id == 's50351835':
                #     print(label_dict)
                label_vec = dict_to_vec(label_dict, total_findings)
                findings = collections.OrderedDict()
                findings['id'] = str(doc.id)
                for i, f in enumerate(total_findings):
                    findings[f] = label_vec[i]
                rows.append(findings)

    rows = sorted(rows, key=lambda x: x['id'])
    row_df = pd.DataFrame(rows)
    row_df.to_csv(dest, index=None, float_format='%1.0f')
    logging.debug(cnt)
示例#5
0
def split(source,
          *,
          prefix: str,
          num_doc: int,
          additional_suffix: str = '.xml',
          suffix_length: int = 2):
    path_format = prefix + '{:0' + str(
        suffix_length) + 'x}' + additional_suffix

    with open(source, encoding='utf8') as fp:
        collection = bioc.load(fp)

    newc = bioc.BioCCollection()
    newc.infons = collection.infons
    newc.source = collection.source
    newc.version = collection.version
    newc.source = collection.source
    newc.standalone = collection.standalone

    i = 0
    for doc in tqdm.tqdm(collection.documents):
        newc.add_document(doc)
        if len(newc.documents) == num_doc:
            dst = path_format.format(i)
            with open(dst, 'w', encoding='utf8') as fp:
                bioc.dump(newc, fp)
            del newc.documents[:]
            i += 1
    if newc.documents:
        dst = path_format.format(i)
        with open(dst, 'w', encoding='utf8') as fp:
            bioc.dump(newc, fp)
    def xml_decoder(self):
        """Decode an XML file and return a bioc.collection object in a variable called output."""
        with open("foodbase/FoodBase_curated.xml", "r") as xml_file:
            collection = bioc.load(xml_file)
            output = bioc.dumps(collection, BioCFileType.BIOC_JSON, indent=2)

        return output
示例#7
0
def BioC_Converter(infile, outfile, biotag_dic, nn_model, para_set):

    with open(infile, 'r', encoding='utf-8') as fin:
        with open(outfile, 'w', encoding='utf8') as fout:
            collection = bioc.load(fin)
            for document in collection.documents:
                for passage in document.passages:
                    tag_result = bioTag(passage.text,
                                        biotag_dic,
                                        nn_model,
                                        onlyLongest=para_set['onlyLongest'],
                                        abbrRecog=para_set['abbrRecog'],
                                        Threshold=para_set['ML_Threshold'])
                    mention_num = 0
                    for ele in tag_result:
                        bioc_note = bioc.BioCAnnotation()
                        bioc_note.id = str(mention_num)
                        mention_num += 1
                        bioc_note.infons['identifier'] = ele[2]
                        bioc_note.infons['type'] = "Phenotype"
                        bioc_note.infons['score'] = ele[3]
                        start = int(ele[0])
                        last = int(ele[1])
                        loc = bioc.BioCLocation(offset=str(start),
                                                length=str(last - start))
                        bioc_note.locations.append(loc)
                        bioc_note.text = passage.text[start:last]
                        passage.annotations.append(bioc_note)
            bioc.dump(collection, fout, pretty_print=True)
示例#8
0
def read_word_based_annotations(infile, textfile, prediction_file=False):
    with open(infile, 'r') as fin:
        try:
            collection = bioc.load(fin)
        except:
            logging.error('BioC file {0} not well formed'.format(infile))
            raise
    with open(textfile, 'r') as fin:
        text = fin.read()
    assert collection.documents.__len__(
    ) != 0, "Each document should be encoded in its own collection"

    annotations, seen_anns = {}, set()
    for passage in collection.documents[0].passages:
        for annotation in passage.annotations:
            assert annotation.id not in seen_anns, 'Duplicate annotation id found. Please verify{0}'.format(
                annotation.id)
            if annotation.infons['type'] in ANNOTATION_TYPE:
                word_annotations = split_annotations(
                    annotation.id, annotation.infons['type'],
                    annotation.locations[0].offset,
                    annotation.locations[0].length, text)
                for word_id, word_annotation in word_annotations:
                    annotations[word_id] = word_annotation

    annotation_map = {ann: key for key, ann in annotations.items()}
    return annotations, annotation_map
示例#9
0
def test_chexpert_extractor():
    extractor = RegExExtractor(
        __tests_dir / 'data/patterns/chexpert_phrases.yml', 'CheXpert labeler')

    dir = get_example_dir()
    with open(dir / '1.chexpert.xml') as fp:
        c = bioc.load(fp)

    actual_documents = c.documents
    expected_documents = []
    for doc in actual_documents:
        doc = copy.deepcopy(doc)
        for p in doc.passages:
            del p.annotations[:]
        expected_documents.append(doc)

    for expected_doc, actual_doc in zip(expected_documents, actual_documents):
        extractor.__call__(expected_doc)
        expected_anns = sorted(list(
            bioc.annotations(expected_doc, bioc.PASSAGE)),
                               key=lambda a: a.total_span.offset)
        actual_anns = sorted(list(bioc.annotations(actual_doc, bioc.PASSAGE)),
                             key=lambda a: a.total_span.offset)

        assert len(expected_anns) == len(actual_anns), \
            '{} vs {}'.format(len(expected_anns), len(actual_anns))
        for expected_ann, actual_ann in zip(expected_anns, actual_anns):
            assert expected_ann.total_span == actual_ann.total_span
            for k in ['observation', 'annotator']:
                assert expected_ann.infons[k] == actual_ann.infons[k]
示例#10
0
def main():
    argv = parse_args(__doc__, version='version 2')
    print(argv)

    lemmatizer = Lemmatizer()
    ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True)
    splitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
    parser = NegBioParser(model_dir=argv['--bllip-model'])

    argv = get_absolute_path(argv, '--neg-patterns',
                             'negbio/patterns/neg_patterns.txt')
    argv = get_absolute_path(argv, '--uncertainty-patterns',
                             'negbio/patterns/uncertainty_patterns.txt')

    mm = pymetamap.MetaMap.get_instance(argv['--metamap'])
    neg_detector = negdetect.Detector(argv['--neg-patterns'],
                                      argv['--uncertainty-patterns'])

    if argv['--cuis'] == 'None':
        cuis = None
    else:
        cuis = read_cuis(argv['--cuis'])

    if argv['text']:
        collection = text2bioc.text2collection(argv['SOURCES'])
    elif argv['bioc']:
        with open(argv['SOURCE']) as fp:
            collection = bioc.load(fp)
    else:
        raise KeyError

    pipeline(collection, mm, splitter, parser, ptb2dep, neg_detector, cuis)

    with open(os.path.expanduser(argv['--output']), 'w') as fp:
        bioc.dump(collection, fp)
示例#11
0
def test_toJSON():
    with open(file, encoding='utf8') as fp:
        collection = bioc.load(fp, BioCFileType.BIOC_JSON)
    obj = toJSON(collection)
    assert obj['documents'][0]['id'] == '1'

    with pytest.raises(TypeError):
        toJSON({})
示例#12
0
def test_dump():
    collection = _get_collection()
    tmp = tempfile.mktemp()
    with open(tmp, 'w', encoding='utf8') as fp:
        bioc.dump(collection, fp)
    with open(tmp, encoding='utf8') as fp:
        collection = bioc.load(fp)
    assert_everything(collection)
示例#13
0
def add_text(objs: List[OneFigure], bioc_dir):
    for obj in objs:
        pmcid = obj.pmcid
        with open(bioc_dir / f'{pmcid}.xml', encoding='utf8') as fp:
            collection = bioc.load(fp)
            for doc in collection.documents:
                get_figure_caption(obj, doc)
    return objs
示例#14
0
def loadDataFromBioC(filename, ignoreEntities=[]):
    with open(filename, 'r') as fp:
        collection = bioc.load(fp)
    assert isinstance(collection, bioc.BioCCollection)

    parsed = []
    for document in collection.documents:
        parsed += convertBiocDocToKindredDocs(document)
    return parsed
示例#15
0
def add_text(objs: List[OneArticle], bioc_dir):
    for obj in objs:
        pmcid = obj.pmcid
        with open(bioc_dir / f'{pmcid}.xml', encoding='utf8') as fp:
            collection = bioc.load(fp)
            for doc in collection.documents:
                # split sentences
                # doc = split_sentences(doc)
                for figure in obj.figures.values():
                    get_figure_caption(figure, doc)
    return objs
    def parse(self, filename, output_dir=tempfile.mkdtemp()):
        # Read file and do preliminary pre processing to form rows of records
        data_rows = []
        with open(filename, 'r') as fp:
            collection = bioc.load(fp)
            for doc in collection.documents:
                rows_x = self.convert_bioc_document_to_rows(doc)
                data_rows.extend(rows_x)

        # subset
        # data_rows =  data_rows[1:100]
        return data_rows
示例#17
0
def test_text_to_collection_file():
    text = 'No pneumothorax.'

    input = tempfile.mktemp()
    with open(input, 'w') as fp:
        fp.write(text)

    output = tempfile.mktemp()
    text_to_collection_file(output, input)
    with open(output) as fp:
        c = bioc.load(fp)
    assert c.documents[0].passages[0].text == text
示例#18
0
def main():
    argv = parse_args(__doc__, version='version 2')
    print(argv)

    lemmatizer = Lemmatizer()
    ptb2dep = NegBioPtb2DepConverter(lemmatizer, universal=True)
    ssplitter = NegBioSSplitter(newline=argv['--newline_is_sentence_break'])
    parser = NegBioParser(model_dir=argv['--bllip-model'])

    argv = get_absolute_path(argv, '--mention_phrases_dir',
                             'negbio/chexpert/phrases/mention')
    argv = get_absolute_path(argv, '--unmention_phrases_dir',
                             'negbio/chexpert/phrases/unmention')
    argv = get_absolute_path(
        argv, '--pre-negation-uncertainty-patterns',
        'negbio/chexpert/patterns/pre_negation_uncertainty.txt')
    argv = get_absolute_path(
        argv, '--post-negation-uncertainty-patterns',
        'negbio/chexpert/patterns/post_negation_uncertainty.txt')
    argv = get_absolute_path(argv, '--neg-patterns',
                             'negbio/chexpert/patterns/negation.txt')

    # chexpert
    loader = NegBioLoader()
    extractor = NegBioExtractor(Path(argv['--mention_phrases_dir']),
                                Path(argv['--unmention_phrases_dir']),
                                verbose=argv['--verbose'])
    neg_detector = ModifiedDetector(
        argv['--pre-negation-uncertainty-patterns'], argv['--neg-patterns'],
        argv['--post-negation-uncertainty-patterns'])
    aggregator = NegBioAggregator(CATEGORIES, verbose=argv['--verbose'])

    if argv['text']:
        collection = text2bioc.text2collection(argv['SOURCES'])
    elif argv['bioc']:
        with open(argv['SOURCE']) as fp:
            collection = bioc.load(fp)
    else:
        raise KeyError

    pipeline(collection,
             loader,
             ssplitter,
             extractor,
             parser,
             ptb2dep,
             neg_detector,
             aggregator,
             verbose=argv['--verbose'])

    with open(os.path.expanduser(argv['--output']), 'w') as fp:
        bioc.dump(collection, fp)
示例#19
0
def test_scan_collection():
    filenames = create_collections()
    output_dir = tempfile.mkdtemp()
    os.rmdir(output_dir)

    p = NegBioPipeline([('fake', FakePipe())])
    p.scan(source=filenames, directory=output_dir, suffix='.xml')
    for filename in filenames:
        filename = os.path.join(output_dir, os.path.basename(filename))
        with open(filename) as fp:
            c = bioc.load(fp)
            for doc in c.documents:
                assert doc.infons['fake']
示例#20
0
def test_BioCXMLDocumentWriter_file():
    collection = _get_collection()

    tmp = tempfile.mktemp()
    with bioc.BioCXMLDocumentWriter(tmp) as writer:
        writer.write_collection_info(collection)
        for document in collection.documents:
            writer.write_document(document)

    with open(tmp, encoding='utf8') as fp:
        collection = bioc.load(fp)

    assert_everything(collection)
示例#21
0
def test_split_file(tmp_path):
    total_doc = 230
    n = 7
    c = get_collection(total_doc)

    source = tmp_path / 'foo.xml'
    with open(source, 'w') as fp:
        bioc.dump(c, fp)

    split.split_file(source, prefix=str(tmp_path), num_doc=n)
    for i in range(int(total_doc / n)):
        source = str(tmp_path) + '{:02x}.xml'.format(i)
        with open(source) as fp:
            subc = bioc.load(fp)
            assert len(subc.documents) == n

    last_n = int(math.ceil(total_doc / n))
    if last_n > int(total_doc / n):
        source = str(tmp_path) + '{:02x}.xml'.format(last_n - 1)
        with open(source) as fp:
            subc = bioc.load(fp)
            assert len(subc.documents) == total_doc % n
示例#22
0
文件: utils.py 项目: leonweber/pedl
def _process_pubtator_files(files: List[Path], q: mp.Queue, pickle_path: Path):
    for file in files:
        partial_index = {}
        with file.open() as f:
            collection = bioc.load(f)
            for i, document in enumerate(collection.documents):
                pmid, is_fulltext = get_pmid(document)
                partial_index[pmid] = (file.name, i, is_fulltext)
        q.put(partial_index)

        if pickle_path is not None:
            with open(pickle_path / file.with_suffix(".pkl").name, "wb") as f:
                pickle.dump(collection, f)
示例#23
0
def get_figure_link(biocfile) -> List[str]:
    try:
        c = bioc.load(open(biocfile, 'r', encoding='utf8'))
    except:
        return []
    figures = []
    for doc in c.documents:
        for p in doc.passages:
            if len(p.text) == 0:
                continue
            p.text = p.text.replace('\n', ' ')
            if 'file' in p.infons and 'type' in p.infons and p.infons[
                    'type'] in FIG_PASSAGE:
                figures.append(p.infons["file"])
    return figures
示例#24
0
    def __load_collection_xml(bioc_xml: str, is_file: bool = True):
        """load a xml bioc collection.
        It will return a bioc collection object.

        :param bioc_xml: a str path to a bioc file or a bioc input xml string
        :param is_file: if True bioc_input is a path else it is a string
        :returns:  a bioc collection object
        """
        if is_file:
            with open(bioc_xml, 'r') as fp:
                collection = bioc.load(fp)
            return (collection)
        else:
            collection = bioc.loads(bioc_xml)
            return (collection)
示例#25
0
文件: split.py 项目: bionlplab/bioc
def split_file(source,
               *,
               prefix: str,
               num_doc: int,
               additional_suffix: str = '.xml',
               suffix_length: int = 2):
    path_format = prefix + '{:0' + str(
        suffix_length) + 'x}' + additional_suffix

    with open(source, encoding='utf8') as fp:
        collection = bioc.load(fp)

    for i, subc in tqdm.tqdm(enumerate(itersplit(collection, num_doc))):
        dst = path_format.format(i)
        with open(dst, 'w', encoding='utf8') as fp:
            bioc.dump(subc, fp)
示例#26
0
def get_figure_link(pmc, bioc_file):
    with open(bioc_file, encoding='utf8') as fp:
        c = bioc.load(fp)

    figures = []
    for doc in c.documents:
        for p in doc.passages:
            if len(p.text) == 0:
                continue
            p.text = p.text.replace('\n', ' ')
            if 'file' in p.infons and 'type' in p.infons and p.infons[
                    'type'] in FIG_PASSAGE:
                url = f'https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{p.infons["file"]}'
                caption = p.text.replace('\n', ' ')
                f = Figure(pmc, url, caption)
                figures.append(f)
    return figures
示例#27
0
def test_level():
    with pytest.raises(ValueError):
        BioCJsonIterWriter(io.StringIO(), level=-1)

    with open(file, encoding='utf8') as fp:
        collection = bioc.load(fp, BioCFileType.BIOC_JSON)

    with pytest.raises(ValueError):
        writer = BioCJsonIterWriter(io.StringIO(), level=bioc.SENTENCE)
        writer.write(collection.documents[0])

    with pytest.raises(ValueError):
        writer = BioCJsonIterWriter(io.StringIO(), level=bioc.PASSAGE)
        writer.write(collection.documents[0])

    with pytest.raises(ValueError):
        writer = BioCJsonIterWriter(io.StringIO(), level=bioc.DOCUMENT)
        writer.write(collection.documents[0].passages[0])
示例#28
0
def get_figure_text(src1, src2, dest, history_file, bioc_dir):
    df1 = pd.read_csv(src1, dtype=str)
    df2 = pd.read_csv(src2, dtype=str)
    df = pd.concat([df1, df2], axis=0)
    figures = create_figures(df, history_file=history_file)

    docs = {}  # type: Dict[str, bioc.BioCDocument]
    for figure in figures:
        pmcid = figure.pmcid
        if pmcid not in docs:
            src = bioc_dir / generate_path(pmcid) / f'{pmcid}.xml'
            collection = bioc.load(open(src))
            docs[pmcid] = collection.documents[0]
        add_text(figure, docs[figure.pmcid])

    with open(dest, 'w', encoding='utf8') as fp:
        objs = [f.to_dict() for f in figures]
        json.dump(objs, fp, indent=2)
示例#29
0
def scan_document(*_, **kwargs):
    """
    Scan each document in a list of BioC source files, apply fn, and print to directory.

    Args:
        kwargs:
            source(list): a list of source pathnames
            directory(str): output directory
            fn:
                fn should expect the following arguments in this given order:
                    sequence1
                    sequence2
                    ...
                    non_sequence1
                    non_sequence2
                    ...
            verbose(boolean):
    """
    source = kwargs.pop('source')
    verbose = kwargs.pop('verbose', True)
    directory = os.path.expanduser(kwargs.pop('directory'))
    suffix = kwargs.pop('suffix')
    fn = kwargs.pop('fn')
    non_sequences = kwargs.pop('non_sequences', [])

    if not os.path.exists(directory):
        os.makedirs(directory)

    def catch(document, non_sequences):
        try:
            return fn(document, *non_sequences)
        except:
            logging.exception('Cannot process %s', document.id)

    for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose):
        basename = os.path.splitext(os.path.basename(pathname))[0]
        dstname = os.path.join(directory, '{}{}'.format(basename, suffix))
        with io.open(pathname, encoding='utf8') as fp:
            collection = bioc.load(fp)
        collection.documents = [
            catch(doc, non_sequences) for doc in collection.documents
        ]
        with io.open(dstname, 'w', encoding='utf8') as fp:
            bioc.dump(collection, fp)
示例#30
0
def read_prediction(infile):
    with open(infile, 'r') as fin:
        try:
            collection = bioc.load(fin)
        except:
            logging.error('BioC file {0} not well formed'.format(infile))
            raise
    assert collection.documents.__len__(
    ) != 0, "Each document should be encoded in its own collection"
    annotations, relations = {}, {}
    for passage in collection.documents[0].passages:
        for annotation in passage.annotations:
            assert annotation.id not in annotations, 'Duplicate annotation id found. Please verify{0}'.format(
                annotation.id)
            if annotation.infons['type'] in ANNOTATION_TYPE:
                annotations[annotation.id] = (annotation.infons['type'],
                                              annotation.locations[0].offset,
                                              annotation.locations[0].length)

    for passage in collection.documents[0].passages:
        for relation in passage.relations:
            assert relation.id not in relations, 'Duplicate relation id found. Please verify{0}'.format(
                relation.id)
            if relation.infons['type'] in RELATION_TYPE:
                if relation.nodes[0].refid in annotations and relation.nodes[
                        1].refid in annotations:
                    # Disregarding relations that have illegal annotation ids.
                    annotation1 = annotations[relation.nodes[0].refid]
                    annotation2 = annotations[relation.nodes[1].refid]
                    relations[relation.id] = (relation.infons['type'],
                                              annotation1, annotation2)
                else:
                    logging.debug(
                        'Disregarding relation id {0} from file {1} because annotation entries are not valid'
                        .format(relation.id, infile))

    return annotations, relations
示例#31
0
def scan_collection(*_, **kwargs):
    """
        Scan each document in a list of BioC source files, apply fn, and print to directory.

        Args:
            kwargs:
                source(list): a list of source pathnames
                directory(str): output directory
                fn:
                    fn should expect the following arguments in this given order:
                        sequence1
                        sequence2
                        ...
                        non_sequence1
                        non_sequence2
                        ...
                verbose(boolean):
        """
    source = kwargs.pop('source')
    verbose = kwargs.pop('verbose', True)
    directory = os.path.expanduser(kwargs.pop('directory'))
    suffix = kwargs.pop('suffix')
    fn = kwargs.pop('fn')
    non_sequences = kwargs.pop('non_sequences', None)

    for pathname in tqdm.tqdm(source, total=len(source), disable=not verbose):
        basename = os.path.splitext(os.path.basename(pathname))[0]
        dstname = os.path.join(directory, '{}{}'.format(basename, suffix))
        with open(pathname) as fp:
            collection = bioc.load(fp)
            try:
                args = [collection] + non_sequences
                fn(*args)
            except:
                logging.exception('Cannot process %s', collection.source)
        with open(dstname, 'w') as fp:
            bioc.dump(collection, fp)
示例#32
0
 def test_validate(self):
     with open(self.src) as fp:
         collection = bioc.load(fp)
     bioc.validate(collection)
示例#33
0
 def test_dumps(self):
     with open(self.src) as fp:
         collection = bioc.load(fp)
     s = bioc.dumps(collection)
     collection = bioc.loads(s)
     self.__test_collection(collection)
示例#34
0
 def test_load(self):
     with open(self.src) as fp:
         collection = bioc.load(fp)
     self.__test_collection(collection)