def write_sentence_documents(sentences: List[str],
                             labels: List[str],
                             path: Path,
                             labeled=True):
    typesystem = TypeSystem()
    cas = Cas(typesystem=typesystem)

    SentenceType = typesystem.create_type(
        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
    SentimentType = typesystem.create_type("webanno.custom.Sentiment")
    typesystem.add_feature(type_=SentimentType,
                           name="value",
                           rangeTypeName="uima.cas.String")

    cas.sofa_string = " ".join(sentences)

    begin = 0
    for sentence, label in zip(sentences, labels):
        end = begin + len(sentence)
        cas_sentence = SentenceType(begin=begin, end=end)
        sentiment_annotation = SentimentType(begin=begin, end=end, value=label)
        begin = end + 1

        cas.add_annotation(cas_sentence)

        if labeled:
            cas.add_annotation(sentiment_annotation)

    cas.to_xmi(path, pretty_print=True)

    for sentence in cas.select(SENTENCE_TYPE):
        print(cas.get_covered_text(sentence))
def convert_single_file(input_paragraph_list: List[str],
                        output_xmi_file: str) -> None:
    document_text = '\n'.join(input_paragraph_list)

    cas = Cas(typesystem=cassis.load_dkpro_core_typesystem())
    cas.sofa_string = document_text

    print("----")
    print(document_text)
    print("----")

    token_type: Type = cas.typesystem.get_type(
        'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token')
    paragraph_type: Type = cas.typesystem.get_type(
        'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph')
    sentence_type: Type = cas.typesystem.get_type(
        'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence')

    total_doc_offset: int = 0
    for paragraph_str in input_paragraph_list:
        this_paragraph_total_offset = total_doc_offset

        doc: Doc = nlp(paragraph_str)

        for token in doc:
            assert isinstance(token, Token)
            # print(token.text, token.idx, len(token), token.idx + len(token), token.is_space)
            begin: int = total_doc_offset + token.idx
            end: int = total_doc_offset + token.idx + len(token)
            # annotate token -- only if it is not a space!
            if not token.is_space:
                cas.add_annotation(token_type.__call__(begin=begin, end=end))

        total_doc_offset += len(paragraph_str)

        # annotate paragraph
        this_paragraph_annotation = paragraph_type.__call__(
            begin=this_paragraph_total_offset, end=total_doc_offset)
        cas.add_annotation(this_paragraph_annotation)
        # and for paragraph too; but how about the '\n' char? maybe +1?
        total_doc_offset += 1

        # add sentences aligned exactly to paragraphs
        cas.add_annotation(
            sentence_type.__call__(begin=this_paragraph_annotation.begin,
                                   end=this_paragraph_annotation.end))

    print([x.get_covered_text() for x in cas.select(paragraph_type.name)])
    print([x.get_covered_text() for x in cas.select(sentence_type.name)])
    print([x.get_covered_text() for x in cas.select(token_type.name)])

    # create parent folder if not exists
    Path(output_xmi_file).parent.mkdir(parents=True, exist_ok=True)

    cas.to_xmi(output_xmi_file)
示例#3
0
def rebuilt2xmi(ci,
                output_dir,
                typesystem_path,
                iiif_mappings,
                pct_coordinates=False) -> str:
    """
    Converts a rebuilt ContentItem into Apache UIMA/XMI format.

    The resulting file will be named after the content item's ID, adding
    the `.xmi` extension.

    :param ci: the content item to be converted
    :type ci: `impresso_commons.classes.ContentItem`
    :param output_dir: the path to the output directory
    :type output_dir: str
    :param typesystem_path: TypeSystem file containing defitions of annotation
    layers.
    :type typesystem_path: str
    """

    with open(typesystem_path, "rb") as f:
        typesystem = load_typesystem(f)

    cas = Cas(typesystem=typesystem)
    cas.sofa_string = ci.fulltext
    cas.sofa_mime = 'text/plain'

    sentType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
    imgLinkType = 'webanno.custom.ImpressoImages'
    Sentence = typesystem.get_type(sentType)
    ImageLink = typesystem.get_type(imgLinkType)

    # create sentence-level annotations
    start_offset = 0
    for break_offset in ci.lines:
        start = start_offset
        end = break_offset
        start_offset = break_offset
        cas.add_annotation(Sentence(begin=start, end=end))

    iiif_links = compute_image_links(ci,
                                     iiif_links=iiif_mappings,
                                     pct=pct_coordinates)

    # inject the IIIF links into
    for iiif_link, start, end in iiif_links:
        cas.add_annotation(ImageLink(begin=start, end=end, link=iiif_link))

    outfile_path = os.path.join(output_dir, f'{ci.id}.xmi')
    cas.to_xmi(outfile_path, pretty_print=True)
    return outfile_path