def write_sentence_documents(sentences: List[str],
                             labels: List[str],
                             path: Path,
                             labeled=True):
    typesystem = TypeSystem()
    cas = Cas(typesystem=typesystem)

    SentenceType = typesystem.create_type(
        "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")
    SentimentType = typesystem.create_type("webanno.custom.Sentiment")
    typesystem.add_feature(type_=SentimentType,
                           name="value",
                           rangeTypeName="uima.cas.String")

    cas.sofa_string = " ".join(sentences)

    begin = 0
    for sentence, label in zip(sentences, labels):
        end = begin + len(sentence)
        cas_sentence = SentenceType(begin=begin, end=end)
        sentiment_annotation = SentimentType(begin=begin, end=end, value=label)
        begin = end + 1

        cas.add_annotation(cas_sentence)

        if labeled:
            cas.add_annotation(sentiment_annotation)

    cas.to_xmi(path, pretty_print=True)

    for sentence in cas.select(SENTENCE_TYPE):
        print(cas.get_covered_text(sentence))
def convert_single_file(input_paragraph_list: List[str],
                        output_xmi_file: str) -> None:
    document_text = '\n'.join(input_paragraph_list)

    cas = Cas(typesystem=cassis.load_dkpro_core_typesystem())
    cas.sofa_string = document_text

    print("----")
    print(document_text)
    print("----")

    token_type: Type = cas.typesystem.get_type(
        'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token')
    paragraph_type: Type = cas.typesystem.get_type(
        'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph')
    sentence_type: Type = cas.typesystem.get_type(
        'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence')

    total_doc_offset: int = 0
    for paragraph_str in input_paragraph_list:
        this_paragraph_total_offset = total_doc_offset

        doc: Doc = nlp(paragraph_str)

        for token in doc:
            assert isinstance(token, Token)
            # print(token.text, token.idx, len(token), token.idx + len(token), token.is_space)
            begin: int = total_doc_offset + token.idx
            end: int = total_doc_offset + token.idx + len(token)
            # annotate token -- only if it is not a space!
            if not token.is_space:
                cas.add_annotation(token_type.__call__(begin=begin, end=end))

        total_doc_offset += len(paragraph_str)

        # annotate paragraph
        this_paragraph_annotation = paragraph_type.__call__(
            begin=this_paragraph_total_offset, end=total_doc_offset)
        cas.add_annotation(this_paragraph_annotation)
        # and for paragraph too; but how about the '\n' char? maybe +1?
        total_doc_offset += 1

        # add sentences aligned exactly to paragraphs
        cas.add_annotation(
            sentence_type.__call__(begin=this_paragraph_annotation.begin,
                                   end=this_paragraph_annotation.end))

    print([x.get_covered_text() for x in cas.select(paragraph_type.name)])
    print([x.get_covered_text() for x in cas.select(sentence_type.name)])
    print([x.get_covered_text() for x in cas.select(token_type.name)])

    # create parent folder if not exists
    Path(output_xmi_file).parent.mkdir(parents=True, exist_ok=True)

    cas.to_xmi(output_xmi_file)
Пример #3
0
def rebuilt2xmi(ci,
                output_dir,
                typesystem_path,
                iiif_mappings,
                pct_coordinates=False) -> str:
    """
    Converts a rebuilt ContentItem into Apache UIMA/XMI format.

    The resulting file will be named after the content item's ID, adding
    the `.xmi` extension.

    :param ci: the content item to be converted
    :type ci: `impresso_commons.classes.ContentItem`
    :param output_dir: the path to the output directory
    :type output_dir: str
    :param typesystem_path: TypeSystem file containing defitions of annotation
    layers.
    :type typesystem_path: str
    """

    with open(typesystem_path, "rb") as f:
        typesystem = load_typesystem(f)

    cas = Cas(typesystem=typesystem)
    cas.sofa_string = ci.fulltext
    cas.sofa_mime = 'text/plain'

    sentType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence'
    imgLinkType = 'webanno.custom.ImpressoImages'
    Sentence = typesystem.get_type(sentType)
    ImageLink = typesystem.get_type(imgLinkType)

    # create sentence-level annotations
    start_offset = 0
    for break_offset in ci.lines:
        start = start_offset
        end = break_offset
        start_offset = break_offset
        cas.add_annotation(Sentence(begin=start, end=end))

    iiif_links = compute_image_links(ci,
                                     iiif_links=iiif_mappings,
                                     pct=pct_coordinates)

    # inject the IIIF links into
    for iiif_link, start, end in iiif_links:
        cas.add_annotation(ImageLink(begin=start, end=end, link=iiif_link))

    outfile_path = os.path.join(output_dir, f'{ci.id}.xmi')
    cas.to_xmi(outfile_path, pretty_print=True)
    return outfile_path
Пример #4
0
    def generate_cas(self, typesystem: TypeSystem) -> Cas:
        cas = Cas(typesystem)
        cas.sofa_string = "x" * 130

        types = [t for t in typesystem.get_types()]
        types.remove(cas.typesystem.get_type(TYPE_NAME_DOCUMENT_ANNOTATION))
        self.rnd.shuffle(types)

        for n in range(0, self.size):
            for T in types:
                begin = self.rnd.randint(0, 100)
                end = self.rnd.randint(0, 30) + self.minimum_width
                fs = T(begin=begin, end=end)
                cas.add(fs)

        return cas
def export_annotated_texts_to_xmi(annotated_texts: List[AnnotatedText],
                                  type_system,
                                  file: str,
                                  xmi_file=None):

    cas = Cas(typesystem=type_system)

    current_start = 0
    starts = []
    sofa_string = ''

    # Create sofa string
    for annotated_text in annotated_texts:
        starts.append(current_start)
        text = annotated_text.text
        if not text.endswith('\n'):
            text += '\n'
        sofa_string += text
        current_start += len(text)

    cas.sofa_string = sofa_string

    # Tokens
    for annotated_text, start in zip(annotated_texts, starts):
        for token in annotated_text.tokens:
            annotation = cas.typesystem.get_type(TOKEN_NS)(
                begin=start + token.start, end=start + token.stop)
            cas.add_annotation(annotation)

    # Sentences
    for annotated_text, start in zip(annotated_texts, starts):
        annotation = cas.typesystem.get_type(SENTENCE_NS)(
            begin=start, end=start + len(annotated_text.text))
        cas.add_annotation(annotation)

    # Annotations
    for annotated_text, start in zip(annotated_texts, starts):
        for annotation in annotated_text.annotations:
            annotation = cas.typesystem.get_type(NAMED_ENTITY_NS)(
                value=annotation.label,
                begin=start + annotation.start,
                end=start + annotation.stop)
            cas.add_annotation(annotation)

    # write
    with open(file, 'wb') as f:
        dump_cas_to_zip_file(cas, f, xmi_file=xmi_file)
def load_newsgroup_test_data() -> List[Cas]:
    twenty_test = fetch_20newsgroups(subset="test", categories=NEWSGROUP_CATEGORIES, shuffle=True, random_state=42)

    typesystem = build_typesystem()
    SentenceType = typesystem.get_type(SENTENCE_TYPE)

    result = []
    for text in twenty_test.data[:5]:
        cas = Cas(typesystem=typesystem)
        cas.sofa_string = text

        begin = 0
        end = len(text)
        cas.add_annotation(SentenceType(begin=begin, end=end))

        result.append(cas)

    return result
def load_newsgroup_training_data() -> List[TrainingDocument]:
    twenty_train = fetch_20newsgroups(subset="train", categories=NEWSGROUP_CATEGORIES, shuffle=True, random_state=42)
    target_names = twenty_train.target_names

    typesystem = build_typesystem()
    SentenceType = typesystem.get_type(SENTENCE_TYPE)
    PredictedType = typesystem.get_type(PREDICTED_TYPE)

    docs = []
    for i, (text, target) in enumerate(zip(twenty_train.data, twenty_train.target)):
        cas = Cas(typesystem=typesystem)
        cas.sofa_string = text

        begin = 0
        end = len(text)
        cas.add_annotation(SentenceType(begin=begin, end=end))
        cas.add_annotation(PredictedType(begin=begin, end=end, value=target_names[target]))

        doc = TrainingDocument(cas, f"doc_{i}", USER)
        docs.append(doc)

    return docs