def write_sentence_documents(sentences: List[str], labels: List[str], path: Path, labeled=True): typesystem = TypeSystem() cas = Cas(typesystem=typesystem) SentenceType = typesystem.create_type( "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence") SentimentType = typesystem.create_type("webanno.custom.Sentiment") typesystem.add_feature(type_=SentimentType, name="value", rangeTypeName="uima.cas.String") cas.sofa_string = " ".join(sentences) begin = 0 for sentence, label in zip(sentences, labels): end = begin + len(sentence) cas_sentence = SentenceType(begin=begin, end=end) sentiment_annotation = SentimentType(begin=begin, end=end, value=label) begin = end + 1 cas.add_annotation(cas_sentence) if labeled: cas.add_annotation(sentiment_annotation) cas.to_xmi(path, pretty_print=True) for sentence in cas.select(SENTENCE_TYPE): print(cas.get_covered_text(sentence))
def convert_single_file(input_paragraph_list: List[str], output_xmi_file: str) -> None: document_text = '\n'.join(input_paragraph_list) cas = Cas(typesystem=cassis.load_dkpro_core_typesystem()) cas.sofa_string = document_text print("----") print(document_text) print("----") token_type: Type = cas.typesystem.get_type( 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token') paragraph_type: Type = cas.typesystem.get_type( 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph') sentence_type: Type = cas.typesystem.get_type( 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence') total_doc_offset: int = 0 for paragraph_str in input_paragraph_list: this_paragraph_total_offset = total_doc_offset doc: Doc = nlp(paragraph_str) for token in doc: assert isinstance(token, Token) # print(token.text, token.idx, len(token), token.idx + len(token), token.is_space) begin: int = total_doc_offset + token.idx end: int = total_doc_offset + token.idx + len(token) # annotate token -- only if it is not a space! if not token.is_space: cas.add_annotation(token_type.__call__(begin=begin, end=end)) total_doc_offset += len(paragraph_str) # annotate paragraph this_paragraph_annotation = paragraph_type.__call__( begin=this_paragraph_total_offset, end=total_doc_offset) cas.add_annotation(this_paragraph_annotation) # and for paragraph too; but how about the '\n' char? maybe +1? total_doc_offset += 1 # add sentences aligned exactly to paragraphs cas.add_annotation( sentence_type.__call__(begin=this_paragraph_annotation.begin, end=this_paragraph_annotation.end)) print([x.get_covered_text() for x in cas.select(paragraph_type.name)]) print([x.get_covered_text() for x in cas.select(sentence_type.name)]) print([x.get_covered_text() for x in cas.select(token_type.name)]) # create parent folder if not exists Path(output_xmi_file).parent.mkdir(parents=True, exist_ok=True) cas.to_xmi(output_xmi_file)
def rebuilt2xmi(ci, output_dir, typesystem_path, iiif_mappings, pct_coordinates=False) -> str: """ Converts a rebuilt ContentItem into Apache UIMA/XMI format. The resulting file will be named after the content item's ID, adding the `.xmi` extension. :param ci: the content item to be converted :type ci: `impresso_commons.classes.ContentItem` :param output_dir: the path to the output directory :type output_dir: str :param typesystem_path: TypeSystem file containing defitions of annotation layers. :type typesystem_path: str """ with open(typesystem_path, "rb") as f: typesystem = load_typesystem(f) cas = Cas(typesystem=typesystem) cas.sofa_string = ci.fulltext cas.sofa_mime = 'text/plain' sentType = 'de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence' imgLinkType = 'webanno.custom.ImpressoImages' Sentence = typesystem.get_type(sentType) ImageLink = typesystem.get_type(imgLinkType) # create sentence-level annotations start_offset = 0 for break_offset in ci.lines: start = start_offset end = break_offset start_offset = break_offset cas.add_annotation(Sentence(begin=start, end=end)) iiif_links = compute_image_links(ci, iiif_links=iiif_mappings, pct=pct_coordinates) # inject the IIIF links into for iiif_link, start, end in iiif_links: cas.add_annotation(ImageLink(begin=start, end=end, link=iiif_link)) outfile_path = os.path.join(output_dir, f'{ci.id}.xmi') cas.to_xmi(outfile_path, pretty_print=True) return outfile_path