Python ExportAnnotations примеры использования

Язык программирования: Python

Пространство имен/Пакет: sparv

Класс/Тип: ExportAnnotations

Примеров на hotexamples.com: 7

Python ExportAnnotations - 7 примеров найдено. Это лучшие примеры Python кода для sparv.ExportAnnotations, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ExportAnnotations(7)

Основные методы

ExportAnnotations (7)

Пример #1

Показать файл

def encode_scrambled(
        corpus: Corpus = Corpus(),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations",
                                                           is_input=False),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        docs: AllDocuments = AllDocuments(),
        words: AnnotationAllDocs = AnnotationAllDocs("[export.word]"),
        vrtfiles: ExportInput = ExportInput("vrt_scrambled/{doc}.vrt",
                                            all_docs=True),
        out: Export = Export("[cwb.corpus_registry]/[metadata.id]",
                             absolute_path=True),
        out_marker: Export = Export(
            "[cwb.cwb_datadir]/[metadata.id]/.scrambled_marker",
            absolute_path=True),
        token: AnnotationAllDocs = AnnotationAllDocs("<token>"),
        bin_path: Config = Config("cwb.bin_path"),
        encoding: str = Config("cwb.encoding"),
        datadir: str = Config("cwb.cwb_datadir"),
        registry: str = Config("cwb.corpus_registry"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace"),
        skip_compression: Optional[bool] = Config("cwb.skip_compression"),
        skip_validation: Optional[bool] = Config("cwb.skip_validation")):
    """Do cwb encoding with vrt files in scrambled order."""
    cwb_encode(corpus, annotations, source_annotations, docs, words, vrtfiles,
               out, out_marker, token.name, bin_path, encoding, datadir,
               registry, remove_namespaces, sparv_namespace, source_namespace,
               skip_compression, skip_validation)

Пример #2

Показать файл

def pretty(doc: Document = Document(),
           docid: AnnotationData = AnnotationData("<docid>"),
           out: Export = Export("xml_pretty/[xml_export.filename]"),
           token: Annotation = Annotation("<token>"),
           word: Annotation = Annotation("[export.word]"),
           annotations: ExportAnnotations = ExportAnnotations("xml_export.annotations"),
           source_annotations: SourceAnnotations = SourceAnnotations("xml_export.source_annotations"),
           header_annotations: SourceAnnotations = SourceAnnotations("xml_export.header_annotations"),
           remove_namespaces: bool = Config("export.remove_module_namespaces", False),
           sparv_namespace: str = Config("export.sparv_namespace"),
           source_namespace: str = Config("export.source_namespace"),
           include_empty_attributes: bool = Config("xml_export.include_empty_attributes")):
    """Export annotations to pretty XML in export_dir.

    Args:
        doc: Name of the original document.
        docid: Annotation with document IDs.
        out: Path and filename pattern for resulting file.
        token: Annotation containing the token strings.
        word: Annotation containing the token strings.
        annotations: List of elements:attributes (annotations) to include.
        source_annotations: List of elements:attributes from the original document
            to be kept. If not specified, everything will be kept.
        header_annotations: List of header elements from the original document to include
            in the export. If not specified, all headers will be kept.
        remove_namespaces: Whether to remove module "namespaces" from element and attribute names.
            Disabled by default.
        sparv_namespace: The namespace to be added to all Sparv annotations.
        source_namespace: The namespace to be added to all annotations present in the source.
        include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    token_name = token.name

    # Read words and document ID
    word_annotation = list(word.read())
    docid_annotation = docid.read()

    # Get annotation spans, annotations list etc.
    annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, doc=doc,
                                                                 token_name=token_name,
                                                                 remove_namespaces=remove_namespaces,
                                                                 sparv_namespace=sparv_namespace,
                                                                 source_namespace=source_namespace)
    h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc)
    export_names.update(h_export_names)
    span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, h_annotations,
                                                              doc=doc, split_overlaps=True)
    xmlstr = xml_utils.make_pretty_xml(span_positions, annotation_dict, export_names, token_name, word_annotation,
                                       docid_annotation, include_empty_attributes, sparv_namespace)

    # Write XML to file
    with open(out, mode="w") as outfile:
        outfile.write(xmlstr)
    log.info("Exported: %s", out)

Пример #3

Показать файл

def vrt_scrambled(
        doc: Document = Document(),
        out: Export = Export("vrt_scrambled/{doc}.vrt"),
        chunk: Annotation = Annotation("[cwb.scramble_on]"),
        chunk_order: Annotation = Annotation(
            "[cwb.scramble_on]:misc.number_random"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("[export.word]"),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace")):
    """Export annotations to vrt in scrambled order."""
    # Get annotation spans, annotations list etc.
    annotation_list, token_attributes, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        token_name=token.name,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    if chunk not in annotation_list:
        raise util.SparvErrorMessage(
            "The annotation used for scrambling ({}) needs to be included in the output."
            .format(chunk))
    span_positions, annotation_dict = util.gather_annotations(
        annotation_list, export_names, doc=doc, split_overlaps=True)

    # Read words and document ID
    word_annotation = list(word.read())
    chunk_order_data = list(chunk_order.read())

    # Reorder chunks and open/close tags in correct order
    new_span_positions = util.scramble_spans(span_positions, chunk.name,
                                             chunk_order_data)

    # Make vrt format
    vrt_data = create_vrt(new_span_positions, token.name, word_annotation,
                          token_attributes, annotation_dict, export_names)

    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Write result to file
    with open(out, "w") as f:
        f.write(vrt_data)
    log.info("Exported: %s", out)

Пример #4

Показать файл

def vrt(doc: Document = Document(),
        out: Export = Export("vrt/{doc}.vrt"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("[export.word]"),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace")):
    """Export annotations to vrt.

    - annotations: list of elements:attributes (annotations) to include.
    - source_annotations: list of elements:attributes from the original document
      to be kept. If not specified, everything will be kept.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Read words
    word_annotation = list(word.read())

    # Get annotation spans, annotations list etc.
    annotation_list, token_attributes, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        token_name=token.name,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    span_positions, annotation_dict = util.gather_annotations(annotation_list,
                                                              export_names,
                                                              doc=doc)
    vrt_data = create_vrt(span_positions, token.name, word_annotation,
                          token_attributes, annotation_dict, export_names)

    # Write result to file
    with open(out, "w") as f:
        f.write(vrt_data)
    log.info("Exported: %s", out)

Пример #5

Показать файл

def preserved_format(
        doc: Document = Document(),
        text: Text = Text(),
        docid: AnnotationData = AnnotationData("<docid>"),
        out: Export = Export(
            "xml_preserved_format/[xml_export.filename_formatted]"),
        annotations: ExportAnnotations = ExportAnnotations(
            "xml_export.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "xml_export.source_annotations"),
        header_annotations: SourceAnnotations = SourceAnnotations(
            "xml_export.header_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace"),
        include_empty_attributes: bool = Config(
            "xml_export.include_empty_attributes")):
    """Export annotations to XML in export_dir and keep whitespaces and indentation from original file.

    Args:
        doc: Name of the original document.
        text: The corpus text.
        docid: Annotation with document IDs.
        out: Path and filename pattern for resulting file.
        annotations: List of elements:attributes (annotations) to include.
        source_annotations: List of elements:attributes from the original document
            to be kept. If not specified, everything will be kept.
        header_annotations: List of header elements from the original document to include
            in the export. If not specified, all headers will be kept.
        remove_namespaces: Whether to remove module "namespaces" from element and attribute names.
            Disabled by default.
        sparv_namespace: The namespace to be added to all Sparv annotations.
        source_namespace: The namespace to be added to all annotations present in the source.
        include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Read corpus text and document ID
    corpus_text = text.read()
    docid = docid.read()

    # Get annotation spans, annotations list etc.
    annotation_list, _, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    h_annotations, h_export_names = util.get_header_names(header_annotations,
                                                          doc=doc)
    export_names.update(h_export_names)
    span_positions, annotation_dict = util.gather_annotations(
        annotation_list,
        export_names,
        h_annotations,
        doc=doc,
        flatten=False,
        split_overlaps=True)
    sorted_positions = [(pos, span[0], span[1])
                        for pos, spans in sorted(span_positions.items())
                        for span in spans]

    # Root tag sanity check
    if not xml_utils.valid_root(sorted_positions[0], sorted_positions[-1]):
        raise util.SparvErrorMessage(
            "Root tag is missing! If you have manually specified which elements to include, "
            "make sure to include an element that encloses all other included elements and "
            "text content.")

    # Create root node
    root_span = sorted_positions[0][2]
    root_span.set_node()
    node_stack = []
    last_pos = 0  # Keeps track of the position of the processed text

    for x, (_pos, instruction, span) in enumerate(sorted_positions):
        # Open node: Create child node under the top stack node
        if instruction == "open":
            # Set tail for previous node if necessary
            if last_pos < span.start:
                # Get last closing node in this position
                _, tail_span = [
                    i for i in span_positions[last_pos] if i[0] == "close"
                ][-1]
                tail_span.node.tail = corpus_text[last_pos:span.start]
                last_pos = span.start

            # Handle headers
            if span.is_header:
                header = annotation_dict[span.name][util.HEADER_CONTENTS][
                    span.index]
                header_xml = etree.fromstring(header)
                header_xml.tag = span.export  # Rename element if needed
                span.node = header_xml
                node_stack[-1].node.append(header_xml)
            else:
                if node_stack:  # Don't create root node, it already exists
                    span.set_node(parent_node=node_stack[-1].node)

                xml_utils.add_attrs(span.node, span.name, annotation_dict,
                                    export_names, span.index,
                                    include_empty_attributes)
                if span.overlap_id:
                    if sparv_namespace:
                        span.node.set(f"{sparv_namespace}.{util.OVERLAP_ATTR}",
                                      f"{docid}-{span.overlap_id}")
                    else:
                        span.node.set(
                            f"{util.SPARV_DEFAULT_NAMESPACE}.{util.OVERLAP_ATTR}",
                            f"{docid}-{span.overlap_id}")
                node_stack.append(span)

                # Set text if there should be any between this node and the next one
                next_item = sorted_positions[x + 1]
                if next_item[1] == "open" and next_item[2].start > span.start:
                    span.node.text = corpus_text[last_pos:next_item[2].start]
                    last_pos = next_item[2].start

        # Close node
        else:
            if span.is_header:
                continue
            if last_pos < span.end:
                # Set node text if necessary
                if span.start == last_pos:
                    span.node.text = corpus_text[last_pos:span.end]
                # Set tail for previous node if necessary
                else:
                    # Get last closing node in this position
                    _, tail_span = [
                        i for i in span_positions[last_pos] if i[0] == "close"
                    ][-1]
                    tail_span.node.tail = corpus_text[last_pos:span.end]
                last_pos = span.end

            # Make sure closing node == top stack node
            assert span == node_stack[
                -1], "Overlapping elements found: {}".format(node_stack[-2:])
            # Pop stack and move on to next span
            node_stack.pop()

    # Write xml to file
    etree.ElementTree(root_span.node).write(out,
                                            encoding="unicode",
                                            method="xml",
                                            xml_declaration=True)
    log.info("Exported: %s", out)

Пример #6

Показать файл

Файл: csv_export.py Проект: heatherleaf/sparv-pipeline

def csv(doc: Document = Document(),
        out: Export = Export("csv/{doc}.csv"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("[export.word]"),
        sentence: Annotation = Annotation("<sentence>"),
        annotations: ExportAnnotations = ExportAnnotations(
            "csv_export.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "csv_export.source_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace"),
        delimiter: str = Config("csv_export.delimiter")):
    """Export annotations to CSV format."""
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    token_name = token.name

    # Read words
    word_annotation = list(word.read())

    # Get annotation spans, annotations list etc.
    annotation_list, token_attributes, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        token_name=token_name,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    span_positions, annotation_dict = util.gather_annotations(annotation_list,
                                                              export_names,
                                                              doc=doc)

    # Make csv header
    csv_data = [
        _make_header(token_name, token_attributes, export_names, delimiter)
    ]

    # Go through spans_dict and add to csv, line by line
    for _pos, instruction, span in span_positions:
        if instruction == "open":
            # Create token line
            if span.name == token_name:
                csv_data.append(
                    _make_token_line(word_annotation[span.index], token_name,
                                     token_attributes, annotation_dict,
                                     span.index, delimiter))

            # Create line with structural annotation
            else:
                attrs = _make_attrs(span.name, annotation_dict, export_names,
                                    span.index)
                for attr in attrs:
                    csv_data.append(f"# {attr}")
                if not attrs:
                    csv_data.append(f"# {span.export}")

        # Insert blank line after each closing sentence
        elif span.name == sentence.name and instruction == "close":
            csv_data.append("")

    # Write result to file
    with open(out, "w") as f:
        f.write("\n".join(csv_data))
    logger.info("Exported: %s", out)

Пример #7

Показать файл

Файл: metashare.py Проект: spraakbanken/sparv-sbx-metadata

def metashare(
        out: Export = Export("sbx_metadata/[metadata.id].xml"),
        template: Model = Model("sbx_metadata/sbx-metashare-template.xml"),
        corpus_id: Corpus = Corpus(),
        lang: Language = Language(),
        metadata: dict = Config("metadata"),
        sentences: AnnotationCommonData = AnnotationCommonData(
            "misc.<sentence>_count"),
        tokens: AnnotationCommonData = AnnotationCommonData(
            "misc.<token>_count"),
        annotations: ExportAnnotations = ExportAnnotations(
            "xml_export.annotations", is_input=False),
        korp_protected: bool = Config("korp.protected"),
        korp_mode: bool = Config("korp.mode"),
        # md_linguality: str = Config("sbx_metadata.linguality"),
        md_script: str = Config("sbx_metadata.script"),
        md_xml_export: str = Config("sbx_metadata.xml_export"),
        md_stats_export: bool = Config("sbx_metadata.stats_export"),
        md_korp: bool = Config("sbx_metadata.korp"),
        md_downloads: list = Config("sbx_metadata.downloads"),
        md_interface: list = Config("sbx_metadata.interface"),
        md_contact: dict = Config("sbx_metadata.contact_info")):
    """Export corpus metadata to META-SHARE format."""
    # Parse template and handle META SHARE namespace
    xml = etree.parse(template.path).getroot()
    etree.register_namespace("", META_SHARE_URL)
    ns = META_SHARE_NAMESPACE

    # Set idenfification info
    identificationInfo = xml.find(ns + "identificationInfo")
    for i in identificationInfo.findall(ns + "resourceShortName"):
        i.text = corpus_id
    identificationInfo.find(ns + "identifier").text = corpus_id
    _set_texts(identificationInfo.findall(ns + "resourceName"),
               metadata.get("name", {}))
    _set_texts(identificationInfo.findall(ns + "description"),
               metadata.get("description", {}))

    # Set metadata creation date in metadataInfo
    xml.find(".//" + ns + "metadataCreationDate").text = str(
        time.strftime("%Y-%m-%d"))

    # Set availability
    if korp_protected:
        xml.find(".//" + ns + "availability").text = "available-restrictedUse"
    else:
        xml.find(".//" + ns +
                 "availability").text = "available-unrestrictedUse"

    # Set licenceInfos
    distInfo = xml.find(".//" + ns + "distributionInfo")
    _set_licence_info(
        [metadata_utils.make_standard_xml_export(md_xml_export, corpus_id)],
        distInfo)
    _set_licence_info([
        metadata_utils.make_standard_stats_export(md_stats_export, corpus_id)
    ], distInfo)
    _set_licence_info(
        [metadata_utils.make_korp(md_korp, corpus_id, korp_mode)],
        distInfo,
        download=False)
    _set_licence_info([metadata_utils.make_metashare(corpus_id)], distInfo)
    # Add non-standard licenseInfos
    _set_licence_info(md_downloads, distInfo)
    _set_licence_info(md_interface, distInfo, download=False)

    # Set contactPerson
    _set_contact_info(md_contact, xml.find(".//" + ns + "contactPerson"))

    # Set samplesLocation
    xml.find(".//" + ns +
             "samplesLocation").text = f"{SBX_SAMPLES_LOCATION}{corpus_id}"

    # Set lingualityType
    xml.find(".//" + ns + "lingualityType").text = "monolingual"

    # Set languageInfo (languageId, languageName, languageScript)
    xml.find(".//" + ns + "languageId").text = lang
    xml.find(".//" + ns + "languageName").text = languages.get(
        part3=lang).name if lang in languages.part3 else lang
    xml.find(".//" + ns + "languageScript").text = md_script

    # Set sizeInfo
    sizeInfos = xml.findall(".//" + ns + "sizeInfo")
    sizeInfos[0].find(ns + "size").text = tokens.read()
    sizeInfos[1].find(ns + "size").text = sentences.read()

    # Set annotationInfo
    corpusTextInfo = xml.find(".//" + ns + "corpusTextInfo")
    _set_annotation_info(annotations, corpusTextInfo)

    # Write XML to file
    os.makedirs(os.path.dirname(out), exist_ok=True)
    etree.ElementTree(xml).write(out,
                                 encoding="unicode",
                                 method="xml",
                                 xml_declaration=True)
    logger.info("Exported: %s", out)