Exemplo n.º 1
0
def encode_scrambled(
        corpus: Corpus = Corpus(),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations",
                                                           is_input=False),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        docs: AllDocuments = AllDocuments(),
        words: AnnotationAllDocs = AnnotationAllDocs("[export.word]"),
        vrtfiles: ExportInput = ExportInput("vrt_scrambled/{doc}.vrt",
                                            all_docs=True),
        out: Export = Export("[cwb.corpus_registry]/[metadata.id]",
                             absolute_path=True),
        out_marker: Export = Export(
            "[cwb.cwb_datadir]/[metadata.id]/.scrambled_marker",
            absolute_path=True),
        token: AnnotationAllDocs = AnnotationAllDocs("<token>"),
        bin_path: Config = Config("cwb.bin_path"),
        encoding: str = Config("cwb.encoding"),
        datadir: str = Config("cwb.cwb_datadir"),
        registry: str = Config("cwb.corpus_registry"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace"),
        skip_compression: Optional[bool] = Config("cwb.skip_compression"),
        skip_validation: Optional[bool] = Config("cwb.skip_validation")):
    """Do cwb encoding with vrt files in scrambled order."""
    cwb_encode(corpus, annotations, source_annotations, docs, words, vrtfiles,
               out, out_marker, token.name, bin_path, encoding, datadir,
               registry, remove_namespaces, sparv_namespace, source_namespace,
               skip_compression, skip_validation)
Exemplo n.º 2
0
def info(out: Export = Export("[cwb.cwb_datadir]/[metadata.id]/.info",
                              absolute_path=True),
         sentences: AnnotationCommonData = AnnotationCommonData(
             "cwb.sentencecount"),
         firstdate: AnnotationCommonData = AnnotationCommonData(
             "cwb.datefirst"),
         lastdate: AnnotationCommonData = AnnotationCommonData("cwb.datelast"),
         resolution: AnnotationCommonData = AnnotationCommonData(
             "dateformat.resolution"),
         protected: bool = Config("korp.protected")):
    """Save information to the file specified by 'out'."""
    content = []
    protected_str = str(protected).lower()

    for key, value_obj in [("Sentences", sentences), ("FirstDate", firstdate),
                           ("LastDate", lastdate),
                           ("DateResolution", resolution),
                           ("Updated", time.strftime("%Y-%m-%d")),
                           ("Protected", protected_str)]:
        if isinstance(value_obj, AnnotationCommonData):
            value = value_obj.read()
        else:
            value = value_obj

        content.append("%s: %s\n" % (key, value))

    # Write .info file
    with open(out, "w") as o:
        o.writelines(content)

    log.info("Exported: %s", out)
Exemplo n.º 3
0
def timespan_sql_no_dateinfo(
        corpus: Corpus = Corpus(),
        out: Export = Export("korp_timespan/timespan.sql"),
        docs: AllDocuments = AllDocuments(),
        token: AnnotationAllDocs = AnnotationAllDocs("<token>")):
    """Create timespan SQL data for use in Korp."""
    corpus_name = corpus.upper()
    token_count = 0

    for doc in docs:
        tokens = token.read_spans(doc)
        token_count += len(list(tokens))

    rows_date = [{
        "corpus": corpus_name,
        "datefrom": "0" * 8,
        "dateto": "0" * 8,
        "tokens": token_count
    }]
    rows_datetime = [{
        "corpus": corpus_name,
        "datefrom": "0" * 14,
        "dateto": "0" * 14,
        "tokens": token_count
    }]

    create_sql(corpus_name, out, rows_date, rows_datetime)
Exemplo n.º 4
0
def lemgram_sql(corpus: Corpus = Corpus(),
                docs: AllDocuments = AllDocuments(),
                out: Export = Export("korp_lemgram_index/lemgram_index.sql"),
                lemgram: AnnotationAllDocs = AnnotationAllDocs(
                    "<token>:saldo.lemgram")):
    """Create lemgram index SQL file."""

    corpus = corpus.upper()
    result = defaultdict(int)

    for doc in docs:
        for lg in lemgram.read(doc):
            for value in lg.split("|"):
                if value and ":" not in value:
                    result[value] += 1

    mysql = MySQL(output=out)
    mysql.create_table(MYSQL_TABLE, drop=False, **MYSQL_INDEX)
    mysql.delete_rows(MYSQL_TABLE, {"corpus": corpus})
    mysql.set_names()

    rows = []
    for lemgram, freq in list(result.items()):
        rows.append({"lemgram": lemgram, "corpus": corpus, "freq": freq})

    log.info("Creating SQL")
    mysql.add_row(MYSQL_TABLE, rows)
Exemplo n.º 5
0
def pretty(doc: Document = Document(),
           docid: AnnotationData = AnnotationData("<docid>"),
           out: Export = Export("xml_pretty/[xml_export.filename]"),
           token: Annotation = Annotation("<token>"),
           word: Annotation = Annotation("[export.word]"),
           annotations: ExportAnnotations = ExportAnnotations("xml_export.annotations"),
           source_annotations: SourceAnnotations = SourceAnnotations("xml_export.source_annotations"),
           header_annotations: SourceAnnotations = SourceAnnotations("xml_export.header_annotations"),
           remove_namespaces: bool = Config("export.remove_module_namespaces", False),
           sparv_namespace: str = Config("export.sparv_namespace"),
           source_namespace: str = Config("export.source_namespace"),
           include_empty_attributes: bool = Config("xml_export.include_empty_attributes")):
    """Export annotations to pretty XML in export_dir.

    Args:
        doc: Name of the original document.
        docid: Annotation with document IDs.
        out: Path and filename pattern for resulting file.
        token: Annotation containing the token strings.
        word: Annotation containing the token strings.
        annotations: List of elements:attributes (annotations) to include.
        source_annotations: List of elements:attributes from the original document
            to be kept. If not specified, everything will be kept.
        header_annotations: List of header elements from the original document to include
            in the export. If not specified, all headers will be kept.
        remove_namespaces: Whether to remove module "namespaces" from element and attribute names.
            Disabled by default.
        sparv_namespace: The namespace to be added to all Sparv annotations.
        source_namespace: The namespace to be added to all annotations present in the source.
        include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    token_name = token.name

    # Read words and document ID
    word_annotation = list(word.read())
    docid_annotation = docid.read()

    # Get annotation spans, annotations list etc.
    annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, doc=doc,
                                                                 token_name=token_name,
                                                                 remove_namespaces=remove_namespaces,
                                                                 sparv_namespace=sparv_namespace,
                                                                 source_namespace=source_namespace)
    h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc)
    export_names.update(h_export_names)
    span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, h_annotations,
                                                              doc=doc, split_overlaps=True)
    xmlstr = xml_utils.make_pretty_xml(span_positions, annotation_dict, export_names, token_name, word_annotation,
                                       docid_annotation, include_empty_attributes, sparv_namespace)

    # Write XML to file
    with open(out, mode="w") as outfile:
        outfile.write(xmlstr)
    log.info("Exported: %s", out)
Exemplo n.º 6
0
def timespan_sql_with_dateinfo(
        corpus: Corpus = Corpus(),
        out: Export = Export("korp_timespan/timespan.sql"),
        docs: AllDocuments = AllDocuments(),
        token: AnnotationAllDocs = AnnotationAllDocs("<token>"),
        datefrom: AnnotationAllDocs = AnnotationAllDocs(
            "<text>:dateformat.datefrom"),
        dateto: AnnotationAllDocs = AnnotationAllDocs(
            "<text>:dateformat.dateto"),
        timefrom: AnnotationAllDocs = AnnotationAllDocs(
            "<text>:dateformat.timefrom"),
        timeto: AnnotationAllDocs = AnnotationAllDocs(
            "<text>:dateformat.timeto")):
    """Create timespan SQL data for use in Korp."""
    corpus_name = corpus.upper()
    datespans = defaultdict(int)
    datetimespans = defaultdict(int)

    for doc in docs:
        text_tokens, orphans = Annotation(datefrom.name,
                                          doc=doc).get_children(token)
        if orphans:
            datespans[("0" * 8, "0" * 8)] += len(orphans)
            datetimespans[("0" * 14, "0" * 14)] += len(orphans)
        dateinfo = datefrom.read_attributes(
            doc, (datefrom, dateto, timefrom, timeto))
        for text in text_tokens:
            d = next(dateinfo)
            datespans[(d[0].zfill(8), d[1].zfill(8))] += len(text)
            datetimespans[(d[0].zfill(8) + d[2].zfill(6),
                           d[1].zfill(8) + d[3].zfill(6))] += len(text)

    rows_date = []
    rows_datetime = []

    for span in datespans:
        rows_date.append({
            "corpus": corpus_name,
            "datefrom": span[0],
            "dateto": span[1],
            "tokens": datespans[span]
        })

    for span in datetimespans:
        rows_datetime.append({
            "corpus": corpus_name,
            "datefrom": span[0],
            "dateto": span[1],
            "tokens": datetimespans[span]
        })

    create_sql(corpus_name, out, rows_date, rows_datetime)
Exemplo n.º 7
0
def vrt_scrambled(
        doc: Document = Document(),
        out: Export = Export("vrt_scrambled/{doc}.vrt"),
        chunk: Annotation = Annotation("[cwb.scramble_on]"),
        chunk_order: Annotation = Annotation(
            "[cwb.scramble_on]:misc.number_random"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("[export.word]"),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace")):
    """Export annotations to vrt in scrambled order."""
    # Get annotation spans, annotations list etc.
    annotation_list, token_attributes, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        token_name=token.name,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    if chunk not in annotation_list:
        raise util.SparvErrorMessage(
            "The annotation used for scrambling ({}) needs to be included in the output."
            .format(chunk))
    span_positions, annotation_dict = util.gather_annotations(
        annotation_list, export_names, doc=doc, split_overlaps=True)

    # Read words and document ID
    word_annotation = list(word.read())
    chunk_order_data = list(chunk_order.read())

    # Reorder chunks and open/close tags in correct order
    new_span_positions = util.scramble_spans(span_positions, chunk.name,
                                             chunk_order_data)

    # Make vrt format
    vrt_data = create_vrt(new_span_positions, token.name, word_annotation,
                          token_attributes, annotation_dict, export_names)

    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Write result to file
    with open(out, "w") as f:
        f.write(vrt_data)
    log.info("Exported: %s", out)
Exemplo n.º 8
0
def vrt(doc: Document = Document(),
        out: Export = Export("vrt/{doc}.vrt"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("[export.word]"),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace")):
    """Export annotations to vrt.

    - annotations: list of elements:attributes (annotations) to include.
    - source_annotations: list of elements:attributes from the original document
      to be kept. If not specified, everything will be kept.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Read words
    word_annotation = list(word.read())

    # Get annotation spans, annotations list etc.
    annotation_list, token_attributes, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        token_name=token.name,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    span_positions, annotation_dict = util.gather_annotations(annotation_list,
                                                              export_names,
                                                              doc=doc)
    vrt_data = create_vrt(span_positions, token.name, word_annotation,
                          token_attributes, annotation_dict, export_names)

    # Write result to file
    with open(out, "w") as f:
        f.write(vrt_data)
    log.info("Exported: %s", out)
Exemplo n.º 9
0
def freq_list_simple(corpus: Corpus = Corpus(),
                     docs: AllDocuments = AllDocuments(),
                     word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"),
                     pos: AnnotationAllDocs = AnnotationAllDocs("<token:pos>"),
                     baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"),
                     out: Export = Export("frequency_list/stats_[metadata.id].csv"),
                     delimiter: str = Config("stats_export.delimiter"),
                     cutoff: int = Config("stats_export.cutoff")):
    """Create a word frequency list for a corpus without sense, lemgram and complemgram annotations."""
    freq_dict = defaultdict(int)

    for doc in docs:
        simple_tokens = word.read_attributes(doc, [word, pos, baseform])

        # Add empty annotations for sense, lemgram and complemgram
        tokens = []
        for w, p, b in simple_tokens:
            tokens.append((w, p, b, "|", "|", "|"))
        update_freqs(tokens, freq_dict)

    write_csv(out, freq_dict, delimiter, cutoff)
Exemplo n.º 10
0
def freq_list(corpus: Corpus = Corpus(),
              docs: AllDocuments = AllDocuments(),
              word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"),
              msd: AnnotationAllDocs = AnnotationAllDocs("<token:msd>"),
              baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"),
              sense: AnnotationAllDocs = AnnotationAllDocs("<token:sense>"),
              lemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.lemgram"),
              complemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.complemgram"),
              out: Export = Export("frequency_list/stats_[metadata.id].csv"),
              delimiter: str = Config("stats_export.delimiter"),
              cutoff: int = Config("stats_export.cutoff"),
              include_all_compounds: bool = Config("stats_export.include_all_compounds")):
    """Create a word frequency list for the entire corpus.

    Args:
        corpus (str, optional): The corpus ID. Defaults to Corpus.
        docs (list, optional): The documents belonging to this corpus. Defaults to AllDocuments.
        word (str, optional): Word annotations. Defaults to AnnotationAllDocs("<token:word>").
        msd (str, optional): MSD annotations. Defaults to AnnotationAllDocs("<token:msd>").
        baseform (str, optional): Baseform annotations. Defaults to AnnotationAllDocs("<token:baseform>").
        sense (str, optional): Sense annotations. Defaults to AnnotationAllDocs("<token:sense>").
        lemgram (str, optional): Lemgram annotations. Defaults to AnnotationAllDocs("<token>:saldo.lemgram").
        complemgram (str, optional): Compound lemgram annotations.
            Defaults to AnnotationAllDocs("<token>:saldo.complemgram").
        out (str, optional): The output word frequency file. Defaults to Export("frequency_list/[metadata.id].csv").
        delimiter (str, optional): Column delimiter to use in the csv. Defaults to Config("stats_export.delimiter").
        cutoff (int, optional): The minimum frequency a word must have in order to be included in the result.
            Defaults to Config("stats_export.cutoff").
        include_all_compounds (bool, optional): Whether to include compound analyses for every word
            or just for the words that are lacking a sense annotation.
            Defaults to Config("stats_export.include_all_compounds").
    """
    freq_dict = defaultdict(int)

    for doc in docs:
        tokens = word.read_attributes(doc, [word, msd, baseform, sense, lemgram, complemgram])
        update_freqs(tokens, freq_dict, include_all_compounds)

    write_csv(out, freq_dict, delimiter, cutoff)
Exemplo n.º 11
0
def relations_sql(corpus: Corpus = Corpus(),
                  out: Export = Export("korp_wordpicture/relations.sql"),
                  relations: AnnotationDataAllDocs = AnnotationDataAllDocs(
                      "korp.relations"),
                  docs: Optional[AllDocuments] = AllDocuments(),
                  doclist: str = "",
                  split: bool = False):
    """Calculate statistics of the dependencies and saves to SQL files.

    - corpus is the corpus name.
    - out is the name for the SQL file which will contain the resulting SQL statements.
    - relations is the name of the relations annotation.
    - docs is a list of documents.
    - doclist can be used instead of docs, and should be a file containing the name of docs, one per row.
    - split set to true leads to SQL commands being split into several parts, requiring less memory during creation,
     but installing the data will take much longer.
    """
    db_table = MYSQL_TABLE + "_" + corpus.upper()

    # Relations that will be grouped together
    rel_grouping = {
        "OO": "OBJ",
        "IO": "OBJ",
        "RA": "ADV",
        "TA": "ADV",
        "OA": "ADV"
    }

    MAX_SENTENCES = 5000

    index = 0
    string_index = -1
    strings = {}  # ID -> string table
    freq_index = {}
    sentence_count = defaultdict(int)
    doc_count = 0

    assert (docs or doclist), "Missing source"

    if doclist:
        with open(doclist) as insource:
            docs = [line.strip() for line in insource]

    if len(docs) == 1:
        split = False

    for doc in docs:
        doc_count += 1
        sentences = {}
        if doc_count == 1 or split:
            freq = {}  # Frequency of (head, rel, dep)
            rel_count = defaultdict(int)  # Frequency of (rel)
            head_rel_count = defaultdict(int)  # Frequency of (head, rel)
            dep_rel_count = defaultdict(int)  # Frequency of (rel, dep)

        relations_data = relations.read(doc)

        for triple in relations_data.splitlines():
            head, headpos, rel, dep, deppos, extra, sid, refh, refd, bfhead, bfdep, wfhead, wfdep = triple.split(
                u"\t")
            bfhead, bfdep, wfhead, wfdep = int(bfhead), int(bfdep), int(
                wfhead), int(wfdep)

            if not (head, headpos) in strings:
                string_index += 1
            head = strings.setdefault((head, headpos), string_index)

            if not (dep, deppos, extra) in strings:
                string_index += 1
            dep = strings.setdefault((dep, deppos, extra), string_index)

            rel = rel_grouping.get(rel, rel)

            if (head, rel, dep) in freq_index:
                this_index = freq_index[(head, rel, dep)]
            else:
                this_index = index
                freq_index[(head, rel, dep)] = this_index
                index += 1
            #                                                                         freq    bf/wf
            freq.setdefault(head, {}).setdefault(rel, {}).setdefault(
                dep, [this_index, 0, [0, 0, 0, 0]])
            freq[head][rel][dep][1] += 1  # Frequency

            if sentence_count[this_index] < MAX_SENTENCES:
                sentences.setdefault(this_index, set())
                sentences[this_index].add(
                    (sid, refh,
                     refd))  # Sentence ID and "ref" for both head and dep
                sentence_count[this_index] += 1

            freq[head][rel][dep][2][0] = freq[head][rel][dep][2][0] or bfhead
            freq[head][rel][dep][2][1] = freq[head][rel][dep][2][1] or bfdep
            freq[head][rel][dep][2][2] = freq[head][rel][dep][2][2] or wfhead
            freq[head][rel][dep][2][3] = freq[head][rel][dep][2][3] or wfdep

            if bfhead and bfdep:
                rel_count[rel] += 1
            if (bfhead and bfdep) or wfhead:
                head_rel_count[(head, rel)] += 1
            if (bfhead and bfdep) or wfdep:
                dep_rel_count[(dep, rel)] += 1

        # If not the last file
        if not doc_count == len(docs):
            if split:
                # Don't print string table until the last file
                _write_sql({},
                           sentences,
                           freq,
                           rel_count,
                           head_rel_count,
                           dep_rel_count,
                           out,
                           db_table,
                           split,
                           first=(doc_count == 1))
            else:
                # Only save sentences data, save the rest for the last file
                _write_sql({},
                           sentences, {}, {}, {}, {},
                           out,
                           db_table,
                           split,
                           first=(doc_count == 1))

    # Create the final file, including the string table
    _write_sql(strings,
               sentences,
               freq,
               rel_count,
               head_rel_count,
               dep_rel_count,
               out,
               db_table,
               split,
               first=(doc_count == 1),
               last=True)

    log.info("Done creating SQL files")
Exemplo n.º 12
0
def combined(corpus: Corpus = Corpus(),
             out: Export = Export("[xml_export.filename_combined]"),
             docs: AllDocuments = AllDocuments(),
             xml_input: ExportInput = ExportInput("xml_pretty/[xml_export.filename]", all_docs=True)):
    """Combine XML export files into a single XML file."""
    xml_utils.combine(corpus, out, docs, xml_input)
Exemplo n.º 13
0
def compressed(out: Export = Export("[xml_export.filename_compressed]"),
               xmlfile: ExportInput = ExportInput("[xml_export.filename_combined]")):
    """Compress combined XML export."""
    xml_utils.compress(xmlfile, out)
Exemplo n.º 14
0
def json_export(out: Export = Export("sbx_metadata/[metadata.id].json"),
                corpus_id: Corpus = Corpus(),
                lang: Language = Language(),
                metadata: dict = Config("metadata"),
                sentences: AnnotationCommonData = AnnotationCommonData(
                    "misc.<sentence>_count"),
                tokens: AnnotationCommonData = AnnotationCommonData(
                    "misc.<token>_count"),
                korp_protected: bool = Config("korp.protected"),
                korp_mode: bool = Config("korp.mode"),
                md_trainingdata: bool = Config("sbx_metadata.trainingdata"),
                md_xml_export: str = Config("sbx_metadata.xml_export"),
                md_stats_export: bool = Config("sbx_metadata.stats_export"),
                md_korp: bool = Config("sbx_metadata.korp"),
                md_downloads: list = Config("sbx_metadata.downloads"),
                md_interface: list = Config("sbx_metadata.interface"),
                md_contact: dict = Config("sbx_metadata.contact_info")):
    """Export corpus metadata to JSON format."""
    md_obj = {}
    md_obj["id"] = corpus_id
    md_obj["type"] = "corpus"
    md_obj["trainingdata"] = md_trainingdata

    # Set language info
    md_obj["lang"] = [{
        "code":
        lang,
        "name_en":
        languages.get(part3=lang).name if lang in languages.part3 else lang,
        "name_sv":
        Language.get(lang).display_name("swe"),
    }]

    # Set name and description
    md_obj["name_en"] = metadata.get("name", {}).get("eng")
    md_obj["name_sv"] = metadata.get("name", {}).get("swe")
    md_obj["description_en"] = metadata.get("description", {}).get("eng")
    md_obj["description_sv"] = metadata.get("description", {}).get("swe")

    # Set downloads
    downloads = []
    downloads.append(
        metadata_utils.make_standard_xml_export(md_xml_export, corpus_id))
    downloads.append(
        metadata_utils.make_standard_stats_export(md_stats_export, corpus_id))
    downloads.append(metadata_utils.make_metashare(corpus_id))
    downloads.extend(md_downloads)
    md_obj["downloads"] = [d for d in downloads if d]

    # Set interface
    interface = []
    interface.append(metadata_utils.make_korp(md_korp, corpus_id, korp_mode))
    interface.extend(md_interface)
    md_obj["interface"] = [d for d in interface if d]

    # Set contact info
    if md_contact == "sbx-default":
        md_obj["contact_info"] = metadata_utils.SBX_DEFAULT_CONTACT
    else:
        md_obj["contact_info"] = md_contact

    # Set size
    md_obj["size"] = {"tokens": tokens.read(), "sentences": sentences.read()}

    # Write JSON to file
    os.makedirs(os.path.dirname(out), exist_ok=True)
    json_str = json.dumps(md_obj, ensure_ascii=False, indent=4)
    with open(out, "w") as f:
        f.write(json_str)
    logger.info("Exported: %s", out)
Exemplo n.º 15
0
def preserved_format(
        doc: Document = Document(),
        text: Text = Text(),
        docid: AnnotationData = AnnotationData("<docid>"),
        out: Export = Export(
            "xml_preserved_format/[xml_export.filename_formatted]"),
        annotations: ExportAnnotations = ExportAnnotations(
            "xml_export.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "xml_export.source_annotations"),
        header_annotations: SourceAnnotations = SourceAnnotations(
            "xml_export.header_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace"),
        include_empty_attributes: bool = Config(
            "xml_export.include_empty_attributes")):
    """Export annotations to XML in export_dir and keep whitespaces and indentation from original file.

    Args:
        doc: Name of the original document.
        text: The corpus text.
        docid: Annotation with document IDs.
        out: Path and filename pattern for resulting file.
        annotations: List of elements:attributes (annotations) to include.
        source_annotations: List of elements:attributes from the original document
            to be kept. If not specified, everything will be kept.
        header_annotations: List of header elements from the original document to include
            in the export. If not specified, all headers will be kept.
        remove_namespaces: Whether to remove module "namespaces" from element and attribute names.
            Disabled by default.
        sparv_namespace: The namespace to be added to all Sparv annotations.
        source_namespace: The namespace to be added to all annotations present in the source.
        include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default.
    """
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    # Read corpus text and document ID
    corpus_text = text.read()
    docid = docid.read()

    # Get annotation spans, annotations list etc.
    annotation_list, _, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    h_annotations, h_export_names = util.get_header_names(header_annotations,
                                                          doc=doc)
    export_names.update(h_export_names)
    span_positions, annotation_dict = util.gather_annotations(
        annotation_list,
        export_names,
        h_annotations,
        doc=doc,
        flatten=False,
        split_overlaps=True)
    sorted_positions = [(pos, span[0], span[1])
                        for pos, spans in sorted(span_positions.items())
                        for span in spans]

    # Root tag sanity check
    if not xml_utils.valid_root(sorted_positions[0], sorted_positions[-1]):
        raise util.SparvErrorMessage(
            "Root tag is missing! If you have manually specified which elements to include, "
            "make sure to include an element that encloses all other included elements and "
            "text content.")

    # Create root node
    root_span = sorted_positions[0][2]
    root_span.set_node()
    node_stack = []
    last_pos = 0  # Keeps track of the position of the processed text

    for x, (_pos, instruction, span) in enumerate(sorted_positions):
        # Open node: Create child node under the top stack node
        if instruction == "open":
            # Set tail for previous node if necessary
            if last_pos < span.start:
                # Get last closing node in this position
                _, tail_span = [
                    i for i in span_positions[last_pos] if i[0] == "close"
                ][-1]
                tail_span.node.tail = corpus_text[last_pos:span.start]
                last_pos = span.start

            # Handle headers
            if span.is_header:
                header = annotation_dict[span.name][util.HEADER_CONTENTS][
                    span.index]
                header_xml = etree.fromstring(header)
                header_xml.tag = span.export  # Rename element if needed
                span.node = header_xml
                node_stack[-1].node.append(header_xml)
            else:
                if node_stack:  # Don't create root node, it already exists
                    span.set_node(parent_node=node_stack[-1].node)

                xml_utils.add_attrs(span.node, span.name, annotation_dict,
                                    export_names, span.index,
                                    include_empty_attributes)
                if span.overlap_id:
                    if sparv_namespace:
                        span.node.set(f"{sparv_namespace}.{util.OVERLAP_ATTR}",
                                      f"{docid}-{span.overlap_id}")
                    else:
                        span.node.set(
                            f"{util.SPARV_DEFAULT_NAMESPACE}.{util.OVERLAP_ATTR}",
                            f"{docid}-{span.overlap_id}")
                node_stack.append(span)

                # Set text if there should be any between this node and the next one
                next_item = sorted_positions[x + 1]
                if next_item[1] == "open" and next_item[2].start > span.start:
                    span.node.text = corpus_text[last_pos:next_item[2].start]
                    last_pos = next_item[2].start

        # Close node
        else:
            if span.is_header:
                continue
            if last_pos < span.end:
                # Set node text if necessary
                if span.start == last_pos:
                    span.node.text = corpus_text[last_pos:span.end]
                # Set tail for previous node if necessary
                else:
                    # Get last closing node in this position
                    _, tail_span = [
                        i for i in span_positions[last_pos] if i[0] == "close"
                    ][-1]
                    tail_span.node.tail = corpus_text[last_pos:span.end]
                last_pos = span.end

            # Make sure closing node == top stack node
            assert span == node_stack[
                -1], "Overlapping elements found: {}".format(node_stack[-2:])
            # Pop stack and move on to next span
            node_stack.pop()

    # Write xml to file
    etree.ElementTree(root_span.node).write(out,
                                            encoding="unicode",
                                            method="xml",
                                            xml_declaration=True)
    log.info("Exported: %s", out)
Exemplo n.º 16
0
def csv(doc: Document = Document(),
        out: Export = Export("csv/{doc}.csv"),
        token: Annotation = Annotation("<token>"),
        word: Annotation = Annotation("[export.word]"),
        sentence: Annotation = Annotation("<sentence>"),
        annotations: ExportAnnotations = ExportAnnotations(
            "csv_export.annotations"),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "csv_export.source_annotations"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace"),
        delimiter: str = Config("csv_export.delimiter")):
    """Export annotations to CSV format."""
    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    token_name = token.name

    # Read words
    word_annotation = list(word.read())

    # Get annotation spans, annotations list etc.
    annotation_list, token_attributes, export_names = util.get_annotation_names(
        annotations,
        source_annotations,
        doc=doc,
        token_name=token_name,
        remove_namespaces=remove_namespaces,
        sparv_namespace=sparv_namespace,
        source_namespace=source_namespace)
    span_positions, annotation_dict = util.gather_annotations(annotation_list,
                                                              export_names,
                                                              doc=doc)

    # Make csv header
    csv_data = [
        _make_header(token_name, token_attributes, export_names, delimiter)
    ]

    # Go through spans_dict and add to csv, line by line
    for _pos, instruction, span in span_positions:
        if instruction == "open":
            # Create token line
            if span.name == token_name:
                csv_data.append(
                    _make_token_line(word_annotation[span.index], token_name,
                                     token_attributes, annotation_dict,
                                     span.index, delimiter))

            # Create line with structural annotation
            else:
                attrs = _make_attrs(span.name, annotation_dict, export_names,
                                    span.index)
                for attr in attrs:
                    csv_data.append(f"# {attr}")
                if not attrs:
                    csv_data.append(f"# {span.export}")

        # Insert blank line after each closing sentence
        elif span.name == sentence.name and instruction == "close":
            csv_data.append("")

    # Write result to file
    with open(out, "w") as f:
        f.write("\n".join(csv_data))
    logger.info("Exported: %s", out)
Exemplo n.º 17
0
def conllu(doc: Document = Document(),
           out: Export = Export("conll/{doc}.conllu"),
           token: Annotation = Annotation("<token>"),
           sentence: Annotation = Annotation("<sentence>"),
           sentence_id: Annotation = Annotation("[conll_export.conll_fields.sentid]"),
           source_annotations: SourceAnnotations = SourceAnnotations("conll_export.source_annotations"),
           id_ref: Optional[Annotation] = Annotation("[conll_export.conll_fields.id]"),
           form: Optional[Annotation] = Annotation("[export.word]"),
           lemma: Optional[Annotation] = Annotation("[conll_export.conll_fields.lemma]"),
           upos: Optional[Annotation] = Annotation("[conll_export.conll_fields.upos]"),
           xpos: Optional[Annotation] = Annotation("[conll_export.conll_fields.xpos]"),
           feats: Optional[Annotation] = Annotation("[conll_export.conll_fields.feats]"),
           head: Optional[Annotation] = Annotation("[conll_export.conll_fields.head]"),
           deprel: Optional[Annotation] = Annotation("[conll_export.conll_fields.deprel]"),
           deps: Optional[Annotation] = Annotation("[conll_export.conll_fields.deps]"),
           misc: Optional[Annotation] = Annotation("[conll_export.conll_fields.misc]")):
    """Export annotations to CoNLL-U format."""
    # CoNLLU specification: https://universaldependencies.org/format.html
    # ID: Word index, integer starting at 1 for each new sentence; may be a range for multiword tokens; may be a decimal number for empty nodes (decimal numbers can be lower than 1 but must be greater than 0).
    # FORM: Word form or punctuation symbol.
    # LEMMA: Lemma or stem of word form.
    # UPOS: Universal part-of-speech tag.
    # XPOS: Language-specific part-of-speech tag; underscore if not available.
    # FEATS: List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available.
    # HEAD: Head of the current word, which is either a value of ID or zero (0).
    # DEPREL: Universal dependency relation to the HEAD (root iff HEAD = 0) or a defined language-specific subtype of one.
    # DEPS: Enhanced dependency graph in the form of a list of head-deprel pairs.
    # MISC: Any other annotation.
    conll_fields = [id_ref, form, lemma, upos, xpos, feats, head, deprel, deps, misc]
    conll_fields = [f if isinstance(f, Annotation) else Annotation() for f in conll_fields]

    # Create export dir
    os.makedirs(os.path.dirname(out), exist_ok=True)

    token_name = token.name

    # Get annotation spans, annotations list etc.
    # TODO: Add structural annotations from 'annotations'? This is a bit annoying though because then we'd have to
    # take annotations as a requirement which results in Sparv having to run all annotations, even the ones we don't
    # want to use here.
    annotations = [sentence, sentence_id, token] + conll_fields
    annotations = [(annot, None) for annot in annotations]
    annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations,
                                                                 remove_namespaces=True,
                                                                 doc=doc, token_name=token_name)
    span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, doc=doc)

    csv_data = ["# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"]
    # Go through spans_dict and add to csv, line by line
    for _pos, instruction, span in span_positions:
        if instruction == "open":
            # Create token line
            if span.name == token_name:
                csv_data.append(_make_conll_token_line(conll_fields, token_name, annotation_dict, span.index))

            # Create line with structural annotation
            else:
                attrs = _make_attrs(span.name, annotation_dict, export_names, span.index)
                for attr in attrs:
                    csv_data.append(f"# {attr}")
                if not attrs:
                    csv_data.append(f"# {span.export}")

        # Insert blank line after each closing sentence
        elif span.name == sentence.name and instruction == "close":
            csv_data.append("")

    # Insert extra blank line to make CoNLL-U validator happy
    csv_data.append("")

    # Write result to file
    with open(out, "w") as f:
        f.write("\n".join(csv_data))
    logger.info("Exported: %s", out)
Exemplo n.º 18
0
def metashare(
        out: Export = Export("sbx_metadata/[metadata.id].xml"),
        template: Model = Model("sbx_metadata/sbx-metashare-template.xml"),
        corpus_id: Corpus = Corpus(),
        lang: Language = Language(),
        metadata: dict = Config("metadata"),
        sentences: AnnotationCommonData = AnnotationCommonData(
            "misc.<sentence>_count"),
        tokens: AnnotationCommonData = AnnotationCommonData(
            "misc.<token>_count"),
        annotations: ExportAnnotations = ExportAnnotations(
            "xml_export.annotations", is_input=False),
        korp_protected: bool = Config("korp.protected"),
        korp_mode: bool = Config("korp.mode"),
        # md_linguality: str = Config("sbx_metadata.linguality"),
        md_script: str = Config("sbx_metadata.script"),
        md_xml_export: str = Config("sbx_metadata.xml_export"),
        md_stats_export: bool = Config("sbx_metadata.stats_export"),
        md_korp: bool = Config("sbx_metadata.korp"),
        md_downloads: list = Config("sbx_metadata.downloads"),
        md_interface: list = Config("sbx_metadata.interface"),
        md_contact: dict = Config("sbx_metadata.contact_info")):
    """Export corpus metadata to META-SHARE format."""
    # Parse template and handle META SHARE namespace
    xml = etree.parse(template.path).getroot()
    etree.register_namespace("", META_SHARE_URL)
    ns = META_SHARE_NAMESPACE

    # Set idenfification info
    identificationInfo = xml.find(ns + "identificationInfo")
    for i in identificationInfo.findall(ns + "resourceShortName"):
        i.text = corpus_id
    identificationInfo.find(ns + "identifier").text = corpus_id
    _set_texts(identificationInfo.findall(ns + "resourceName"),
               metadata.get("name", {}))
    _set_texts(identificationInfo.findall(ns + "description"),
               metadata.get("description", {}))

    # Set metadata creation date in metadataInfo
    xml.find(".//" + ns + "metadataCreationDate").text = str(
        time.strftime("%Y-%m-%d"))

    # Set availability
    if korp_protected:
        xml.find(".//" + ns + "availability").text = "available-restrictedUse"
    else:
        xml.find(".//" + ns +
                 "availability").text = "available-unrestrictedUse"

    # Set licenceInfos
    distInfo = xml.find(".//" + ns + "distributionInfo")
    _set_licence_info(
        [metadata_utils.make_standard_xml_export(md_xml_export, corpus_id)],
        distInfo)
    _set_licence_info([
        metadata_utils.make_standard_stats_export(md_stats_export, corpus_id)
    ], distInfo)
    _set_licence_info(
        [metadata_utils.make_korp(md_korp, corpus_id, korp_mode)],
        distInfo,
        download=False)
    _set_licence_info([metadata_utils.make_metashare(corpus_id)], distInfo)
    # Add non-standard licenseInfos
    _set_licence_info(md_downloads, distInfo)
    _set_licence_info(md_interface, distInfo, download=False)

    # Set contactPerson
    _set_contact_info(md_contact, xml.find(".//" + ns + "contactPerson"))

    # Set samplesLocation
    xml.find(".//" + ns +
             "samplesLocation").text = f"{SBX_SAMPLES_LOCATION}{corpus_id}"

    # Set lingualityType
    xml.find(".//" + ns + "lingualityType").text = "monolingual"

    # Set languageInfo (languageId, languageName, languageScript)
    xml.find(".//" + ns + "languageId").text = lang
    xml.find(".//" + ns + "languageName").text = languages.get(
        part3=lang).name if lang in languages.part3 else lang
    xml.find(".//" + ns + "languageScript").text = md_script

    # Set sizeInfo
    sizeInfos = xml.findall(".//" + ns + "sizeInfo")
    sizeInfos[0].find(ns + "size").text = tokens.read()
    sizeInfos[1].find(ns + "size").text = sentences.read()

    # Set annotationInfo
    corpusTextInfo = xml.find(".//" + ns + "corpusTextInfo")
    _set_annotation_info(annotations, corpusTextInfo)

    # Write XML to file
    os.makedirs(os.path.dirname(out), exist_ok=True)
    etree.ElementTree(xml).write(out,
                                 encoding="unicode",
                                 method="xml",
                                 xml_declaration=True)
    logger.info("Exported: %s", out)