Пример #1
0
def lemgram_sql(corpus: Corpus = Corpus(),
                docs: AllDocuments = AllDocuments(),
                out: Export = Export("korp_lemgram_index/lemgram_index.sql"),
                lemgram: AnnotationAllDocs = AnnotationAllDocs(
                    "<token>:saldo.lemgram")):
    """Create lemgram index SQL file."""

    corpus = corpus.upper()
    result = defaultdict(int)

    for doc in docs:
        for lg in lemgram.read(doc):
            for value in lg.split("|"):
                if value and ":" not in value:
                    result[value] += 1

    mysql = MySQL(output=out)
    mysql.create_table(MYSQL_TABLE, drop=False, **MYSQL_INDEX)
    mysql.delete_rows(MYSQL_TABLE, {"corpus": corpus})
    mysql.set_names()

    rows = []
    for lemgram, freq in list(result.items()):
        rows.append({"lemgram": lemgram, "corpus": corpus, "freq": freq})

    log.info("Creating SQL")
    mysql.add_row(MYSQL_TABLE, rows)
Пример #2
0
def encode_scrambled(
        corpus: Corpus = Corpus(),
        annotations: ExportAnnotations = ExportAnnotations("cwb.annotations",
                                                           is_input=False),
        source_annotations: SourceAnnotations = SourceAnnotations(
            "cwb.source_annotations"),
        docs: AllDocuments = AllDocuments(),
        words: AnnotationAllDocs = AnnotationAllDocs("[export.word]"),
        vrtfiles: ExportInput = ExportInput("vrt_scrambled/{doc}.vrt",
                                            all_docs=True),
        out: Export = Export("[cwb.corpus_registry]/[metadata.id]",
                             absolute_path=True),
        out_marker: Export = Export(
            "[cwb.cwb_datadir]/[metadata.id]/.scrambled_marker",
            absolute_path=True),
        token: AnnotationAllDocs = AnnotationAllDocs("<token>"),
        bin_path: Config = Config("cwb.bin_path"),
        encoding: str = Config("cwb.encoding"),
        datadir: str = Config("cwb.cwb_datadir"),
        registry: str = Config("cwb.corpus_registry"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        sparv_namespace: str = Config("export.sparv_namespace"),
        source_namespace: str = Config("export.source_namespace"),
        skip_compression: Optional[bool] = Config("cwb.skip_compression"),
        skip_validation: Optional[bool] = Config("cwb.skip_validation")):
    """Do cwb encoding with vrt files in scrambled order."""
    cwb_encode(corpus, annotations, source_annotations, docs, words, vrtfiles,
               out, out_marker, token.name, bin_path, encoding, datadir,
               registry, remove_namespaces, sparv_namespace, source_namespace,
               skip_compression, skip_validation)
Пример #3
0
def timespan_sql_no_dateinfo(
        corpus: Corpus = Corpus(),
        out: Export = Export("korp_timespan/timespan.sql"),
        docs: AllDocuments = AllDocuments(),
        token: AnnotationAllDocs = AnnotationAllDocs("<token>")):
    """Create timespan SQL data for use in Korp."""
    corpus_name = corpus.upper()
    token_count = 0

    for doc in docs:
        tokens = token.read_spans(doc)
        token_count += len(list(tokens))

    rows_date = [{
        "corpus": corpus_name,
        "datefrom": "0" * 8,
        "dateto": "0" * 8,
        "tokens": token_count
    }]
    rows_datetime = [{
        "corpus": corpus_name,
        "datefrom": "0" * 14,
        "dateto": "0" * 14,
        "tokens": token_count
    }]

    create_sql(corpus_name, out, rows_date, rows_datetime)
Пример #4
0
def info_sentences(
        out: OutputCommonData = OutputCommonData("cwb.sentencecount"),
        sentence: AnnotationAllDocs = AnnotationAllDocs("<sentence>"),
        docs: AllDocuments = AllDocuments()):
    """Determine how many sentences there are in the corpus."""
    # Read sentence annotation and count the sentences
    sentence_count = 0
    for doc in docs:
        try:
            sentence_count += len(list(sentence.read_spans(doc)))
        except FileNotFoundError:
            pass

    if sentence_count == 0:
        log.info("No sentence information found in corpus")

    # Write sentencecount data
    out.write(str(sentence_count))
Пример #5
0
def freq_list_simple(corpus: Corpus = Corpus(),
                     docs: AllDocuments = AllDocuments(),
                     word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"),
                     pos: AnnotationAllDocs = AnnotationAllDocs("<token:pos>"),
                     baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"),
                     out: Export = Export("frequency_list/stats_[metadata.id].csv"),
                     delimiter: str = Config("stats_export.delimiter"),
                     cutoff: int = Config("stats_export.cutoff")):
    """Create a word frequency list for a corpus without sense, lemgram and complemgram annotations."""
    freq_dict = defaultdict(int)

    for doc in docs:
        simple_tokens = word.read_attributes(doc, [word, pos, baseform])

        # Add empty annotations for sense, lemgram and complemgram
        tokens = []
        for w, p, b in simple_tokens:
            tokens.append((w, p, b, "|", "|", "|"))
        update_freqs(tokens, freq_dict)

    write_csv(out, freq_dict, delimiter, cutoff)
Пример #6
0
def timespan_sql_with_dateinfo(
        corpus: Corpus = Corpus(),
        out: Export = Export("korp_timespan/timespan.sql"),
        docs: AllDocuments = AllDocuments(),
        token: AnnotationAllDocs = AnnotationAllDocs("<token>"),
        datefrom: AnnotationAllDocs = AnnotationAllDocs(
            "<text>:dateformat.datefrom"),
        dateto: AnnotationAllDocs = AnnotationAllDocs(
            "<text>:dateformat.dateto"),
        timefrom: AnnotationAllDocs = AnnotationAllDocs(
            "<text>:dateformat.timefrom"),
        timeto: AnnotationAllDocs = AnnotationAllDocs(
            "<text>:dateformat.timeto")):
    """Create timespan SQL data for use in Korp."""
    corpus_name = corpus.upper()
    datespans = defaultdict(int)
    datetimespans = defaultdict(int)

    for doc in docs:
        text_tokens, orphans = Annotation(datefrom.name,
                                          doc=doc).get_children(token)
        if orphans:
            datespans[("0" * 8, "0" * 8)] += len(orphans)
            datetimespans[("0" * 14, "0" * 14)] += len(orphans)
        dateinfo = datefrom.read_attributes(
            doc, (datefrom, dateto, timefrom, timeto))
        for text in text_tokens:
            d = next(dateinfo)
            datespans[(d[0].zfill(8), d[1].zfill(8))] += len(text)
            datetimespans[(d[0].zfill(8) + d[2].zfill(6),
                           d[1].zfill(8) + d[3].zfill(6))] += len(text)

    rows_date = []
    rows_datetime = []

    for span in datespans:
        rows_date.append({
            "corpus": corpus_name,
            "datefrom": span[0],
            "dateto": span[1],
            "tokens": datespans[span]
        })

    for span in datetimespans:
        rows_datetime.append({
            "corpus": corpus_name,
            "datefrom": span[0],
            "dateto": span[1],
            "tokens": datetimespans[span]
        })

    create_sql(corpus_name, out, rows_date, rows_datetime)
Пример #7
0
def freq_list(corpus: Corpus = Corpus(),
              docs: AllDocuments = AllDocuments(),
              word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"),
              msd: AnnotationAllDocs = AnnotationAllDocs("<token:msd>"),
              baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"),
              sense: AnnotationAllDocs = AnnotationAllDocs("<token:sense>"),
              lemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.lemgram"),
              complemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.complemgram"),
              out: Export = Export("frequency_list/stats_[metadata.id].csv"),
              delimiter: str = Config("stats_export.delimiter"),
              cutoff: int = Config("stats_export.cutoff"),
              include_all_compounds: bool = Config("stats_export.include_all_compounds")):
    """Create a word frequency list for the entire corpus.

    Args:
        corpus (str, optional): The corpus ID. Defaults to Corpus.
        docs (list, optional): The documents belonging to this corpus. Defaults to AllDocuments.
        word (str, optional): Word annotations. Defaults to AnnotationAllDocs("<token:word>").
        msd (str, optional): MSD annotations. Defaults to AnnotationAllDocs("<token:msd>").
        baseform (str, optional): Baseform annotations. Defaults to AnnotationAllDocs("<token:baseform>").
        sense (str, optional): Sense annotations. Defaults to AnnotationAllDocs("<token:sense>").
        lemgram (str, optional): Lemgram annotations. Defaults to AnnotationAllDocs("<token>:saldo.lemgram").
        complemgram (str, optional): Compound lemgram annotations.
            Defaults to AnnotationAllDocs("<token>:saldo.complemgram").
        out (str, optional): The output word frequency file. Defaults to Export("frequency_list/[metadata.id].csv").
        delimiter (str, optional): Column delimiter to use in the csv. Defaults to Config("stats_export.delimiter").
        cutoff (int, optional): The minimum frequency a word must have in order to be included in the result.
            Defaults to Config("stats_export.cutoff").
        include_all_compounds (bool, optional): Whether to include compound analyses for every word
            or just for the words that are lacking a sense annotation.
            Defaults to Config("stats_export.include_all_compounds").
    """
    freq_dict = defaultdict(int)

    for doc in docs:
        tokens = word.read_attributes(doc, [word, msd, baseform, sense, lemgram, complemgram])
        update_freqs(tokens, freq_dict, include_all_compounds)

    write_csv(out, freq_dict, delimiter, cutoff)
Пример #8
0
def info_date(
        corpus: Corpus = Corpus(),
        out_datefirst: OutputCommonData = OutputCommonData("cwb.datefirst"),
        out_datelast: OutputCommonData = OutputCommonData("cwb.datelast"),
        corpus_data_file: ExportInput = ExportInput(
            "[cwb.corpus_registry]/[metadata.id]"),
        datefrom: AnnotationAllDocs = AnnotationAllDocs(
            "[dateformat.out_annotation]:dateformat.datefrom"),
        dateto: AnnotationAllDocs = AnnotationAllDocs(
            "[dateformat.out_annotation]:dateformat.dateto"),
        timefrom: AnnotationAllDocs = AnnotationAllDocs(
            "[dateformat.out_annotation]:dateformat.timefrom"),
        timeto: AnnotationAllDocs = AnnotationAllDocs(
            "[dateformat.out_annotation]:dateformat.timeto"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        cwb_bin_path: Config = Config("cwb.bin_path", ""),
        registry: str = Config("cwb.corpus_registry")):
    """Create datefirst and datelast file (needed for .info file)."""
    def fix_name(name: str):
        """Remove invalid characters from annotation names and optionally remove namespaces."""
        if remove_namespaces:
            prefix, part, suffix = name.partition(":")
            suffix = suffix.split(".")[-1]
            name = prefix + part + suffix
        return name.replace(":", "_")

    def _parse_cwb_output(output):
        lines = output.decode("utf8").split("\n")
        values = [
            "%s %s" % (line.split("\t")[1], line.split("\t")[2])
            for line in lines if line.split("\t")[-1]
        ]
        # Fix dates with less than 8 digits (e.g. 800 --> 0800), needed by strptime
        values = [
            "%s %s" % (v.split()[0].zfill(8), v.split()[1]) for v in values
        ]
        # Convert to dates and sort, then convert to human readable format
        values = sorted(
            [datetime.strptime(v, "%Y%m%d %H%M%S") for v in values])
        return [v.strftime("%Y-%m-%d %H:%M:%S") for v in values]

    # Get date and time annotation names
    datefrom_name = fix_name(datefrom.name)
    timefrom_name = fix_name(timefrom.name)
    dateto_name = fix_name(dateto.name)
    timeto_name = fix_name(timeto.name)

    # Get datefirst and write to file
    datefirst_args = [
        "-r", registry, "-q", corpus, datefrom_name, timefrom_name
    ]
    datefirst_out, _ = util.system.call_binary(
        os.path.join(cwb_bin_path, "cwb-scan-corpus"), datefirst_args)
    datefirst = _parse_cwb_output(datefirst_out)[0]
    out_datefirst.write(str(datefirst))

    # Get datelast and write to file
    datelast_args = ["-r", registry, "-q", corpus, dateto_name, timeto_name]
    datelast_out, _ = util.system.call_binary(
        os.path.join(cwb_bin_path, "cwb-scan-corpus"), datelast_args)
    datelast = _parse_cwb_output(datelast_out)[-1]
    out_datelast.write(str(datelast))