def lemgram_sql(corpus: Corpus = Corpus(), docs: AllDocuments = AllDocuments(), out: Export = Export("korp_lemgram_index/lemgram_index.sql"), lemgram: AnnotationAllDocs = AnnotationAllDocs( "<token>:saldo.lemgram")): """Create lemgram index SQL file.""" corpus = corpus.upper() result = defaultdict(int) for doc in docs: for lg in lemgram.read(doc): for value in lg.split("|"): if value and ":" not in value: result[value] += 1 mysql = MySQL(output=out) mysql.create_table(MYSQL_TABLE, drop=False, **MYSQL_INDEX) mysql.delete_rows(MYSQL_TABLE, {"corpus": corpus}) mysql.set_names() rows = [] for lemgram, freq in list(result.items()): rows.append({"lemgram": lemgram, "corpus": corpus, "freq": freq}) log.info("Creating SQL") mysql.add_row(MYSQL_TABLE, rows)
def encode_scrambled( corpus: Corpus = Corpus(), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations", is_input=False), source_annotations: SourceAnnotations = SourceAnnotations( "cwb.source_annotations"), docs: AllDocuments = AllDocuments(), words: AnnotationAllDocs = AnnotationAllDocs("[export.word]"), vrtfiles: ExportInput = ExportInput("vrt_scrambled/{doc}.vrt", all_docs=True), out: Export = Export("[cwb.corpus_registry]/[metadata.id]", absolute_path=True), out_marker: Export = Export( "[cwb.cwb_datadir]/[metadata.id]/.scrambled_marker", absolute_path=True), token: AnnotationAllDocs = AnnotationAllDocs("<token>"), bin_path: Config = Config("cwb.bin_path"), encoding: str = Config("cwb.encoding"), datadir: str = Config("cwb.cwb_datadir"), registry: str = Config("cwb.corpus_registry"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), skip_compression: Optional[bool] = Config("cwb.skip_compression"), skip_validation: Optional[bool] = Config("cwb.skip_validation")): """Do cwb encoding with vrt files in scrambled order.""" cwb_encode(corpus, annotations, source_annotations, docs, words, vrtfiles, out, out_marker, token.name, bin_path, encoding, datadir, registry, remove_namespaces, sparv_namespace, source_namespace, skip_compression, skip_validation)
def timespan_sql_no_dateinfo( corpus: Corpus = Corpus(), out: Export = Export("korp_timespan/timespan.sql"), docs: AllDocuments = AllDocuments(), token: AnnotationAllDocs = AnnotationAllDocs("<token>")): """Create timespan SQL data for use in Korp.""" corpus_name = corpus.upper() token_count = 0 for doc in docs: tokens = token.read_spans(doc) token_count += len(list(tokens)) rows_date = [{ "corpus": corpus_name, "datefrom": "0" * 8, "dateto": "0" * 8, "tokens": token_count }] rows_datetime = [{ "corpus": corpus_name, "datefrom": "0" * 14, "dateto": "0" * 14, "tokens": token_count }] create_sql(corpus_name, out, rows_date, rows_datetime)
def info_sentences( out: OutputCommonData = OutputCommonData("cwb.sentencecount"), sentence: AnnotationAllDocs = AnnotationAllDocs("<sentence>"), docs: AllDocuments = AllDocuments()): """Determine how many sentences there are in the corpus.""" # Read sentence annotation and count the sentences sentence_count = 0 for doc in docs: try: sentence_count += len(list(sentence.read_spans(doc))) except FileNotFoundError: pass if sentence_count == 0: log.info("No sentence information found in corpus") # Write sentencecount data out.write(str(sentence_count))
def freq_list_simple(corpus: Corpus = Corpus(), docs: AllDocuments = AllDocuments(), word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"), pos: AnnotationAllDocs = AnnotationAllDocs("<token:pos>"), baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"), out: Export = Export("frequency_list/stats_[metadata.id].csv"), delimiter: str = Config("stats_export.delimiter"), cutoff: int = Config("stats_export.cutoff")): """Create a word frequency list for a corpus without sense, lemgram and complemgram annotations.""" freq_dict = defaultdict(int) for doc in docs: simple_tokens = word.read_attributes(doc, [word, pos, baseform]) # Add empty annotations for sense, lemgram and complemgram tokens = [] for w, p, b in simple_tokens: tokens.append((w, p, b, "|", "|", "|")) update_freqs(tokens, freq_dict) write_csv(out, freq_dict, delimiter, cutoff)
def timespan_sql_with_dateinfo( corpus: Corpus = Corpus(), out: Export = Export("korp_timespan/timespan.sql"), docs: AllDocuments = AllDocuments(), token: AnnotationAllDocs = AnnotationAllDocs("<token>"), datefrom: AnnotationAllDocs = AnnotationAllDocs( "<text>:dateformat.datefrom"), dateto: AnnotationAllDocs = AnnotationAllDocs( "<text>:dateformat.dateto"), timefrom: AnnotationAllDocs = AnnotationAllDocs( "<text>:dateformat.timefrom"), timeto: AnnotationAllDocs = AnnotationAllDocs( "<text>:dateformat.timeto")): """Create timespan SQL data for use in Korp.""" corpus_name = corpus.upper() datespans = defaultdict(int) datetimespans = defaultdict(int) for doc in docs: text_tokens, orphans = Annotation(datefrom.name, doc=doc).get_children(token) if orphans: datespans[("0" * 8, "0" * 8)] += len(orphans) datetimespans[("0" * 14, "0" * 14)] += len(orphans) dateinfo = datefrom.read_attributes( doc, (datefrom, dateto, timefrom, timeto)) for text in text_tokens: d = next(dateinfo) datespans[(d[0].zfill(8), d[1].zfill(8))] += len(text) datetimespans[(d[0].zfill(8) + d[2].zfill(6), d[1].zfill(8) + d[3].zfill(6))] += len(text) rows_date = [] rows_datetime = [] for span in datespans: rows_date.append({ "corpus": corpus_name, "datefrom": span[0], "dateto": span[1], "tokens": datespans[span] }) for span in datetimespans: rows_datetime.append({ "corpus": corpus_name, "datefrom": span[0], "dateto": span[1], "tokens": datetimespans[span] }) create_sql(corpus_name, out, rows_date, rows_datetime)
def freq_list(corpus: Corpus = Corpus(), docs: AllDocuments = AllDocuments(), word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"), msd: AnnotationAllDocs = AnnotationAllDocs("<token:msd>"), baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"), sense: AnnotationAllDocs = AnnotationAllDocs("<token:sense>"), lemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.lemgram"), complemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.complemgram"), out: Export = Export("frequency_list/stats_[metadata.id].csv"), delimiter: str = Config("stats_export.delimiter"), cutoff: int = Config("stats_export.cutoff"), include_all_compounds: bool = Config("stats_export.include_all_compounds")): """Create a word frequency list for the entire corpus. Args: corpus (str, optional): The corpus ID. Defaults to Corpus. docs (list, optional): The documents belonging to this corpus. Defaults to AllDocuments. word (str, optional): Word annotations. Defaults to AnnotationAllDocs("<token:word>"). msd (str, optional): MSD annotations. Defaults to AnnotationAllDocs("<token:msd>"). baseform (str, optional): Baseform annotations. Defaults to AnnotationAllDocs("<token:baseform>"). sense (str, optional): Sense annotations. Defaults to AnnotationAllDocs("<token:sense>"). lemgram (str, optional): Lemgram annotations. Defaults to AnnotationAllDocs("<token>:saldo.lemgram"). complemgram (str, optional): Compound lemgram annotations. Defaults to AnnotationAllDocs("<token>:saldo.complemgram"). out (str, optional): The output word frequency file. Defaults to Export("frequency_list/[metadata.id].csv"). delimiter (str, optional): Column delimiter to use in the csv. Defaults to Config("stats_export.delimiter"). cutoff (int, optional): The minimum frequency a word must have in order to be included in the result. Defaults to Config("stats_export.cutoff"). include_all_compounds (bool, optional): Whether to include compound analyses for every word or just for the words that are lacking a sense annotation. Defaults to Config("stats_export.include_all_compounds"). """ freq_dict = defaultdict(int) for doc in docs: tokens = word.read_attributes(doc, [word, msd, baseform, sense, lemgram, complemgram]) update_freqs(tokens, freq_dict, include_all_compounds) write_csv(out, freq_dict, delimiter, cutoff)
def info_date( corpus: Corpus = Corpus(), out_datefirst: OutputCommonData = OutputCommonData("cwb.datefirst"), out_datelast: OutputCommonData = OutputCommonData("cwb.datelast"), corpus_data_file: ExportInput = ExportInput( "[cwb.corpus_registry]/[metadata.id]"), datefrom: AnnotationAllDocs = AnnotationAllDocs( "[dateformat.out_annotation]:dateformat.datefrom"), dateto: AnnotationAllDocs = AnnotationAllDocs( "[dateformat.out_annotation]:dateformat.dateto"), timefrom: AnnotationAllDocs = AnnotationAllDocs( "[dateformat.out_annotation]:dateformat.timefrom"), timeto: AnnotationAllDocs = AnnotationAllDocs( "[dateformat.out_annotation]:dateformat.timeto"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), cwb_bin_path: Config = Config("cwb.bin_path", ""), registry: str = Config("cwb.corpus_registry")): """Create datefirst and datelast file (needed for .info file).""" def fix_name(name: str): """Remove invalid characters from annotation names and optionally remove namespaces.""" if remove_namespaces: prefix, part, suffix = name.partition(":") suffix = suffix.split(".")[-1] name = prefix + part + suffix return name.replace(":", "_") def _parse_cwb_output(output): lines = output.decode("utf8").split("\n") values = [ "%s %s" % (line.split("\t")[1], line.split("\t")[2]) for line in lines if line.split("\t")[-1] ] # Fix dates with less than 8 digits (e.g. 800 --> 0800), needed by strptime values = [ "%s %s" % (v.split()[0].zfill(8), v.split()[1]) for v in values ] # Convert to dates and sort, then convert to human readable format values = sorted( [datetime.strptime(v, "%Y%m%d %H%M%S") for v in values]) return [v.strftime("%Y-%m-%d %H:%M:%S") for v in values] # Get date and time annotation names datefrom_name = fix_name(datefrom.name) timefrom_name = fix_name(timefrom.name) dateto_name = fix_name(dateto.name) timeto_name = fix_name(timeto.name) # Get datefirst and write to file datefirst_args = [ "-r", registry, "-q", corpus, datefrom_name, timefrom_name ] datefirst_out, _ = util.system.call_binary( os.path.join(cwb_bin_path, "cwb-scan-corpus"), datefirst_args) datefirst = _parse_cwb_output(datefirst_out)[0] out_datefirst.write(str(datefirst)) # Get datelast and write to file datelast_args = ["-r", registry, "-q", corpus, dateto_name, timeto_name] datelast_out, _ = util.system.call_binary( os.path.join(cwb_bin_path, "cwb-scan-corpus"), datelast_args) datelast = _parse_cwb_output(datelast_out)[-1] out_datelast.write(str(datelast))