def encode_scrambled( corpus: Corpus = Corpus(), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations", is_input=False), source_annotations: SourceAnnotations = SourceAnnotations( "cwb.source_annotations"), docs: AllDocuments = AllDocuments(), words: AnnotationAllDocs = AnnotationAllDocs("[export.word]"), vrtfiles: ExportInput = ExportInput("vrt_scrambled/{doc}.vrt", all_docs=True), out: Export = Export("[cwb.corpus_registry]/[metadata.id]", absolute_path=True), out_marker: Export = Export( "[cwb.cwb_datadir]/[metadata.id]/.scrambled_marker", absolute_path=True), token: AnnotationAllDocs = AnnotationAllDocs("<token>"), bin_path: Config = Config("cwb.bin_path"), encoding: str = Config("cwb.encoding"), datadir: str = Config("cwb.cwb_datadir"), registry: str = Config("cwb.corpus_registry"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), skip_compression: Optional[bool] = Config("cwb.skip_compression"), skip_validation: Optional[bool] = Config("cwb.skip_validation")): """Do cwb encoding with vrt files in scrambled order.""" cwb_encode(corpus, annotations, source_annotations, docs, words, vrtfiles, out, out_marker, token.name, bin_path, encoding, datadir, registry, remove_namespaces, sparv_namespace, source_namespace, skip_compression, skip_validation)
def info(out: Export = Export("[cwb.cwb_datadir]/[metadata.id]/.info", absolute_path=True), sentences: AnnotationCommonData = AnnotationCommonData( "cwb.sentencecount"), firstdate: AnnotationCommonData = AnnotationCommonData( "cwb.datefirst"), lastdate: AnnotationCommonData = AnnotationCommonData("cwb.datelast"), resolution: AnnotationCommonData = AnnotationCommonData( "dateformat.resolution"), protected: bool = Config("korp.protected")): """Save information to the file specified by 'out'.""" content = [] protected_str = str(protected).lower() for key, value_obj in [("Sentences", sentences), ("FirstDate", firstdate), ("LastDate", lastdate), ("DateResolution", resolution), ("Updated", time.strftime("%Y-%m-%d")), ("Protected", protected_str)]: if isinstance(value_obj, AnnotationCommonData): value = value_obj.read() else: value = value_obj content.append("%s: %s\n" % (key, value)) # Write .info file with open(out, "w") as o: o.writelines(content) log.info("Exported: %s", out)
def timespan_sql_no_dateinfo( corpus: Corpus = Corpus(), out: Export = Export("korp_timespan/timespan.sql"), docs: AllDocuments = AllDocuments(), token: AnnotationAllDocs = AnnotationAllDocs("<token>")): """Create timespan SQL data for use in Korp.""" corpus_name = corpus.upper() token_count = 0 for doc in docs: tokens = token.read_spans(doc) token_count += len(list(tokens)) rows_date = [{ "corpus": corpus_name, "datefrom": "0" * 8, "dateto": "0" * 8, "tokens": token_count }] rows_datetime = [{ "corpus": corpus_name, "datefrom": "0" * 14, "dateto": "0" * 14, "tokens": token_count }] create_sql(corpus_name, out, rows_date, rows_datetime)
def lemgram_sql(corpus: Corpus = Corpus(), docs: AllDocuments = AllDocuments(), out: Export = Export("korp_lemgram_index/lemgram_index.sql"), lemgram: AnnotationAllDocs = AnnotationAllDocs( "<token>:saldo.lemgram")): """Create lemgram index SQL file.""" corpus = corpus.upper() result = defaultdict(int) for doc in docs: for lg in lemgram.read(doc): for value in lg.split("|"): if value and ":" not in value: result[value] += 1 mysql = MySQL(output=out) mysql.create_table(MYSQL_TABLE, drop=False, **MYSQL_INDEX) mysql.delete_rows(MYSQL_TABLE, {"corpus": corpus}) mysql.set_names() rows = [] for lemgram, freq in list(result.items()): rows.append({"lemgram": lemgram, "corpus": corpus, "freq": freq}) log.info("Creating SQL") mysql.add_row(MYSQL_TABLE, rows)
def pretty(doc: Document = Document(), docid: AnnotationData = AnnotationData("<docid>"), out: Export = Export("xml_pretty/[xml_export.filename]"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("xml_export.annotations"), source_annotations: SourceAnnotations = SourceAnnotations("xml_export.source_annotations"), header_annotations: SourceAnnotations = SourceAnnotations("xml_export.header_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), include_empty_attributes: bool = Config("xml_export.include_empty_attributes")): """Export annotations to pretty XML in export_dir. Args: doc: Name of the original document. docid: Annotation with document IDs. out: Path and filename pattern for resulting file. token: Annotation containing the token strings. word: Annotation containing the token strings. annotations: List of elements:attributes (annotations) to include. source_annotations: List of elements:attributes from the original document to be kept. If not specified, everything will be kept. header_annotations: List of header elements from the original document to include in the export. If not specified, all headers will be kept. remove_namespaces: Whether to remove module "namespaces" from element and attribute names. Disabled by default. sparv_namespace: The namespace to be added to all Sparv annotations. source_namespace: The namespace to be added to all annotations present in the source. include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default. """ # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) token_name = token.name # Read words and document ID word_annotation = list(word.read()) docid_annotation = docid.read() # Get annotation spans, annotations list etc. annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, doc=doc, token_name=token_name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc) export_names.update(h_export_names) span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, h_annotations, doc=doc, split_overlaps=True) xmlstr = xml_utils.make_pretty_xml(span_positions, annotation_dict, export_names, token_name, word_annotation, docid_annotation, include_empty_attributes, sparv_namespace) # Write XML to file with open(out, mode="w") as outfile: outfile.write(xmlstr) log.info("Exported: %s", out)
def timespan_sql_with_dateinfo( corpus: Corpus = Corpus(), out: Export = Export("korp_timespan/timespan.sql"), docs: AllDocuments = AllDocuments(), token: AnnotationAllDocs = AnnotationAllDocs("<token>"), datefrom: AnnotationAllDocs = AnnotationAllDocs( "<text>:dateformat.datefrom"), dateto: AnnotationAllDocs = AnnotationAllDocs( "<text>:dateformat.dateto"), timefrom: AnnotationAllDocs = AnnotationAllDocs( "<text>:dateformat.timefrom"), timeto: AnnotationAllDocs = AnnotationAllDocs( "<text>:dateformat.timeto")): """Create timespan SQL data for use in Korp.""" corpus_name = corpus.upper() datespans = defaultdict(int) datetimespans = defaultdict(int) for doc in docs: text_tokens, orphans = Annotation(datefrom.name, doc=doc).get_children(token) if orphans: datespans[("0" * 8, "0" * 8)] += len(orphans) datetimespans[("0" * 14, "0" * 14)] += len(orphans) dateinfo = datefrom.read_attributes( doc, (datefrom, dateto, timefrom, timeto)) for text in text_tokens: d = next(dateinfo) datespans[(d[0].zfill(8), d[1].zfill(8))] += len(text) datetimespans[(d[0].zfill(8) + d[2].zfill(6), d[1].zfill(8) + d[3].zfill(6))] += len(text) rows_date = [] rows_datetime = [] for span in datespans: rows_date.append({ "corpus": corpus_name, "datefrom": span[0], "dateto": span[1], "tokens": datespans[span] }) for span in datetimespans: rows_datetime.append({ "corpus": corpus_name, "datefrom": span[0], "dateto": span[1], "tokens": datetimespans[span] }) create_sql(corpus_name, out, rows_date, rows_datetime)
def vrt_scrambled( doc: Document = Document(), out: Export = Export("vrt_scrambled/{doc}.vrt"), chunk: Annotation = Annotation("[cwb.scramble_on]"), chunk_order: Annotation = Annotation( "[cwb.scramble_on]:misc.number_random"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "cwb.source_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace")): """Export annotations to vrt in scrambled order.""" # Get annotation spans, annotations list etc. annotation_list, token_attributes, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, token_name=token.name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) if chunk not in annotation_list: raise util.SparvErrorMessage( "The annotation used for scrambling ({}) needs to be included in the output." .format(chunk)) span_positions, annotation_dict = util.gather_annotations( annotation_list, export_names, doc=doc, split_overlaps=True) # Read words and document ID word_annotation = list(word.read()) chunk_order_data = list(chunk_order.read()) # Reorder chunks and open/close tags in correct order new_span_positions = util.scramble_spans(span_positions, chunk.name, chunk_order_data) # Make vrt format vrt_data = create_vrt(new_span_positions, token.name, word_annotation, token_attributes, annotation_dict, export_names) # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Write result to file with open(out, "w") as f: f.write(vrt_data) log.info("Exported: %s", out)
def vrt(doc: Document = Document(), out: Export = Export("vrt/{doc}.vrt"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "cwb.source_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace")): """Export annotations to vrt. - annotations: list of elements:attributes (annotations) to include. - source_annotations: list of elements:attributes from the original document to be kept. If not specified, everything will be kept. """ # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Read words word_annotation = list(word.read()) # Get annotation spans, annotations list etc. annotation_list, token_attributes, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, token_name=token.name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, doc=doc) vrt_data = create_vrt(span_positions, token.name, word_annotation, token_attributes, annotation_dict, export_names) # Write result to file with open(out, "w") as f: f.write(vrt_data) log.info("Exported: %s", out)
def freq_list_simple(corpus: Corpus = Corpus(), docs: AllDocuments = AllDocuments(), word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"), pos: AnnotationAllDocs = AnnotationAllDocs("<token:pos>"), baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"), out: Export = Export("frequency_list/stats_[metadata.id].csv"), delimiter: str = Config("stats_export.delimiter"), cutoff: int = Config("stats_export.cutoff")): """Create a word frequency list for a corpus without sense, lemgram and complemgram annotations.""" freq_dict = defaultdict(int) for doc in docs: simple_tokens = word.read_attributes(doc, [word, pos, baseform]) # Add empty annotations for sense, lemgram and complemgram tokens = [] for w, p, b in simple_tokens: tokens.append((w, p, b, "|", "|", "|")) update_freqs(tokens, freq_dict) write_csv(out, freq_dict, delimiter, cutoff)
def freq_list(corpus: Corpus = Corpus(), docs: AllDocuments = AllDocuments(), word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"), msd: AnnotationAllDocs = AnnotationAllDocs("<token:msd>"), baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"), sense: AnnotationAllDocs = AnnotationAllDocs("<token:sense>"), lemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.lemgram"), complemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.complemgram"), out: Export = Export("frequency_list/stats_[metadata.id].csv"), delimiter: str = Config("stats_export.delimiter"), cutoff: int = Config("stats_export.cutoff"), include_all_compounds: bool = Config("stats_export.include_all_compounds")): """Create a word frequency list for the entire corpus. Args: corpus (str, optional): The corpus ID. Defaults to Corpus. docs (list, optional): The documents belonging to this corpus. Defaults to AllDocuments. word (str, optional): Word annotations. Defaults to AnnotationAllDocs("<token:word>"). msd (str, optional): MSD annotations. Defaults to AnnotationAllDocs("<token:msd>"). baseform (str, optional): Baseform annotations. Defaults to AnnotationAllDocs("<token:baseform>"). sense (str, optional): Sense annotations. Defaults to AnnotationAllDocs("<token:sense>"). lemgram (str, optional): Lemgram annotations. Defaults to AnnotationAllDocs("<token>:saldo.lemgram"). complemgram (str, optional): Compound lemgram annotations. Defaults to AnnotationAllDocs("<token>:saldo.complemgram"). out (str, optional): The output word frequency file. Defaults to Export("frequency_list/[metadata.id].csv"). delimiter (str, optional): Column delimiter to use in the csv. Defaults to Config("stats_export.delimiter"). cutoff (int, optional): The minimum frequency a word must have in order to be included in the result. Defaults to Config("stats_export.cutoff"). include_all_compounds (bool, optional): Whether to include compound analyses for every word or just for the words that are lacking a sense annotation. Defaults to Config("stats_export.include_all_compounds"). """ freq_dict = defaultdict(int) for doc in docs: tokens = word.read_attributes(doc, [word, msd, baseform, sense, lemgram, complemgram]) update_freqs(tokens, freq_dict, include_all_compounds) write_csv(out, freq_dict, delimiter, cutoff)
def relations_sql(corpus: Corpus = Corpus(), out: Export = Export("korp_wordpicture/relations.sql"), relations: AnnotationDataAllDocs = AnnotationDataAllDocs( "korp.relations"), docs: Optional[AllDocuments] = AllDocuments(), doclist: str = "", split: bool = False): """Calculate statistics of the dependencies and saves to SQL files. - corpus is the corpus name. - out is the name for the SQL file which will contain the resulting SQL statements. - relations is the name of the relations annotation. - docs is a list of documents. - doclist can be used instead of docs, and should be a file containing the name of docs, one per row. - split set to true leads to SQL commands being split into several parts, requiring less memory during creation, but installing the data will take much longer. """ db_table = MYSQL_TABLE + "_" + corpus.upper() # Relations that will be grouped together rel_grouping = { "OO": "OBJ", "IO": "OBJ", "RA": "ADV", "TA": "ADV", "OA": "ADV" } MAX_SENTENCES = 5000 index = 0 string_index = -1 strings = {} # ID -> string table freq_index = {} sentence_count = defaultdict(int) doc_count = 0 assert (docs or doclist), "Missing source" if doclist: with open(doclist) as insource: docs = [line.strip() for line in insource] if len(docs) == 1: split = False for doc in docs: doc_count += 1 sentences = {} if doc_count == 1 or split: freq = {} # Frequency of (head, rel, dep) rel_count = defaultdict(int) # Frequency of (rel) head_rel_count = defaultdict(int) # Frequency of (head, rel) dep_rel_count = defaultdict(int) # Frequency of (rel, dep) relations_data = relations.read(doc) for triple in relations_data.splitlines(): head, headpos, rel, dep, deppos, extra, sid, refh, refd, bfhead, bfdep, wfhead, wfdep = triple.split( u"\t") bfhead, bfdep, wfhead, wfdep = int(bfhead), int(bfdep), int( wfhead), int(wfdep) if not (head, headpos) in strings: string_index += 1 head = strings.setdefault((head, headpos), string_index) if not (dep, deppos, extra) in strings: string_index += 1 dep = strings.setdefault((dep, deppos, extra), string_index) rel = rel_grouping.get(rel, rel) if (head, rel, dep) in freq_index: this_index = freq_index[(head, rel, dep)] else: this_index = index freq_index[(head, rel, dep)] = this_index index += 1 # freq bf/wf freq.setdefault(head, {}).setdefault(rel, {}).setdefault( dep, [this_index, 0, [0, 0, 0, 0]]) freq[head][rel][dep][1] += 1 # Frequency if sentence_count[this_index] < MAX_SENTENCES: sentences.setdefault(this_index, set()) sentences[this_index].add( (sid, refh, refd)) # Sentence ID and "ref" for both head and dep sentence_count[this_index] += 1 freq[head][rel][dep][2][0] = freq[head][rel][dep][2][0] or bfhead freq[head][rel][dep][2][1] = freq[head][rel][dep][2][1] or bfdep freq[head][rel][dep][2][2] = freq[head][rel][dep][2][2] or wfhead freq[head][rel][dep][2][3] = freq[head][rel][dep][2][3] or wfdep if bfhead and bfdep: rel_count[rel] += 1 if (bfhead and bfdep) or wfhead: head_rel_count[(head, rel)] += 1 if (bfhead and bfdep) or wfdep: dep_rel_count[(dep, rel)] += 1 # If not the last file if not doc_count == len(docs): if split: # Don't print string table until the last file _write_sql({}, sentences, freq, rel_count, head_rel_count, dep_rel_count, out, db_table, split, first=(doc_count == 1)) else: # Only save sentences data, save the rest for the last file _write_sql({}, sentences, {}, {}, {}, {}, out, db_table, split, first=(doc_count == 1)) # Create the final file, including the string table _write_sql(strings, sentences, freq, rel_count, head_rel_count, dep_rel_count, out, db_table, split, first=(doc_count == 1), last=True) log.info("Done creating SQL files")
def combined(corpus: Corpus = Corpus(), out: Export = Export("[xml_export.filename_combined]"), docs: AllDocuments = AllDocuments(), xml_input: ExportInput = ExportInput("xml_pretty/[xml_export.filename]", all_docs=True)): """Combine XML export files into a single XML file.""" xml_utils.combine(corpus, out, docs, xml_input)
def compressed(out: Export = Export("[xml_export.filename_compressed]"), xmlfile: ExportInput = ExportInput("[xml_export.filename_combined]")): """Compress combined XML export.""" xml_utils.compress(xmlfile, out)
def json_export(out: Export = Export("sbx_metadata/[metadata.id].json"), corpus_id: Corpus = Corpus(), lang: Language = Language(), metadata: dict = Config("metadata"), sentences: AnnotationCommonData = AnnotationCommonData( "misc.<sentence>_count"), tokens: AnnotationCommonData = AnnotationCommonData( "misc.<token>_count"), korp_protected: bool = Config("korp.protected"), korp_mode: bool = Config("korp.mode"), md_trainingdata: bool = Config("sbx_metadata.trainingdata"), md_xml_export: str = Config("sbx_metadata.xml_export"), md_stats_export: bool = Config("sbx_metadata.stats_export"), md_korp: bool = Config("sbx_metadata.korp"), md_downloads: list = Config("sbx_metadata.downloads"), md_interface: list = Config("sbx_metadata.interface"), md_contact: dict = Config("sbx_metadata.contact_info")): """Export corpus metadata to JSON format.""" md_obj = {} md_obj["id"] = corpus_id md_obj["type"] = "corpus" md_obj["trainingdata"] = md_trainingdata # Set language info md_obj["lang"] = [{ "code": lang, "name_en": languages.get(part3=lang).name if lang in languages.part3 else lang, "name_sv": Language.get(lang).display_name("swe"), }] # Set name and description md_obj["name_en"] = metadata.get("name", {}).get("eng") md_obj["name_sv"] = metadata.get("name", {}).get("swe") md_obj["description_en"] = metadata.get("description", {}).get("eng") md_obj["description_sv"] = metadata.get("description", {}).get("swe") # Set downloads downloads = [] downloads.append( metadata_utils.make_standard_xml_export(md_xml_export, corpus_id)) downloads.append( metadata_utils.make_standard_stats_export(md_stats_export, corpus_id)) downloads.append(metadata_utils.make_metashare(corpus_id)) downloads.extend(md_downloads) md_obj["downloads"] = [d for d in downloads if d] # Set interface interface = [] interface.append(metadata_utils.make_korp(md_korp, corpus_id, korp_mode)) interface.extend(md_interface) md_obj["interface"] = [d for d in interface if d] # Set contact info if md_contact == "sbx-default": md_obj["contact_info"] = metadata_utils.SBX_DEFAULT_CONTACT else: md_obj["contact_info"] = md_contact # Set size md_obj["size"] = {"tokens": tokens.read(), "sentences": sentences.read()} # Write JSON to file os.makedirs(os.path.dirname(out), exist_ok=True) json_str = json.dumps(md_obj, ensure_ascii=False, indent=4) with open(out, "w") as f: f.write(json_str) logger.info("Exported: %s", out)
def preserved_format( doc: Document = Document(), text: Text = Text(), docid: AnnotationData = AnnotationData("<docid>"), out: Export = Export( "xml_preserved_format/[xml_export.filename_formatted]"), annotations: ExportAnnotations = ExportAnnotations( "xml_export.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "xml_export.source_annotations"), header_annotations: SourceAnnotations = SourceAnnotations( "xml_export.header_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), include_empty_attributes: bool = Config( "xml_export.include_empty_attributes")): """Export annotations to XML in export_dir and keep whitespaces and indentation from original file. Args: doc: Name of the original document. text: The corpus text. docid: Annotation with document IDs. out: Path and filename pattern for resulting file. annotations: List of elements:attributes (annotations) to include. source_annotations: List of elements:attributes from the original document to be kept. If not specified, everything will be kept. header_annotations: List of header elements from the original document to include in the export. If not specified, all headers will be kept. remove_namespaces: Whether to remove module "namespaces" from element and attribute names. Disabled by default. sparv_namespace: The namespace to be added to all Sparv annotations. source_namespace: The namespace to be added to all annotations present in the source. include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default. """ # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Read corpus text and document ID corpus_text = text.read() docid = docid.read() # Get annotation spans, annotations list etc. annotation_list, _, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc) export_names.update(h_export_names) span_positions, annotation_dict = util.gather_annotations( annotation_list, export_names, h_annotations, doc=doc, flatten=False, split_overlaps=True) sorted_positions = [(pos, span[0], span[1]) for pos, spans in sorted(span_positions.items()) for span in spans] # Root tag sanity check if not xml_utils.valid_root(sorted_positions[0], sorted_positions[-1]): raise util.SparvErrorMessage( "Root tag is missing! If you have manually specified which elements to include, " "make sure to include an element that encloses all other included elements and " "text content.") # Create root node root_span = sorted_positions[0][2] root_span.set_node() node_stack = [] last_pos = 0 # Keeps track of the position of the processed text for x, (_pos, instruction, span) in enumerate(sorted_positions): # Open node: Create child node under the top stack node if instruction == "open": # Set tail for previous node if necessary if last_pos < span.start: # Get last closing node in this position _, tail_span = [ i for i in span_positions[last_pos] if i[0] == "close" ][-1] tail_span.node.tail = corpus_text[last_pos:span.start] last_pos = span.start # Handle headers if span.is_header: header = annotation_dict[span.name][util.HEADER_CONTENTS][ span.index] header_xml = etree.fromstring(header) header_xml.tag = span.export # Rename element if needed span.node = header_xml node_stack[-1].node.append(header_xml) else: if node_stack: # Don't create root node, it already exists span.set_node(parent_node=node_stack[-1].node) xml_utils.add_attrs(span.node, span.name, annotation_dict, export_names, span.index, include_empty_attributes) if span.overlap_id: if sparv_namespace: span.node.set(f"{sparv_namespace}.{util.OVERLAP_ATTR}", f"{docid}-{span.overlap_id}") else: span.node.set( f"{util.SPARV_DEFAULT_NAMESPACE}.{util.OVERLAP_ATTR}", f"{docid}-{span.overlap_id}") node_stack.append(span) # Set text if there should be any between this node and the next one next_item = sorted_positions[x + 1] if next_item[1] == "open" and next_item[2].start > span.start: span.node.text = corpus_text[last_pos:next_item[2].start] last_pos = next_item[2].start # Close node else: if span.is_header: continue if last_pos < span.end: # Set node text if necessary if span.start == last_pos: span.node.text = corpus_text[last_pos:span.end] # Set tail for previous node if necessary else: # Get last closing node in this position _, tail_span = [ i for i in span_positions[last_pos] if i[0] == "close" ][-1] tail_span.node.tail = corpus_text[last_pos:span.end] last_pos = span.end # Make sure closing node == top stack node assert span == node_stack[ -1], "Overlapping elements found: {}".format(node_stack[-2:]) # Pop stack and move on to next span node_stack.pop() # Write xml to file etree.ElementTree(root_span.node).write(out, encoding="unicode", method="xml", xml_declaration=True) log.info("Exported: %s", out)
def csv(doc: Document = Document(), out: Export = Export("csv/{doc}.csv"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), sentence: Annotation = Annotation("<sentence>"), annotations: ExportAnnotations = ExportAnnotations( "csv_export.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "csv_export.source_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), delimiter: str = Config("csv_export.delimiter")): """Export annotations to CSV format.""" # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) token_name = token.name # Read words word_annotation = list(word.read()) # Get annotation spans, annotations list etc. annotation_list, token_attributes, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, token_name=token_name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, doc=doc) # Make csv header csv_data = [ _make_header(token_name, token_attributes, export_names, delimiter) ] # Go through spans_dict and add to csv, line by line for _pos, instruction, span in span_positions: if instruction == "open": # Create token line if span.name == token_name: csv_data.append( _make_token_line(word_annotation[span.index], token_name, token_attributes, annotation_dict, span.index, delimiter)) # Create line with structural annotation else: attrs = _make_attrs(span.name, annotation_dict, export_names, span.index) for attr in attrs: csv_data.append(f"# {attr}") if not attrs: csv_data.append(f"# {span.export}") # Insert blank line after each closing sentence elif span.name == sentence.name and instruction == "close": csv_data.append("") # Write result to file with open(out, "w") as f: f.write("\n".join(csv_data)) logger.info("Exported: %s", out)
def conllu(doc: Document = Document(), out: Export = Export("conll/{doc}.conllu"), token: Annotation = Annotation("<token>"), sentence: Annotation = Annotation("<sentence>"), sentence_id: Annotation = Annotation("[conll_export.conll_fields.sentid]"), source_annotations: SourceAnnotations = SourceAnnotations("conll_export.source_annotations"), id_ref: Optional[Annotation] = Annotation("[conll_export.conll_fields.id]"), form: Optional[Annotation] = Annotation("[export.word]"), lemma: Optional[Annotation] = Annotation("[conll_export.conll_fields.lemma]"), upos: Optional[Annotation] = Annotation("[conll_export.conll_fields.upos]"), xpos: Optional[Annotation] = Annotation("[conll_export.conll_fields.xpos]"), feats: Optional[Annotation] = Annotation("[conll_export.conll_fields.feats]"), head: Optional[Annotation] = Annotation("[conll_export.conll_fields.head]"), deprel: Optional[Annotation] = Annotation("[conll_export.conll_fields.deprel]"), deps: Optional[Annotation] = Annotation("[conll_export.conll_fields.deps]"), misc: Optional[Annotation] = Annotation("[conll_export.conll_fields.misc]")): """Export annotations to CoNLL-U format.""" # CoNLLU specification: https://universaldependencies.org/format.html # ID: Word index, integer starting at 1 for each new sentence; may be a range for multiword tokens; may be a decimal number for empty nodes (decimal numbers can be lower than 1 but must be greater than 0). # FORM: Word form or punctuation symbol. # LEMMA: Lemma or stem of word form. # UPOS: Universal part-of-speech tag. # XPOS: Language-specific part-of-speech tag; underscore if not available. # FEATS: List of morphological features from the universal feature inventory or from a defined language-specific extension; underscore if not available. # HEAD: Head of the current word, which is either a value of ID or zero (0). # DEPREL: Universal dependency relation to the HEAD (root iff HEAD = 0) or a defined language-specific subtype of one. # DEPS: Enhanced dependency graph in the form of a list of head-deprel pairs. # MISC: Any other annotation. conll_fields = [id_ref, form, lemma, upos, xpos, feats, head, deprel, deps, misc] conll_fields = [f if isinstance(f, Annotation) else Annotation() for f in conll_fields] # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) token_name = token.name # Get annotation spans, annotations list etc. # TODO: Add structural annotations from 'annotations'? This is a bit annoying though because then we'd have to # take annotations as a requirement which results in Sparv having to run all annotations, even the ones we don't # want to use here. annotations = [sentence, sentence_id, token] + conll_fields annotations = [(annot, None) for annot in annotations] annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, remove_namespaces=True, doc=doc, token_name=token_name) span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, doc=doc) csv_data = ["# global.columns = ID FORM LEMMA UPOS XPOS FEATS HEAD DEPREL DEPS MISC"] # Go through spans_dict and add to csv, line by line for _pos, instruction, span in span_positions: if instruction == "open": # Create token line if span.name == token_name: csv_data.append(_make_conll_token_line(conll_fields, token_name, annotation_dict, span.index)) # Create line with structural annotation else: attrs = _make_attrs(span.name, annotation_dict, export_names, span.index) for attr in attrs: csv_data.append(f"# {attr}") if not attrs: csv_data.append(f"# {span.export}") # Insert blank line after each closing sentence elif span.name == sentence.name and instruction == "close": csv_data.append("") # Insert extra blank line to make CoNLL-U validator happy csv_data.append("") # Write result to file with open(out, "w") as f: f.write("\n".join(csv_data)) logger.info("Exported: %s", out)
def metashare( out: Export = Export("sbx_metadata/[metadata.id].xml"), template: Model = Model("sbx_metadata/sbx-metashare-template.xml"), corpus_id: Corpus = Corpus(), lang: Language = Language(), metadata: dict = Config("metadata"), sentences: AnnotationCommonData = AnnotationCommonData( "misc.<sentence>_count"), tokens: AnnotationCommonData = AnnotationCommonData( "misc.<token>_count"), annotations: ExportAnnotations = ExportAnnotations( "xml_export.annotations", is_input=False), korp_protected: bool = Config("korp.protected"), korp_mode: bool = Config("korp.mode"), # md_linguality: str = Config("sbx_metadata.linguality"), md_script: str = Config("sbx_metadata.script"), md_xml_export: str = Config("sbx_metadata.xml_export"), md_stats_export: bool = Config("sbx_metadata.stats_export"), md_korp: bool = Config("sbx_metadata.korp"), md_downloads: list = Config("sbx_metadata.downloads"), md_interface: list = Config("sbx_metadata.interface"), md_contact: dict = Config("sbx_metadata.contact_info")): """Export corpus metadata to META-SHARE format.""" # Parse template and handle META SHARE namespace xml = etree.parse(template.path).getroot() etree.register_namespace("", META_SHARE_URL) ns = META_SHARE_NAMESPACE # Set idenfification info identificationInfo = xml.find(ns + "identificationInfo") for i in identificationInfo.findall(ns + "resourceShortName"): i.text = corpus_id identificationInfo.find(ns + "identifier").text = corpus_id _set_texts(identificationInfo.findall(ns + "resourceName"), metadata.get("name", {})) _set_texts(identificationInfo.findall(ns + "description"), metadata.get("description", {})) # Set metadata creation date in metadataInfo xml.find(".//" + ns + "metadataCreationDate").text = str( time.strftime("%Y-%m-%d")) # Set availability if korp_protected: xml.find(".//" + ns + "availability").text = "available-restrictedUse" else: xml.find(".//" + ns + "availability").text = "available-unrestrictedUse" # Set licenceInfos distInfo = xml.find(".//" + ns + "distributionInfo") _set_licence_info( [metadata_utils.make_standard_xml_export(md_xml_export, corpus_id)], distInfo) _set_licence_info([ metadata_utils.make_standard_stats_export(md_stats_export, corpus_id) ], distInfo) _set_licence_info( [metadata_utils.make_korp(md_korp, corpus_id, korp_mode)], distInfo, download=False) _set_licence_info([metadata_utils.make_metashare(corpus_id)], distInfo) # Add non-standard licenseInfos _set_licence_info(md_downloads, distInfo) _set_licence_info(md_interface, distInfo, download=False) # Set contactPerson _set_contact_info(md_contact, xml.find(".//" + ns + "contactPerson")) # Set samplesLocation xml.find(".//" + ns + "samplesLocation").text = f"{SBX_SAMPLES_LOCATION}{corpus_id}" # Set lingualityType xml.find(".//" + ns + "lingualityType").text = "monolingual" # Set languageInfo (languageId, languageName, languageScript) xml.find(".//" + ns + "languageId").text = lang xml.find(".//" + ns + "languageName").text = languages.get( part3=lang).name if lang in languages.part3 else lang xml.find(".//" + ns + "languageScript").text = md_script # Set sizeInfo sizeInfos = xml.findall(".//" + ns + "sizeInfo") sizeInfos[0].find(ns + "size").text = tokens.read() sizeInfos[1].find(ns + "size").text = sentences.read() # Set annotationInfo corpusTextInfo = xml.find(".//" + ns + "corpusTextInfo") _set_annotation_info(annotations, corpusTextInfo) # Write XML to file os.makedirs(os.path.dirname(out), exist_ok=True) etree.ElementTree(xml).write(out, encoding="unicode", method="xml", xml_declaration=True) logger.info("Exported: %s", out)