def blingbring_words(out: Output = Output("<token>:lexical_classes.blingbring", description="Lexical classes for tokens from Blingbring"), model: Model = Model("[lexical_classes.bb_word_model]"), saldoids: Annotation = Annotation("<token:sense>"), pos: Annotation = Annotation("<token:pos>"), pos_limit: List[str] = ["NN", "VB", "JJ", "AB"], class_set: str = "bring", disambiguate: bool = True, connect_ids: bool = False, delimiter: str = util.DELIM, affix: str = util.AFFIX, scoresep: str = util.SCORESEP, lexicon=None): """Blingbring specific wrapper for annotate_words. See annotate_words for more info.""" # pos_limit="NN VB JJ AB" | None if class_set not in ["bring", "roget_head", "roget_subsection", "roget_section", "roget_class"]: log.warning("Class '%s' not available. Fallback to 'bring'.") class_set = "bring" # Blingbring annotation function def annotate_bring(saldo_ids, lexicon, connect_IDs=False, scoresep=util.SCORESEP): rogetid = set() if saldo_ids: for sid in saldo_ids: if connect_IDs: rogetid = rogetid.union(set(i + scoresep + sid for i in lexicon.lookup(sid, default=set()))) else: rogetid = rogetid.union(lexicon.lookup(sid, default=dict()).get(class_set, set())) return sorted(rogetid) annotate_words(out, model, saldoids, pos, annotate_bring, pos_limit=pos_limit, disambiguate=disambiguate, class_set=class_set, connect_ids=connect_ids, delimiter=delimiter, affix=affix, scoresep=scoresep, lexicon=lexicon)
def swefn_words(out: Output = Output("<token>:lexical_classes.swefn", description="Lexical classes for tokens from SweFN"), model: Model = Model("[lexical_classes.swefn_word_model]"), saldoids: Annotation = Annotation("<token:sense>"), pos: Annotation = Annotation("<token:pos>"), pos_limit: List[str] = ["NN", "VB", "JJ", "AB"], disambiguate: bool = True, connect_ids: bool = False, delimiter: str = util.DELIM, affix: str = util.AFFIX, scoresep: str = util.SCORESEP, lexicon=None): """Swefn specific wrapper for annotate_words. See annotate_words for more info.""" # SweFN annotation function def annotate_swefn(saldo_ids, lexicon, connect_IDs=False, scoresep=util.SCORESEP): swefnid = set() if saldo_ids: for sid in saldo_ids: if connect_IDs: swefnid = swefnid.union(set(i + scoresep + sid for i in lexicon.lookup(sid, default=set()))) else: swefnid = swefnid.union(lexicon.lookup(sid, default=set())) return sorted(swefnid) annotate_words(out, model, saldoids, pos, annotate_swefn, pos_limit=pos_limit, disambiguate=disambiguate, connect_ids=connect_ids, delimiter=delimiter, affix=affix, scoresep=scoresep, lexicon=lexicon)
def lix(text: Annotation = Annotation("<text>"), sentence: Annotation = Annotation("<sentence>"), word: Annotation = Annotation("<token:word>"), pos: Annotation = Annotation("<token:pos>"), out: Output = Output("<text>:readability.lix", description="LIX values for text chunks"), skip_pos: List[str] = ["MAD", "MID", "PAD"], fmt: str = "%.2f"): """Create LIX annotation for text.""" # Read annotation files and get parent_children relations text_children, _orphans = text.get_children(sentence) word_pos = list(word.read_attributes((word, pos))) sentence_children, _orphans = sentence.get_children(word) sentence_children = list(sentence_children) # Calculate LIX for every text element lix_annotation = [] for text in text_children: in_sentences = [] for sentence_index in text: s = sentence_children[sentence_index] in_sentences.append( list( actual_words([word_pos[token_index] for token_index in s], skip_pos))) lix_annotation.append(fmt % lix_calc(in_sentences)) out.write(lix_annotation)
def pretty(doc: Document = Document(), docid: AnnotationData = AnnotationData("<docid>"), out: Export = Export("xml_pretty/[xml_export.filename]"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("xml_export.annotations"), source_annotations: SourceAnnotations = SourceAnnotations("xml_export.source_annotations"), header_annotations: SourceAnnotations = SourceAnnotations("xml_export.header_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace"), include_empty_attributes: bool = Config("xml_export.include_empty_attributes")): """Export annotations to pretty XML in export_dir. Args: doc: Name of the original document. docid: Annotation with document IDs. out: Path and filename pattern for resulting file. token: Annotation containing the token strings. word: Annotation containing the token strings. annotations: List of elements:attributes (annotations) to include. source_annotations: List of elements:attributes from the original document to be kept. If not specified, everything will be kept. header_annotations: List of header elements from the original document to include in the export. If not specified, all headers will be kept. remove_namespaces: Whether to remove module "namespaces" from element and attribute names. Disabled by default. sparv_namespace: The namespace to be added to all Sparv annotations. source_namespace: The namespace to be added to all annotations present in the source. include_empty_attributes: Whether to include attributes even when they are empty. Disabled by default. """ # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) token_name = token.name # Read words and document ID word_annotation = list(word.read()) docid_annotation = docid.read() # Get annotation spans, annotations list etc. annotation_list, _, export_names = util.get_annotation_names(annotations, source_annotations, doc=doc, token_name=token_name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) h_annotations, h_export_names = util.get_header_names(header_annotations, doc=doc) export_names.update(h_export_names) span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, h_annotations, doc=doc, split_overlaps=True) xmlstr = xml_utils.make_pretty_xml(span_positions, annotation_dict, export_names, token_name, word_annotation, docid_annotation, include_empty_attributes, sparv_namespace) # Write XML to file with open(out, mode="w") as outfile: outfile.write(xmlstr) log.info("Exported: %s", out)
def msdtag(out: Output = Output( "<token>:hunpos.msd", cls="token:msd", description="Part-of-speeches with morphological descriptions"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), binary: Binary = Binary("[hunpos.binary]"), model: Model = Model("[hunpos.model]"), morphtable: Optional[Model] = Model("[hunpos.morphtable]"), patterns: Optional[Model] = Model("[hunpos.patterns]"), tag_mapping=None, encoding: str = util.UTF8): """POS/MSD tag using the Hunpos tagger.""" if isinstance(tag_mapping, str) and tag_mapping: tag_mapping = util.tagsets.mappings[tag_mapping] elif tag_mapping is None or tag_mapping == "": tag_mapping = {} pattern_list = [] if patterns: with open(patterns.path, encoding="utf-8") as pat: for line in pat: if line.strip() and not line.startswith("#"): name, pattern, tags = line.strip().split("\t", 2) pattern_list.append( (name, re.compile("^%s$" % pattern), tags)) def replace_word(w): """Replace word with alias if word matches a regex pattern.""" for p in pattern_list: if re.match(p[1], w): return "[[%s]]" % p[0] return w sentences, _orphans = sentence.get_children(word) token_word = list(word.read()) stdin = SENT_SEP.join( TOK_SEP.join( replace_word(token_word[token_index]) for token_index in sent) for sent in sentences) args = [model.path] if morphtable: args.extend(["-m", morphtable.path]) stdout, _ = util.system.call_binary(binary, args, stdin, encoding=encoding) out_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_index, tagged_token in zip( sent, tagged_sent.strip().split(TOK_SEP)): tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN] tag = tag_mapping.get(tag, tag) out_annotation[token_index] = tag out.write(out_annotation)
def struct_to_token( attr: Annotation = Annotation("{struct}:{attr}"), token: Annotation = Annotation("<token>"), out: Output = Output("<token>:misc.from_struct_{struct}_{attr}")): """Convert an attribute on a structural annotation into a token attribute.""" token_parents = token.get_parents(attr) attr_values = list(attr.read()) out_values = [ attr_values[p] if p is not None else "" for p in token_parents ] out.write(out_values)
def vrt_scrambled( doc: Document = Document(), out: Export = Export("vrt_scrambled/{doc}.vrt"), chunk: Annotation = Annotation("[cwb.scramble_on]"), chunk_order: Annotation = Annotation( "[cwb.scramble_on]:misc.number_random"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "cwb.source_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace")): """Export annotations to vrt in scrambled order.""" # Get annotation spans, annotations list etc. annotation_list, token_attributes, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, token_name=token.name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) if chunk not in annotation_list: raise util.SparvErrorMessage( "The annotation used for scrambling ({}) needs to be included in the output." .format(chunk)) span_positions, annotation_dict = util.gather_annotations( annotation_list, export_names, doc=doc, split_overlaps=True) # Read words and document ID word_annotation = list(word.read()) chunk_order_data = list(chunk_order.read()) # Reorder chunks and open/close tags in correct order new_span_positions = util.scramble_spans(span_positions, chunk.name, chunk_order_data) # Make vrt format vrt_data = create_vrt(new_span_positions, token.name, word_annotation, token_attributes, annotation_dict, export_names) # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Write result to file with open(out, "w") as f: f.write(vrt_data) log.info("Exported: %s", out)
def annotate( lang: Language = Language(), model: Model = Model("[treetagger.model]"), tt_binary: Binary = Binary("[treetagger.binary]"), out_upos: Output = Output("<token>:treetagger.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output( "<token>:treetagger.pos", cls="token:pos", description="Part-of-speeches from TreeTagger"), out_baseform: Output = Output("<token>:treetagger.baseform", description="Baseforms from TreeTagger"), word: Annotation = Annotation("<token:word>"), sentence: Annotation = Annotation("<sentence>"), encoding: str = util.UTF8): """POS/MSD tag and lemmatize using TreeTagger.""" sentences, _orphans = sentence.get_children(word) word_annotation = list(word.read()) stdin = SENT_SEP.join( TOK_SEP.join(word_annotation[token_index] for token_index in sent) for sent in sentences) args = ["-token", "-lemma", "-no-unknown", "-eos-tag", "<eos>", model.path] stdout, stderr = util.system.call_binary(tt_binary, args, stdin, encoding=encoding) log.debug("Message from TreeTagger:\n%s", stderr) # Write pos and upos annotations. out_upos_annotation = word.create_empty_attribute() out_pos_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_id, tagged_token in zip(sent, tagged_sent.strip().split(TOK_SEP)): tag = tagged_token.strip().split(TAG_SEP)[TAG_COLUMN] out_pos_annotation[token_id] = tag out_upos_annotation[token_id] = util.tagsets.pos_to_upos( tag, lang, TAG_SETS.get(lang)) out_pos.write(out_pos_annotation) out_upos.write(out_upos_annotation) # Write lemma annotations. out_lemma_annotation = word.create_empty_attribute() for sent, tagged_sent in zip(sentences, stdout.strip().split(SENT_SEP)): for token_id, tagged_token in zip(sent, tagged_sent.strip().split(TOK_SEP)): lem = tagged_token.strip().split(TAG_SEP)[LEM_COLUMN] out_lemma_annotation[token_id] = lem out_baseform.write(out_lemma_annotation)
def contextual(out: Output = Output("{chunk}:geo.geo_context", description="Geographical places with coordinates"), chunk: Annotation = Annotation("{chunk}"), context: Annotation = Annotation("[geo.context_chunk]"), ne_type: Annotation = Annotation("swener.ne:swener.type"), ne_subtype: Annotation = Annotation("swener.ne:swener.subtype"), ne_name: Annotation = Annotation("swener.ne:swener.name"), model: Model = Model("[geo.model]"), method: str = "populous", language: list = []): """Annotate chunks with location data, based on locations contained within the text. context = text chunk to use for disambiguating places (when applicable). chunk = text chunk to which the annotation will be added. """ model = load_model(model, language=language) ne_type_annotation = list(ne_type.read()) ne_subtype_annotation = list(ne_subtype.read()) ne_name_annotation = list(ne_name.read()) children_context_chunk, _orphans = context.get_children(chunk) children_chunk_ne, _orphans = chunk.get_children(ne_type) out_annotation = chunk.create_empty_attribute() for chunks in children_context_chunk: all_locations = [] # TODO: Maybe not needed for anything? context_locations = [] chunk_locations = defaultdict(list) for ch in chunks: for n in children_chunk_ne[ch]: if ne_type_annotation[n] == "LOC" and "PPL" in ne_subtype_annotation[n]: location_text = ne_name_annotation[n].replace("\n", " ").replace(" ", " ") location_data = model.get(location_text.lower()) if location_data: all_locations.append((location_text, list(location_data))) context_locations.append((location_text, list(location_data))) chunk_locations[ch].append((location_text, list(location_data))) else: pass # log.info("No location found for %s" % ne_name_annotation[n].replace("%", "%%")) chunk_locations = most_populous(chunk_locations) for c in chunks: out_annotation[c] = _format_location(chunk_locations.get(c, ())) out.write(out_annotation)
def number_relative(out: Output = Output("{annotation}:misc.number_rel_{parent}"), parent: Annotation = Annotation("{parent}"), child: Annotation = Annotation("{annotation}"), prefix: str = "", zfill: bool = False, start: int = START_DEFAULT): """Number chunks by their relative position within a parent.""" parent_children, _orphans = parent.get_children(child) out.write(("{prefix}{nr:0{length}d}".format(prefix=prefix, length=len(str(len(parent) - 1 + start)) if zfill else 0, nr=cnr) for parent in parent_children for cnr, _index in enumerate(parent, start)))
def ufeatstag(out: Output = Output( "<token>:misc.ufeats", cls="token:ufeats", description="Universal morphological features"), pos: Annotation = Annotation("<token:pos>"), msd: Annotation = Annotation("<token:msd>")): """Convert SUC MSD tags to universal features.""" pos_tags = pos.read() msd_tags = msd.read() out_annotation = [] for pos_tag, msd_tag in zip(pos_tags, msd_tags): feats = util.tagsets.suc_to_feats(pos_tag, msd_tag) out_annotation.append(util.cwbset(feats)) out.write(out_annotation)
def diapivot_annotate( out: Output = Output( "<token>:hist.diapivot", description="SALDO IDs corresponding to lemgrams"), lemgram: Annotation = Annotation("<token>:saldo.lemgram"), model: Model = Model("hist/diapivot.pickle")): """Annotate each lemgram with its corresponding saldo_id according to model. Args: out (str, optional): Resulting annotation file. Defaults to Output("<token>:hist.diapivot", description="SALDO IDs corresponding to lemgrams"). lemgram (str, optional): Existing lemgram annotation. Defaults to Annotation("<token>:saldo.lemgram"). model (str, optional): Crosslink model. Defaults to Model("hist/diapivot.pickle"). """ lexicon = PivotLexicon(model) lemgram_annotation = list(lemgram.read()) out_annotation = [] for lemgrams in lemgram_annotation: saldo_ids = [] for lemgram in lemgrams.split(util.DELIM): s_i = lexicon.get_exactMatch(lemgram) if s_i: saldo_ids += [s_i] out_annotation.append(util.AFFIX + util.DELIM.join(set(saldo_ids)) + util.AFFIX if saldo_ids else util.AFFIX) out.write(out_annotation)
def annotate(corpus_text: Text = Text(), lang: Language = Language, conf_file: Model = Model("[freeling.conf]"), fl_binary: Binary = Binary("[freeling.binary]"), sentence_chunk: Optional[Annotation] = Annotation("[freeling.sentence_chunk]"), out_token: Output = Output("freeling.token", cls="token", description="Token segments"), out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"), out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"), out_upos: Output = Output("<token>:freeling.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output("<token>:freeling.pos", cls="token:pos", description="Part-of-speeches from FreeLing"), out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence", description="Sentence segments"), sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")): """Run FreeLing and output sentences, tokens, baseforms, upos and pos.""" main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos, out_sentence, sentence_annotation)
def swefn_text(out: Output = Output("<text>:lexical_classes.swefn", description="Lexical classes for text chunks from SweFN"), lexical_classes_token: Annotation = Annotation("<token>:lexical_classes.swefn"), text: Annotation = Annotation("<text>"), token: Annotation = Annotation("<token>"), saldoids: Optional[Annotation] = Annotation("<token:sense>"), cutoff: int = 3, types: bool = False, delimiter: str = util.DELIM, affix: str = util.AFFIX, freq_model: Model = Model("[lexical_classes.swefn_freq_model]"), decimals: int = 3): """Annotate text chunks with SweFN classes.""" annotate_text(out=out, lexical_classes_token=lexical_classes_token, text=text, token=token, saldoids=saldoids, cutoff=cutoff, types=types, delimiter=delimiter, affix=affix, freq_model=freq_model, decimals=decimals)
def uppercase( word: Annotation = Annotation("<token:word>"), out: Output = Output("<token>:uppercase.upper"), # some_config_variable: str = Config("uppercase.some_setting") ): """Convert to uppercase.""" out.write([val.upper() for val in word.read()])
def postag(out: Output = Output("<token>:hunpos.pos", cls="token:pos", description="Part-of-speech tags"), msd: Annotation = Annotation("<token>:hunpos.msd")): """Extract POS from MSD.""" from sparv.modules.misc import misc misc.select(out, msd, index=0, separator=".")
def word_weights(doc: str = Document, model: str = Model("[vw_topic_modelling.model]"), word: str = Annotation("<token:word>"), pos: str = Annotation("<token:pos>"), out: str = Output("<token>:vw_topic_modelling:label_weights", description="Label weights per word")): """ Report the weight for each label for each word. Both model and model.json must exist. See --train and --predict. """ m_json = json.load(open(model + ".json")) index_to_label = m_json["index_to_label"] min_word_length = int(m_json["min_word_length"] or "0") banned_pos = (m_json["banned_pos"] or "").split() words = list(util.read_annotation(doc, word)) poss = util.read_annotation(doc, pos) if pos else [] data = (Example(None, vw_normalize(word)) for n, word in enumerate(words) if len(word) >= min_word_length if not pos or poss[n] not in banned_pos) weights = defaultdict(list) with tempfile.NamedTemporaryFile() as tmp: args = ["--initial_regressor", model, "--invert_hash", tmp.name] for _ in vw_predict(args, data): pass for line in open(tmp.name, "r").readlines(): # allmänna[1]:14342849:0.0139527 colons = line.split(":") if len(colons) == 3: word, _hash, weight = colons if word[-1] == "]": bracesplit = word.rsplit("[", 1) else: bracesplit = [] if len(bracesplit) == 2: word, index = bracesplit n = int(index[:-1]) + 1 else: n = 1 weights[word].append(index_to_label[str(n)] + ":" + weight) ws = ( util.cwbset(weights[vw_normalize(word)]) for word in words if vw_normalize(word) in weights ) util.write_annotation(doc, out, ws)
def number_by_parent(out: Output = Output("{annotation}:misc.number_by_parent_{parent_annotation}__{parent_attribute}"), chunk: Annotation = Annotation("{annotation}"), parent_order: Annotation = Annotation("{parent_annotation}:{parent_attribute}"), prefix: str = "", zfill: bool = False, start: int = START_DEFAULT): """Number chunks by (parent_order, chunk order).""" parent_children, _orphans = parent_order.get_children(chunk) child_order = {child_index: (parent_nr, child_index) for parent_index, parent_nr in enumerate(parent_order.read()) for child_index in parent_children[parent_index]} def _order(index, _value): return child_order.get(index) _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
def number_by_attribute(out: Output = Output("{annotation}:misc.number_by_{attribute}"), chunk: Annotation = Annotation("{annotation}:{attribute}"), prefix: str = "", zfill: bool = False, start: int = START_DEFAULT): """Number chunks, with the order determined by an attribute.""" def _order(_index, value): return _natural_sorting(value) _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
def nominal_ratio(text: Annotation = Annotation("<text>"), pos: Annotation = Annotation("<token:pos>"), out: Output = Output( "<text>:readability.nk", description="Nominal ratios for text chunks"), noun_pos: List[str] = ["NN", "PP", "PC"], verb_pos: List[str] = ["PN", "AB", "VB"], fmt: str = "%.2f"): """Create nominal ratio annotation for text.""" text_children, _orphans = text.get_children(pos) pos_annotation = list(pos.read()) # Calculate OVIX for every text element nk_annotation = [] for text in text_children: in_pos = [pos_annotation[token_index] for token_index in text] nk_annotation.append(fmt % nominal_ratio_calc(in_pos, noun_pos, verb_pos)) out.write(nk_annotation)
def vrt(doc: Document = Document(), out: Export = Export("vrt/{doc}.vrt"), token: Annotation = Annotation("<token>"), word: Annotation = Annotation("[export.word]"), annotations: ExportAnnotations = ExportAnnotations("cwb.annotations"), source_annotations: SourceAnnotations = SourceAnnotations( "cwb.source_annotations"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), sparv_namespace: str = Config("export.sparv_namespace"), source_namespace: str = Config("export.source_namespace")): """Export annotations to vrt. - annotations: list of elements:attributes (annotations) to include. - source_annotations: list of elements:attributes from the original document to be kept. If not specified, everything will be kept. """ # Create export dir os.makedirs(os.path.dirname(out), exist_ok=True) # Read words word_annotation = list(word.read()) # Get annotation spans, annotations list etc. annotation_list, token_attributes, export_names = util.get_annotation_names( annotations, source_annotations, doc=doc, token_name=token.name, remove_namespaces=remove_namespaces, sparv_namespace=sparv_namespace, source_namespace=source_namespace) span_positions, annotation_dict = util.gather_annotations(annotation_list, export_names, doc=doc) vrt_data = create_vrt(span_positions, token.name, word_annotation, token_attributes, annotation_dict, export_names) # Write result to file with open(out, "w") as f: f.write(vrt_data) log.info("Exported: %s", out)
def annotate_full(corpus_text: Text = Text(), lang: Language = Language(), conf_file: Model = Model("[freeling.conf]"), fl_binary: Binary = Binary("[freeling.binary]"), sentence_chunk: Annotation = Annotation("[freeling.sentence_chunk]"), out_token: Output = Output("freeling.token", cls="token", description="Token segments"), out_word: Output = Output("<token>:freeling.word", cls="token:word", description="Token strings"), out_baseform: Output = Output("<token>:freeling.baseform", description="Baseforms from FreeLing"), out_upos: Output = Output("<token>:freeling.upos", cls="token:upos", description="Part-of-speeches in UD"), out_pos: Output = Output("<token>:freeling.pos", cls="token:pos", description="Part-of-speeches from FreeLing"), out_ne_type: Output = Output("<token>:freeling.ne_type", cls="token:named_entity_type", description="Named entitiy types from FreeLing"), out_sentence: Optional[Output] = Output("freeling.sentence", cls="sentence", description="Sentence segments"), sentence_annotation: Optional[Annotation] = Annotation("[freeling.sentence_annotation]")): """Run FreeLing and output the usual annotations plus named entity types.""" main(corpus_text, lang, conf_file, fl_binary, sentence_chunk, out_token, out_word, out_baseform, out_upos, out_pos, out_sentence, sentence_annotation, out_ne_type)
def ovix(text: Annotation = Annotation("<text>"), word: Annotation = Annotation("<token:word>"), pos: Annotation = Annotation("<token:pos>"), out: Output = Output("<text>:readability.ovix", description="OVIX values for text chunks"), skip_pos: List[str] = ["MAD", "MID", "PAD"], fmt: str = "%.2f"): """Create OVIX annotation for text.""" text_children, _orphans = text.get_children(word) word_pos = list(word.read_attributes((word, pos))) # Calculate OVIX for every text element ovix_annotation = [] for text in text_children: in_words = list( actual_words([word_pos[token_index] for token_index in text], skip_pos)) ovix_annotation.append(fmt % ovix_calc(in_words)) out.write(ovix_annotation)
def metadata(out: Output = Output("{chunk}:geo.geo_metadata", description="Geographical places with coordinates"), chunk: Annotation = Annotation("{chunk}"), source: Annotation = Annotation("[geo.metadata_source]"), model: Model = Model("[geo.model]"), method: str = "populous", language: list = []): """Get location data based on metadata containing location names.""" geomodel = load_model(model, language=language) same_target_source = chunk.split()[0] == source.split()[0] chunk_annotation = list(chunk.read()) source_annotation = list(source.read()) # If location source and target chunk are not the same, we need # to find the parent/child relations between them. if not same_target_source: target_source_parents = list(source.get_parents(chunk)) chunk_locations = {} for i, _ in enumerate(chunk_annotation): if same_target_source: location_source = source_annotation[i] else: location_source = source_annotation[target_source_parents[i]] if target_source_parents[ i] is not None else None if location_source: location_data = geomodel.get(location_source.strip().lower()) if location_data: chunk_locations[i] = [(location_source, list(location_data))] else: chunk_locations[i] = [] chunk_locations = most_populous(chunk_locations) out_annotation = chunk.create_empty_attribute() for c in chunk_locations: out_annotation[c] = _format_location(chunk_locations.get(c, ())) out.write(out_annotation)
def upostag(out: Output = Output("<token>:misc.upos", cls="token:upos", description="Part-of-speeches in UD"), pos: Annotation = Annotation("<token:pos>")): """Convert SUC POS tags to UPOS.""" pos_tags = pos.read() out_annotation = [] for tag in pos_tags: out_annotation.append(util.tagsets.pos_to_upos(tag, "swe", "SUC")) out.write(out_annotation)
def number_by_position(out: Output = Output("{annotation}:misc.number_position"), chunk: Annotation = Annotation("{annotation}"), prefix: str = "", zfill: bool = False, start: int = START_DEFAULT): """Number chunks by their position.""" spans = list(chunk.read_spans()) def _order(index, _value): return spans[index] _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)
def timespan_sql_with_dateinfo( corpus: Corpus = Corpus(), out: Export = Export("korp_timespan/timespan.sql"), docs: AllDocuments = AllDocuments(), token: AnnotationAllDocs = AnnotationAllDocs("<token>"), datefrom: AnnotationAllDocs = AnnotationAllDocs( "<text>:dateformat.datefrom"), dateto: AnnotationAllDocs = AnnotationAllDocs( "<text>:dateformat.dateto"), timefrom: AnnotationAllDocs = AnnotationAllDocs( "<text>:dateformat.timefrom"), timeto: AnnotationAllDocs = AnnotationAllDocs( "<text>:dateformat.timeto")): """Create timespan SQL data for use in Korp.""" corpus_name = corpus.upper() datespans = defaultdict(int) datetimespans = defaultdict(int) for doc in docs: text_tokens, orphans = Annotation(datefrom.name, doc=doc).get_children(token) if orphans: datespans[("0" * 8, "0" * 8)] += len(orphans) datetimespans[("0" * 14, "0" * 14)] += len(orphans) dateinfo = datefrom.read_attributes( doc, (datefrom, dateto, timefrom, timeto)) for text in text_tokens: d = next(dateinfo) datespans[(d[0].zfill(8), d[1].zfill(8))] += len(text) datetimespans[(d[0].zfill(8) + d[2].zfill(6), d[1].zfill(8) + d[3].zfill(6))] += len(text) rows_date = [] rows_datetime = [] for span in datespans: rows_date.append({ "corpus": corpus_name, "datefrom": span[0], "dateto": span[1], "tokens": datespans[span] }) for span in datetimespans: rows_datetime.append({ "corpus": corpus_name, "datefrom": span[0], "dateto": span[1], "tokens": datetimespans[span] }) create_sql(corpus_name, out, rows_date, rows_datetime)
def predict(doc: str = Document, model: str = Model("[vw_topic_modelling.model]"), modeljson: str = Model("[vw_topic_modelling.modeljson]"), order, struct, parent: str = Annotation("{chunk}"), word: str = Annotation("<token:word>"), out: str = Output("{chunk}:vw_topic_modelling.prediction", description="Predicted attributes"), pos: str = Annotation("<token:pos>"), raw: bool = False): """Predict a structural attribute.""" raw = raw == "true" m_json = json.load(open(modeljson)) data = ( Example(None, text.words, text.span) for text in texts([(order, struct, parent, word, pos)], map_label=lambda _: "?", min_word_length=m_json["min_word_length"], banned_pos=m_json["banned_pos"]) ) index_to_label = m_json["index_to_label"] args = ["--initial_regressor", model] if raw: predictions = ( util.cwbset(index_to_label[str(s)] + ":" + str(v) for s, v in ss) for ss, _span in vw_predict(args, data, raw=True) ) else: predictions = ( index_to_label[str(s)] for s, _span in vw_predict(args, data) ) util.write_annotation(doc, out, predictions)
def dateformat( in_from: Annotation = Annotation("[dateformat.datetime_from]"), in_to: Optional[Annotation] = Annotation("[dateformat.datetime_to]"), out_from: Output = Output( "[dateformat.out_annotation]:dateformat.datefrom", description="From-dates"), out_to: Optional[Output] = Output( "[dateformat.out_annotation]:dateformat.dateto", description="To-dates"), informat: str = Config("dateformat.datetime_informat"), outformat: str = Config("dateformat.date_outformat"), splitter: Optional[str] = Config("dateformat.splitter", None), regex: Optional[str] = Config("dateformat.regex", None)): """Convert existing dates/times to specified date output format. http://docs.python.org/library/datetime.html#strftime-and-strptime-behavior Args: in_from (str, optional): Annotation containing from-dates (and times). Defaults to Annotation("[dateformat.datetime_from]"). in_to (Optional[str], optional): Annotation containing to-dates. Defaults to Annotation("[dateformat.datetime_to]"). out_from (str, optional): Annotation with from-times to be written. Defaults to Output("[dateformat.out_annotation]:dateformat.datefrom",description="From-dates"). out_to (Optional[str], optional): Annotation with to-times to be written. Defaults to Output("[dateformat.out_annotation]:dateformat.dateto",description="To-dates"). informat (str, optional): Format of the in_from and in_to dates/times. Several formats can be specified separated by |. They will be tried in order. Defaults to Config("dateformat.datetime_informat"). outformat (str, optional): Desired format of the out_from and out_to dates. Several formats can be specified separated by |. They will be tied to their respective in-format. Defaults to Config("dateformat.date_outformat", "%Y%m%d"). splitter (str, optional): One or more characters separating two dates in 'in_from', treating them as from-date and to-date. Defaults to Config("dateformat.splitter", None). regex (str, optional): Regular expression with a catching group whose content will be used in the parsing instead of the whole string. Defaults to Config("dateformat.regex", None). """ _formatter(in_from, in_to, out_from, out_to, informat, outformat, splitter, regex)
def renumber_by_shuffle(out: Output = Output("{annotation}:misc.renumber_by_shuffle_{attribute}"), chunk: Annotation = Annotation("{annotation}:{attribute}"), prefix: str = "", zfill: bool = False, start: int = START_DEFAULT): """Renumber already numbered chunks, in new random order. Retains the connection between parallelly numbered chunks by using the values as random seed. """ def _order(_index, value): random.seed(int(hexlify(value.encode()), 16)) return random.random(), _natural_sorting(value) _read_chunks_and_write_new_ordering(out, chunk, _order, prefix, zfill, start)