def _add_annotation(annotations, text_value, annotation_type, text, offset): if text is None: return offset prefix = None if annotation_type in ('Title', 'Abstract'): prefix = annotation_type.upper() if prefix is not None: anno = { "id": IdentifierFactory.next_id('Header'), "@type": vocab('Header'), "start": offset, "end": offset + len(prefix) } annotations.append(Annotation(anno)) text_value.write(prefix + u"\n\n") offset += len(prefix) + 2 anno = { "id": IdentifierFactory.next_id(annotation_type), "@type": vocab(annotation_type), "start": offset, "end": offset + len(text) } annotations.append(Annotation(anno)) text_value.write(text + u"\n\n") return offset + len(text) + 2
def _add_technologies(self): """Takes the technology ontology and tries to add each element to the technologies index of this document. Add only if the technology term occurs in the text. This is done rather inefficiently by searching the entire text # for each technology, but on a 30K LIF document this takes less than # 0.01 seconds for 100 technology terms, so we can live with this.""" technologies = self.annotations.technologies if technologies: next_id = max([int(a.id[1:]) for a in technologies.annotations]) + 1 # print len(technologies.texts), len(technologies.annotations) for term in self.ontology.technologies: searchterm = r'\b%s\b' % term matches = list( re.finditer(searchterm, self.annotations.text, flags=re.I)) for match in matches: json_obj = { "id": "t%d" % next_id, "@type": 'http://vocab.lappsgrid.org/Technology', "start": match.start(), "end": match.end() } next_id += 1 anno = Annotation(json_obj) anno.text = term technologies.add(anno)
def markable_annotation(lif_obj): return Annotation({ "id": "m1", "@type": 'http://vocab.lappsgrid.org/Markable', "start": 0, "end": len(lif_obj.text.value) })
def topic_annotation(topic, topic_id, lemmas): return Annotation({"id": "t{:d}".format(topic_id), "@type": 'http://vocab.lappsgrid.org/SemanticTag', "target": "m1", "features": { "type": "gensim-topic", "topic_id": topic[0], "topic_score": "{:.04f}".format(topic[1]), "topic_name": lemmas}})
def _add_docelement_anno(self, docelement_type, p1, p2): self.view.add( Annotation({ 'id': Identifiers.new_id('de'), '@type': 'Section', 'start': p1, 'end': p2, 'features': { 'section_type': docelement_type } }))
def _create_annotation(lif, tokens, w, term, i, length, ttype): p1, p2, w_in_text = _get_match_information(lif, tokens, i, length) if DEBUG: OUT.write("%s\t%s\t%s\n" % (p1, p2, w)) next_id = TECHNOLOGIES.get_next_id() json_obj = { "id": "t%d" % next_id, "@type": 'http://vocab.lappsgrid.org/Technology', "start": p1, "end": p2, "features": { "term": w, "type": ttype }} if w != term: json_obj['features']['term_normalized'] = term return Annotation(json_obj)
def as_annotation(self): properties = { "id": "p%s" % self.number, "@type": vocab('Page'), "start": self.start, "end": self.end, "features": {} } if self.header is not None: properties['features']['header'] = self.header if self.footer is not None: properties['features']['footer'] = self.footer return Annotation(properties)
def generate_lif(txt, vnc): """ * txt is a plain text file only with the original text value. * vnc (verbnetclass) is a output from clearwsd file (mostly in conll format) This function will generate a LIF json file using disambiguation annotation encoded in the vnc file, using txt as top-level `text` field. """ t = open(txt, encoding="utf-8") v = open(vnc, encoding="utf-8") lif_obj = LIF() cont_obj = Container() cont_obj.discriminator = "http://vocab.lappsgrid.org/ns/media/jsonld#lif" cont_obj.payload = lif_obj raw_text = t.read() t.close() lif_obj.text.value = raw_text vnc_view = View() lif_obj.views.append(vnc_view) vnc_view.id = "verbnettag" vnc_view.metadata['contains'] = {vocab('SemanticTag'): {}} annotations = [line for line in v if line.startswith('#')] v.close() for annotation in annotations: splitted = annotation.split('\t')[0].split() oid = splitted[1] osent = splitted[2] otoken = splitted[3] olemma = " ".join(splitted[4:-1]) # some lemmas have space inside olabel = splitted[-1] properly_annotated = re.match(r'\d+\[(\d+),(\d+)\]', otoken) if properly_annotated is None: continue s, e = map(int, properly_annotated.groups()) ann = {} ann["id"] = "vnc_" + oid ann["start"] = s ann["end"] = e ann["@type"] = vocab("SemanticTag") ann["features"] = { "tags": [olabel], "type": "VerbNetClass", "lemma": olemma, "text": raw_text[s:e] } ann_obj = Annotation(ann) vnc_view.annotations.append(ann_obj) cont_obj.write()
def fix_view(identifier, view): annos = view.id['annotations'] view.id = identifier view.annotations = [] for a in annos: view.annotations.append(Annotation(a))