def json_data(data): document = Document(data.get(u"name", u"_DOCUMENT_"), content=data.get(u"content", u"")) for key, value in data.get(u"metadatas", {}).items(): document.add_metadata(key, value) for segmentation_name in data.get(u"segmentations", {}): d = data[u"segmentations"][segmentation_name] spans = [ Span(lb=span[u"s"], ub=0, length=span[u"l"]) for span in d[u"spans"] ] segmentation = Segmentation(segmentation_name, spans=spans, reference=d.get(u"reference", None)) document.add_segmentation(segmentation) for segmentation in document.segmentations: if segmentation.reference is not None: segmentation.reference = document.segmentation( segmentation.reference) for annotation_name in data.get(u"annotations", {}): d = data[u"annotations"][annotation_name] annotations = [ Tag(lb=annotation[u"s"], ub=0, length=annotation[u"l"], value=annotation[u"v"]) for annotation in d[u"annotations"] ] annotation = Annotation(annotation_name, reference=document.segmentation( d[u"reference"]), annotations=annotations) document.add_annotation(annotation)
def brat_file(filename, encoding="utf-8"): no_ext, ext = os.path.splitext(filename) txt_file = no_ext + ".txt" ann_file = no_ext + ".ann" if not (os.path.exists(txt_file) and os.path.exists(ann_file)): raise ValueError("missing either .ann or .txt file") document = Document(os.path.basename(txt_file), encoding=encoding, mime_type="text/plain") document.content = codecs.open(txt_file, "rU", encoding).read().replace(u"\r", u"") annotations = Annotation("NER") for line in codecs.open(ann_file, "rU", encoding): line = line.strip() if line != u"" and line.startswith(u'T'): parts = line.split(u"\t") value, bounds = parts[1].split(" ", 1) for bound in bounds.split(";"): lb, ub = bound.split() lb = int(lb) ub = int(ub) annotations.append(Tag(lb=lb, ub=ub, value=value)) annotations.sort() document.add_annotation(annotations) return document
def main(indirnames, outfilename, default_shift=0, top_level=False): dirs = [] for indirname in indirnames: dirs.extend([ os.path.join(indirname, name) for name in sorted(os.listdir(indirname)) if os.path.isdir(os.path.join(indirname, name)) ]) contents = [] annotations = [] shift = 0 for dirname in dirs: cur_contents, cur_annotations, cur_shift = make_data( dirname, default_shift=shift, top_level=top_level) contents.extend(cur_contents) annotations.extend(cur_annotations) shift = cur_shift document = Document("_doc_", content=(u"\n" * NUM_NEWLINES).join(contents)) document.add_annotation(Annotation("NER", annotations=annotations)) exporter = BratExporter() with codecs.open(outfilename + ".ann", "w", "utf-8") as O: O.write(exporter.document_to_unicode(document, {"ner": "NER"})) with codecs.open(outfilename + ".txt", "w", "utf-8") as O: O.write(document.content)
def test_enrich(self): document = Document("document", "Ceci est un test.") corpus = Corpus([u"word"], sentences=[[{ u"word": u"Ceci" }, { u"word": u"est" }, { u"word": u"un" }, { u"word": u"test" }, { u"word": u"." }]]) document._corpus = corpus features = [] cwg = DictGetterFeature(entry="word", x=0) features.append(BOSFeature(name="BOS", entry="word", getter=cwg)) features.append(EOSFeature(name="EOS", entry="word", getter=cwg)) informations = Informations(bentries=[Entry(u"word")], features=features) enrich = EnrichModule(informations) self.assertEquals(document._corpus.fields, [u"word"]) enrich.process_document(document) self.assertEquals(document._corpus.fields, [u"word", u"BOS", u"EOS"])
def test_clean(self): document = Document("document", "Ceci est un test.") corpus = Corpus([u"word", u"remove"], sentences=[[{ u"word": u"Ceci", u"remove": u"Ceci" }, { u"word": u"est", u"remove": u"est" }, { u"word": u"un", u"remove": u"un" }, { u"word": u"test", u"remove": u"test" }, { u"word": u".", u"remove": u"." }]]) document._corpus = corpus self.assertEquals(document._corpus.fields, [u"word", u"remove"]) clean = CleanModule(to_keep=[u"word"]) clean.process_document(document) self.assertEquals(document._corpus.fields, [u"word"])
def test_wapiti_label(self): document = Document("document", "Ceci est un test.") corpus = Corpus([u"word"], sentences=[[{ u"word": u"Ceci" }, { u"word": u"est" }, { u"word": u"un" }, { u"word": u"test" }, { u"word": u"." }]]) document._corpus = corpus self.assertEquals(document._corpus.fields, [u"word"]) wapiti_label = WapitiLabelModule( os.path.join(SEM_DATA_DIR, "non-regression", "models", "model"), u"the_new_field") wapiti_label.process_document(document) self.assertEquals(document._corpus.fields, [u"word", u"the_new_field"]) sentence = document._corpus.sentences[0] self.assertEquals(sentence[0]["the_new_field"], u"A") self.assertEquals(sentence[1]["the_new_field"], u"B") self.assertEquals(sentence[2]["the_new_field"], u"B") self.assertEquals(sentence[3]["the_new_field"], u"A") self.assertEquals(sentence[4]["the_new_field"], u"O")
def conll_file(filename, fields, word_field, encoding="utf-8"): document = Document(os.path.basename(filename), encoding=encoding) document._corpus = Corpus.from_conll(filename, fields, encoding=encoding) character_index = 0 sentence_index = 0 contents = [] word_spans = [] sentence_spans = [] for sentence in document._corpus.sentences: contents.append([]) for token in sentence: word = token[word_field] contents[-1].append(word) word_spans.append( Span(character_index, character_index + len(word))) character_index += len(word) + 1 sentence_spans.append( Span(sentence_index, sentence_index + len(sentence))) sentence_index += len(sentence) document._content = u"\n".join( [u" ".join(content) for content in contents]) document.add_segmentation(Segmentation("tokens", spans=word_spans)) document.add_segmentation( Segmentation("sentences", reference=document.segmentation("tokens"), spans=sentence_spans)) return document
def main(indirname, outfilename, default_shift=0, top_level=False): contents, annotations, shift = make_data(indirname, default_shift=default_shift, top_level=top_level) document = Document("_doc_", content=(u"\n" * NUM_NEWLINES).join(contents)) document.add_annotation(Annotation("NER", annotations=annotations)) exporter = BratExporter() with codecs.open(outfilename + ".ann", "w", "utf-8") as O: O.write(exporter.document_to_unicode(document, {"ner": "NER"})) with codecs.open(outfilename + ".txt", "w", "utf-8") as O: O.write(document.content)
def load(filename, encoding="utf-8", fields=None, word_field=None, wikinews_format=False, logger=None, strip_html=False, tagset_name=None, *args, **kwargs): if type(filename) in (Document, SEMCorpus): if logger is not None: logger.info(u"detected format: SEM XML") return filename try: filename = filename.decode(sys.getfilesystemencoding()) except UnicodeDecodeError: pass except AttributeError: # AttributeError raised in python3 as it will be str pass if filename.startswith("http"): if logger is not None: logger.info(u"detected format: HTML") return from_url(filename, strip_html=strip_html, wikinews_format=wikinews_format) if filename.endswith(".xml"): xml = ET.parse(filename) root_tag = xml.getroot().tag if root_tag == "sem": if logger is not None: logger.info(u"detected format: SEM XML") return SEMCorpus.from_xml(xml) elif root_tag == "document": if logger is not None: logger.info(u"detected format: SEM XML") return Document.from_xml(xml) elif root_tag == "GateDocument": if logger is not None: logger.info(u"detected format: GATE XML") return gate_data(xml, os.path.basename(filename)) no_ext, ext = os.path.splitext(filename) if (ext == ".ann") or (ext == ".txt" and os.path.exists(no_ext + ".ann")): if logger is not None: logger.info(u"detected format: BRAT") return brat_file(filename, encoding=encoding, tagset_name=tagset_name) if fields is not None and word_field is not None: if logger is not None: logger.info(u"No specific format found, defaulting to text format") return conll_file(filename, fields, word_field, encoding=encoding) # this should be the last: if everything fail, just load as text document return text_file(filename, encoding=encoding)
def gate_data(data, name=None): document = Document(name or "__DOCUMENT__", mime_type="text/plain") textwithnodes = data.findall("TextWithNodes")[0] annotation_sets = data.findall("AnnotationSet") text_parts = [textwithnodes.text or u""] nodes = {} for node in list(textwithnodes): nodes[int(node.attrib["id"])] = sum([len(part) for part in text_parts]) text_parts.append(node.tail or u"") document.content = u"".join(text_parts) annotations = [] for annotation_set in annotation_sets: annotation_name = annotation_set.attrib["Name"] sem_annotation = Annotation(annotation_name) for annotation in annotation_set: lb = nodes[int(annotation.attrib["StartNode"])] ub = nodes[int(annotation.attrib["EndNode"])] sem_annotation.append(Tag(lb, ub, annotation.attrib["Type"])) document.add_annotation(sem_annotation) return document
def from_url(url, strip_html=False, wikinews_format=False): url = url.strip() if url == u"": return None try: url = url.decode(sys.getfilesystemencoding()) except: pass strip_html |= wikinews_format # wikinews format is always stripped charset = re.compile('charset="(.+?)"') escaped_url = u"".join([(urllib.quote(c) if ord(c) > 127 else c) for c in url.encode("utf-8")]) escaped_url = escaped_url.replace(u"%2525", u"%25") #url = url.decode("iso-8859-1") #url = url.replace(":","") content = u"" f = urllib.urlopen(escaped_url) content = f.read() f.close() encoding = charset.search(content) if encoding is not None: encoding = encoding.group(1) or "utf-8" else: encoding = "utf-8" content = content.decode(encoding) regex = re.compile('^.+?[^/]/(?=[^/])', re.M) parts = regex.findall(escaped_url) base_url = (escaped_url[:] + u"/" if parts == [] else parts[0]).decode("iso-8859-1") content = content.replace(u'="//', u'="http://') content = content.replace(u'="/', u'="%s' % base_url) content = content.replace(u'=\\"//', u'=\\"http://') content = content.replace(u'=\\"/', u'=\\"%s' % base_url) content = content.replace(u'\r', u'') content = content.replace(u'</p>', u'</p>\n\n') if strip_html: new_content = sem.misc.strip_html(content, keep_offsets=True) else: new_content = content if wikinews_format: cleaned_content = new_content[:content.index("<h2>")].strip() else: cleaned_content = new_content if strip_html: h = HTMLParser() empty_line = re.compile("\n[ \t]+") spaces = re.compile("[ \t]+") newlines = re.compile("\n{2,}") cleaned_content = h.unescape(cleaned_content) cleaned_content = empty_line.sub(u"\n", cleaned_content) cleaned_content = spaces.sub(u" ", cleaned_content) cleaned_content = newlines.sub("\n\n", cleaned_content) spaces_begin = re.compile("^[ \t]+", re.M) spaces_end = re.compile("[ \t]+$", re.M) cleaned_content = spaces_begin.sub("", cleaned_content) cleaned_content = spaces_end.sub("", cleaned_content) mime_type = ("text/plain" if strip_html else "text/html") return Document(name=url, content=cleaned_content, original_content=content, mime_type=mime_type)
def text_file(filename, encoding="utf-8"): return Document(os.path.basename(filename), content=codecs.open(filename, "rU", encoding).read().replace("\r", ""), encoding=encoding)
def test_wapiti_label(self): corpus = Corpus([u"word", u"tag"], sentences=[ [{ u"word": u"Ceci", u"tag": u"B-tag" }, { u"word": u"est", u"tag": u"O" }, { u"word": u"un", u"tag": u"O" }, { u"word": u"test", u"tag": u"O" }, { u"word": u".", u"tag": u"O" }], [{ u"word": u"Ceci", u"tag": u"O" }, { u"word": u"est", u"tag": u"O" }, { u"word": u"un", u"tag": u"O" }, { u"word": u"test", u"tag": u"O" }, { u"word": u".", u"tag": u"O" }], [{ u"word": u"ceci", u"tag": u"O" }, { u"word": u"est", u"tag": u"O" }, { u"word": u"un", u"tag": u"O" }, { u"word": u"test", u"tag": u"O" }, { u"word": u".", u"tag": u"O" }], ]) document = Document.from_corpus("document", corpus, u"word") tags = [] for sentence in document._corpus.sentences: for token in sentence: tags.append(token[u"tag"]) self.assertEquals(tags.count(u"O"), 14) self.assertEquals(tags.count(u"B-tag"), 1) label_consistency = LabelConsistencyModule(u"tag", token_field=u"word") label_consistency.process_document(document) self.assertEquals(document._corpus.sentences[0][0][u"tag"], u"B-tag") self.assertEquals(document._corpus.sentences[1][0][u"tag"], u"B-tag") self.assertEquals(document._corpus.sentences[2][0][u"tag"], u"O") tags = [] for sentence in document._corpus.sentences: for token in sentence: tags.append(token[u"tag"]) self.assertEquals(tags.count(u"O"), 13) self.assertEquals(tags.count(u"B-tag"), 2)
def main(args): infile = args.infile reference_column = args.reference_column tagging_column = args.tagging_column ienc = args.ienc or args.enc oenc = args.oenc or args.enc verbose = args.verbose input_format = args.input_format reference_file = args.reference_file annotation_name = args.annotation_name dump = args.dump context_size = args.context_size counts = {} prf = {} if input_format == "conll": if reference_file: print(u"reference_file not handled for CoNLL files") L = [] R = [] keys = None nth = -1 for n_line, p in Reader(infile, ienc).line_iter(): nth += 1 keys = keys or range(len(p[0])) L.extend( annotation_from_sentence(p, column=reference_column, shift=n_line - nth)) R.extend( annotation_from_sentence(p, column=tagging_column, shift=n_line - nth)) document = sem.importers.conll_file(infile, keys, keys[0], encoding=ienc) L = Annotation("", annotations=L, reference=document.segmentation( "tokens")).get_reference_annotations() R = Annotation("", annotations=R, reference=document.segmentation( "tokens")).get_reference_annotations() elif input_format == "brat": document = sem.importers.brat_file(reference_file) L = document.annotation("NER").get_reference_annotations() R = sem.importers.brat_file(infile).annotation( "NER").get_reference_annotations() elif input_format in ("sem", "SEM"): document = Document.from_xml(reference_file) system = Document.from_xml(infile) common_annotations = set(document.annotations.keys()) & set( system.annotations.keys()) if len(common_annotations) == 1 and annotation_name is None: annotation_name = list(common_annotations)[0] if annotation_name is None: raise RuntimeError( "Could not find an annotation set to evaluate: please provide one" ) L = document.annotation(annotation_name).get_reference_annotations() R = system.annotation(annotation_name).get_reference_annotations() else: raise RuntimeError("format not handled: {0}".format(input_format)) len_ref = len(L) len_tag = len(R) d = { CORRECT: [], TYPE_ERROR: [], BOUNDARY_ERROR: [], TYPE_AND_BOUNDARY_ERROR: [], SILENCE_ERROR: [], NOISE_ERROR: [] } # first pass, removing correct i = 0 while i < len(L): LR = L[i] j = 0 while j < len(R): RR = R[j] if LR == RR: del L[i] del R[j] i -= 1 d[CORRECT].append([LR, RR]) break j += 1 i += 1 # second pass, typing errors i = 0 while i < len(L): LR = L[i] j = 0 while j < len(R): RR = R[j] if LR.value != RR.value and LR.lb == RR.lb and LR.ub == RR.ub: del L[i] del R[j] d[TYPE_ERROR].append([LR, RR]) break j += 1 i += 1 # third pass, boundary errors i = 0 while i < len(L): LR = L[i] j = 0 while j < len(R): RR = R[j] if LR.value == RR.value and ((LR.lb != RR.lb and LR.ub == RR.ub) or (LR.lb == RR.lb and LR.ub != RR.ub)): del L[i] del R[j] i -= 1 d[BOUNDARY_ERROR].append([LR, RR]) break j += 1 i += 1 # fourth pass, both type and boundary errors i = 0 while i < len(L): LR = L[i] j = 0 while j < len(R): RR = R[j] if LR.value != RR.value and (LR.lb != RR.lb and LR.ub == RR.ub) or (LR.lb == RR.lb and LR.ub != RR.ub): del L[i] del R[j] i -= 1 d[TYPE_AND_BOUNDARY_ERROR].append([LR, RR]) break j += 1 i += 1 d[SILENCE_ERROR] = L[:] d[NOISE_ERROR] = R[:] entities = set() for l in d.values(): for e in l: try: l, r = e entities.add(l.value) entities.add(r.value) except: entities.add(e.value) with codecs.open(dump, "w", "utf-8") as O: O.write(u"error kind\treference entity\toutput entity\tdiff\n") for error_kind in (TYPE_ERROR, BOUNDARY_ERROR, TYPE_AND_BOUNDARY_ERROR, NOISE_ERROR, SILENCE_ERROR): for ex in d[error_kind]: if error_kind == NOISE_ERROR: gold = None guess = ex elif error_kind == SILENCE_ERROR: gold = ex guess = None else: gold = ex[0] guess = ex[1] gold_str = (u"{0}:{1}".format( gold.value, document.content[gold.lb:gold.ub]) if gold else "").replace("\r", "").replace("\n", " ") guess_str = (u"{0}:{1}".format( guess.value, document.content[guess.lb:guess.ub]) if guess else "").replace("\r", "").replace("\n", " ") diff = get_diff(document.content, gold, guess, error_kind, context_size=context_size) O.write(u"{0}\t{1}\t{2}\t{3}\n".format(error_kind, gold_str, guess_str, diff)) counts = {} for entity in entities: sub_d = {} sub_d[CORRECT] = [m for m in d[CORRECT] if m[0].value == entity] sub_d[TYPE_ERROR] = [ m for m in d[TYPE_ERROR] if m[0].value == entity or m[1].value == entity ] sub_d[BOUNDARY_ERROR] = [ m for m in d[BOUNDARY_ERROR] if m[0].value == entity or m[1].value == entity ] sub_d[TYPE_AND_BOUNDARY_ERROR] = [ m for m in d[TYPE_AND_BOUNDARY_ERROR] if m[0].value == entity or m[1].value == entity ] sub_d[NOISE_ERROR] = [m for m in d[NOISE_ERROR] if m.value == entity] sub_d[SILENCE_ERROR] = [ m for m in d[SILENCE_ERROR] if m.value == entity ] counts[entity] = sub_d # basic counts print(u"entity\tmeasure\tvalue") for entity in sorted(entities): for kind in OUTPUT_KINDS: print(u"{0}\t{1}\t{2}".format(entity, kind, len(counts[entity][kind]))) print(u"global\treference\t{0}".format(len_ref)) print(u"global\ttagging\t{0}".format(len_tag)) for kind in OUTPUT_KINDS: print(u"global\t{0}\t{1}".format(kind, len(d[kind]))) # P R F precisions = [] recalls = [] print() print(u"entity\tmeasure\tvalue") for entity in sorted(entities): precisions.append(precision(counts[entity])) recalls.append(recall(counts[entity])) print(u"{0}\tprecision\t{1:.4f}".format(entity, precisions[-1])) print(u"{0}\trecall\t{1:.4f}".format(entity, recalls[-1])) print(u"{0}\tfscore\t{1:.4f}".format( entity, fscore(precision(counts[entity]), recall(counts[entity])))) print(u"global\tprecision\t{0:.4f}".format(precision(d))) print(u"global\trecall\t{0:.4f}".format(recall(d))) print(u"global\tfscore\t{0:.4f}".format(fscore(precision(d), recall(d)))) print(u"global\tmacro-precision\t{0:.4f}".format(mean(precisions))) print(u"global\tmacro-recall\t{0:.4f}".format(mean(recalls))) print(u"global\tmacro-fscore\t{0:.4f}".format( fscore(mean(precisions), mean(recalls)))) # over/under generation, substitution print() print(u"entity\tmeasure\tvalue") for entity in sorted(entities): print(u"{0}\tundergeneration\t{1:.4f}".format( entity, undergeneration(counts[entity]))) print(u"{0}\tovergeneration\t{1:.4f}".format( entity, overgeneration(counts[entity]))) print(u"{0}\tsubstitution\t{1:.4f}".format( entity, substitution(counts[entity]))) print(u"global\tundergeneration\t{0:.4f}".format(undergeneration(d))) print(u"global\tovergeneration\t{0:.4f}".format(overgeneration(d))) print(u"global\tsubstitution\t{0:.4f}".format(substitution(d)))
def main(args): """ Return a document after it passed through a pipeline. Parameters ---------- masterfile : str the file containing the pipeline and global options infile : str the input for the upcoming pipe. Its base value is the file to treat, it can be either "plain text" or CoNNL-formatted file. directory : str the directory where every file will be outputted. """ start = time.time() infile = args.infile try: output_directory = args.output_directory except AttributeError: output_directory = u"." try: force_format = args.force_format except AttributeError: force_format = "default" try: pipeline = args.pipeline options = args.options exporter = args.exporter couples = args.couples except AttributeError: pipeline, options, exporter, couples = load_master( args.master, force_format) if get_option(options, "log", "log_file") is not None: sem_tagger_logger.addHandler( file_handler(get_option(options, "log", "log_file"))) sem_tagger_logger.setLevel( get_option(options, "log", "log_level", "WARNING")) if not os.path.exists(output_directory): os.makedirs(output_directory) exports = {} # keeping track of already done exports nth = 1 ienc = get_option(options, "encoding", "input_encoding", "utf-8") oenc = get_option(options, "encoding", "output_encoding", "utf-8") current_fields = None # the fields at the current state (depends on enrichments and # info cleaning). They will be used for wapiti if isinstance(infile, Document): sem_tagger_logger.info("Reading %s" % (infile.name)) document = infile else: sem_tagger_logger.info("Reading %s" % (infile)) file_shortname, _ = os.path.splitext(os.path.basename(infile)) export_name = os.path.join(output_directory, file_shortname) file_format = get_option(options, "file", "format", "guess") opts = get_section(options, "file") opts.update(get_section(options, "encoding")) if file_format == "text": document = Document(os.path.basename(infile), content=codecs.open(infile, "rU", ienc).read().replace( u"\r", u""), **opts) elif file_format == "conll": opts["fields"] = opts["fields"].split(u",") opts["taggings"] = [ tagging for tagging in opts.get("taggings", u"").split(u",") if tagging ] opts["chunkings"] = [ chunking for chunking in opts.get("chunkings", u"").split(u",") if chunking ] document = Document.from_conll(infile, **opts) elif file_format == "guess": document = sem.importers.load(infile, logger=sem_tagger_logger, **opts) else: raise ValueError(u"unknown format: %s" % file_format) pipeline.process_document(document) if exporter is not None: name = document.escaped_name() if "html" in exporter.extension(): shutil.copy(os.path.join(sem.SEM_RESOURCE_DIR, "css", "tabs.css"), output_directory) shutil.copy( os.path.join( sem.SEM_RESOURCE_DIR, "css", exporter._lang, get_option(options, "export", "lang_style", "default.css")), output_directory) if exporter.extension() == "ann": out_path = os.path.join( output_directory, "%s.%s" % (os.path.splitext(name)[0], exporter.extension())) filename = name if not filename.endswith(".txt"): filename += ".txt" with codecs.open(os.path.join(output_directory, filename), "w", oenc) as O: O.write(document.content) else: out_path = os.path.join(output_directory, "%s.%s" % (name, exporter.extension())) exporter.document_to_file(document, couples, out_path, encoding=oenc) laps = time.time() - start sem_tagger_logger.info('done in %s' % (timedelta(seconds=laps))) return document