def process(self, cas): lang = next(CasUtil.get_annotations(cas, "Language")) if lang.value != "de": return filtered_token = [] for annot in CasUtil.get_all_annotations(cas): self.add_to_filtered_token(annot, filtered_token) for t in filtered_token: self.unique_token[t.lower()] = 1 if CasUtil.has_annotation(cas, t, 'NER') else 0 self.write_output_files(cas, filtered_token)
def process(self, cas): for token_annot in CasUtil.get_annotations(cas, "Token"): token = token_annot.get_covered_text() normalized = self.normalize_word_token(token) if normalized != token: norm_annot = Annotation(cas.get_view(), token_annot.begin, token_annot.end, "Error", normalized) cas.add_fs_annotation(norm_annot)
def process(self, cas): self.f.write('<document id=' + str(cas.document_id) + '>\n') self.f.write('\t<text>' + cas.artifact + '</text>\n') self.f.write('\t<annotations>\n') for annot in CasUtil.get_all_annotations(cas): xml = '\t\t<annotation' xml += ' begin=' + str(annot.begin) xml += ' end=' + str(annot.end) xml += ' type=' + annot.type if annot.type else '' xml += ' value=' + str(annot.value) if annot.value else '' xml += ' />\n' self.f.write(xml) self.f.write('\t</annotations>\n') self.f.write('</document>\n\n')
def process(self, cas): for token_annot in CasUtil.get_annotations(cas, "Token"): token = token_annot.get_covered_text() if self.is_token_to_tag(token): annot = Annotation(cas.get_view(), token_annot.begin, token_annot.end, self.get_token_type()) cas.add_fs_annotation(annot)
def write_output_files(self, cas, filtered_token): self.sent_writer.writerow([cas.document_id, cas.date, cas.artifact]) self.token_writer.writerow([cas.document_id, cas.date, " ".join(filtered_token)]) raw_token = [annot.get_covered_text() for annot in CasUtil.get_annotations(cas, "Token")] self.raw_token_writer.writerow([" ".join(raw_token)])
def process(self, cas): print("Artifact:", cas.artifact) for annot in CasUtil.get_all_annotations(cas): print(annot, annot.get_covered_text())