def generate_metadata(data_dir, fname): subdir = os.path.split(fname)[0] lif_file = os.path.join(data_dir, 'lif', subdir, "tesseract-300dpi-20p.lif") ner_file = os.path.join(data_dir, 'ner', subdir, "%s.ner.lif" % subdir) mta_file = os.path.join(data_dir, 'mta', subdir, "%s.mta.lif" % subdir) ensure_directory(mta_file) lif = Container(lif_file).payload lif_ner = Container(ner_file).payload lif_mta = LIF(json_object=lif.as_json()) lif_mta.text.value = None lif_mta.text.fname = lif_file lif_mta.views = [] lif.metadata["authors"] = [] lif.metadata["year"] = None page_view = lif.get_view("pages") ner_view = lif_ner.get_view('v2') window = _get_window(page_view) lif.metadata["authors"] = _get_authors(lif, ner_view, window) lif.metadata["year"] = _get_year(ner_view, window) lif_mta.write(fname=mta_file, pretty=True)
def collect(path, n, tag, restriction=None): print("# SCRIPT = %s" % 'scripts/dtriac-19d/collect_annotations.py') print("# PATH = %s" % path) print("# FILES = %s" % n) print("# TAG = %s" % tag) feat, val = None, None if restriction is not None: feat, val = restriction.split('=') print("# FEAT = %s=%s" % (feat, val)) full_tag = "http://vocab.lappsgrid.org/%s" % tag processing_step = os.path.split(path)[1] subdirs = os.listdir(path)[:int(n)] locations = [] for subdir in subdirs: fname = os.path.join(path, subdir, "%s.%s.lif" % (subdir, processing_step)) lif = Container(fname).payload for view in lif.views: for annotation in view.annotations: if annotation_matches(annotation, full_tag, feat, val): p1 = annotation.start p2 = annotation.end locations.append(annotation.features.get('word')) locs = Counter(locations) total_count = sum(locs.values()) print("# HITS = %d" % total_count) for loc, count in locs.most_common(): print("%d \t%s" % (count, loc))
def test_lif_file(lif_file): """Just print the text of all headers, should give an indication of whether all the offsets are correct.""" lif = Container(json_file=lif_file).payload text = lif.text.value view = lif.views[0] for anno in view.annotations: if anno.type.endswith('Header'): print("[{}]".format(text[anno.start:anno.end])) print('')
def test_lif_file(lif_file): """Just print the text of all headers, should give an indication of whether all the offsets are correct.""" lif = Container(json_file=lif_file).payload text = lif.text.value view = lif.views[0] for anno in view.annotations: page = text[anno.start:anno.end] print("<{}> {}".format(anno.id, ' '.join(page[:80].split()))) print('')
def generate_sentence_types(data_dir, fname): subdir = os.path.split(fname)[0] lif_file = os.path.join(data_dir, 'lif', subdir, "tesseract-300dpi-20p.lif") spl_file = os.path.join(data_dir, 'spl', subdir, "%s.spl.lif" % subdir) sen_file = os.path.join(data_dir, 'sen', subdir, "%s.sen.lif" % subdir) ensure_directory(sen_file) if DEBUG: SENTS.write(">>> %s\n>>> %s\n>>> %s\n\n" % ('-' * 100, fname, '-' * 100)) lif = Container(lif_file).payload lif_spl = Container(spl_file).payload lif_sen = LIF(json_object=lif.as_json()) spl_sentences_view = lif_spl.get_view('v2') new_sentences_view = _create_view() lif_sen.views = [new_sentences_view] good_sentences = 0 bad_sentences = 0 for anno in spl_sentences_view.annotations: if anno.type.endswith('Sentence'): sc = SentenceClassifier(lif, anno, WORDS) if sc.is_crap(): if DEBUG: SENTS.write("---- %f\n%s\n\n" % (sc.ratio, repr(sc.text))) anno.features['type'] = 'crap' bad_sentences += 1 else: if DEBUG: SENTS.write("++++ %f\n%s\n\n" % (sc.ratio, repr(sc.text))) anno.features['type'] = 'normal' good_sentences += 1 new_sentences_view.annotations.append(anno) if DEBUG: SENTS.write("\nTOTAL GOOD = {:d}\nTOTAL BAD = {:d}\n\n\n".format(good_sentences, bad_sentences)) lif_sen.write(fname=sen_file, pretty=True)
def _add_view(self, identifier, fname, view_rank): """Load fname as either a LIF object or a Container object and select the specified view, indicated by an index in the view list. Add the identifier to this view and add it to the list of views. Note that some files contain LIF objects and others contain Containers with LIF embedded. The view we are looking for is the first or second, depending on how the processor for those data was set up.""" try: view = Container(fname).payload.views[view_rank] except KeyError: view = LIF(fname).views[view_rank] view.id = identifier self.lif.views.append(view)
def run_tarsqi_for_file(data_dir, fname): lif_file = os.path.join(data_dir, 'lif', fname[:-4] + '.lif') ttk_file = os.path.join(data_dir, 'ttk', fname[:-4] + '.lif') ensure_directory(ttk_file) lif = Container(lif_file).payload text = lif.text.value doc = parse_text(text) if COMPRESS: with gzip.open(ttk_file + '.gz', 'wb') as fh: doc.print_all_lif(fh) else: with open(ttk_file, 'w') as out: doc.print_all_lif(out)
def lookup_technologies(data_dir, fname): subdir = os.path.split(fname)[0] pos_file = os.path.join(data_dir, 'pos', subdir, "%s.pos.lif" % subdir) tex_file = os.path.join(data_dir, 'tex', subdir, "%s.lup.lif" % subdir) ensure_directory(tex_file) lif = Container(pos_file).payload lif_tex = LIF(json_object=lif.as_json()) pos_view = lif.get_view('v2') tex_view = create_view('tex', 'Technology', 'dtriac-pipeline:lookup.py') lif_tex.views = [tex_view] tokens = [a for a in pos_view.annotations if a.type.endswith('Token')] _lookup_technologies_in_tokens(lif, tokens, tex_view) lif_tex.write(fname=tex_file, pretty=True)
def generate_lif(txt, vnc): """ * txt is a plain text file only with the original text value. * vnc (verbnetclass) is a output from clearwsd file (mostly in conll format) This function will generate a LIF json file using disambiguation annotation encoded in the vnc file, using txt as top-level `text` field. """ t = open(txt, encoding="utf-8") v = open(vnc, encoding="utf-8") lif_obj = LIF() cont_obj = Container() cont_obj.discriminator = "http://vocab.lappsgrid.org/ns/media/jsonld#lif" cont_obj.payload = lif_obj raw_text = t.read() t.close() lif_obj.text.value = raw_text vnc_view = View() lif_obj.views.append(vnc_view) vnc_view.id = "verbnettag" vnc_view.metadata['contains'] = {vocab('SemanticTag'): {}} annotations = [line for line in v if line.startswith('#')] v.close() for annotation in annotations: splitted = annotation.split('\t')[0].split() oid = splitted[1] osent = splitted[2] otoken = splitted[3] olemma = " ".join(splitted[4:-1]) # some lemmas have space inside olabel = splitted[-1] properly_annotated = re.match(r'\d+\[(\d+),(\d+)\]', otoken) if properly_annotated is None: continue s, e = map(int, properly_annotated.groups()) ann = {} ann["id"] = "vnc_" + oid ann["start"] = s ann["end"] = e ann["@type"] = vocab("SemanticTag") ann["features"] = { "tags": [olabel], "type": "VerbNetClass", "lemma": olemma, "text": raw_text[s:e] } ann_obj = Annotation(ann) vnc_view.annotations.append(ann_obj) cont_obj.write()
def _add_view(self, identifier, fname, view_id): """Load fname as either a LIF object or a Container object and select the specified view, indicated by an index in the view list. Add the identifier to this view and add it to the list of views.""" # Note that some files contain LIF objects and others contain Containers # with LIF embedded. The view we are looking for is the first or second, # depending on how the processor for those data was set up. try: view = Container(fname).payload.views[view_id] except KeyError: # this happens when we try to get a discriminator attribute from a LIF object view = LIF(fname).views[view_id] view.id = identifier self.lif.views.append(view)
def _collect_data(data_dir, filelist, start, end): all_data = [] # especially the first two occur in most abstracts so let's ignore them words_to_ignore = {'title', 'abstract', 'result', 'study'} for n, fname in elements(filelist, start, end): print("%07d %s" % (n, fname)) fpath = os.path.join(data_dir, 'lif', fname[:-4] + '.lif') lif = Container(fpath).payload text_data = prepare_text_for_lda(lif.text.value) text_data = [w for w in text_data if w not in words_to_ignore] all_data.append(text_data) token_count = sum([len(d) for d in all_data]) print('\nToken count = %d' % token_count) return all_data
def _collect_data(lif_dir): all_data = [] for fname in os.listdir(lif_dir): print(" {}".format(os.path.basename(fname))) fpath = os.path.join(lif_dir, fname) lif = Container(fpath).payload text_data = prepare_text_for_lda(lif.text.value) all_data.append(text_data) if fname.startswith('888'): break print('') print(len(text_data), 'sentences') print(sum([len(s) for s in text_data]), 'tokens') return all_data
def create_lif_file(json_file, lif_file, txt_file, test=False): print("Creating {}".format(lif_file)) with codecs.open(json_file, encoding='utf8') as fh_in, \ codecs.open(lif_file, 'w', encoding='utf8') as fh_out_lif, \ codecs.open(txt_file, 'w', encoding='utf8') as fh_out_txt: json_obj = json.loads(fh_in.read()) lif_obj = LIF() _add_metadata(lif_obj, json_obj) _add_view(lif_obj, json_obj) _add_rest(lif_obj, json_obj) container = Container() container.discriminator = "http://vocab.lappsgrid.org/ns/media/jsonld#lif" container.payload = lif_obj fh_out_lif.write(json.dumps(container.as_json(), indent=4)) fh_out_txt.write(container.payload.text.value) if test: test_lif_file(lif_file)
def __init__(self, fname, data_dir, lif_file, mta_file, top_file, ner_file, sen_file, tex_file, wik_file): """Build a single LIF object with all relevant annotations. The annotations themselves are stored in the Annotations object in self.annotations.""" self.id = int(os.path.split(fname)[0]) self.fname = fname self.data_dir = data_dir self.lif = Container(lif_file).payload self.meta = LIF(mta_file) self.wikis = LIF(wik_file).metadata['wikified_es'] self._add_views(ner_file, sen_file, tex_file, top_file) self.lif.metadata["filename"] = self.fname self.lif.metadata["year"] = self._get_year() self.annotations = Annotations(self.id, fname, doc=self, text=self.lif.text.value) self.annotations.text = self.lif.text.value self._collect_allowed_offsets() self._collect_annotations()
def show_file(sourcepath, datapath, subdir): sourcefile = os.path.join(sourcepath, subdir, 'tesseract-300dpi-20p.txt') datafile = os.path.join(datapath, subdir, 'tesseract-300dpi-20p.lif') print("\n%s%s/%s%s" % (BLUE, subdir, os.path.basename(datafile), END)) lif = Container(json_file=datafile).payload annotations = lif.views[0].annotations pages = get_pages(sourcefile) if len(pages) != len(annotations): print( "WARNING: unequal number of pages and page annotations (%d != %d)" % (len(pages), len(annotations))) for page, annotation in zip(pages, annotations): if (HEADERS_FOOTERS_ONLY and annotation.features.get('header') is None and annotation.features.get('footer') is None): continue print_annotation(annotation) print_page(page) print_page(lif.text.value[annotation.start:annotation.end]) input()
def __init__(self, fname, lif_file, ner_file, tex_file, ttk_file, sen_file, rel_file, vnc_file, top_file, ontology): """Build a single LIF object with all relevant annotations. The annotations themselves are stored in the Annotations object in self.annotations.""" self.id = Document.new_id() self.fname = fname self.ontology = ontology self.lif = Container(lif_file).payload self._add_views(ner_file, tex_file, ttk_file, sen_file, rel_file, vnc_file, top_file) self.lif.metadata["filename"] = self.fname self.lif.metadata["title"] = self._get_title() self.lif.metadata["year"] = self._get_year() self.lif.metadata["abstract"] = self._get_abstract() self.annotations = Annotations(fname, doc=self, docid=self.id, text=self.lif.text.value) self.annotations.text = self.lif.text.value self._collect_allowed_offsets() self._collect_annotations()
def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary): topic_id = 0 fname_in = os.path.join(data_dir, 'lif', fname[:-4] + '.lif') fname_out = os.path.join(data_dir, 'top', fname[:-4] + '.lif') ensure_directory(fname_out) lif_in = Container(fname_in).payload lif_out = LIF(json_object=lif_in.as_json()) # just to save some space, we get them from the lif file anyway lif_out.metadata = {} topics_view = _create_view() lif_out.views = [topics_view] topics_view.annotations.append(markable_annotation(lif_in)) doc = prepare_text_for_lda(lif_in.text.value) bow = dictionary.doc2bow(doc) for topic in lda.get_document_topics(bow): topic_id += 1 # these are tuples of topic_id and score lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0])) # print(' %3d %.04f %s' % (topic[0], topic[1], lemmas)) topics_view.annotations.append( topic_annotation(topic, topic_id, lemmas)) lif_out.write(fname=fname_out, pretty=True)
def show_file(tag, path, subdir): data_dir = os.path.join(path, subdir) files = [f for f in os.listdir(data_dir) if f[0].isdigit()] if len(files) != 1: print('Unexpected directory contents') return data_file = os.path.join(data_dir, files[0]) print("\n%s%s/%s%s" % (BLUE, subdir, os.path.basename(data_file), END)) lif = Container(json_file=data_file).payload count = 0 print() for view in lif.views: for anno in view.annotations: if anno.type == tag: p1, p2 = anno.start, anno.end text = lif.text.value[p1:p2] if tag.endswith('Sentence'): print("%s" % ('>' * WIDTH)) print(lif.text.value[p1:p2]) input() else: category = anno.features.get('category') if category in ('number', 'ordinal', 'percent', 'money', 'misc'): continue if category is not None: category = '%-20s' % (category + ':') else: category = '' left = "%-25s" % lif.text.value[p1 - 25:p1] right = lif.text.value[p2:p2 + 25] context = "%s%s%s %s %s%s" % (category, left, BLUE, text, END, right) context = context.replace('\n', ' ') print(context, end='\n') count += 1 if count % 25 == 0: input()
def generate_topics(lif, top): lda = load_model() topic_idx = { topic_id: topic for topic_id, topic in lda.print_topics(num_topics=NUM_TOPICS) } dictionary = load_dictionary() for fname in os.listdir(lif): if not fname.endswith('.lif'): continue # if not fname.startswith('z'): continue topic_id = 0 print("{}".format(os.path.basename(fname))) fname_in = os.path.join(lif, fname) fname_out = os.path.join(top, fname) lif_in = Container(fname_in).payload lif_out = LIF(json_object=lif_in.as_json()) # just to save some space, we get them from the lif file anyway lif_out.metadata = {} topics_view = _create_view() lif_out.views = [topics_view] topics_view.annotations.append(markable_annotation(lif_in)) doc = prepare_text_for_lda(lif_in.text.value) bow = dictionary.doc2bow(doc) for topic in lda.get_document_topics(bow): topic_id += 1 # these are tuples of topic_id and score lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0])) # print(' %3d %.04f %s' % (topic[0], topic[1], lemmas)) topics_view.annotations.append( topic_annotation(topic, topic_id, lemmas)) lif_out.write(fname=fname_out, pretty=True)
def wikify_lif(in_f, wikifier): in_lif = Container(in_f).payload out_lif = LIF(json_object=in_lif.as_json()) out_lif.views = [] out_lif.metadata["wikified_es"] = wikifier.wikify(out_lif.text.value) return out_lif
def get_lif(fpath): try: lif = Container(fpath).payload except: lif = LIF(fpath) return lif
def create_container(lif_object): container = Container() container.discriminator = "http://vocab.lappsgrid.org/ns/media/jsonld#lif" container.payload = lif_object return container