def generate_metadata(data_dir, fname):

    subdir = os.path.split(fname)[0]
    lif_file = os.path.join(data_dir, 'lif', subdir, "tesseract-300dpi-20p.lif")
    ner_file = os.path.join(data_dir, 'ner', subdir, "%s.ner.lif" % subdir)
    mta_file = os.path.join(data_dir, 'mta', subdir, "%s.mta.lif" % subdir)
    ensure_directory(mta_file)

    lif = Container(lif_file).payload
    lif_ner = Container(ner_file).payload
    lif_mta = LIF(json_object=lif.as_json())
    lif_mta.text.value = None
    lif_mta.text.fname = lif_file
    lif_mta.views = []
    lif.metadata["authors"] = []
    lif.metadata["year"] = None

    page_view = lif.get_view("pages")
    ner_view = lif_ner.get_view('v2')

    window = _get_window(page_view)
    lif.metadata["authors"] = _get_authors(lif, ner_view, window)
    lif.metadata["year"] = _get_year(ner_view, window)

    lif_mta.write(fname=mta_file, pretty=True)
def collect(path, n, tag, restriction=None):
    print("# SCRIPT  =  %s" % 'scripts/dtriac-19d/collect_annotations.py')
    print("# PATH    =  %s" % path)
    print("# FILES   =  %s" % n)
    print("# TAG     =  %s" % tag)
    feat, val = None, None
    if restriction is not None:
        feat, val = restriction.split('=')
        print("# FEAT    =  %s=%s" % (feat, val))
    full_tag = "http://vocab.lappsgrid.org/%s" % tag
    processing_step = os.path.split(path)[1]
    subdirs = os.listdir(path)[:int(n)]
    locations = []
    for subdir in subdirs:
        fname = os.path.join(path, subdir,
                             "%s.%s.lif" % (subdir, processing_step))
        lif = Container(fname).payload
        for view in lif.views:
            for annotation in view.annotations:
                if annotation_matches(annotation, full_tag, feat, val):
                    p1 = annotation.start
                    p2 = annotation.end
                    locations.append(annotation.features.get('word'))
    locs = Counter(locations)
    total_count = sum(locs.values())
    print("# HITS    =  %d" % total_count)
    for loc, count in locs.most_common():
        print("%d \t%s" % (count, loc))
def test_lif_file(lif_file):
    """Just print the text of all headers, should give an indication of whether all
    the offsets are correct."""
    lif = Container(json_file=lif_file).payload
    text = lif.text.value
    view = lif.views[0]
    for anno in view.annotations:
        if anno.type.endswith('Header'):
            print("[{}]".format(text[anno.start:anno.end]))
    print('')
示例#4
0
def test_lif_file(lif_file):
    """Just print the text of all headers, should give an indication of whether all
    the offsets are correct."""
    lif = Container(json_file=lif_file).payload
    text = lif.text.value
    view = lif.views[0]
    for anno in view.annotations:
        page = text[anno.start:anno.end]
        print("<{}> {}".format(anno.id, ' '.join(page[:80].split())))
    print('')
def generate_sentence_types(data_dir, fname):

    subdir = os.path.split(fname)[0]
    lif_file = os.path.join(data_dir, 'lif', subdir, "tesseract-300dpi-20p.lif")
    spl_file = os.path.join(data_dir, 'spl', subdir, "%s.spl.lif" % subdir)
    sen_file = os.path.join(data_dir, 'sen', subdir, "%s.sen.lif" % subdir)
    ensure_directory(sen_file)

    if DEBUG:
        SENTS.write(">>> %s\n>>> %s\n>>> %s\n\n" % ('-' * 100, fname, '-' * 100))

    lif = Container(lif_file).payload
    lif_spl = Container(spl_file).payload
    lif_sen = LIF(json_object=lif.as_json())

    spl_sentences_view = lif_spl.get_view('v2')
    new_sentences_view = _create_view()
    lif_sen.views = [new_sentences_view]

    good_sentences = 0
    bad_sentences = 0

    for anno in spl_sentences_view.annotations:
        if anno.type.endswith('Sentence'):
            sc = SentenceClassifier(lif, anno, WORDS)
            if sc.is_crap():
                if DEBUG:
                    SENTS.write("---- %f\n%s\n\n" % (sc.ratio, repr(sc.text)))
                anno.features['type'] = 'crap'
                bad_sentences += 1
            else:
                if DEBUG:
                    SENTS.write("++++ %f\n%s\n\n" % (sc.ratio, repr(sc.text)))
                anno.features['type'] = 'normal'
                good_sentences += 1
            new_sentences_view.annotations.append(anno)
    if DEBUG:
        SENTS.write("\nTOTAL GOOD = {:d}\nTOTAL BAD  = {:d}\n\n\n".format(good_sentences, bad_sentences))

    lif_sen.write(fname=sen_file, pretty=True)
 def _add_view(self, identifier, fname, view_rank):
     """Load fname as either a LIF object or a Container object and select the
     specified view, indicated by an index in the view list. Add the
     identifier to this view and add it to the list of views. Note that some
     files contain LIF objects and others contain Containers with LIF
     embedded. The view we are looking for is the first or second, depending
     on how the processor for those data was set up."""
     try:
         view = Container(fname).payload.views[view_rank]
     except KeyError:
         view = LIF(fname).views[view_rank]
     view.id = identifier
     self.lif.views.append(view)
示例#7
0
def run_tarsqi_for_file(data_dir, fname):
    lif_file = os.path.join(data_dir, 'lif', fname[:-4] + '.lif')
    ttk_file = os.path.join(data_dir, 'ttk', fname[:-4] + '.lif')
    ensure_directory(ttk_file)
    lif = Container(lif_file).payload
    text = lif.text.value
    doc = parse_text(text)
    if COMPRESS:
        with gzip.open(ttk_file + '.gz', 'wb') as fh:
            doc.print_all_lif(fh)
    else:
        with open(ttk_file, 'w') as out:
            doc.print_all_lif(out)
示例#8
0
def lookup_technologies(data_dir, fname):
    subdir = os.path.split(fname)[0]
    pos_file = os.path.join(data_dir, 'pos', subdir, "%s.pos.lif" % subdir)
    tex_file = os.path.join(data_dir, 'tex', subdir, "%s.lup.lif" % subdir)
    ensure_directory(tex_file)
    lif = Container(pos_file).payload
    lif_tex = LIF(json_object=lif.as_json())
    pos_view = lif.get_view('v2')
    tex_view = create_view('tex', 'Technology', 'dtriac-pipeline:lookup.py')
    lif_tex.views = [tex_view]
    tokens = [a for a in pos_view.annotations if a.type.endswith('Token')]
    _lookup_technologies_in_tokens(lif, tokens, tex_view)
    lif_tex.write(fname=tex_file, pretty=True)
示例#9
0
def generate_lif(txt, vnc):
    """
    * txt is a plain text file only with the original text value. 
    * vnc (verbnetclass) is a output from clearwsd file (mostly in conll format)
    This function will generate a LIF json file using disambiguation annotation 
    encoded in the vnc file, using txt as top-level `text` field. 
    """
    t = open(txt, encoding="utf-8")
    v = open(vnc, encoding="utf-8")
    lif_obj = LIF()
    cont_obj = Container()
    cont_obj.discriminator = "http://vocab.lappsgrid.org/ns/media/jsonld#lif"
    cont_obj.payload = lif_obj

    raw_text = t.read()
    t.close()
    lif_obj.text.value = raw_text

    vnc_view = View()
    lif_obj.views.append(vnc_view)
    vnc_view.id = "verbnettag"
    vnc_view.metadata['contains'] = {vocab('SemanticTag'): {}}

    annotations = [line for line in v if line.startswith('#')]
    v.close()
    for annotation in annotations:
        splitted = annotation.split('\t')[0].split()

        oid = splitted[1]
        osent = splitted[2]
        otoken = splitted[3]
        olemma = " ".join(splitted[4:-1])  # some lemmas have space inside
        olabel = splitted[-1]
        properly_annotated = re.match(r'\d+\[(\d+),(\d+)\]', otoken)
        if properly_annotated is None:
            continue
        s, e = map(int, properly_annotated.groups())
        ann = {}
        ann["id"] = "vnc_" + oid
        ann["start"] = s
        ann["end"] = e
        ann["@type"] = vocab("SemanticTag")
        ann["features"] = {
            "tags": [olabel],
            "type": "VerbNetClass",
            "lemma": olemma,
            "text": raw_text[s:e]
        }
        ann_obj = Annotation(ann)
        vnc_view.annotations.append(ann_obj)
    cont_obj.write()
示例#10
0
 def _add_view(self, identifier, fname, view_id):
     """Load fname as either a LIF object or a Container object and select
     the specified view, indicated by an index in the view list. Add the
     identifier to this view and add it to the list of views."""
     # Note that some files contain LIF objects and others contain Containers
     # with LIF embedded. The view we are looking for is the first or second,
     # depending on how the processor for those data was set up.
     try:
         view = Container(fname).payload.views[view_id]
     except KeyError:
         # this happens when we try to get a discriminator attribute from a LIF object
         view = LIF(fname).views[view_id]
     view.id = identifier
     self.lif.views.append(view)
def _collect_data(data_dir, filelist, start, end):
    all_data = []
    # especially the first two occur  in most abstracts so let's ignore them
    words_to_ignore = {'title', 'abstract', 'result', 'study'}
    for n, fname in elements(filelist, start, end):
        print("%07d  %s" % (n, fname))
        fpath = os.path.join(data_dir, 'lif', fname[:-4] + '.lif')
        lif = Container(fpath).payload
        text_data = prepare_text_for_lda(lif.text.value)
        text_data = [w for w in text_data if w not in words_to_ignore]
        all_data.append(text_data)
    token_count = sum([len(d) for d in all_data])
    print('\nToken count = %d' % token_count)
    return all_data
示例#12
0
def _collect_data(lif_dir):
    all_data = []
    for fname in os.listdir(lif_dir):
        print("  {}".format(os.path.basename(fname)))
        fpath = os.path.join(lif_dir, fname)
        lif = Container(fpath).payload
        text_data = prepare_text_for_lda(lif.text.value)
        all_data.append(text_data)
        if fname.startswith('888'):
            break
    print('')
    print(len(text_data), 'sentences')
    print(sum([len(s) for s in text_data]), 'tokens')
    return all_data
示例#13
0
def create_lif_file(json_file, lif_file, txt_file, test=False):
    print("Creating {}".format(lif_file))
    with codecs.open(json_file, encoding='utf8') as fh_in, \
         codecs.open(lif_file, 'w', encoding='utf8') as fh_out_lif, \
         codecs.open(txt_file, 'w', encoding='utf8') as fh_out_txt:
        json_obj = json.loads(fh_in.read())
        lif_obj = LIF()
        _add_metadata(lif_obj, json_obj)
        _add_view(lif_obj, json_obj)
        _add_rest(lif_obj, json_obj)
        container = Container()
        container.discriminator = "http://vocab.lappsgrid.org/ns/media/jsonld#lif"
        container.payload = lif_obj
        fh_out_lif.write(json.dumps(container.as_json(), indent=4))
        fh_out_txt.write(container.payload.text.value)
    if test:
        test_lif_file(lif_file)
    def __init__(self, fname, data_dir, lif_file, mta_file,
                 top_file, ner_file, sen_file, tex_file, wik_file):

        """Build a single LIF object with all relevant annotations. The annotations
        themselves are stored in the Annotations object in self.annotations."""
        self.id = int(os.path.split(fname)[0])
        self.fname = fname
        self.data_dir = data_dir
        self.lif = Container(lif_file).payload
        self.meta = LIF(mta_file)
        self.wikis = LIF(wik_file).metadata['wikified_es']
        self._add_views(ner_file, sen_file, tex_file, top_file)
        self.lif.metadata["filename"] = self.fname
        self.lif.metadata["year"] = self._get_year()
        self.annotations = Annotations(self.id, fname, doc=self,
                                       text=self.lif.text.value)
        self.annotations.text = self.lif.text.value
        self._collect_allowed_offsets()
        self._collect_annotations()
示例#15
0
def show_file(sourcepath, datapath, subdir):
    sourcefile = os.path.join(sourcepath, subdir, 'tesseract-300dpi-20p.txt')
    datafile = os.path.join(datapath, subdir, 'tesseract-300dpi-20p.lif')
    print("\n%s%s/%s%s" % (BLUE, subdir, os.path.basename(datafile), END))
    lif = Container(json_file=datafile).payload
    annotations = lif.views[0].annotations
    pages = get_pages(sourcefile)
    if len(pages) != len(annotations):
        print(
            "WARNING: unequal number of pages and page annotations (%d != %d)"
            % (len(pages), len(annotations)))
    for page, annotation in zip(pages, annotations):
        if (HEADERS_FOOTERS_ONLY and annotation.features.get('header') is None
                and annotation.features.get('footer') is None):
            continue
        print_annotation(annotation)
        print_page(page)
        print_page(lif.text.value[annotation.start:annotation.end])
        input()
示例#16
0
 def __init__(self, fname, lif_file, ner_file, tex_file, ttk_file, sen_file,
              rel_file, vnc_file, top_file, ontology):
     """Build a single LIF object with all relevant annotations. The annotations
     themselves are stored in the Annotations object in self.annotations."""
     self.id = Document.new_id()
     self.fname = fname
     self.ontology = ontology
     self.lif = Container(lif_file).payload
     self._add_views(ner_file, tex_file, ttk_file, sen_file, rel_file,
                     vnc_file, top_file)
     self.lif.metadata["filename"] = self.fname
     self.lif.metadata["title"] = self._get_title()
     self.lif.metadata["year"] = self._get_year()
     self.lif.metadata["abstract"] = self._get_abstract()
     self.annotations = Annotations(fname,
                                    doc=self,
                                    docid=self.id,
                                    text=self.lif.text.value)
     self.annotations.text = self.lif.text.value
     self._collect_allowed_offsets()
     self._collect_annotations()
def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary):
    topic_id = 0
    fname_in = os.path.join(data_dir, 'lif', fname[:-4] + '.lif')
    fname_out = os.path.join(data_dir, 'top', fname[:-4] + '.lif')
    ensure_directory(fname_out)
    lif_in = Container(fname_in).payload
    lif_out = LIF(json_object=lif_in.as_json())
    # just to save some space, we get them from the lif file anyway
    lif_out.metadata = {}
    topics_view = _create_view()
    lif_out.views = [topics_view]
    topics_view.annotations.append(markable_annotation(lif_in))
    doc = prepare_text_for_lda(lif_in.text.value)
    bow = dictionary.doc2bow(doc)
    for topic in lda.get_document_topics(bow):
        topic_id += 1
        # these are tuples of topic_id and score
        lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0]))
        # print('   %3d  %.04f  %s' % (topic[0], topic[1], lemmas))
        topics_view.annotations.append(
            topic_annotation(topic, topic_id, lemmas))
    lif_out.write(fname=fname_out, pretty=True)
def show_file(tag, path, subdir):
    data_dir = os.path.join(path, subdir)
    files = [f for f in os.listdir(data_dir) if f[0].isdigit()]
    if len(files) != 1:
        print('Unexpected directory contents')
        return
    data_file = os.path.join(data_dir, files[0])
    print("\n%s%s/%s%s" % (BLUE, subdir, os.path.basename(data_file), END))
    lif = Container(json_file=data_file).payload
    count = 0
    print()
    for view in lif.views:
        for anno in view.annotations:
            if anno.type == tag:
                p1, p2 = anno.start, anno.end
                text = lif.text.value[p1:p2]
                if tag.endswith('Sentence'):
                    print("%s" % ('>' * WIDTH))
                    print(lif.text.value[p1:p2])
                    input()
                else:
                    category = anno.features.get('category')
                    if category in ('number', 'ordinal', 'percent', 'money',
                                    'misc'):
                        continue
                    if category is not None:
                        category = '%-20s' % (category + ':')
                    else:
                        category = ''
                    left = "%-25s" % lif.text.value[p1 - 25:p1]
                    right = lif.text.value[p2:p2 + 25]
                    context = "%s%s%s %s %s%s" % (category, left, BLUE, text,
                                                  END, right)
                    context = context.replace('\n', ' ')
                    print(context, end='\n')
                    count += 1
                    if count % 25 == 0:
                        input()
示例#19
0
def generate_topics(lif, top):

    lda = load_model()
    topic_idx = {
        topic_id: topic
        for topic_id, topic in lda.print_topics(num_topics=NUM_TOPICS)
    }
    dictionary = load_dictionary()

    for fname in os.listdir(lif):

        if not fname.endswith('.lif'):
            continue
        # if not fname.startswith('z'): continue

        topic_id = 0
        print("{}".format(os.path.basename(fname)))
        fname_in = os.path.join(lif, fname)
        fname_out = os.path.join(top, fname)
        lif_in = Container(fname_in).payload
        lif_out = LIF(json_object=lif_in.as_json())
        # just to save some space, we get them from the lif file anyway
        lif_out.metadata = {}
        topics_view = _create_view()
        lif_out.views = [topics_view]

        topics_view.annotations.append(markable_annotation(lif_in))
        doc = prepare_text_for_lda(lif_in.text.value)
        bow = dictionary.doc2bow(doc)
        for topic in lda.get_document_topics(bow):
            topic_id += 1
            # these are tuples of topic_id and score
            lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0]))
            # print('   %3d  %.04f  %s' % (topic[0], topic[1], lemmas))
            topics_view.annotations.append(
                topic_annotation(topic, topic_id, lemmas))
        lif_out.write(fname=fname_out, pretty=True)
def wikify_lif(in_f, wikifier):
    in_lif = Container(in_f).payload
    out_lif = LIF(json_object=in_lif.as_json())
    out_lif.views = []
    out_lif.metadata["wikified_es"] = wikifier.wikify(out_lif.text.value)
    return out_lif
示例#21
0
def get_lif(fpath):
    try:
        lif = Container(fpath).payload
    except:
        lif = LIF(fpath)
    return lif
示例#22
0
def create_container(lif_object):
    container = Container()
    container.discriminator = "http://vocab.lappsgrid.org/ns/media/jsonld#lif"
    container.payload = lif_object
    return container