def generate_metadata(data_dir, fname):

    subdir = os.path.split(fname)[0]
    lif_file = os.path.join(data_dir, 'lif', subdir, "tesseract-300dpi-20p.lif")
    ner_file = os.path.join(data_dir, 'ner', subdir, "%s.ner.lif" % subdir)
    mta_file = os.path.join(data_dir, 'mta', subdir, "%s.mta.lif" % subdir)
    ensure_directory(mta_file)

    lif = Container(lif_file).payload
    lif_ner = Container(ner_file).payload
    lif_mta = LIF(json_object=lif.as_json())
    lif_mta.text.value = None
    lif_mta.text.fname = lif_file
    lif_mta.views = []
    lif.metadata["authors"] = []
    lif.metadata["year"] = None

    page_view = lif.get_view("pages")
    ner_view = lif_ner.get_view('v2')

    window = _get_window(page_view)
    lif.metadata["authors"] = _get_authors(lif, ner_view, window)
    lif.metadata["year"] = _get_year(ner_view, window)

    lif_mta.write(fname=mta_file, pretty=True)
Exemplo n.º 2
0
def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary):
    topic_id = 0
    #fname_in = os.path.join(data_dir, 'lif', fname[:-5] + '.lif')
    fname_in = os.path.join(data_dir, 'lif', fname)
    fname_out = os.path.join(data_dir, 'top', fname[:-5] + '.lif')
    ensure_directory(fname_out)
    # lif_in = Container(fname_in).payload
    try:
        lif_in = LIF(fname_in)
    except FileNotFoundError:
        print("Warning: file '%s' does not exist" % fname_in)
        return
    lif_out = LIF(json_object=lif_in.as_json())
    # the following three are just to save some space, we get them from the lif
    # file anyway
    lif_out.text.value = None
    lif_out.text.source = fname_in
    lif_out.metadata = {}
    topics_view = _create_view()
    lif_out.views = [topics_view]
    topics_view.annotations.append(markable_annotation(lif_in))
    doc = prepare_text_for_lda(lif_in.text.value)
    bow = dictionary.doc2bow(doc)
    for topic in lda.get_document_topics(bow):
        topic_id += 1
        # these are tuples of topic_id and score
        lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0]))
        # print('   %3d  %.04f  %s' % (topic[0], topic[1], lemmas))
        topics_view.annotations.append(
            topic_annotation(topic, topic_id, lemmas))
    lif_out.write(fname=fname_out, pretty=True)
Exemplo n.º 3
0
def lookup_technologies(data_dir, fname):
    subdir = os.path.split(fname)[0]
    pos_file = os.path.join(data_dir, 'pos', subdir, "%s.pos.lif" % subdir)
    tex_file = os.path.join(data_dir, 'tex', subdir, "%s.lup.lif" % subdir)
    ensure_directory(tex_file)
    lif = Container(pos_file).payload
    lif_tex = LIF(json_object=lif.as_json())
    pos_view = lif.get_view('v2')
    tex_view = create_view('tex', 'Technology', 'dtriac-pipeline:lookup.py')
    lif_tex.views = [tex_view]
    tokens = [a for a in pos_view.annotations if a.type.endswith('Token')]
    _lookup_technologies_in_tokens(lif, tokens, tex_view)
    lif_tex.write(fname=tex_file, pretty=True)
def generate_sentence_types(data_dir, fname):

    subdir = os.path.split(fname)[0]
    lif_file = os.path.join(data_dir, 'lif', subdir, "tesseract-300dpi-20p.lif")
    spl_file = os.path.join(data_dir, 'spl', subdir, "%s.spl.lif" % subdir)
    sen_file = os.path.join(data_dir, 'sen', subdir, "%s.sen.lif" % subdir)
    ensure_directory(sen_file)

    if DEBUG:
        SENTS.write(">>> %s\n>>> %s\n>>> %s\n\n" % ('-' * 100, fname, '-' * 100))

    lif = Container(lif_file).payload
    lif_spl = Container(spl_file).payload
    lif_sen = LIF(json_object=lif.as_json())

    spl_sentences_view = lif_spl.get_view('v2')
    new_sentences_view = _create_view()
    lif_sen.views = [new_sentences_view]

    good_sentences = 0
    bad_sentences = 0

    for anno in spl_sentences_view.annotations:
        if anno.type.endswith('Sentence'):
            sc = SentenceClassifier(lif, anno, WORDS)
            if sc.is_crap():
                if DEBUG:
                    SENTS.write("---- %f\n%s\n\n" % (sc.ratio, repr(sc.text)))
                anno.features['type'] = 'crap'
                bad_sentences += 1
            else:
                if DEBUG:
                    SENTS.write("++++ %f\n%s\n\n" % (sc.ratio, repr(sc.text)))
                anno.features['type'] = 'normal'
                good_sentences += 1
            new_sentences_view.annotations.append(anno)
    if DEBUG:
        SENTS.write("\nTOTAL GOOD = {:d}\nTOTAL BAD  = {:d}\n\n\n".format(good_sentences, bad_sentences))

    lif_sen.write(fname=sen_file, pretty=True)
def generate_topics_for_file(data_dir, fname, lda, topic_idx, dictionary):
    topic_id = 0
    fname_in = os.path.join(data_dir, 'lif', fname[:-4] + '.lif')
    fname_out = os.path.join(data_dir, 'top', fname[:-4] + '.lif')
    ensure_directory(fname_out)
    lif_in = Container(fname_in).payload
    lif_out = LIF(json_object=lif_in.as_json())
    # just to save some space, we get them from the lif file anyway
    lif_out.metadata = {}
    topics_view = _create_view()
    lif_out.views = [topics_view]
    topics_view.annotations.append(markable_annotation(lif_in))
    doc = prepare_text_for_lda(lif_in.text.value)
    bow = dictionary.doc2bow(doc)
    for topic in lda.get_document_topics(bow):
        topic_id += 1
        # these are tuples of topic_id and score
        lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0]))
        # print('   %3d  %.04f  %s' % (topic[0], topic[1], lemmas))
        topics_view.annotations.append(
            topic_annotation(topic, topic_id, lemmas))
    lif_out.write(fname=fname_out, pretty=True)
Exemplo n.º 6
0
def generate_sentence_types(ttk, sen, words):
    for fname in os.listdir(ttk):
        if not fname.endswith('.lif'):
            continue
        print("{} ... ".format(os.path.basename(fname)))
        if DEBUG:
            GOOD.write(">>> %s\n>>> %s\n>>> %s\n\n" %
                       ('-' * 100, fname, '-' * 100))
            BAD.write(">>> %s\n>>> %s\n>>> %s\n\n" %
                      ('-' * 100, fname, '-' * 100))
        fname_in = os.path.join(ttk, fname)
        fname_out = os.path.join(sen, fname)
        lif_in = LIF(fname_in)
        lif_out = LIF(json_object=lif_in.as_json())
        sentences_view = _create_view()
        lif_out.views = [sentences_view]
        good_sentences = 0
        bad_sentences = 0
        view = lif_in.get_view('v1')
        for anno in view.annotations:
            if anno.type.endswith('Sentence'):
                sc = SentenceClassifier(lif_in, anno, words)
                if sc.is_crap():
                    if DEBUG:
                        BAD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text))
                    anno.features['type'] = 'crap'
                    bad_sentences += 1
                else:
                    if DEBUG:
                        GOOD.write(">>> %f\n%s\n\n" % (sc.ratio, sc.text))
                    anno.features['type'] = 'normal'
                    good_sentences += 1
                sentences_view.annotations.append(anno)
        if DEBUG:
            print(" (good={:d} bad={:d})".format(good_sentences,
                                                 bad_sentences))
        lif_out.write(fname=fname_out, pretty=True)
        #break
    print
Exemplo n.º 7
0
def generate_topics(lif, top):

    lda = load_model()
    topic_idx = {
        topic_id: topic
        for topic_id, topic in lda.print_topics(num_topics=NUM_TOPICS)
    }
    dictionary = load_dictionary()

    for fname in os.listdir(lif):

        if not fname.endswith('.lif'):
            continue
        # if not fname.startswith('z'): continue

        topic_id = 0
        print("{}".format(os.path.basename(fname)))
        fname_in = os.path.join(lif, fname)
        fname_out = os.path.join(top, fname)
        lif_in = Container(fname_in).payload
        lif_out = LIF(json_object=lif_in.as_json())
        # just to save some space, we get them from the lif file anyway
        lif_out.metadata = {}
        topics_view = _create_view()
        lif_out.views = [topics_view]

        topics_view.annotations.append(markable_annotation(lif_in))
        doc = prepare_text_for_lda(lif_in.text.value)
        bow = dictionary.doc2bow(doc)
        for topic in lda.get_document_topics(bow):
            topic_id += 1
            # these are tuples of topic_id and score
            lemmas = get_lemmas_from_topic_name(topic_idx.get(topic[0]))
            # print('   %3d  %.04f  %s' % (topic[0], topic[1], lemmas))
            topics_view.annotations.append(
                topic_annotation(topic, topic_id, lemmas))
        lif_out.write(fname=fname_out, pretty=True)
def wikify_lif(in_f, wikifier):
    in_lif = Container(in_f).payload
    out_lif = LIF(json_object=in_lif.as_json())
    out_lif.views = []
    out_lif.metadata["wikified_es"] = wikifier.wikify(out_lif.text.value)
    return out_lif