Пример #1
0
def add_new_meta(meta: Metadata):
    """
    If file to added meta
    Назив прописа  # ELI#Напомена издавача#Додатне информације#Врста прописа#Доносилац#Област#Група#Датум усвајања#Гласило и датум објављивања#Датум ступања на снагу основног текста#Датум примене#Правни претходник#Издавач#filename#Верзија на снази од#Почетак примене верзије#Број акта
    :param meta:
    :return: None, writes in file meta
    """
    file_meta = open(utilities.get_root_dir() + "/data/meta/allmeta.csv",
                     mode="a")
    deli = "#"
    new_line = meta.act_name + deli + meta.eli + deli + meta.napomena_izdavaca + deli + meta.dodatne_informacije + deli + meta.vrsta_propisa + deli + meta.donosilac + deli + meta.oblast + deli + meta.grupa + deli + meta.datum_usvajanja + deli + meta.glasilo_i_datum + deli + meta.datum_stupanja + deli + meta.pravni_prethodnik + deli + meta.izdavac + deli + meta.filename + "\n"
    file_meta.write(new_line)
    file_meta.close()
Пример #2
0
def generate_owl(folder_path, filenames=None):
    result = get_tf_idf_values_document(folder_path, filenames=filenames, return_just_words=False)
    stat_file = open("stats.txt", "w", encoding="utf-8")
    for el in result:
        s_file = el[0]
        f = open(folder_path + "\\" + s_file, "r", encoding="utf-8")
        info = "".join(f.readlines())
        # for q in el[1]:
        #     print(q)

        stat_len = len(re.findall(r'\w+', info))

        clans_data = util.from_content_to_act_list(info)
        clan_info = util.gather_clans(info)
        # Otvori fajl, pronađe strukture, generišu clanovi, dodaju se
        # print(clans_data)
        meta = utilities.get_meta(check_meta(s_file), utilities.get_root_dir() + "\\data\\meta\\allmeta.csv")
        if meta == None:
            print("Warn - " + el[0] + "missing meta")
            continue
        latin_name = to_latin(meta.act_name).replace(' ', '_')
        # latin_name = meta.act_name.replace(" ", '_')
        dis = {}
        curr_zakon = owl.add_legal_resource(latin_name)
        add_meta_to_act(curr_zakon, meta)

        i = 0
        for info in clan_info:

            curr_sub = owl.add_legal_sub(latin_name.split(":")[0] + '_' + info.replace(' ', '_'))
            is_about = inside_important(el[1], clans_data, i)
            for new_concept in is_about:
                if new_concept not in dis:
                    dis[new_concept] = owl.add_concept(new_concept)

            if is_about.__len__ != 0:
                curr_sub.is_about = [dis[s] for s in is_about]
            curr_sub.is_part_of = [curr_zakon]
            i = i + 1
        curr_zakon.is_about = [dis[s] for s in dis]
        write_stat_info(curr_zakon, dis, stat_file, stat_len)
    s_file.close()
    owl.save()
Пример #3
0
def apply_akn_tags(text: str,
                   meta_name: str,
                   skip_tfidf_ner=False,
                   ner="crf",
                   meta_data=None):
    """
    Applies to text Akoma Ntoso 3.0 tags for Republic of Serbia regulations
    :param text: HTML or plain text
    :param meta_name: name which was meta added in file, 15 tag in meta, use function in MetadataBuilder.add_new_meta or
    add manually in Akoma/data/meta/allmeta.csv
    :param skip_tfidf_ner: Don't add references> TLCconcept for document and TLC for ner, if true speeds up execution by a lot
    :param ner: chooses model which will be used, can be one of values: 'crf','spacy','spacy_default','reldi', crf best so far but slowest
    :param meta_data: type form_akoma/Metadata.py, if this is passed meta_name is not important, because file allmeta.csv is not searched and all data passed in meta_data is used for meta_data
    :return: Labeled xml string
    """
    global ner_list
    akoma_root = init_akoma.init_xml("act")
    repaired = False
    if text.find("<") == -1:
        repaired = True
    else:
        text = regex_patterns.strip_html_tags_exept(text)
    if not repaired:
        try:
            html_root = ET.fromstring("<article>" + text + "</article>")
        except Exception as e:
            got = BeautifulSoup(text, "lxml")
            text = got.prettify().replace("<html>", "").replace(
                "</html>", "").replace("<body>", "").replace("</body>", "")
            html_root = ET.fromstring("<article>" + text + "</article>")
    metabuilder = MetadataBuilder("data/meta/allmeta.csv")
    metabuilder.build(meta_name, akoma_root, skip_tfidf_ner)
    # print(ETree.prettify(akoma_root))
    builder = AkomaBuilder(akoma_root)
    if not repaired:
        reasoner = BasicReasoner(HTMLTokenizer(html_root), builder)
    else:
        reasoner = BasicReasoner(BasicTokenizer(text), builder)
    reasoner.start(metabuilder)

    if reasoner.current_hierarchy[4] == 0:
        akoma_root = init_akoma.init_xml("act")
        metabuilder = MetadataBuilder("data/meta/allmeta.csv")
        if meta_data is None:
            metabuilder.build(fajl, akoma_root, skip_tfidf=skip_tfidf_ner)
        else:
            metabuilder.build(fajl,
                              akoma_root,
                              skip_tfidf=skip_tfidf_ner,
                              passed_meta=meta_data)

        builder = AkomaBuilder(akoma_root)
        if not repaired:
            reasoner = OdlukaReasoner(HTMLTokenizer(html_root), builder)
        else:
            reasoner = OdlukaReasoner(BasicTokenizer(text), builder)
        reasoner.start(metabuilder)

    result_str = builder.result_str().replace("&lt;", "~vece;").replace(
        "&gt;", "~manje;").replace("&quot;", "~navod;")
    if not skip_tfidf_ner:
        send_to_NER(akoma_root)
        if ner == "crf":
            map_ret = do_ner_on_sentences(ner_list)
        elif ner == "spacy":
            map_ret = do_spacy_ner(ner_list, custom=True)
        elif ner == "spacy_default":
            map_ret = do_spacy_ner(ner_list, custom=False)
        elif ner == "reldi":
            map_ret = {}
            print("Waiting for access to reldi NER from devs, TODO for future")
            exit(-1)
        if ner == "crf" or ner == "spacy" or ner == "spacy_default" or ner == "reldi":
            fix_dates(map_ret)
            events = utilities.regex_events(
                regex_patterns.strip_html_tags(text))
            utilities.entities_add_date(map_ret, events)  # Regex adding dates
            add_ner_tags(map_ret, akoma_root, metabuilder)
        ner_list.clear()

    try:
        result_stablo = add_refs(akoma_root, result_str,
                                 metabuilder.uri_expression)
    except Exception as e:
        file_ref_exeption = open(utilities.get_root_dir() + "/data/" +
                                 "za_ninu.txt",
                                 mode="a+")
        file_ref_exeption.write(meta_name + ":" + str(e) + "\n")
        file_ref_exeption.close()
        return result_str
    result_str = ETree.prettify(result_stablo).replace("&lt;", "<") \
        .replace("&gt;", ">").replace("&quot;", "\"").replace('<references source="#somebody"/>', "")

    result_str = result_str.replace("~vece;", "&gt;").replace(
        "~manje;", "&lt;").replace("~navod;", "&quot;")
    return result_str
Пример #4
0
if __name__ == "__main__":

    nastavi = "1.html"
    only_annotated = True  # just do annotated files
    idemo = False
    stani = [
        "1005.html",
        "980.html",
        "986.html",
        "981.html",
        "210.html",
        "1033.html"  # problematicni PROVERITI 176
        ,
        "180.html"
    ]  # Veliki fajlovi
    location_source = utilities.get_root_dir() + "/data/acts"
    annotated_source = utilities.get_root_dir() + "/data/annotated"
    fajls = utilities.sort_file_names(os.listdir(location_source))
    if only_annotated is True:
        fajls = utilities.sort_file_names(os.listdir(annotated_source))
        fajls = [el.replace(".xml", ".html") for el in fajls]
        idemo = True

    for fajl in fajls:
        if fajl == nastavi:
            idemo = True
        if not idemo:
            continue
        if fajl in stani:
            continue
        # if fajl != "2.html":
Пример #5
0
        except FileNotFoundError:
            print(">Error tf-idf FileNotFoundError:" + check)
            continue
        all_lines = "".join(file.readlines())

        list_words = get_tf_idf_values_from_text(all_lines, return_just_words=return_just_words, threshold=threshold,
                                                 max_elements=max_elements,
                                                 latin=latin, debug=debug)
        if with_file_names:
            results.append([filename, list_words])
        else:
            results.append(list_words)
        if debug:
            print(results[len(results) - 1])
    return results


if __name__ == '__main__':
    # filenames , folderPath = get_file_names("data", "aktovi_raw_lat")
    filenames = ["1.html", "2.html"]
    path_folder = utilities.get_root_dir().replace("\\", "/") + "/data/acts"
    tf_idf_values = get_tf_idf_values_document(path_folder, filenames=filenames, return_just_words=False, with_file_names=True, latin=False)
    got_file = open(path_folder + "/" + filenames[0], mode="r", encoding="utf-8")
    text = "".join(got_file.readlines())
    #tf_idf_val2 = get_tf_idf_values_from_text(text, return_just_words=True, latin=False)
    #print(tf_idf_val2)
    print(tf_idf_values)
    for el in tf_idf_values:
        print([item[0] for item in el])  # FILES if return file names also
        print([item[1] for item in el])  # WORDS if return file names also
Пример #6
0
from os import path
try:
    from Akoma.utilities.utilities import get_root_dir
except ModuleNotFoundError:
    try:
        from utilities.utilities import get_root_dir
    except ModuleNotFoundError:
        print("Error in modules")
        exit(-1)
cls_legal_resource = "LegalResource"
cls_legal_resource_sub = "LegalResourceSubdivision"
p_is_about = "is_about"


pather = path.dirname(__file__)
onto_path = get_root_dir() + "\\semanticki\\"


onto = get_ontology(onto_path + "eli.rdf")
onto.load()
skos = onto.get_namespace("http://www.w3.org/2004/02/skos/core")
concept_class = [s for s in onto.Language.ancestors() if s.name == "Concept"][0]


def save():
    onto.save("output.rdf")


def add_instance(class_name, instance_name):
    return eval("onto.{0}('{1}')".format(class_name, instance_name))
Пример #7
0
        add_meta_to_act(curr_zakon, meta)

        i = 0
        for info in clan_info:

            curr_sub = owl.add_legal_sub(latin_name.split(":")[0] + '_' + info.replace(' ', '_'))
            is_about = inside_important(el[1], clans_data, i)
            for new_concept in is_about:
                if new_concept not in dis:
                    dis[new_concept] = owl.add_concept(new_concept)

            if is_about.__len__ != 0:
                curr_sub.is_about = [dis[s] for s in is_about]
            curr_sub.is_part_of = [curr_zakon]
            i = i + 1
        curr_zakon.is_about = [dis[s] for s in dis]
        write_stat_info(curr_zakon, dis, stat_file, stat_len)
    s_file.close()
    owl.save()


if __name__ == '__main__':
    from os import listdir
    from os.path import isfile, join

    base_path = utilities.get_root_dir()
    folder_path = base_path + "/data/lat_acts"
    only_files = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]
    ordered = [str(el) + ".txt" for el in range(1, 200)]
    generate_owl(folder_path, filenames=ordered)  # onlyfiles[:10])   filenames=["86.txt", "200.txt"])