示例#1
0
def create_noun(html_full_noun):
    data = scrap_noun(html_full_noun)
    main_term = scrapper.ascii_2_portuguese(data[0][0].split(" - ")[0])
    gender = scrapper.ascii_2_portuguese(
        data[0][0].split(" - ")[1]).split(" ")[-1]
    divisao_silabica = scrapper.ascii_2_portuguese(data[1].replace("·", "-"))
    plural = data[0][-1].split(" ")[-1]
    query = "CREATE (:Substantivo {nid:\"%s\", divisao_silabica:\"%s\", genero:\"%s\", plural:\"%s\"" % (
        main_term, divisao_silabica, gender, plural)
    query += "});"
    db_utils.execute_query(query=query)
    return query
示例#2
0
def create_verb(html_full_verb):
    main_verb_token = regex.findall(r"<h1>.+?</h1>", html_full_verb)
    main_verb_token = regex.sub(r"<.+?>", "", main_verb_token[0])
    main_term = str(main_verb_token).split(" -")[0].strip(" ")
    verb_data = scrap_verb(html_full_verb)
    data_len = len(verb_data[0])
    divisao_silabica = verb_data[2].replace("·", "-")
    query = "CREATE (:Verbo {nid:\"" + main_term + "\", divisao_silabica:\"" + divisao_silabica + "\", "

    for vvv in range(0, data_len):
        if str(vvv) in verb_structure.keys():
            data = verb_data[0][vvv]
            if vvv == 31:
                data = scrapper.ascii_2_portuguese(verb_data[1].replace(
                    ":::", ","))
                query += verb_structure[str(vvv)] + ":[\"" + data.replace(
                    "Pessoal:::", "").replace(":::", "\",\"").replace(
                        " ", "") + "\"]"
            else:
                query += verb_structure[str(vvv)] + ":[\"" + data.replace(
                    "Pessoal:::", "").replace(":::", "\",\"").replace(
                        " ", "") + "\"], "
    query += "});"
    db_utils.execute_query(query=query)
    return query
示例#3
0
def scrap_noun(html_full_noun_scrap):
    temp_data = scrap_general(html_full_noun_scrap)
    clean_data = list()
    for line in temp_data[0]:
        line = line.replace(":::", " ").replace(" : ", " ").strip(" ")
        if "" == line.strip(" ") or '' == line:
            continue
        clean_data.append(line)
    divisao_silabica = temp_data[1]
    divisao_silabica = regex.sub(r"<.+?>", "",
                                 scrapper.ascii_2_portuguese(divisao_silabica))
    return clean_data, divisao_silabica
示例#4
0
def scrap_verb(html_full_verb_scrap):
    html_full_verb_scrap = html_full_verb_scrap.split("name=maintext")[1]
    html_full_verb_scrap = html_full_verb_scrap.split("</table>")[0]
    html_full_verb_scrap = filter_funcs.replace_html_entities(
        html_full_verb_scrap)
    divisao_silabica = regex.findall(r"<p.+?Divisão silábica.+?</p>",
                                     html_full_verb_scrap)
    html_full_verb_scrap = html_full_verb_scrap.replace("\t", "").replace(
        "\r", "")[1:].strip(" ")
    html_full_verb_scrap = regex.sub(r"<p.+?Divisão silábica.+?</p>", "",
                                     html_full_verb_scrap)
    participio_passado = html_full_verb_scrap.split("Particípio passado")[1]
    participio_passado = regex.sub(r"</?t[dr]>", " ", participio_passado)
    participio_passado = regex.sub(r"<.+?>", " ", participio_passado)
    participio_passado = participio_passado.replace("  /  ", "/")
    participio_passado = participio_passado.replace(" / ", "/")
    participio_passado = participio_passado.strip(" ")
    while "  " in participio_passado:
        participio_passado = participio_passado.replace("  ", " ")
    while "\n\n" in participio_passado:
        participio_passado = participio_passado.replace("\n\n", "")
    participio_passado = participio_passado.strip(" ")
    participio_passado = participio_passado.replace(" ", ":::")
    while "  " in html_full_verb_scrap:
        html_full_verb_scrap = html_full_verb_scrap.replace("  ", " ")
    while "\n\n" in html_full_verb_scrap:
        html_full_verb_scrap = html_full_verb_scrap.replace("\n\n", "\n")
    html_full_verb_scrap = regex.sub(r"<br>", ":::", html_full_verb_scrap)
    html_full_verb_scrap = regex.sub(r" / ", "/", html_full_verb_scrap)
    html_full_verb_scrap = regex.sub(r"  /  ", "/", html_full_verb_scrap)
    html_full_verb_scrap = html_full_verb_scrap.replace(r" / ", "/")
    html_full_verb_scrap = html_full_verb_scrap.replace(r"  /  ", "/")
    content = regex.sub(r"<.+?>", "", html_full_verb_scrap)
    temp_data = content.split("\n")
    clean_data = list()
    for line in temp_data:
        if "" == line.strip(" "):
            continue
        clean_data.append(line.strip(" "))
    divisao_silabica = divisao_silabica[0]
    divisao_silabica = regex.sub(r"<.+?>", "",
                                 scrapper.ascii_2_portuguese(divisao_silabica))
    return clean_data, participio_passado, divisao_silabica
示例#5
0
def scrap_adverb(html_full_adverb_scrap):
    temp_data = scrap_general(html_full_adverb_scrap)
    clean_data = dict()
    for line in temp_data[0]:
        line = line.replace(":::", " ").replace(" : ", " ").strip(" ")
        if "" == line.strip(" ") or '' == line:
            continue
        while "  " in line:
            line = line.replace("  ", " ")
        if " - advérbio" in line:
            clean_data["lemma"] = line.split(" ")[0]
            continue
        elif "Destaques" in line:
            break
        else:
            clean_data["mais_informacoes"] = line
    divisao_silabica = temp_data[1]
    clean_data["divisao_silabica"] = regex.sub(
        r"<.+?>", "", scrapper.ascii_2_portuguese(divisao_silabica))
    return clean_data
示例#6
0
def scrap_adjective(html_full_adjective_scrap):
    temp_data = scrap_general(html_full_adjective_scrap)
    clean_data = dict()
    for line in temp_data[0]:
        line = line.replace(":::", " ").replace(" : ", " ").strip(" ")
        if "" == line.strip(" ") or '' == line or "Masculino Feminino" in line:
            continue
        elif " - adjetivo" in line:
            clean_data["lemma"] = line.split(" ")[0]
        elif "Singular" in line:
            clean_data["sing_masc"] = line.split(" ")[1]
            clean_data["sing_femi"] = line.split(" ")[2]
        elif "Plural" in line:
            clean_data["plur_masc"] = line.split(" ")[1]
            clean_data["plur_femi"] = line.split(" ")[2]
        else:
            clean_data["more_info"] = line
    divisao_silabica = temp_data[1]
    clean_data["divisao_silabica"] = regex.sub(
        r"<.+?>", "", scrapper.ascii_2_portuguese(divisao_silabica))
    return clean_data