示例#1
0
def parse_html(html_str, court_name):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        div = soup.find_all('div', {'id': 'CPHBody_PanelList'})[0]
        a_list_soup = BeautifulSoup(str(div), "html.parser")
        a_list = a_list_soup.find_all('a')

        a_list_unique = list(set(a_list))
        for a in a_list_unique:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            case_no = escape_string(str(str(a.text)[:-10]).replace("-", ""))
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            judgment_date = escape_string(str(a.text)[-10:])

            # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
            #     insert_check = True

            a_link = a.get('href')
            pdf_data = escape_string(
                request_pdf(base_url + a_link, case_no, court_name))
            pdf_file = escape_string(base_url + a_link)

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, judgment_date, pdf_file, pdf_filename) " \
                                                               "VALUE ('" + case_no + "', '" + judgment_date + "', '" \
                            + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def parse_html(html_str, court_name):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        ul = soup.find_all('ul')[0]
        ul_soup = BeautifulSoup(str(ul), "html.parser")
        li_list = ul_soup.find_all('li')

        # p_list = ul_soup.find_all('p')
        # p_list = [x for x in p_list if "<p><font" not in str(x)]
        # print(p_list)
        # return

        for li in li_list:
            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            a = BeautifulSoup(str(li), "html.parser").a
            a_link = a.get('href')

            case_no = str(a_link[a_link.rfind("/")+1:]).replace('.pdf', '')
            judgment_date = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
            #     insert_check = True

            judgment_date = escape_string(case_no[-10:].replace('(', '').replace(')', ''))
            pdf_data = escape_string(request_pdf(base_url + a_link, case_no, court_name))
            pdf_file = escape_string(base_url + a_link)

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, judgment_date, pdf_file, pdf_filename) " \
                                                               "VALUE ('" + case_no + "', '" + judgment_date + "', '" \
                            + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET text_data = '" + str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'")
        return False
示例#3
0
def parse_html(html_str, court_name, appeal_type):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_soup = BeautifulSoup(
            str(soup.find_all('table', {'class': 'table table-bordered'})[0]),
            'html.parser')
        tr_list = table_soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count == 1:
                continue

            case_no = "NULL"
            date_of_order = "NULL"
            appellant = "NULL"
            respondent = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    continue

                if i == 2:
                    case_no = escape_string(
                        str(td.text).strip().replace("\n", ""))

                if i == 3:
                    date_of_order = escape_string(
                        str(td.text).strip().replace("\n", ""))

                # if select_count_query(str(court_name), str(case_no), 'date_of_order', date_of_order):
                #     insert_check = True

                if i == 4:
                    party = str(td.decode_contents()).split("V/s")
                    appellant = escape_string(str(party[0]))
                    respondent = escape_string(str(party[1]))

                if i == 5:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    pdf_url = str(base_url + a_tag.get('href')).replace(
                        '\\', '/')
                    pdf_file = escape_string(pdf_url)
                    pdf_data = escape_string(
                        request_pdf(pdf_url, case_no, court_name))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, date_of_order, appellant, respondent, " \
                                                               "pdf_file, appeal_type, pdf_filename) VALUE ('" + \
                            case_no + "', '" + date_of_order + "', '" + appellant + "', '" + respondent + "', '" + \
                            pdf_file + "', '" + appeal_type + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def parse_html(html_str, court_name, bench_code):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_list = soup.find_all('table')

        for table in table_list:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            judge_name = "NULL"
            text = "NULL"
            text_file = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            table_soup = BeautifulSoup(str(table), "html.parser")
            td_list = table_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    case_no = escape_string(str(td.decode_contents()))

                if i == 2:
                    petitioner = escape_string(str(td.decode_contents()))
                if i == 4:
                    respondent = escape_string(str(td.decode_contents()))
                if i == 6:
                    judgment_date = escape_string(str(td.decode_contents()))

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 7:
                    judge_name = escape_string(str(td.decode_contents()))
                if i == 8:
                    a_link = BeautifulSoup(str(td),
                                           "html.parser").a.get('href')
                    text_dir = request_text(base_url + a_link, case_no,
                                            court_name)
                    text = escape_string(text_dir['data'])
                    text_file = escape_string(base_url + a_link)
                if i == 9:
                    a_link = BeautifulSoup(str(td),
                                           "html.parser").a.get('href')
                    pdf_file = escape_string(base_url + a_link)
                    pdf_data = escape_string(
                        request_pdf(base_url + a_link, case_no, court_name))

            # if case_no != "NULL" and insert_check and petitioner != 'Judgment Information System':
            if case_no != "NULL" and petitioner != 'Judgment Information System':
                sql_query = "INSERT INTO " + str(court_name) + \
                            " (case_no, petitioner, respondent, judgment_date, judge_name, text_data, text_file, " \
                            "pdf_file, bench_code, pdf_filename) VALUE ('" + case_no + "', '" + petitioner + "', '" + \
                            respondent + "', '" + judgment_date + "', '" + judge_name + "', '" + text + "', '" + \
                            text_file + "', '" + pdf_file + "', " + str(bench_code) + ", '" + court_name + "_" + \
                            slugify(case_no) + ".txt')"
                insert_query(sql_query)

                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
示例#5
0
def parse_html(html_str, court_name):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        tr_list = soup.find_all('tr')

        case_no = "NULL"
        diary_number = "NULL"
        petitioner = "NULL"
        respondent = "NULL"
        petitioner_advocate = "NULL"
        respondent_advocate = "NULL"
        judgment_date = "NULL"
        judge_name = "NULL"
        bench = "NULL"
        pdf_data = "NULL"
        pdf_file = "NULL"

        tr_count = 0
        for tr in tr_list:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            if tr_count == 1:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 3:
                        diary_number = escape_string(str(td.decode_contents()))

            if tr_count == 2:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        case_no = escape_string(str(td.decode_contents()))
                    if td_count == 3:
                        judgment_date = escape_string(str(td.a.string))
                        a_link = BeautifulSoup(str(td),
                                               "html.parser").a.get('href')
                        pdf_data = escape_string(
                            request_pdf(base_url + a_link, case_no,
                                        court_name))
                        pdf_file = escape_string(base_url + a_link)

            if tr_count == 3:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        petitioner = escape_string(str(td.decode_contents()))

            if tr_count == 4:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        respondent = escape_string(str(td.decode_contents()))

            if tr_count == 5:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        petitioner_advocate = escape_string(
                            str(td.decode_contents()))

            if tr_count == 6:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        respondent_advocate = escape_string(
                            str(td.decode_contents()))

            if tr_count == 7:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        bench = escape_string(str(td.decode_contents()))

            if tr_count == 8:
                td_count = 0
                for td in td_list:
                    td_count += 1
                    if td_count == 2:
                        judge_name = escape_string(str(td.decode_contents()))

                # if case_no != "NULL" and select_count_query(str(court_name), str(case_no), 'judgment_date',
                #                                             judgment_date):
                if case_no != "NULL":
                    sql_query = "INSERT INTO " + str(court_name) + \
                                " (diary_number, case_no, petitioner, respondent, petitioner_advocate, " \
                                "respondent_advocate, judgment_date, bench, judge_name, pdf_file, pdf_filename) VALUE "\
                                "('" + diary_number + "', '" + case_no + "', '" + petitioner + "', '" + respondent + \
                                "', '" + petitioner_advocate + "', '" + respondent_advocate + "', '" + judgment_date + \
                                "', '" + bench + "', '" + judge_name + "', '" + pdf_file + "', '" + court_name + "_" \
                                + slugify(case_no) + ".pdf')"
                    insert_query(sql_query)

                    update_query("UPDATE " + court_name + " SET pdf_data = '" +
                                 str(pdf_data) + "' WHERE case_no = '" +
                                 str(case_no) + "'")
                    update_query(
                        "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                        + str(court_name) + "'")

            if tr_count == 9:
                tr_count = 0
                case_no = "NULL"
                diary_number = "NULL"
                petitioner = "NULL"
                respondent = "NULL"
                petitioner_advocate = "NULL"
                respondent_advocate = "NULL"
                judgment_date = "NULL"
                judge_name = "NULL"
                bench = "NULL"
                pdf_data = "NULL"
                pdf_file = "NULL"

        return True

    except Exception as e:
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def parse_html(html_str):
    try:
        soup = BeautifulSoup(str(html_str), "html.parser")

        table_soup = BeautifulSoup(
            str(soup.find_all('table', {"width": "100%"})[0]), "html.parser")
        tr_list = table_soup.select('tr')

        tr_count = 0
        for tr in tr_list:

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count <= 2:
                continue

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            judge_name = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.select('td')

            i = 0
            for td in td_list:
                i += 1

                if i == 1:
                    continue

                if i == 2 and td.get('align') is None:
                    font_tag = BeautifulSoup(str(td), "html.parser").font
                    case_no = escape_string(str(font_tag.text))

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 3 and td.get('align') is None:
                    font_tag = BeautifulSoup(str(td), "html.parser").font
                    respondent = escape_string(str(font_tag.text))

                if i == 4 and td.get('align') is None:
                    font_tag = BeautifulSoup(str(td), "html.parser").font
                    petitioner = escape_string(str(font_tag.text))

                if i == 5 and td.get('align') is None:
                    font_tag = BeautifulSoup(str(td), "html.parser").font
                    judgment_date = escape_string(str(font_tag.text))

                if td.get('align') == 'left':
                    td_soup1 = BeautifulSoup(str(td), "html.parser")
                    judge_name = escape_string(str(td_soup1.text))

                if td.get('align') == 'center':
                    font_tag = BeautifulSoup(str(td), "html.parser").font
                    a_tag = BeautifulSoup(str(font_tag), "html.parser").a
                    pdf_file = escape_string(base_url + "/" +
                                             a_tag.get('href'))
                    pdf_data = escape_string(
                        bytes(
                            str(
                                request_pdf(base_url + "/" + a_tag.get('href'),
                                            case_no)),
                            'utf-8').decode("utf-8", 'ignore'))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \
                                                               "judge_name, pdf_file, pdf_filename) VALUE ('" + \
                            case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + \
                            judge_name + "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def parse_html(html_str, court_name):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        select_soup = BeautifulSoup(
            str(soup.find_all('select', {'id': 'txtlist'})[0]), "html.parser")
        tr_list = select_soup.find_all('option')

        for tr in tr_list:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            # insert_check = False
            pdf_value = tr['value']

            res = BeautifulSoup(
                str(tr['onmouseover']).replace("return overlib('",
                                               "").replace("')", ""),
                "html.parser")
            [s.extract() for s in res('font')]
            res = str(res).replace('\n', '').strip().split('<br/>')

            petitioner = escape_string(res[0])
            respondent = escape_string(res[1])
            judge = escape_string(res[2])
            judgment_date = escape_string(res[3])
            mix_data = str(res[4]).replace("', CAPTION, '", '')

            reportable = mix_data[0:2]
            case_no = escape_string(mix_data[3:])

            if reportable == 'No':
                continue

            # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
            #     insert_check = True

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                pdf_data = escape_string(
                    request_pdf(case_no, court_name, pdf_value))

                sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \
                                                               "judge, pdf_file, pdf_filename, reportable) VALUE ('" + \
                            case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + \
                            judge + "', '" + pdf_value + "', '" + court_name + "_" + slugify(case_no) + ".pdf', '" + \
                            reportable + "')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def parse_html(html_str, court_name, court_id):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_list = soup.find_all('table')

        for table in table_list:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            case_no = "NULL"
            judgment_date = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            table_soup = BeautifulSoup(str(table), "html.parser")
            td_list = table_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    case_no = escape_string(str(td.decode_contents()))

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 3:
                    judgment_date = escape_string(str(td.decode_contents()))
                if i == 4:
                    pdf_file = base_url + BeautifulSoup(
                        str(td), "html.parser").a.get('href')
                    pdf_data = escape_string(
                        request_pdf(pdf_file, case_no, court_name))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, court_id, judgment_date, pdf_file, " \
                                                               "pdf_filename) VALUE ('" + case_no + "', " + court_id + \
                            ", '" + judgment_date + "', '" + pdf_file + "', '" + court_name + "_" + \
                            slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
示例#9
0
def parse_html(html_str, court_name, m_sideflg):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_soup = BeautifulSoup(str(soup.find_all('form')[0]),
                                   "html.parser")
        table_soup = BeautifulSoup(
            str(table_soup.find_all('table', {"width": "100%"})[0]),
            "html.parser")
        tr_list = table_soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count <= 4 or tr_count % 2 == 0:
                continue

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            coram = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1

                if i == 1 or i == 6 or str(td.decode_contents()).replace("\n", "").strip() == \
                        '<font color="blue">LBR  : Larger Benches Referred Matter</font>':
                    continue

                if i == 2:
                    coram = escape_string(str(td.decode_contents()))

                if i == 3:
                    data1 = escape_string(str(td.decode_contents()))
                    data1_list = data1.split("<b>")
                    petitioner = data1_list[0]
                    respondent = str(data1_list[1]).split("</b>")[1]

                if i == 4:
                    data2 = escape_string(str(td.decode_contents()))
                    data2_list = data2.split("<br/>")
                    judgment_date = data2_list[0]

                if i == 5:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    pdf_file = base_url + a_tag.get('href')
                    case_no = str(a_tag.text).replace("\n", "")
                    pdf_data = escape_string(
                        request_pdf(pdf_file, case_no, court_name))

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (m_sideflg, case_no, petitioner, respondent, " \
                                                               "judgment_date, coram, pdf_file, pdf_filename) VALUE " \
                                                               "('" + m_sideflg +\
                            "', '" + case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + \
                            "', '" + coram + "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def parse_html(html_str, court_name):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_list = soup.find_all('table', {'style': 'width:100%; margin-top: 10px; font-size: 12px;'})
        table_soup = BeautifulSoup(str(table_list), "html.parser")
        tr_list = table_soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:

            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count == 1:
                continue

            appeal_no = "NULL"
            appellant = "NULL"
            respondent = "NULL"
            date_of_order = "NULL"
            filed_by = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            order_type = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    appeal_no = escape_string(str(td.text).strip().replace("\n", ""))

                if i == 2:
                    filed_by = escape_string(str(td.text).strip().replace('\n', ''))

                if i == 3:
                    appellant = escape_string(str(td.text).strip().replace('\n', ''))

                if i == 4:
                    respondent = escape_string(str(td.text).strip().replace('\n', ''))

                if i == 5:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    details_url = a_tag.get('href')
                    date_of_order, pdf_file, order_type = details_parse(details_url, appeal_no, court_name)

                    # if select_count_query_other(str(court_name), 'appeal_no', str(appeal_no), 'date_of_order',
                    #                             date_of_order):
                    #     insert_check = True

                    pdf_data = escape_string(str(request_pdf(pdf_file, appeal_no, court_name)).replace("'", ""))

            # if appeal_no != "NULL" and insert_check:
            if appeal_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (appeal_no, appellant, respondent, filed_by, " \
                                                               "bench_code, pdf_filename ) VALUE ('" + appeal_no + \
                            "', '" + appellant + "', '" + respondent + "', '" + filed_by + "', '" + court_name + \
                            "_" + slugify(appeal_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "', date_of_order ='" +
                             date_of_order + "', pdf_file = '" + pdf_file + "', order_type = '" + order_type +
                             "' WHERE appeal_no = '" + str(appeal_no) + "'")
                update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'")

        return True

    except Exception as e:
        logging.error("Failed to parse the html: %s", e)
        update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'")
        return False
def parse_html(html_str, court_name):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_tag = soup.find_all('table')[1]

        table_soup = BeautifulSoup(str(table_tag), "html.parser")
        tr_list = table_soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:

            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count <= 2 or tr_count > 17:
                continue

            case_no = "NULL"
            judgment_date = "NULL"
            coram = "NULL"
            type_ = "NULL"
            status = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    case_no = escape_string(str(td.decode_contents()))

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 2:
                    coram = escape_string(str(td.decode_contents()))

                if i == 3:
                    judgment_date = escape_string(str(td.decode_contents()))

                if i == 5:
                    type_ = escape_string(str(td.decode_contents()))

                if i == 6:
                    status = escape_string(str(td.decode_contents()))

                if i == 4:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    pdf_file = escape_string(base_url + a_tag.get('href'))
                    pdf_data = escape_string(request_pdf(base_url + a_tag.get('href'), case_no, court_name))

            # if case_no != "NULL" and insert_check and case_no.find("DISCLAIMER") == -1:
            if case_no != "NULL" and case_no.find("DISCLAIMER") == -1:

                sql_query = "INSERT INTO " + str(court_name) + " (case_no, judgment_date, coram, type, status, " \
                                                               "pdf_file, pdf_filename) VALUE ('" + case_no + "', '" + \
                            judgment_date + "', '" + coram + "', '" + type_ + "', '" + status + "', '" + pdf_file + \
                            "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'")
        return False
def parse_html(html_str, court_name):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_tag = soup.find_all('table', {'class': 'miscTable'})[0]

        table_soup = BeautifulSoup(str(table_tag), "html.parser")
        tr_list = table_soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:

            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count == 1:
                continue

            case_no = "NULL"
            judgment_date = "NULL"
            judge_name = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            bench = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1

                if i == 1:
                    judgment_date = escape_string(str(td.decode_contents()))

                if i == 2:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    case_no = escape_string(str(a_tag.text).replace("\n", ""))

                    # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                    #     insert_check = True

                    new_url = base_url + a_tag.get('href')
                    response = requests.request('GET', new_url, headers=headers, proxies=proxy_dict)

                    new_soup = BeautifulSoup(str(response.text), "html.parser")
                    new_td_tag = new_soup.find_all('td', {'headers': 't1'})[0]
                    new_a_href = BeautifulSoup(str(new_td_tag), "html.parser").a.get('href')

                    pdf_file = escape_string(base_url + new_a_href)
                    pdf_data = escape_string(request_pdf(base_url + new_a_href, case_no, court_name))

                if i == 3:
                    judge_name = escape_string(str(td.text))

                if i == 4:
                    petitioner = escape_string(str(td.text))

                if i == 5:
                    respondent = escape_string(str(td.text))

                if i == 6:
                    bench = escape_string(str(td.text))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + "(case_no, judgment_date, judge_name, petitioner, " \
                                                               "respondent, bench, pdf_file, pdf_filename) VALUE ('" +\
                            case_no + "', '" + judgment_date + "', '" + judge_name + "', '" + petitioner + "', '" + \
                            respondent + "', '" + bench + "', '" + pdf_file + "', '" + court_name + "_" + \
                            slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'")

        return True

    except Exception as e:
        logging.error("Failed to parse the html: %s", e)
        update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'")
        return False
示例#13
0
def parse_html(html_str, court_name, bench):
    try:
        soup = BeautifulSoup(
            html_str.replace("<b>", "").replace("</b>", "").replace(
                "<br>",
                "").replace("</br>",
                            "").replace("<b",
                                        "").replace("<br< p="
                                                    "></br<>", ""),
            "html.parser")
        tr_list = soup.find_all('tr')
        del tr_list[0:7]

        case_no = "NULL"
        petitioner = "NULL"
        respondent = "NULL"
        petitioner_advocate = "NULL"
        respondent_advocate = "NULL"
        judgment_date = "NULL"
        disposal_date = "NULL"
        judge_name = "NULL"
        pdf_data = "NULL"
        pdf_file = "NULL"

        tr_count = 0
        for tr in tr_list:
            tr_count += 1

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            # insert_check = False
            if tr_count == 1:
                td_count = 0
                tr_soup = BeautifulSoup(str(tr), "html.parser")
                td_list = tr_soup.find_all('td')
                for td in td_list:
                    td_count += 1
                    if td_count == 3:
                        case_no = escape_string(str(td.text).strip())

                    if td_count == 4:
                        td_text = str(td.text)
                        if td_text.find("NA") == -1:
                            a_tag = BeautifulSoup(str(td), "html.parser").a
                            if a_tag:
                                a_link = a_tag.get('href')
                                pdf_data = escape_string(
                                    request_pdf(base_url + a_link, case_no,
                                                court_name))
                                pdf_file = base_url + a_link

                        judgment_date = escape_string(
                            td_text.replace("Judgement", "").replace(
                                "Orders", "").replace("r", "").replace(
                                    "(AFR)", "").replace("NA", "").strip())

            # if select_count_query_bench(str(court_name), str(case_no), bench, 'judgment_date', judgment_date):
            #     insert_check = True

            if tr_count == 2:
                td_count = 0
                tr_soup = BeautifulSoup(str(tr), "html.parser")
                td_list = tr_soup.find_all('td')
                for td in td_list:
                    td_count += 1
                    if td_count == 3:
                        petitioner = escape_string(str(td.text).strip())

            if tr_count == 3:
                td_count = 0
                tr_soup = BeautifulSoup(str(tr), "html.parser")
                td_list = tr_soup.find_all('td')
                for td in td_list:
                    td_count += 1
                    if td_count == 3:
                        respondent = escape_string(str(td.text).strip())

            if tr_count == 4:
                td_count = 0
                tr_soup = BeautifulSoup(str(tr), "html.parser")
                td_list = tr_soup.find_all('td')
                for td in td_list:
                    td_count += 1
                    if td_count == 3:
                        petitioner_advocate = escape_string(
                            str(td.text).strip())

            if tr_count == 5:
                td_count = 0
                tr_soup = BeautifulSoup(str(tr), "html.parser")
                td_list = tr_soup.find_all('td')
                for td in td_list:
                    td_count += 1
                    if td_count == 3:
                        respondent_advocate = escape_string(
                            str(td.text).strip())

            if tr_count == 6:
                td_count = 0
                tr_soup = BeautifulSoup(str(tr), "html.parser")
                td_list = tr_soup.find_all('td')
                for td in td_list:
                    td_count += 1
                    if td_count == 3:
                        judge_name = escape_string(str(td.text).strip())

            if tr_count == 7:
                td_count = 0
                tr_soup = BeautifulSoup(str(tr), "html.parser")
                td_list = tr_soup.find_all('td')
                for td in td_list:
                    td_count += 1
                    if td_count == 3:
                        disposal_date = escape_string(str(td.text).strip())

                # if case_no != "NULL" and insert_check:
                if case_no != "NULL":
                    sql_query = "INSERT INTO " + str(court_name) + \
                                " (case_no, petitioner, respondent, petitioner_advocate, respondent_advocate, " \
                                "judgment_date, disposal_date, bench, judge_name, pdf_file, pdf_filename)" \
                                " VALUE ('" + case_no + "', '" + petitioner + "', '" + respondent + "', '" + \
                                petitioner_advocate + "', '" + respondent_advocate + "', '" + judgment_date + "', '" + \
                                disposal_date + "', '" + bench + "', '" + judge_name + "', '" + pdf_file + "', '" + \
                                court_name + "_" + slugify(case_no) + ".pdf')"
                    insert_query(sql_query)

                    update_query("UPDATE " + court_name + " SET pdf_data = '" +
                                 str(pdf_data) + "' WHERE case_no = '" +
                                 str(case_no) + "'")
                    update_query(
                        "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                        + str(court_name) + "'")

            if tr_count == 9:
                tr_count = 0
                case_no = "NULL"
                petitioner = "NULL"
                respondent = "NULL"
                petitioner_advocate = "NULL"
                respondent_advocate = "NULL"
                judgment_date = "NULL"
                judge_name = "NULL"
                pdf_data = "NULL"
                pdf_file = "NULL"

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def parse_html(html_str, court_name, headers):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_list = soup.find_all('table', {'id': 'tables11'})
        table_soup = BeautifulSoup(str(table_list), "html.parser")
        tr_list = table_soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count <= 3:
                continue

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            table_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = table_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    continue

                if i == 2:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    case_no = escape_string(str(a_tag.text))

                if i == 3:
                    party = str(td.decode_contents()).split("Vs")
                    petitioner = escape_string(str(party[0]))
                    respondent = escape_string(str(party[1]))

                if i == 4:
                    judgment_date = escape_string(str(td.decode_contents()))

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 5:
                    a_link = BeautifulSoup(str(td),
                                           "html.parser").a.get('onclick')
                    a_formatted = str(
                        str(a_link).replace("window.open('",
                                            "")).replace("')", "")
                    pdf_file = escape_string(base_url + "/" + a_formatted)

                    # pdf_data = escape_string(request_pdf(
                    #     str(pdf_file).replace(base_url + "download_file.php?auth=", ""), case_no, court_name,
                    #     headers))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \
                                                               "pdf_file, pdf_filename) VALUE ('" + case_no + "', '" + \
                            petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + pdf_file + "', '" + \
                            court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
示例#15
0
def parse_html(html_str, court_name, bench, child_url):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        div_soup = BeautifulSoup(str(soup.find_all('div', {'id': 'text'})[0]),
                                 'html.parser')
        tr_list = div_soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count == 1:
                continue

            case_no = "NULL"
            date_of_order = "NULL"
            description = "NULL"
            section = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    case_no = escape_string(
                        str(td.text).strip().replace("\n", ""))

                if i == 2:
                    date_of_order = escape_string(
                        str(td.text).strip().replace("\n", ""))

                # if select_count_query(str(court_name), str(case_no), 'date_of_order', date_of_order):
                #     insert_check = True

                if i == 3:
                    description = escape_string(str(td.text).strip())
                    a_tag = BeautifulSoup(str(td), "html.parser").font.a
                    pdf_url = base_url + child_url + a_tag.get('href')
                    pdf_file = escape_string(pdf_url)
                    pdf_data = escape_string(
                        request_pdf(pdf_url, case_no, court_name))

                if i == 4:
                    section = str(td.text)

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, date_of_order, description, section, " \
                                                               "pdf_file, bench_code, pdf_filename) VALUE ('" + \
                            case_no + "', '" + date_of_order + "', '" + description + "', '" + section + "', '" + \
                            pdf_file + "', '" + str(bench) + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
def parse_html(html_str, court_name, bench, start_date):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        table_soup = BeautifulSoup(
            str(soup.find_all('table', {'class': 'hoverTable'})[0]),
            'html.parser')
        tr_list = table_soup.find_all('tr')

        if not tr_list:
            logging.error("NO data Found for start date: " + str(start_date))
            update_query(
                "UPDATE Tracker SET No_Year_NoData = No_Year_NoData + 1 WHERE Name = '"
                + str(court_name) + "'")
            return True

        tr_count = 0
        for tr in tr_list:

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count == 1:
                continue

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            judge_name = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    continue

                if i == 2:
                    case_no = escape_string(
                        str(td.text).strip().replace("\n", ""))

                if i == 3:
                    party = str(td.decode_contents()).split("<br/>")
                    petitioner = escape_string(str(party[0]).strip())
                    respondent = escape_string(str(party[2]).strip())

                if i == 4:
                    judge_name = escape_string(str(td.text).strip())

                if i == 5:
                    judgment_date = escape_string(str(td.text).strip())

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 7:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    pdf_file = base_url + a_tag.get('href')
                    # pdf_data = escape_string(request_pdf(pdf_file, case_no, court_name))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \
                                                               "judge_name, pdf_file, bench_code, pdf_filename) VALUE"\
                                                               " ('" + case_no + "', '" + petitioner + "', '" + \
                            respondent + "', '" + judgment_date + "', '" + judge_name + "', '" + pdf_file + "', '" + \
                            str(bench) + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
示例#17
0
def parse_html(html_str, court_name):
    try:
        soup = BeautifulSoup(str(html_str).replace('&', ' '), "html.parser")
        tr_list = soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:
            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count == 1:
                continue

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            judge_name = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    judgment_date = escape_string(str(td.decode_contents()))

                if i == 2:
                    judge_name = escape_string(str(td.decode_contents()))

                if i == 3:
                    case_no = escape_string(str(td.text))

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 4:
                    party = str(td.decode_contents()).split("v/s")
                    petitioner = escape_string(str(party[0]))
                    respondent = escape_string(str(party[1]))

                if i == 5:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    pdf_file = escape_string(str(base_url + a_tag.get('href')))
                    pdf_data = escape_string(
                        request_pdf(base_url + a_tag.get('href'), case_no,
                                    court_name))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \
                                                               "judge_name, pdf_file, pdf_filename) VALUE ('" + \
                            case_no + "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + \
                            judge_name + "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False
示例#18
0
def parse_html(html_str, court_name, flag):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        soup = BeautifulSoup(str(soup.prettify()), "html.parser")

        date_h4 = soup.find_all('h4', {'align': 'center'})[0]
        month_year = str(date_h4.text).replace('JUDGMENTS FOR THE MONTH OF', '').strip()

        table_list = soup.find_all('table', {'class': 'DISCOVERY3'})[0]
        table_soup = BeautifulSoup(str(table_list), "html.parser")
        tr_list = table_soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:
            emergency_exit = select_one_query("SELECT emergency_exit FROM Tracker WHERE Name='" + court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count <= 1:
                continue

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            judge_name = "NULL"
            subject = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            if flag:
                i = 1
            else:
                i = 0
            for td in td_list:
                i += 1
                if i == 2:
                    judgment_day = escape_string(str(td.decode_contents()))
                    judgment_date = str(re.findall('\d+', str(judgment_day))[0]) + ", " + month_year.replace(
                        'JUDGEMENTS FOR THE MONTH OF', '')

                if i == 3:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    pdf_file = escape_string(str(base_url + a_tag.get('href')))
                    case_no = escape_string(str(a_tag.text).replace("\n", "").strip())

                    # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                    #     insert_check = True
                    pdf_data = escape_string(request_pdf(str(base_url + a_tag.get('href')), case_no, court_name))

                if i == 4:
                    font_tag = BeautifulSoup(str(td), "html.parser").font
                    if font_tag is not None:
                        span_tag = font_tag.span
                    else:
                        span_tag = BeautifulSoup(str(td), "html.parser").span
                        if span_tag is None:
                            span_tag = BeautifulSoup(str(td), "html.parser")

                    party = str(span_tag.decode_contents()).split("<br/>")
                    petitioner = escape_string(
                        str(party[0]).replace('<td align="center" bgcolor="#FFFFFF" valign="middle" width="30%">',
                                              '').strip())
                    petitioner = re.sub(r'(\\x(.){2})', '', petitioner)

                    respondent = escape_string(str(party[2]).replace('</td>', '').strip())
                    respondent = re.sub(r'(\\x(.){2})', '', respondent)

                if i == 5:
                    subject = escape_string(str(td.decode_contents()).strip())

                if i == 6:
                    judge_name = escape_string(str(td.text).replace(r'\x', '').replace('\\xC2\\x92BLE', '').strip())
                    judge_name = re.sub(r'(\\x(.){2})', '', judge_name)
                    judge_name = re.sub(r'', '', judge_name, re.U)

            # if case_no != "NULL" and insert_check and td_list:
            if case_no != "NULL" and td_list:
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \
                                                               "subject, pdf_file, pdf_filename) VALUE ('" + case_no + \
                            "', '" + petitioner + "', '" + respondent + "', '" + judgment_date + "', '" + subject + \
                            "', '" + pdf_file + "', '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET judge_name = '" + str(judge_name) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query("UPDATE " + court_name + " SET pdf_data = '" + str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query("UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '" + str(court_name) + "'")

        return True

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to parse the html: %s", e)
        update_query("UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" + str(court_name) + "'")
        return False
示例#19
0
def parse_html(html_str, court_name, dc):
    try:
        soup = BeautifulSoup(html_str, "html.parser")
        tr_list = soup.find_all('tr')

        tr_count = 0
        for tr in tr_list:

            emergency_exit = select_one_query(
                "SELECT emergency_exit FROM Tracker WHERE Name='" +
                court_name + "'")
            if emergency_exit is not None:
                if emergency_exit['emergency_exit'] == 1:
                    break

            tr_count += 1
            if tr_count == 1:
                continue

            case_no = "NULL"
            petitioner = "NULL"
            respondent = "NULL"
            judgment_date = "NULL"
            corrigendum = "NULL"
            pdf_data = "NULL"
            pdf_file = "NULL"
            # insert_check = False

            tr_soup = BeautifulSoup(str(tr), "html.parser")
            td_list = tr_soup.find_all('td')

            i = 0
            for td in td_list:
                i += 1
                if i == 1:
                    continue

                if i == 2:
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    pdf_file = base_url + a_tag.get('href')
                    case_no = str(a_tag.text).replace("\n", "")
                    pdf_data = escape_string(
                        request_pdf(pdf_file, case_no, court_name))

                if i == 3:
                    span_tag = BeautifulSoup(str(td), "html.parser").span
                    judgment_date = escape_string(
                        str(span_tag.decode_contents()))

                # if select_count_query(str(court_name), str(case_no), 'judgment_date', judgment_date):
                #     insert_check = True

                if i == 5:
                    span_tag = BeautifulSoup(str(td), "html.parser").span
                    corrigendum = escape_string(str(
                        span_tag.decode_contents()))

                if i == 4:
                    td_soup = BeautifulSoup(str(td), "html.parser")
                    span_list = td_soup.find_all('span')

                    j = 0
                    for span in span_list:
                        j += 1

                        if j == 1:
                            petitioner = escape_string(
                                str(span.decode_contents()))
                        if j == 3:
                            respondent = escape_string(
                                str(span.decode_contents()))

            # if case_no != "NULL" and insert_check:
            if case_no != "NULL":
                sql_query = "INSERT INTO " + str(court_name) + " (case_no, petitioner, respondent, judgment_date, " \
                                                               "corrigendum, pdf_file, bench_code, pdf_filename) VALUE"\
                                                               " ('" + case_no + "', '" + petitioner + "', '" + \
                            respondent + "', '" + judgment_date + "', '" + corrigendum + "', '" + pdf_file + "', " + \
                            str(dc) + ", '" + court_name + "_" + slugify(case_no) + ".pdf')"
                insert_query(sql_query)

                update_query("UPDATE " + court_name + " SET pdf_data = '" +
                             str(pdf_data) + "' WHERE case_no = '" +
                             str(case_no) + "'")
                update_query(
                    "UPDATE Tracker SET No_Cases = No_Cases + 1 WHERE Name = '"
                    + str(court_name) + "'")

        return True

    except Exception as e:
        logging.error("Failed to parse the html: %s", e)
        update_query(
            "UPDATE Tracker SET No_Error = No_Error + 1 WHERE Name = '" +
            str(court_name) + "'")
        return False