예제 #1
0
def parser(court_name, page_no, response):
    try:
        table_data = None
        soup = BeautifulSoup(response, "html.parser")
        tables = soup.find_all("table")
        i = 0
        for table in tables:
            i += 1
            if i == 5:
                table_data = table
                break
        table_data = BeautifulSoup(str(table_data), "html.parser")
        table_rows = table_data.find_all("tr")
        i = 0
        for table_row in table_rows:
            update_query(
                "UPDATE tracker SET total_cases=%s, inserted_cases=0, no_pdf=0, no_text=0, transferred_pdf=0,"
                "transferred_text=0 WHERE court_name=%s",
                ((len(table_rows) - 1), court_name))
            if i == 0:
                i += 1
                continue
            else:
                emergency_exit = select_one_query(
                    "SELECT emergency_exit FROM tracker WHERE court_name=%s",
                    court_name)
                if emergency_exit is not None:
                    if emergency_exit['emergency_exit'] == 1:
                        break
                s_no = None
                country = None
                applicant = None
                case_id = None
                pdf_url = None
                ruling_date = None
                itr_taxman_ctr = None

                row = BeautifulSoup(str(table_row), "html.parser")
                total_td = row.find_all("td")
                j = 0
                for td in total_td:
                    td_soup = BeautifulSoup(str(td), "html.parser")
                    strong_text = td_soup.find('strong')
                    if j == 0:
                        if strong_text is not None:
                            s_no = escape_string(
                                str(strong_text.decode_contents()))
                    elif j == 1:
                        if strong_text is not None:
                            case_id = escape_string(
                                str(strong_text.decode_contents()))
                    elif j == 2:
                        if strong_text is not None:
                            ruling_date = escape_string(
                                str(strong_text.decode_contents()))
                    elif j == 3:
                        if strong_text is not None:
                            applicant = escape_string(
                                str(strong_text.decode_contents()))
                    elif j == 4:
                        if strong_text is not None:
                            country = escape_string(
                                str(strong_text.decode_contents()))
                    elif j == 5:
                        if strong_text is not None:
                            itr_taxman_ctr = escape_string(
                                str(strong_text.decode_contents()))
                    elif j == 6:
                        td_soup = BeautifulSoup(str(td), "html.parser")
                        if td_soup.a is not None:
                            a = td_soup.a
                            index_of_first_comma = str(a['href']).index("'")
                            index_of_last_comma = str(a['href']).rindex("'")
                            pdf_url = str(a['href'])[index_of_first_comma +
                                                     1:index_of_last_comma]
                    j += 1

                if select_count_query(str(court_name),
                                      str(escape_string(case_id)), 'date',
                                      ruling_date):
                    pdf_filepath = None
                    text_filename = None
                    pdf_final_url = None
                    pdf_filename = None
                    if pdf_url is not None:
                        pdf_filename = slugify('aar-rulings' +
                                               str(escape_string(case_id)) +
                                               str(ruling_date)) + '.pdf'
                        text_filename = slugify('aar-rulings-' +
                                                str(escape_string(case_id)) +
                                                str(ruling_date)) + '.txt'
                        pdf_final_url = 'http://aarrulings.in/it-rulings/uploads/pdf/' + pdf_url
                        pdf_filepath = request_pdf(pdf_final_url, pdf_filename,
                                                   court_name, case_id,
                                                   page_no)

                    if pdf_filepath is not None:
                        pdf_text_data = escape_string(
                            str(pdf_to_text_api(pdf_filepath)))
                        text_filepath = module_directory + "/../data_files/text_files/" + court_name + '_' + text_filename
                        fw = open(text_filepath, "w")
                        fw.write(pdf_text_data)
                    else:
                        text_filepath = None
                        pdf_text_data = None
                        pdf_filename = None
                        text_filename = None

                    if insert_query(
                            "INSERT INTO aar_rulings (sl_no, case_id, date, country, "
                            "itr_taxman_ctr, pdf_url, pdf_filename, text_filename) "
                            "VALUES (%s,%s,%s,%s,%s,%s,%s,%s)",
                        (s_no, str(escape_string(case_id)), ruling_date,
                         country, itr_taxman_ctr, pdf_final_url, pdf_filename,
                         text_filename)):
                        update_query(
                            "UPDATE tracker SET inserted_cases=inserted_cases+1 WHERE court_name=%s",
                            court_name)
                    else:
                        update_query(
                            "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and page_no",
                            (court_name, page_no))
                        insert_query(
                            "INSERT INTO alerts (court_name, case_id, page_no,error_message) VALUES "
                            "(%s, %s, %s, %s)",
                            (court_name, case_id, page_no,
                             'Failed to insert court data in table'))
                    if update_query(
                            "UPDATE aar_rulings SET name_of_applicant=%s WHERE case_id=%s",
                        (applicant, case_id)):
                        update_query(
                            "UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s",
                            court_name)
                    else:
                        insert_query(
                            "INSERT INTO alerts (court_name, case_id, page_no,error_message) VALUES (%s, %s, %s,"
                            " %s)",
                            (court_name, case_id, page_no,
                             'Failed to insert applicant name in table'))
                        update_query(
                            "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and page_no",
                            (court_name, page_no))

                    if update_query(
                            "UPDATE aar_rulings SET text_data=%s WHERE case_id=%s",
                        (pdf_text_data, case_id)):
                        update_query(
                            "UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s",
                            court_name)
                    else:
                        insert_query(
                            "INSERT INTO alerts (court_name, case_id, page_no,error_message) VALUES (%s, %s, %s,"
                            " %s)", (court_name, case_id, page_no,
                                     'Failed to insert text data in table'))
                        update_query(
                            "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and page_no",
                            (court_name, page_no))

                    if transfer_to_bucket('PDF_Files', pdf_filepath):
                        update_query(
                            "UPDATE tracker SET transferred_pdf=transferred_pdf+1 WHERE court_name=%s",
                            court_name)
                        os.remove(pdf_filepath)
                    else:
                        insert_query(
                            "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)",
                            (court_name, case_id,
                             'Failed to transfer pdf to bucket.'))
                        update_query(
                            "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s",
                            (court_name))

                    if transfer_to_bucket('Text_Files', text_filepath):
                        update_query(
                            "UPDATE tracker SET transferred_text=transferred_text+1 WHERE court_name=%s",
                            (court_name))
                        os.remove(text_filepath)
                    else:
                        insert_query(
                            "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)",
                            (court_name, case_id,
                             'Failed to transfer text to bucket.'))
                        update_query(
                            "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s",
                            (court_name))
                else:
                    update_query(
                        "UPDATE tracker SET inserted_cases=inserted_cases+1, no_pdf=no_pdf+1, no_text=no_text+1, "
                        "transferred_pdf=transferred_pdf+1, transferred_text=transferred_text+1 WHERE "
                        "court_name=%s", court_name)

    except Exception as e:
        traceback.print_exc()
        logging.error("Failed to request: %s", e)
        insert_query(
            "INSERT INTO alerts (court_name, page_no, error_message) VALUES (%s, %s, %s)",
            (court_name, page_no, str(e)))
        update_query(
            "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s",
            court_name)
        return False
예제 #2
0
def parser(base_url, court_name, bench_id, response):
    pdf_base_path = base_url + 'viewpdf/'

    update_query("UPDATE tracker SET total_cases=%s, inserted_cases=0, no_pdf=0, no_text=0, transferred_pdf=0,"
                 "transferred_text=0 WHERE court_name=%s and bench=%s", (str(len(response)), court_name, bench_id))

    for case in response:
        emergency_exit = select_one_query("SELECT emergency_exit FROM tracker WHERE court_name=%s and bench=%s",
                                          (court_name, bench_id))
        if emergency_exit is not None:
            if emergency_exit['emergency_exit'] == 1:
                break

        case_type = case['CaseType']
        case_no = case['CaseNo']
        case_yr = case['CaseYr']
        jud_dt = case['Jud_Dt']
        jud_pdf_name = case['Jud_Pdf_Name']

        case_id = case_type + ' ' + case_no + ' OF ' + case_yr

        if select_count_query(str(court_name), str(case_id), 'judgment_date', jud_dt):
            pdf_url = pdf_base_path + jud_pdf_name
            pdf_filename = str(jud_pdf_name).replace('.pdf', '')

            pdf_filepath = request_pdf(pdf_url, pdf_filename, court_name, bench_id, case_id)
            if pdf_filepath is not None:
                pdf_text_data = escape_string(str(pdf_to_text_api(pdf_filepath)))

                text_filepath = module_directory + "/../data_files/text_files/" + court_name + "_" + slugify(
                    pdf_filename) + '.txt'
                fw = open(text_filepath, "w")
                fw.write(pdf_text_data)
                text_filename = jud_pdf_name.replace('.pdf', '.txt')
            else:
                text_filepath = None
                pdf_text_data = None
                text_filename = None
                jud_pdf_name = None

            if insert_query(
                    "INSERT INTO kolkata (case_id, judgment_date, pdf_url, pdf_filename, text_filename, case_type, "
                    "case_no, case_year, bench) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)",
                    (case_id, jud_dt, pdf_url, jud_pdf_name, text_filename, case_type, case_no,
                     case_yr, bench_id)):

                update_query("UPDATE tracker SET inserted_cases=inserted_cases+1 WHERE court_name=%s and bench=%s",
                             (court_name, bench_id))
            else:
                update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and bench=%s",
                             (court_name, bench_id))
                insert_query("INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)",
                             (court_name, bench_id, case_id, 'Failed to insert court data in table'))

            if update_query("UPDATE kolkata SET text_data=%s WHERE case_id=%s", (pdf_text_data, case_id)):
                update_query("UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s and bench=%s",
                             (court_name, bench_id))
            else:
                insert_query("INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)",
                             (court_name, bench_id, case_id, 'Failed to insert text data.'))
                update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and bench=%s",
                             (court_name, bench_id))

            if transfer_to_bucket('PDF_Files', pdf_filepath):
                update_query("UPDATE tracker SET transferred_pdf=transferred_pdf+1 WHERE court_name=%s and bench=%s",
                             (court_name, bench_id))
                os.remove(pdf_filepath)
            else:
                insert_query("INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)",
                             (court_name, bench_id, case_id, 'Failed to transfer PDF to bucket.'))
                update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and bench=%s",
                             (court_name, bench_id))

            if transfer_to_bucket('Text_Files', text_filepath):
                update_query("UPDATE tracker SET transferred_text=transferred_text+1 WHERE court_name=%s and "
                             "bench=%s", (court_name, bench_id))
                os.remove(text_filepath)
            else:
                insert_query("INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)",
                             (court_name, bench_id, case_id, 'Failed to transfer text to bucket.'))
                update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and bench=%s",
                             (court_name, bench_id))
        else:
            update_query("UPDATE tracker SET inserted_cases=inserted_cases+1, no_pdf=no_pdf+1, no_text=no_text+1,"
                         "transferred_pdf=transferred_pdf+1, transferred_text=transferred_text+1 WHERE court_name=%s "
                         "and bench=%s", (court_name, bench_id))
def parser(court_name, bench_id, response):
    tbody = BeautifulSoup(str(response), "html.parser").find_all('tbody')[0]
    tr_list = BeautifulSoup(str(tbody), "html.parser").find_all('tr')

    update_query(
        "UPDATE tracker SET total_cases=%s, inserted_cases=0, no_pdf=0, no_text=0, transferred_pdf=0,"
        "transferred_text=0 WHERE court_name=%s AND bench=%s",
        (str(len(tr_list)), court_name, bench_id))

    for tr in tr_list:
        emergency_exit = select_one_query(
            "SELECT emergency_exit FROM tracker WHERE court_name=%s AND bench=%s",
            (court_name, bench_id))
        if emergency_exit is not None:
            if emergency_exit['emergency_exit'] == 1:
                break

        case_id = None
        judgment_date = None
        party = None
        section = None
        court_name_ = None
        order_passed_by = None
        pdf_url = None

        i = 0
        td_list = BeautifulSoup(str(tr), "html.parser").find_all('td')
        for td in td_list:
            i += 1
            if i == 1:
                case_id = escape_string(str(td.decode_contents()))
            elif i == 2:
                judgment_date = escape_string(str(td.decode_contents()))
            elif i == 3:
                party = escape_string(str(td.decode_contents()))
            elif i == 4:
                section = escape_string(str(td.decode_contents()))
            elif i == 5:
                court_name_ = escape_string(str(td.decode_contents()))
            elif i == 6:
                order_passed_by = escape_string(str(td.decode_contents()))
            elif i == 7:
                a_tag = BeautifulSoup(str(td), "html.parser").a
                if a_tag:
                    pdf_url = escape_string(str(a_tag.get('href')))
                else:
                    pdf_url = None

        if select_count_query(str(court_name), str(case_id), 'judgment_date',
                              judgment_date):
            pdf_filename = slugify(court_name + '-' + case_id + '-' +
                                   judgment_date) + '.pdf'
            text_filename = slugify(court_name + '-' + case_id + '-' +
                                    judgment_date) + '.txt'

            pdf_filepath = request_pdf(pdf_url, pdf_filename, court_name,
                                       bench_id, case_id)
            if pdf_filepath is not None:

                pdf_text_data = escape_string(
                    str(pdf_to_text_api(pdf_filepath)))
                if pdf_text_data is not None:
                    text_filepath = module_directory + "/../data_files/text_files/" + court_name + '_' + text_filename
                    fw = open(text_filepath, "w")
                    fw.write(pdf_text_data)
                else:
                    text_filepath = None
                    text_filename = None
            else:
                text_filepath = None
                pdf_text_data = None
                pdf_filename = None

            if insert_query(
                    "INSERT INTO national_company_law_appellate_tribunal (case_id, judgment_date, party, section, "
                    "court_name, order_passed_by, pdf_url, pdf_filename, text_filename, bench) VALUES (%s, %s, %s, %s, "
                    "%s, %s, %s, %s, %s, %s)",
                (case_id, judgment_date, party, section, court_name_,
                 order_passed_by, pdf_url, pdf_filename, text_filename,
                 bench_id)):

                update_query(
                    "UPDATE tracker SET inserted_cases=inserted_cases+1 WHERE court_name=%s AND bench=%s",
                    (court_name, bench_id))
            else:
                update_query(
                    "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s AND bench=%s",
                    (court_name, bench_id))
                insert_query(
                    "INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)",
                    (court_name, bench_id, case_id,
                     'Failed to insert court data in table'))

            if update_query(
                    "UPDATE national_company_law_appellate_tribunal SET text_data=%s WHERE case_id=%s",
                (pdf_text_data, case_id)):
                update_query(
                    "UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s AND bench=%s",
                    (court_name, bench_id))
            else:
                insert_query(
                    "INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)",
                    (court_name, bench_id, case_id,
                     'Failed to insert text data.'))
                update_query(
                    "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s AND bench=%s",
                    (court_name, bench_id))

            if transfer_to_bucket('PDF_Files', pdf_filepath):
                update_query(
                    "UPDATE tracker SET transferred_pdf=transferred_pdf+1 WHERE court_name=%s AND bench=%s",
                    (court_name, bench_id))
                os.remove(pdf_filepath)
            else:
                insert_query(
                    "INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)",
                    (court_name, bench_id, case_id,
                     'Failed to transfer pdf to bucket.'))
                update_query(
                    "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s AND bench=%s",
                    (court_name, bench_id))

            if transfer_to_bucket('Text_Files', text_filepath):
                update_query(
                    "UPDATE tracker SET transferred_text=transferred_text+1 WHERE court_name=%s AND bench=%s",
                    (court_name, bench_id))
                os.remove(text_filepath)
            else:
                insert_query(
                    "INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)",
                    (court_name, bench_id, case_id,
                     'Failed to transfer text to bucket.'))
                update_query(
                    "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s AND bench=%s",
                    (court_name, bench_id))

        else:
            update_query(
                "UPDATE tracker SET inserted_cases=inserted_cases+1, no_pdf=no_pdf+1, no_text=no_text+1,"
                "transferred_pdf=transferred_pdf+1, transferred_text=transferred_text+1 WHERE court_name=%s "
                "AND bench=%s", (court_name, bench_id))
def parser(base_url, court_name, response):
    table = BeautifulSoup(response, "html.parser").find_all(
        'table', {'class': 'custum-tbl table table-bordered'})[0]
    tbody = BeautifulSoup(str(table), "html.parser").find_all('tbody')[0]
    tr_list = BeautifulSoup(str(tbody), "html.parser").find_all('tr')

    update_query(
        "UPDATE tracker SET total_cases=%s, inserted_cases=0, no_pdf=0, no_text=0, transferred_pdf=0,"
        "transferred_text=0 WHERE court_name=%s",
        (str(len(tr_list)), court_name))

    for tr in tr_list:
        emergency_exit = select_one_query(
            "SELECT emergency_exit FROM tracker WHERE court_name=%s",
            (court_name))
        if emergency_exit is not None:
            if emergency_exit['emergency_exit'] == 1:
                break

        state = None
        name_of_appellant = None
        brief_of_order_in_appeal = None
        appeal_order_no = None
        appeal_order_date = None
        pdf_url = None
        aar_order_no = None
        aar_order_date = None
        aar_pdf_url = None

        i = 0
        td_list = BeautifulSoup(str(tr), "html.parser").find_all('td')
        for td in td_list:
            i += 1
            if i == 2:
                state = escape_string(str(td.decode_contents()))
            elif i == 3:
                name_of_appellant = escape_string(str(td.decode_contents()))
            elif i == 4:
                brief_of_order_in_appeal = escape_string(
                    str(td.decode_contents()))
            elif i == 5:
                appeal_order = str(td.decode_contents()).lower()
                if 'dated' in appeal_order:
                    appeal_order = appeal_order.split('dated')
                elif 'dt.' in appeal_order:
                    appeal_order = appeal_order.split('dt.')

                appeal_order_no = escape_string(appeal_order[0])
                appeal_order_date = escape_string(appeal_order[1])

            elif i == 6:
                a_tag = BeautifulSoup(str(td), "html.parser").a
                pdf_url = escape_string(str(base_url + a_tag.get('href')))

            elif i == 7:
                if str(td.decode_contents()) != '-':
                    a_tag = BeautifulSoup(str(td), "html.parser").a
                    aar_pdf_url = escape_string(str(a_tag.get('href')))
                    aar_order = str(a_tag.decode_contents()).lower()
                    if 'dated' in aar_order:
                        aar_order = aar_order.split('dated')
                    elif 'dt.' in aar_order:
                        aar_order = aar_order.split('dt.')
                    elif 'dtd.' in aar_order:
                        aar_order = aar_order.split('dtd.')

                    aar_order_no = escape_string(aar_order[0])
                    aar_order_date = escape_string(aar_order[1])

        if select_count_query(str(court_name), str(appeal_order_no),
                              'appeal_order_date', appeal_order_date):
            pdf_filename = slugify('appeal-' + appeal_order_no +
                                   appeal_order_date) + '.pdf'
            text_filename = slugify('appeal-' + appeal_order_no +
                                    appeal_order_date) + '.txt'

            pdf_filepath = request_pdf(pdf_url, pdf_filename, court_name,
                                       appeal_order_no)
            if pdf_filepath is not None:
                pdf_text_data = escape_string(
                    str(pdf_to_text_api(pdf_filepath)))

                text_filepath = module_directory + "/../data_files/text_files/" + court_name + '_' + text_filename
                fw = open(text_filepath, "w")
                fw.write(pdf_text_data)
            else:
                text_filepath = None
                pdf_text_data = None
                pdf_filename = None
                text_filename = None

            if aar_order_no is not None:
                aar_pdf_filename = slugify('aar-' + aar_order_no +
                                           aar_order_date) + '.pdf'
                aar_text_filename = slugify('aar-' + aar_order_no +
                                            aar_order_date) + '.txt'

                aar_pdf_filepath = request_pdf(aar_pdf_url, aar_pdf_filename,
                                               court_name, aar_order_no)
                if aar_pdf_filepath is not None:
                    aar_text_data = escape_string(
                        pdf_to_text_api(aar_pdf_filepath))

                    aar_text_filepath = module_directory + "/../data_files/text_files/" \
                                                           "" + court_name + '_' + aar_text_filename
                    fw = open(aar_text_filepath, "w")
                    fw.write(aar_text_data)
                else:
                    aar_text_filepath = None
                    aar_text_data = None
            else:
                aar_pdf_filename = None
                aar_text_filename = None
                aar_text_data = None
                aar_pdf_filepath = None
                aar_text_filepath = None

            if insert_query(
                    "INSERT INTO gst_appellate (case_id, appeal_order_no, appeal_order_date, name_of_appellant, "
                    "brief_of_order_in_appeal, state, aar_order_no, aar_order_date, pdf_url, pdf_filename, "
                    "text_filename, aar_pdf_url, arr_pdf_filename, aar_text_filename) VALUES "
                    "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
                (appeal_order_no, appeal_order_no, appeal_order_date,
                 name_of_appellant, brief_of_order_in_appeal, state,
                 aar_order_no, aar_order_date, pdf_url, pdf_filename,
                 text_filename, aar_pdf_url, aar_pdf_filename,
                 aar_text_filename)):

                update_query(
                    "UPDATE tracker SET inserted_cases=inserted_cases+1 WHERE court_name=%s",
                    (court_name))
            else:
                update_query(
                    "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s",
                    (court_name))
                insert_query(
                    "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)",
                    (court_name, appeal_order_no,
                     'Failed to insert court data in table'))

            if update_query(
                    "UPDATE gst_appellate SET text_data=%s WHERE case_id=%s",
                (pdf_text_data, appeal_order_no)):
                update_query(
                    "UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s",
                    (court_name))
            else:
                insert_query(
                    "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)",
                    (court_name, appeal_order_no,
                     'Failed to insert text data.'))
                update_query(
                    "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s",
                    (court_name))

            if update_query(
                    "UPDATE gst_appellate SET aar_text_data=%s WHERE case_id=%s",
                (aar_text_data, appeal_order_no)):
                update_query(
                    "UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s",
                    (court_name))
            else:
                insert_query(
                    "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)",
                    (court_name, appeal_order_no,
                     'Failed to insert aar text data.'))
                update_query(
                    "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s",
                    (court_name))

            if transfer_to_bucket('PDF_Files', pdf_filepath):
                update_query(
                    "UPDATE tracker SET transferred_pdf=transferred_pdf+1 WHERE court_name=%s",
                    (court_name))
                os.remove(pdf_filepath)
            else:
                insert_query(
                    "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)",
                    (court_name, appeal_order_no,
                     'Failed to transfer pdf to bucket.'))
                update_query(
                    "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s",
                    (court_name))

            if transfer_to_bucket('Text_Files', text_filepath):
                update_query(
                    "UPDATE tracker SET transferred_text=transferred_text+1 WHERE court_name=%s",
                    (court_name))
                os.remove(text_filepath)
            else:
                insert_query(
                    "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)",
                    (court_name, appeal_order_no,
                     'Failed to transfer text to bucket.'))
                update_query(
                    "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s",
                    (court_name))

            if aar_order_no is not None:
                if transfer_to_bucket('PDF_Files', aar_pdf_filepath):
                    update_query(
                        "UPDATE tracker SET transferred_pdf=transferred_pdf+1 WHERE court_name=%s",
                        (court_name))
                    os.remove(aar_pdf_filepath)
                else:
                    insert_query(
                        "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)",
                        (court_name, appeal_order_no,
                         'Failed to transfer aar pdf to bucket.'))
                    update_query(
                        "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s",
                        (court_name))

                if transfer_to_bucket('Text_Files', aar_text_filepath):
                    update_query(
                        "UPDATE tracker SET transferred_text=transferred_text+1 WHERE court_name=%s",
                        (court_name))
                    os.remove(aar_text_filepath)
                else:
                    insert_query(
                        "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)",
                        (court_name, appeal_order_no,
                         'Failed to transfer aar text to bucket.'))
                    update_query(
                        "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s",
                        (court_name))
        else:
            update_query(
                "UPDATE tracker SET inserted_cases=inserted_cases+1, no_pdf=no_pdf+1, no_text=no_text+1,"
                "transferred_pdf=transferred_pdf+1, transferred_text=transferred_text+1 WHERE court_name=%s",
                (court_name))
def parser(html_str, court_name, headers):
    soup = BeautifulSoup(html_str, "html.parser")
    table_list = soup.find_all('table', {'id': 'tables11'})
    table_soup = BeautifulSoup(str(table_list), "html.parser")
    tr_list = table_soup.find_all('tr')

    if int(len(tr_list) - 3) > 50:
        tr_list = tr_list[:-2]

    update_query("UPDATE tracker SET total_cases=%s, inserted_cases=0, no_pdf=0, no_text=0, transferred_pdf=0,"
                 "transferred_text=0 WHERE court_name=%s", (str(len(tr_list) - 3), court_name))

    tr_count = 0
    for tr in tr_list:
        emergency_exit = select_one_query("SELECT emergency_exit FROM tracker WHERE court_name=%s", (court_name))
        if emergency_exit is not None:
            if emergency_exit['emergency_exit'] == 1:
                break

        tr_count += 1
        if tr_count <= 3:
            continue

        case_id = None
        petitioner = None
        respondent = None
        judgment_date = None
        pdf_url = None

        table_soup = BeautifulSoup(str(tr), "html.parser")
        td_list = table_soup.find_all('td')

        i = 0
        for td in td_list:
            i += 1
            if i == 1:
                continue
            elif i == 2:
                a_tag = BeautifulSoup(str(td), "html.parser").a
                case_id = escape_string(str(a_tag.text))
            elif i == 3:
                party = str(td.decode_contents()).split("Vs")
                petitioner = escape_string(str(party[0]))
                respondent = escape_string(str(party[1]))
            elif i == 4:
                judgment_date = escape_string(str(td.decode_contents()))
            elif i == 5:
                if str(td.decode_contents()).lower() != 'file not available':
                    a_link = BeautifulSoup(str(td), "html.parser").a.get('onclick')
                    a_formatted = str(str(a_link).replace("window.open('", "")).replace("')", "")
                    pdf_url = escape_string(base_url + "/" + a_formatted)

        if select_count_query(str(court_name), str(case_id), 'judgment_date', judgment_date) and case_id is not None:
            pdf_filename = escape_string(slugify(case_id + '-' + judgment_date)) + '.pdf'
            text_filename = escape_string(slugify(case_id + '-' + judgment_date)) + '.txt'
            pdf_filepath = request_pdf(pdf_url, headers, pdf_filename, court_name, case_id)

            if pdf_filepath is not None:
                pdf_text_data = escape_string(str(pdf_to_text_api(pdf_filepath)))
                text_filepath = module_directory + "/../data_files/text_files/" + court_name + "_" + text_filename
                fw = open(text_filepath, "w")
                fw.write(pdf_text_data)
            else:
                text_filepath = None
                pdf_text_data = None
                pdf_filename = None
                text_filename = None

            if insert_query(
                    "INSERT INTO punjab_haryana (case_id, judgment_date, petitioner, respondent, pdf_url, "
                    "pdf_filename, text_filename) VALUES (%s, %s, %s, %s, %s, %s, %s)",
                    (case_id, judgment_date, petitioner, respondent, pdf_url, pdf_filename, text_filename)):
                update_query("UPDATE tracker SET inserted_cases=inserted_cases+1 WHERE court_name=%s", (court_name))
            else:
                update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name))
                insert_query("INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)",
                             (court_name, case_id, 'Failed to insert court data in table'))

            if update_query("UPDATE punjab_haryana SET text_data=%s WHERE case_id=%s", (pdf_text_data, case_id)):
                update_query("UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s", (court_name))
            else:
                insert_query("INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)",
                             (court_name, case_id, 'Failed to insert text data.'))
                update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name))

            if transfer_to_bucket('PDF_Files', pdf_filepath):
                update_query("UPDATE tracker SET transferred_pdf=transferred_pdf+1 WHERE court_name=%s", (court_name))
                os.remove(pdf_filepath)
            else:
                insert_query("INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)",
                             (court_name, case_id, 'Failed to transfer PDF to bucket.'))
                update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name))

            if transfer_to_bucket('Text_Files', text_filepath):
                update_query("UPDATE tracker SET transferred_text=transferred_text+1 WHERE court_name=%s", (court_name))
                os.remove(text_filepath)
            else:
                insert_query("INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)",
                             (court_name, case_id, 'Failed to transfer text to bucket.'))
                update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name))

        else:
            update_query("UPDATE tracker SET inserted_cases=inserted_cases+1, no_pdf=no_pdf+1, no_text=no_text+1,"
                         "transferred_pdf=transferred_pdf+1, transferred_text=transferred_text+1 WHERE court_name=%s",
                         (court_name))
def create_transfer_json(court_name):
    db = db_connect()
    try:
        cursor = db.cursor()
        cursor.execute("select count(id) as num_rows from " + str(court_name) +
                       " WHERE is_json=0")
        result = cursor.fetchall()
        cursor.close()
        no_rows = result[0]['num_rows']

        no_of_data_per_iteration = 1000
        no_of_iteration = floor(int(no_rows) / no_of_data_per_iteration) + 1

        j_count = select_one_query(
            "SELECT no_json FROM tracker WHERE court_name=%s",
            (court_name))['no_json']

        for i in range(0, no_of_iteration):
            cursor = db.cursor()
            cursor.execute("SELECT * FROM " + str(court_name) +
                           " WHERE is_json=0 LIMIT " +
                           str(no_of_data_per_iteration) + " OFFSET " +
                           str(i * no_of_data_per_iteration))
            result = cursor.fetchall()
            cursor.close()

            if result:
                file_path = module_directory + "/../data_files/json_files/new-" + str(
                    court_name) + "-" + str(i + 1 + j_count) + ".json"
                fw = open(file_path, "w")
                fw.write(json.dumps(result))

                if transfer_to_bucket('JSON_Files', file_path):
                    for record in result:
                        update_query("UPDATE " + court_name +
                                     " SET is_json=1 WHERE id='" +
                                     str(record['id']) + "'")

                    update_query(
                        "UPDATE tracker SET no_json=no_json+1, transferred_json=transferred_json+1 "
                        "WHERE court_name=%s", (court_name))
                    os.remove(file_path)
                else:
                    insert_query(
                        "INSERT INTO alerts (court_name, error_message) VALUES (%s, %s)",
                        (court_name, 'JSON Failed to transfer to bucket.'))
                    update_query(
                        "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s",
                        (court_name))

        db.close()
        return True

    except Exception as e:
        insert_query(
            "INSERT INTO alerts (court_name, error_message) VALUES (%s, %s)",
            (court_name, 'JSON Error'))
        update_query(
            "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s",
            (court_name))
        traceback.print_exc()
        logging.error("Failed select query: %s", e)
        db.close()
        return False