def parser(court_name, page_no, response): try: table_data = None soup = BeautifulSoup(response, "html.parser") tables = soup.find_all("table") i = 0 for table in tables: i += 1 if i == 5: table_data = table break table_data = BeautifulSoup(str(table_data), "html.parser") table_rows = table_data.find_all("tr") i = 0 for table_row in table_rows: update_query( "UPDATE tracker SET total_cases=%s, inserted_cases=0, no_pdf=0, no_text=0, transferred_pdf=0," "transferred_text=0 WHERE court_name=%s", ((len(table_rows) - 1), court_name)) if i == 0: i += 1 continue else: emergency_exit = select_one_query( "SELECT emergency_exit FROM tracker WHERE court_name=%s", court_name) if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break s_no = None country = None applicant = None case_id = None pdf_url = None ruling_date = None itr_taxman_ctr = None row = BeautifulSoup(str(table_row), "html.parser") total_td = row.find_all("td") j = 0 for td in total_td: td_soup = BeautifulSoup(str(td), "html.parser") strong_text = td_soup.find('strong') if j == 0: if strong_text is not None: s_no = escape_string( str(strong_text.decode_contents())) elif j == 1: if strong_text is not None: case_id = escape_string( str(strong_text.decode_contents())) elif j == 2: if strong_text is not None: ruling_date = escape_string( str(strong_text.decode_contents())) elif j == 3: if strong_text is not None: applicant = escape_string( str(strong_text.decode_contents())) elif j == 4: if strong_text is not None: country = escape_string( str(strong_text.decode_contents())) elif j == 5: if strong_text is not None: itr_taxman_ctr = escape_string( str(strong_text.decode_contents())) elif j == 6: td_soup = BeautifulSoup(str(td), "html.parser") if td_soup.a is not None: a = td_soup.a index_of_first_comma = str(a['href']).index("'") index_of_last_comma = str(a['href']).rindex("'") pdf_url = str(a['href'])[index_of_first_comma + 1:index_of_last_comma] j += 1 if select_count_query(str(court_name), str(escape_string(case_id)), 'date', ruling_date): pdf_filepath = None text_filename = None pdf_final_url = None pdf_filename = None if pdf_url is not None: pdf_filename = slugify('aar-rulings' + str(escape_string(case_id)) + str(ruling_date)) + '.pdf' text_filename = slugify('aar-rulings-' + str(escape_string(case_id)) + str(ruling_date)) + '.txt' pdf_final_url = 'http://aarrulings.in/it-rulings/uploads/pdf/' + pdf_url pdf_filepath = request_pdf(pdf_final_url, pdf_filename, court_name, case_id, page_no) if pdf_filepath is not None: pdf_text_data = escape_string( str(pdf_to_text_api(pdf_filepath))) text_filepath = module_directory + "/../data_files/text_files/" + court_name + '_' + text_filename fw = open(text_filepath, "w") fw.write(pdf_text_data) else: text_filepath = None pdf_text_data = None pdf_filename = None text_filename = None if insert_query( "INSERT INTO aar_rulings (sl_no, case_id, date, country, " "itr_taxman_ctr, pdf_url, pdf_filename, text_filename) " "VALUES (%s,%s,%s,%s,%s,%s,%s,%s)", (s_no, str(escape_string(case_id)), ruling_date, country, itr_taxman_ctr, pdf_final_url, pdf_filename, text_filename)): update_query( "UPDATE tracker SET inserted_cases=inserted_cases+1 WHERE court_name=%s", court_name) else: update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and page_no", (court_name, page_no)) insert_query( "INSERT INTO alerts (court_name, case_id, page_no,error_message) VALUES " "(%s, %s, %s, %s)", (court_name, case_id, page_no, 'Failed to insert court data in table')) if update_query( "UPDATE aar_rulings SET name_of_applicant=%s WHERE case_id=%s", (applicant, case_id)): update_query( "UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s", court_name) else: insert_query( "INSERT INTO alerts (court_name, case_id, page_no,error_message) VALUES (%s, %s, %s," " %s)", (court_name, case_id, page_no, 'Failed to insert applicant name in table')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and page_no", (court_name, page_no)) if update_query( "UPDATE aar_rulings SET text_data=%s WHERE case_id=%s", (pdf_text_data, case_id)): update_query( "UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s", court_name) else: insert_query( "INSERT INTO alerts (court_name, case_id, page_no,error_message) VALUES (%s, %s, %s," " %s)", (court_name, case_id, page_no, 'Failed to insert text data in table')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and page_no", (court_name, page_no)) if transfer_to_bucket('PDF_Files', pdf_filepath): update_query( "UPDATE tracker SET transferred_pdf=transferred_pdf+1 WHERE court_name=%s", court_name) os.remove(pdf_filepath) else: insert_query( "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)", (court_name, case_id, 'Failed to transfer pdf to bucket.')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) if transfer_to_bucket('Text_Files', text_filepath): update_query( "UPDATE tracker SET transferred_text=transferred_text+1 WHERE court_name=%s", (court_name)) os.remove(text_filepath) else: insert_query( "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)", (court_name, case_id, 'Failed to transfer text to bucket.')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) else: update_query( "UPDATE tracker SET inserted_cases=inserted_cases+1, no_pdf=no_pdf+1, no_text=no_text+1, " "transferred_pdf=transferred_pdf+1, transferred_text=transferred_text+1 WHERE " "court_name=%s", court_name) except Exception as e: traceback.print_exc() logging.error("Failed to request: %s", e) insert_query( "INSERT INTO alerts (court_name, page_no, error_message) VALUES (%s, %s, %s)", (court_name, page_no, str(e))) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", court_name) return False
def parser(base_url, court_name, bench_id, response): pdf_base_path = base_url + 'viewpdf/' update_query("UPDATE tracker SET total_cases=%s, inserted_cases=0, no_pdf=0, no_text=0, transferred_pdf=0," "transferred_text=0 WHERE court_name=%s and bench=%s", (str(len(response)), court_name, bench_id)) for case in response: emergency_exit = select_one_query("SELECT emergency_exit FROM tracker WHERE court_name=%s and bench=%s", (court_name, bench_id)) if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break case_type = case['CaseType'] case_no = case['CaseNo'] case_yr = case['CaseYr'] jud_dt = case['Jud_Dt'] jud_pdf_name = case['Jud_Pdf_Name'] case_id = case_type + ' ' + case_no + ' OF ' + case_yr if select_count_query(str(court_name), str(case_id), 'judgment_date', jud_dt): pdf_url = pdf_base_path + jud_pdf_name pdf_filename = str(jud_pdf_name).replace('.pdf', '') pdf_filepath = request_pdf(pdf_url, pdf_filename, court_name, bench_id, case_id) if pdf_filepath is not None: pdf_text_data = escape_string(str(pdf_to_text_api(pdf_filepath))) text_filepath = module_directory + "/../data_files/text_files/" + court_name + "_" + slugify( pdf_filename) + '.txt' fw = open(text_filepath, "w") fw.write(pdf_text_data) text_filename = jud_pdf_name.replace('.pdf', '.txt') else: text_filepath = None pdf_text_data = None text_filename = None jud_pdf_name = None if insert_query( "INSERT INTO kolkata (case_id, judgment_date, pdf_url, pdf_filename, text_filename, case_type, " "case_no, case_year, bench) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)", (case_id, jud_dt, pdf_url, jud_pdf_name, text_filename, case_type, case_no, case_yr, bench_id)): update_query("UPDATE tracker SET inserted_cases=inserted_cases+1 WHERE court_name=%s and bench=%s", (court_name, bench_id)) else: update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and bench=%s", (court_name, bench_id)) insert_query("INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)", (court_name, bench_id, case_id, 'Failed to insert court data in table')) if update_query("UPDATE kolkata SET text_data=%s WHERE case_id=%s", (pdf_text_data, case_id)): update_query("UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s and bench=%s", (court_name, bench_id)) else: insert_query("INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)", (court_name, bench_id, case_id, 'Failed to insert text data.')) update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and bench=%s", (court_name, bench_id)) if transfer_to_bucket('PDF_Files', pdf_filepath): update_query("UPDATE tracker SET transferred_pdf=transferred_pdf+1 WHERE court_name=%s and bench=%s", (court_name, bench_id)) os.remove(pdf_filepath) else: insert_query("INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)", (court_name, bench_id, case_id, 'Failed to transfer PDF to bucket.')) update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and bench=%s", (court_name, bench_id)) if transfer_to_bucket('Text_Files', text_filepath): update_query("UPDATE tracker SET transferred_text=transferred_text+1 WHERE court_name=%s and " "bench=%s", (court_name, bench_id)) os.remove(text_filepath) else: insert_query("INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)", (court_name, bench_id, case_id, 'Failed to transfer text to bucket.')) update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s and bench=%s", (court_name, bench_id)) else: update_query("UPDATE tracker SET inserted_cases=inserted_cases+1, no_pdf=no_pdf+1, no_text=no_text+1," "transferred_pdf=transferred_pdf+1, transferred_text=transferred_text+1 WHERE court_name=%s " "and bench=%s", (court_name, bench_id))
def parser(court_name, bench_id, response): tbody = BeautifulSoup(str(response), "html.parser").find_all('tbody')[0] tr_list = BeautifulSoup(str(tbody), "html.parser").find_all('tr') update_query( "UPDATE tracker SET total_cases=%s, inserted_cases=0, no_pdf=0, no_text=0, transferred_pdf=0," "transferred_text=0 WHERE court_name=%s AND bench=%s", (str(len(tr_list)), court_name, bench_id)) for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM tracker WHERE court_name=%s AND bench=%s", (court_name, bench_id)) if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break case_id = None judgment_date = None party = None section = None court_name_ = None order_passed_by = None pdf_url = None i = 0 td_list = BeautifulSoup(str(tr), "html.parser").find_all('td') for td in td_list: i += 1 if i == 1: case_id = escape_string(str(td.decode_contents())) elif i == 2: judgment_date = escape_string(str(td.decode_contents())) elif i == 3: party = escape_string(str(td.decode_contents())) elif i == 4: section = escape_string(str(td.decode_contents())) elif i == 5: court_name_ = escape_string(str(td.decode_contents())) elif i == 6: order_passed_by = escape_string(str(td.decode_contents())) elif i == 7: a_tag = BeautifulSoup(str(td), "html.parser").a if a_tag: pdf_url = escape_string(str(a_tag.get('href'))) else: pdf_url = None if select_count_query(str(court_name), str(case_id), 'judgment_date', judgment_date): pdf_filename = slugify(court_name + '-' + case_id + '-' + judgment_date) + '.pdf' text_filename = slugify(court_name + '-' + case_id + '-' + judgment_date) + '.txt' pdf_filepath = request_pdf(pdf_url, pdf_filename, court_name, bench_id, case_id) if pdf_filepath is not None: pdf_text_data = escape_string( str(pdf_to_text_api(pdf_filepath))) if pdf_text_data is not None: text_filepath = module_directory + "/../data_files/text_files/" + court_name + '_' + text_filename fw = open(text_filepath, "w") fw.write(pdf_text_data) else: text_filepath = None text_filename = None else: text_filepath = None pdf_text_data = None pdf_filename = None if insert_query( "INSERT INTO national_company_law_appellate_tribunal (case_id, judgment_date, party, section, " "court_name, order_passed_by, pdf_url, pdf_filename, text_filename, bench) VALUES (%s, %s, %s, %s, " "%s, %s, %s, %s, %s, %s)", (case_id, judgment_date, party, section, court_name_, order_passed_by, pdf_url, pdf_filename, text_filename, bench_id)): update_query( "UPDATE tracker SET inserted_cases=inserted_cases+1 WHERE court_name=%s AND bench=%s", (court_name, bench_id)) else: update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s AND bench=%s", (court_name, bench_id)) insert_query( "INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)", (court_name, bench_id, case_id, 'Failed to insert court data in table')) if update_query( "UPDATE national_company_law_appellate_tribunal SET text_data=%s WHERE case_id=%s", (pdf_text_data, case_id)): update_query( "UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s AND bench=%s", (court_name, bench_id)) else: insert_query( "INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)", (court_name, bench_id, case_id, 'Failed to insert text data.')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s AND bench=%s", (court_name, bench_id)) if transfer_to_bucket('PDF_Files', pdf_filepath): update_query( "UPDATE tracker SET transferred_pdf=transferred_pdf+1 WHERE court_name=%s AND bench=%s", (court_name, bench_id)) os.remove(pdf_filepath) else: insert_query( "INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)", (court_name, bench_id, case_id, 'Failed to transfer pdf to bucket.')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s AND bench=%s", (court_name, bench_id)) if transfer_to_bucket('Text_Files', text_filepath): update_query( "UPDATE tracker SET transferred_text=transferred_text+1 WHERE court_name=%s AND bench=%s", (court_name, bench_id)) os.remove(text_filepath) else: insert_query( "INSERT INTO alerts (court_name, bench, case_id, error_message) VALUES (%s, %s, %s, %s)", (court_name, bench_id, case_id, 'Failed to transfer text to bucket.')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s AND bench=%s", (court_name, bench_id)) else: update_query( "UPDATE tracker SET inserted_cases=inserted_cases+1, no_pdf=no_pdf+1, no_text=no_text+1," "transferred_pdf=transferred_pdf+1, transferred_text=transferred_text+1 WHERE court_name=%s " "AND bench=%s", (court_name, bench_id))
def parser(base_url, court_name, response): table = BeautifulSoup(response, "html.parser").find_all( 'table', {'class': 'custum-tbl table table-bordered'})[0] tbody = BeautifulSoup(str(table), "html.parser").find_all('tbody')[0] tr_list = BeautifulSoup(str(tbody), "html.parser").find_all('tr') update_query( "UPDATE tracker SET total_cases=%s, inserted_cases=0, no_pdf=0, no_text=0, transferred_pdf=0," "transferred_text=0 WHERE court_name=%s", (str(len(tr_list)), court_name)) for tr in tr_list: emergency_exit = select_one_query( "SELECT emergency_exit FROM tracker WHERE court_name=%s", (court_name)) if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break state = None name_of_appellant = None brief_of_order_in_appeal = None appeal_order_no = None appeal_order_date = None pdf_url = None aar_order_no = None aar_order_date = None aar_pdf_url = None i = 0 td_list = BeautifulSoup(str(tr), "html.parser").find_all('td') for td in td_list: i += 1 if i == 2: state = escape_string(str(td.decode_contents())) elif i == 3: name_of_appellant = escape_string(str(td.decode_contents())) elif i == 4: brief_of_order_in_appeal = escape_string( str(td.decode_contents())) elif i == 5: appeal_order = str(td.decode_contents()).lower() if 'dated' in appeal_order: appeal_order = appeal_order.split('dated') elif 'dt.' in appeal_order: appeal_order = appeal_order.split('dt.') appeal_order_no = escape_string(appeal_order[0]) appeal_order_date = escape_string(appeal_order[1]) elif i == 6: a_tag = BeautifulSoup(str(td), "html.parser").a pdf_url = escape_string(str(base_url + a_tag.get('href'))) elif i == 7: if str(td.decode_contents()) != '-': a_tag = BeautifulSoup(str(td), "html.parser").a aar_pdf_url = escape_string(str(a_tag.get('href'))) aar_order = str(a_tag.decode_contents()).lower() if 'dated' in aar_order: aar_order = aar_order.split('dated') elif 'dt.' in aar_order: aar_order = aar_order.split('dt.') elif 'dtd.' in aar_order: aar_order = aar_order.split('dtd.') aar_order_no = escape_string(aar_order[0]) aar_order_date = escape_string(aar_order[1]) if select_count_query(str(court_name), str(appeal_order_no), 'appeal_order_date', appeal_order_date): pdf_filename = slugify('appeal-' + appeal_order_no + appeal_order_date) + '.pdf' text_filename = slugify('appeal-' + appeal_order_no + appeal_order_date) + '.txt' pdf_filepath = request_pdf(pdf_url, pdf_filename, court_name, appeal_order_no) if pdf_filepath is not None: pdf_text_data = escape_string( str(pdf_to_text_api(pdf_filepath))) text_filepath = module_directory + "/../data_files/text_files/" + court_name + '_' + text_filename fw = open(text_filepath, "w") fw.write(pdf_text_data) else: text_filepath = None pdf_text_data = None pdf_filename = None text_filename = None if aar_order_no is not None: aar_pdf_filename = slugify('aar-' + aar_order_no + aar_order_date) + '.pdf' aar_text_filename = slugify('aar-' + aar_order_no + aar_order_date) + '.txt' aar_pdf_filepath = request_pdf(aar_pdf_url, aar_pdf_filename, court_name, aar_order_no) if aar_pdf_filepath is not None: aar_text_data = escape_string( pdf_to_text_api(aar_pdf_filepath)) aar_text_filepath = module_directory + "/../data_files/text_files/" \ "" + court_name + '_' + aar_text_filename fw = open(aar_text_filepath, "w") fw.write(aar_text_data) else: aar_text_filepath = None aar_text_data = None else: aar_pdf_filename = None aar_text_filename = None aar_text_data = None aar_pdf_filepath = None aar_text_filepath = None if insert_query( "INSERT INTO gst_appellate (case_id, appeal_order_no, appeal_order_date, name_of_appellant, " "brief_of_order_in_appeal, state, aar_order_no, aar_order_date, pdf_url, pdf_filename, " "text_filename, aar_pdf_url, arr_pdf_filename, aar_text_filename) VALUES " "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", (appeal_order_no, appeal_order_no, appeal_order_date, name_of_appellant, brief_of_order_in_appeal, state, aar_order_no, aar_order_date, pdf_url, pdf_filename, text_filename, aar_pdf_url, aar_pdf_filename, aar_text_filename)): update_query( "UPDATE tracker SET inserted_cases=inserted_cases+1 WHERE court_name=%s", (court_name)) else: update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) insert_query( "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)", (court_name, appeal_order_no, 'Failed to insert court data in table')) if update_query( "UPDATE gst_appellate SET text_data=%s WHERE case_id=%s", (pdf_text_data, appeal_order_no)): update_query( "UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s", (court_name)) else: insert_query( "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)", (court_name, appeal_order_no, 'Failed to insert text data.')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) if update_query( "UPDATE gst_appellate SET aar_text_data=%s WHERE case_id=%s", (aar_text_data, appeal_order_no)): update_query( "UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s", (court_name)) else: insert_query( "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)", (court_name, appeal_order_no, 'Failed to insert aar text data.')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) if transfer_to_bucket('PDF_Files', pdf_filepath): update_query( "UPDATE tracker SET transferred_pdf=transferred_pdf+1 WHERE court_name=%s", (court_name)) os.remove(pdf_filepath) else: insert_query( "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)", (court_name, appeal_order_no, 'Failed to transfer pdf to bucket.')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) if transfer_to_bucket('Text_Files', text_filepath): update_query( "UPDATE tracker SET transferred_text=transferred_text+1 WHERE court_name=%s", (court_name)) os.remove(text_filepath) else: insert_query( "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)", (court_name, appeal_order_no, 'Failed to transfer text to bucket.')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) if aar_order_no is not None: if transfer_to_bucket('PDF_Files', aar_pdf_filepath): update_query( "UPDATE tracker SET transferred_pdf=transferred_pdf+1 WHERE court_name=%s", (court_name)) os.remove(aar_pdf_filepath) else: insert_query( "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)", (court_name, appeal_order_no, 'Failed to transfer aar pdf to bucket.')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) if transfer_to_bucket('Text_Files', aar_text_filepath): update_query( "UPDATE tracker SET transferred_text=transferred_text+1 WHERE court_name=%s", (court_name)) os.remove(aar_text_filepath) else: insert_query( "INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)", (court_name, appeal_order_no, 'Failed to transfer aar text to bucket.')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) else: update_query( "UPDATE tracker SET inserted_cases=inserted_cases+1, no_pdf=no_pdf+1, no_text=no_text+1," "transferred_pdf=transferred_pdf+1, transferred_text=transferred_text+1 WHERE court_name=%s", (court_name))
def parser(html_str, court_name, headers): soup = BeautifulSoup(html_str, "html.parser") table_list = soup.find_all('table', {'id': 'tables11'}) table_soup = BeautifulSoup(str(table_list), "html.parser") tr_list = table_soup.find_all('tr') if int(len(tr_list) - 3) > 50: tr_list = tr_list[:-2] update_query("UPDATE tracker SET total_cases=%s, inserted_cases=0, no_pdf=0, no_text=0, transferred_pdf=0," "transferred_text=0 WHERE court_name=%s", (str(len(tr_list) - 3), court_name)) tr_count = 0 for tr in tr_list: emergency_exit = select_one_query("SELECT emergency_exit FROM tracker WHERE court_name=%s", (court_name)) if emergency_exit is not None: if emergency_exit['emergency_exit'] == 1: break tr_count += 1 if tr_count <= 3: continue case_id = None petitioner = None respondent = None judgment_date = None pdf_url = None table_soup = BeautifulSoup(str(tr), "html.parser") td_list = table_soup.find_all('td') i = 0 for td in td_list: i += 1 if i == 1: continue elif i == 2: a_tag = BeautifulSoup(str(td), "html.parser").a case_id = escape_string(str(a_tag.text)) elif i == 3: party = str(td.decode_contents()).split("Vs") petitioner = escape_string(str(party[0])) respondent = escape_string(str(party[1])) elif i == 4: judgment_date = escape_string(str(td.decode_contents())) elif i == 5: if str(td.decode_contents()).lower() != 'file not available': a_link = BeautifulSoup(str(td), "html.parser").a.get('onclick') a_formatted = str(str(a_link).replace("window.open('", "")).replace("')", "") pdf_url = escape_string(base_url + "/" + a_formatted) if select_count_query(str(court_name), str(case_id), 'judgment_date', judgment_date) and case_id is not None: pdf_filename = escape_string(slugify(case_id + '-' + judgment_date)) + '.pdf' text_filename = escape_string(slugify(case_id + '-' + judgment_date)) + '.txt' pdf_filepath = request_pdf(pdf_url, headers, pdf_filename, court_name, case_id) if pdf_filepath is not None: pdf_text_data = escape_string(str(pdf_to_text_api(pdf_filepath))) text_filepath = module_directory + "/../data_files/text_files/" + court_name + "_" + text_filename fw = open(text_filepath, "w") fw.write(pdf_text_data) else: text_filepath = None pdf_text_data = None pdf_filename = None text_filename = None if insert_query( "INSERT INTO punjab_haryana (case_id, judgment_date, petitioner, respondent, pdf_url, " "pdf_filename, text_filename) VALUES (%s, %s, %s, %s, %s, %s, %s)", (case_id, judgment_date, petitioner, respondent, pdf_url, pdf_filename, text_filename)): update_query("UPDATE tracker SET inserted_cases=inserted_cases+1 WHERE court_name=%s", (court_name)) else: update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) insert_query("INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)", (court_name, case_id, 'Failed to insert court data in table')) if update_query("UPDATE punjab_haryana SET text_data=%s WHERE case_id=%s", (pdf_text_data, case_id)): update_query("UPDATE tracker SET no_text=no_text+1 WHERE court_name=%s", (court_name)) else: insert_query("INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)", (court_name, case_id, 'Failed to insert text data.')) update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) if transfer_to_bucket('PDF_Files', pdf_filepath): update_query("UPDATE tracker SET transferred_pdf=transferred_pdf+1 WHERE court_name=%s", (court_name)) os.remove(pdf_filepath) else: insert_query("INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)", (court_name, case_id, 'Failed to transfer PDF to bucket.')) update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) if transfer_to_bucket('Text_Files', text_filepath): update_query("UPDATE tracker SET transferred_text=transferred_text+1 WHERE court_name=%s", (court_name)) os.remove(text_filepath) else: insert_query("INSERT INTO alerts (court_name, case_id, error_message) VALUES (%s, %s, %s)", (court_name, case_id, 'Failed to transfer text to bucket.')) update_query("UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) else: update_query("UPDATE tracker SET inserted_cases=inserted_cases+1, no_pdf=no_pdf+1, no_text=no_text+1," "transferred_pdf=transferred_pdf+1, transferred_text=transferred_text+1 WHERE court_name=%s", (court_name))
def create_transfer_json(court_name): db = db_connect() try: cursor = db.cursor() cursor.execute("select count(id) as num_rows from " + str(court_name) + " WHERE is_json=0") result = cursor.fetchall() cursor.close() no_rows = result[0]['num_rows'] no_of_data_per_iteration = 1000 no_of_iteration = floor(int(no_rows) / no_of_data_per_iteration) + 1 j_count = select_one_query( "SELECT no_json FROM tracker WHERE court_name=%s", (court_name))['no_json'] for i in range(0, no_of_iteration): cursor = db.cursor() cursor.execute("SELECT * FROM " + str(court_name) + " WHERE is_json=0 LIMIT " + str(no_of_data_per_iteration) + " OFFSET " + str(i * no_of_data_per_iteration)) result = cursor.fetchall() cursor.close() if result: file_path = module_directory + "/../data_files/json_files/new-" + str( court_name) + "-" + str(i + 1 + j_count) + ".json" fw = open(file_path, "w") fw.write(json.dumps(result)) if transfer_to_bucket('JSON_Files', file_path): for record in result: update_query("UPDATE " + court_name + " SET is_json=1 WHERE id='" + str(record['id']) + "'") update_query( "UPDATE tracker SET no_json=no_json+1, transferred_json=transferred_json+1 " "WHERE court_name=%s", (court_name)) os.remove(file_path) else: insert_query( "INSERT INTO alerts (court_name, error_message) VALUES (%s, %s)", (court_name, 'JSON Failed to transfer to bucket.')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) db.close() return True except Exception as e: insert_query( "INSERT INTO alerts (court_name, error_message) VALUES (%s, %s)", (court_name, 'JSON Error')) update_query( "UPDATE tracker SET no_alerts=no_alerts+1 WHERE court_name=%s", (court_name)) traceback.print_exc() logging.error("Failed select query: %s", e) db.close() return False