def get_case(self, thread_info): # Unpack the parameters queue_pos = thread_info["index"] row = thread_info["row"] start_date = thread_info["start_date"] end_date = thread_info["end_date"] page_number = thread_info["page_number"] case_id = CourtCase.extract_case_id(row) # Use queue_pos to space the worker threads. Assuming each thread takes approximately # the same time, space out the first round of worker threads by using incremental # sleep delays. Remember that queue_pos is 0-based, so the first thread is not delayed if queue_pos < self.threads: time.sleep(queue_pos / 10.0) # Space out threads by 100ms # Fetch the document text from the server create a CourtCase upon success, or add a failure try: doc_filename = CourtCase.extract_filename(row) doc_text = CourtCase.get_document_text(doc_filename.replace(".doc", ".txt"), self.timeout) extended_info = self.get_verdict_extended_info( CourtCase.extract_case_id(row), CourtCase.extract_filename(row)) except Exception as e: self.log_message(LogLevel.ERROR, "Error fetching verdict for case " + case_id + ": " + str(e)) return case_id self.pool_progress += 1 return CourtCase(row, extended_info, doc_text)
def handle_result_page(self, soup, start_date, end_date, page_number, specific_verdicts=None) -> (list, FaultEntity): """Decodes and parses the IIS VIEWSTATE hidden field, then extracts the XML search data and uses it to generate CourtCase object instances with the verdict's information""" return_list = [] # Extract and decode the XML search results from the VIEWSTATE view_state = base64.b64decode( soup.find_all(id="__VIEWSTATE")[0]['value'] ).decode("utf-8", "ignore") results_mask = re.compile(r'<Results>([\s\S]+?)</Results>') data_xml = results_mask.search(view_state).group(0) # Pass each child (search result) to CourtCase's constructor as an ElementTree object data_tree = ET.fromstring(data_xml) # If specific_verdicts was passed, filter data_tree only to the requested verdicts if specific_verdicts is not None: data_tree = [d for d in data_tree if CourtCase.extract_case_id(d) in specific_verdicts] # Create a list of data tree elements with the required delay on startup # This is used to space the requests instead of sending them all at the same time data_tree_numbered = [] for i in range(len(data_tree)): item = { "index": i, "row": data_tree[i], "start_date": start_date.strftime("%d/%m/%Y"), "end_date": end_date.strftime("%d/%m/%Y"), "page_number": page_number } data_tree_numbered.append(item) # Initialize a thread pool and execute the jobs concurrently thread_pool = Pool(self.threads) callback = partial(self.print_status_line) tasks = [thread_pool.apply_async(self.get_case, (x, ), callback=callback) for x in data_tree_numbered] tasks_results = [task.get() for task in tasks] thread_pool.terminate() failed_verdicts = [v for v in tasks_results if type(v) == str] success_verdicts = [v for v in tasks_results if type(v) == CourtCase] if len(failed_verdicts) > 0: return success_verdicts, FaultEntity((start_date, end_date), page_number, failed_verdicts) else: return success_verdicts, None