def get_and_merge_items(items, log): """Get the items returned from the RECAP server and merge them into CL. Items is a list of dicts like so, sorted by court, case number, document number and attachment number: [{'attachment_number': '0', 'document_number': '1', 'case_number': '186759', 'court_id': 'almb', 'is_available': '0'}, ... ] Note that all values are strings. The idea is to iterate over all of these dicts, grabbing the docket, and adding any items that have is_available = 1. """ update_log_status(log, RECAPLog.GETTING_AND_MERGING_ITEMS) tasks = [] for prev, item, nxt in previous_and_next(items): if prev is None or item['case_number'] != prev['case_number']: # New case. Get the next docket before getting any PDFs. url = get_docketxml_url(item['court_id'], item['case_number']) logger.info("New docket found at: %s" % url) filename = get_docket_filename(item['court_id'], item['case_number']) tasks.append(download_recap_item.si(url, filename, clobber=True)) # Get the document filename = get_document_filename(item['court_id'], item['case_number'], item['document_number'], item['attachment_number']) location = os.path.join(settings.MEDIA_ROOT, 'recap', filename) if not os.path.isfile(location) and int(item['is_available']): # We don't have it yet, and it's available to get. Get it! url = get_pdf_url(item['court_id'], item['case_number'], filename) tasks.append(download_recap_item.si(url, filename)) if nxt is None or item['case_number'] != nxt['case_number']: # Last item in the case. Send for processing. if len(tasks) > 0: logger.info("Sending %s tasks for processing." % len(tasks)) filename = get_docket_filename(item['court_id'], item['case_number']) chord(tasks)(chain( parse_recap_docket.si(filename, debug=False), extract_recap_pdf.s().set(priority=5), add_or_update_recap_document.s(coalesce_docket=True), )) tasks = [] logger.info("Finished queueing new cases.")
def calculate_recap_sequence_numbers(docket_entries): """Figure out the RECAP sequence number values for docket entries returned by a parser. Writ large, this is pretty simple, but for some items you need to perform disambiguation using neighboring docket entries. For example, if you get the following docket entries, you need to use the neighboring items to figure out which is first: Date | No. | Description 2014-01-01 | | Some stuff 2014-01-01 | | More stuff 2014-01-02 | 1 | Still more For those first two items, you have the date, but that's it. No numbers, no de_seqno, no nuthin'. The way to handle this is to start by ensuring that the docket is in ascending order and correct it if not. With that done, you can use the values of the previous items to sort out each item in turn. :param docket_entries: A list of docket entry dicts from juriscraper or another parser containing information about docket entries for a docket :return None, but sets the recap_sequence_number for all items. """ # Determine the sort order of the docket entries and normalize it order = get_order_of_docket(docket_entries) if order == 'desc': docket_entries.reverse() # Assign sequence numbers for prev, de, _ in previous_and_next(docket_entries): if prev is not None and de['date_filed'] == prev['date_filed']: # Previous item has same date. Increment the sequence number. de['recap_sequence_index'] = prev['recap_sequence_index'] + 1 de['recap_sequence_number'] = make_recap_sequence_number(de) continue else: # prev is None --> First item on the list; OR # current is different than previous --> Changed date. # Take same action: Reset the index & assign it. de['recap_sequence_index'] = 1 de['recap_sequence_number'] = make_recap_sequence_number(de) continue # Cleanup [de.pop('recap_sequence_index', None) for de in docket_entries]
def get_order_of_docket(docket_entries): """Determine whether the docket is ascending or descending or whether that is knowable. """ order = None for _, de, nxt in previous_and_next(docket_entries): try: current_num = int(de['document_number']) nxt_num = int(de['document_number']) except (TypeError, ValueError): # One or the other can't be cast to an int. Continue until we have # two consecutive ints we can compare. continue if current_num == nxt_num: # Not sure if this is possible. No known instances in the wild. continue elif current_num < nxt_num: order = 'asc' elif current_num > nxt_num: order = 'desc' break return order