def save(self, do_extraction=True, index=True, *args, **kwargs): if self.document_type == self.ATTACHMENT: if self.attachment_number is None: raise ValidationError( 'attachment_number cannot be null for an ' 'attachment.') if self.pacer_doc_id == '': # Normally a char field would be never have a null value, opting # instead on having a blank value. However, blanks are not # considered unique while nulls are, so for this field, we reset # it to null whenever it would normally be blank. # http://stackoverflow.com/a/3124586/64911 self.pacer_doc_id = None super(RECAPDocument, self).save(*args, **kwargs) tasks = [] if do_extraction and self.needs_extraction: # Context extraction not done and is requested. from cl.scrapers.tasks import extract_recap_pdf tasks.append(extract_recap_pdf.si(self.pk)) if index: from cl.search.tasks import add_or_update_recap_document tasks.append(add_or_update_recap_document.si([self.pk], False)) if len(tasks) > 0: chain(*tasks)()
def get_documents(options): """Download documents from PACER if we don't already have them.""" q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() page_size = 10000 main_query = build_main_query_from_query_string( Q_INVOICES, { 'rows': page_size, 'fl': ['id', 'docket_id'] }, { 'group': False, 'facet': False }, ) si = ExtraSolrInterface(settings.SOLR_RECAP_URL, mode='r') results = si.query().add_extra(**main_query).execute() logger.info("Got %s search results.", results.result.numFound) for i, result in enumerate(results): if i < options['offset']: i += 1 continue if i >= options['limit'] > 0: break throttle.maybe_wait() rd = RECAPDocument.objects.get(pk=result['id']) logger.info("Doing item %s w/rd: %s, d: %s", i, rd.pk, result['docket_id']) if rd.is_available: logger.info("Already have pk %s; just tagging it.", rd.pk) add_tags(rd, TAG_PHASE_2) i += 1 continue if not rd.pacer_doc_id: logger.info("Unable to find pacer_doc_id for: %s", rd.pk) i += 1 continue chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_PHASE_2).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_or_update_recap_document.si([rd.pk]).set(queue=q), ).apply_async() i += 1
def download_documents(options): """We've got good values in the new columns, so just need to look those up, and get the documents from PACER. """ f = open(options['input_file'], 'r') dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.DictReader(f, dialect=dialect) q = options['queue'] throttle = CeleryThrottle(queue_name=q, min_items=options['queue_length']) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, row in enumerate(reader): if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() logger.info("Doing row %s: %s", i, row) docket_number = row['cl_d_docket_number'] or \ row['cl_d_docket_number (student)'] or \ None if not docket_number: logger.warn("No docket number found for row: %s", i) continue court = Court.objects.get(fjc_court_id=row['AO ID'].rjust(2, '0'), jurisdiction=Court.FEDERAL_DISTRICT) try: d = Docket.objects.get(docket_number=docket_number, court=court) except Docket.MultipleObjectsReturned: logger.warn("Multiple objects returned for row: %s", i) continue except Docket.DoesNotExist: logger.warn("Could not find docket for row: %s", i) continue # Got the docket, now get the documents from it, tag & OCR them. document_date = datetime.strptime(row['Date'], '%m/%d/%Y').date() des = d.docket_entries.filter(date_filed=document_date) count = des.count() if count == 0: logger.warn("No docket entries found for row: %s", i) continue elif des.count() == 1: good_des = [des[0]] else: # More than one item. Apply filtering rules. good_des = filter_des(des) # We've got our des, now download them. for de in good_des: rds = de.recap_documents.filter( document_type=RECAPDocument.PACER_DOCUMENT) for rd in rds: if not rd.pacer_doc_id: logger.warn( "Unable to get pacer_doc_id for item with " "rd_pk: %s. Restricted document?", rd.pk) continue if options['task'] == 'add_extra_tags': # Wherein I belatedly realize we need a tag specifically # for this part of the project. add_tags(rd, TAG_NAME_OPINIONS) else: # Otherwise, do the normal download thing. chain( get_pacer_doc_by_rd.s(rd.pk, session.cookies, tag=TAG_NAME).set(queue=q), extract_recap_pdf.si(rd.pk).set(queue=q), add_or_update_recap_document.si([rd.pk]).set(queue=q), ).apply_async() f.close()