def handle(self, *args, **options): both_list_and_endpoints = (options.get('doc_id') is not None and (options.get('start_id') is not None or options.get('end_id') is not None or options.get('filed_after') is not None)) no_option = (not any([options.get('doc_id') is None, options.get('start_id') is None, options.get('end_id') is None, options.get('filed_after') is None, options.get('all') is False])) if both_list_and_endpoints or no_option: raise CommandError('Please specify either a list of documents, a ' 'range of ids, a range of dates, or ' 'everything.') self.index = options['index'] self.si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw') # Use query chaining to build the query query = Opinion.objects.all() if options.get('doc_id'): query = query.filter(pk__in=options.get('doc_id')) if options.get('end_id'): query = query.filter(pk__lte=options.get('end_id')) if options.get('start_id'): query = query.filter(pk__gte=options.get('start_id')) if options.get('filed_after'): query = query.filter(cluster__date_filed__gte=options['filed_after']) if options.get('all'): query = Opinion.objects.all() count = query.count() docs = queryset_generator(query, chunksize=10000) self.update_documents(docs, count)
def handle(self, *args, **options): super(Command, self).handle(*args, **options) ds = Docket.objects.filter(source__in=Docket.RECAP_SOURCES).only( 'pk', 'case_name', ) count = ds.count() xml_error_ids = [] for i, d in enumerate(queryset_generator(ds, chunksize=50000)): sys.stdout.write('\rDoing docket: %s of %s, with pk: %s' % (i, count, d.pk)) sys.stdout.flush() if d.pk < options['start_pk'] > 0: continue try: d.reprocess_recap_content(do_original_xml=True) except IntegrityError: # Happens when there's wonkiness in the source data. Move on. continue except (XMLSyntaxError, IOError): # Happens when the local IA XML file is empty. Not sure why # these happen. xml_error_ids.append(d.pk) continue print("Encountered XMLSyntaxErrors/IOErrors for: %s" % xml_error_ids)
def test_queryset_generator(self): """Does the generator work properly with a variety of queries?""" tests = [ { "query": UrlHash.objects.filter(pk__in=["BAD ID"]), "count": 0 }, { "query": UrlHash.objects.filter(pk__in=["0"]), "count": 1 }, { "query": UrlHash.objects.filter(pk__in=["0", "1"]), "count": 2 }, ] for test in tests: print( "Testing queryset_generator with %s expected results..." % test["count"], end="", ) count = 0 for _ in queryset_generator(test["query"]): count += 1 self.assertEqual(count, test["count"]) print("✓")
def test_queryset_generator(self): """Does the generator work properly with a variety of queries?""" tests = [ { 'query': UrlHash.objects.filter(pk__in=['BAD ID']), 'count': 0 }, { 'query': UrlHash.objects.filter(pk__in=['0']), 'count': 1 }, { 'query': UrlHash.objects.filter(pk__in=['0', '1']), 'count': 2 }, ] for test in tests: print("Testing queryset_generator with %s expected results..." % test['count'], end='') count = 0 for _ in queryset_generator(test['query']): count += 1 self.assertEqual(count, test['count']) print('✓')
def handle(self, *args, **options): super(Command, self).handle(*args, **options) ds = ( Docket.objects.filter( # Only do ones that have HTML files *or* that have an IA XML file. # The latter is defined by ones that *don't* have blank # filepath_local fields. Q(html_documents__isnull=False) | ~Q(filepath_local=""), source__in=Docket.RECAP_SOURCES, ).distinct().only("pk", "case_name")) if options["start_pk"]: ds = ds.filter(pk__gte=options["start_pk"]) count = ds.count() xml_error_ids = [] for i, d in enumerate(queryset_generator(ds, chunksize=50000)): sys.stdout.write("\rDoing docket: %s of %s, with pk: %s" % (i, count, d.pk)) sys.stdout.flush() try: d.reprocess_recap_content(do_original_xml=True) except IntegrityError: # Happens when there's wonkiness in the source data. Move on. continue except (XMLSyntaxError, IOError): # Happens when the local IA XML file is empty. Not sure why # these happen. xml_error_ids.append(d.pk) continue print("Encountered XMLSyntaxErrors/IOErrors for: %s" % xml_error_ids)
def test_queryset_generator_values_query(self): """Do values queries work?""" print("Testing raising an error when we can't get a PK in a values " "query...", end='') self.assertRaises( Exception, queryset_generator(UrlHash.objects.values('sha1')), msg="Values query did not fail when pk was not provided." ) print('✓') print("Testing a good values query...", end='') self.assertEqual( sum(1 for _ in queryset_generator(UrlHash.objects.values())), 2, ) print('✓')
def add_or_update_by_datetime(self, dt): """ Given a datetime, adds or updates all items newer than that time. """ self.stdout.write("Adding or updating items(s) newer than %s\n" % dt) qs = self.type.objects.filter(date_created__gte=dt) items = queryset_generator(qs, chunksize=5000) count = qs.count() self._chunk_queryset_into_tasks(items, count)
def add_or_update_by_datetime(self, dt): """ Given a datetime, adds or updates all items newer than that time. """ self.stdout.write("Adding or updating items(s) newer than %s\n" % dt) qs = self.type.objects.filter(date_created__gte=dt) items = queryset_generator(qs, chunksize=5000) count = qs.count() self.process_queryset(items, count)
def populate_docket_number_core_field(apps, schema_editor): Docket = apps.get_model('search', 'Docket') ds = Docket.objects.filter( court__jurisdiction='FD', docket_number_core='', ).only('docket_number') for d in queryset_generator(ds): d.docket_number_core = make_docket_number_core(d.docket_number) d.save()
def test_queryset_generator_chunking(self): """Does chunking work properly without duplicates or omissions?""" print("Testing if queryset_iterator chunking returns the right " "number of results...", end='') expected_count = 2 results = queryset_generator(UrlHash.objects.all(), chunksize=1) self.assertEqual( expected_count, sum(1 for _ in results), ) print('✓')
def add_or_update_all(self): """ Iterates over the entire corpus, adding it to the index. Can be run on an empty index or an existing one. If run on an existing index, existing items will be updated. """ self.stdout.write("Adding or updating all items...\n") if self.type == Person: q = self.type.objects.filter( is_alias_of=None ).prefetch_related( 'positions', 'positions__predecessor', 'positions__supervisor', 'positions__appointer', 'positions__court', 'political_affiliations', 'aba_ratings', 'educations__school', 'aliases', 'race', ) # Filter out non-judges -- they don't get searched. q = [item for item in q if item.is_judge] count = len(q) elif self.type == Docket: q = self.type.objects.filter(source=Docket.RECAP) count = q.count() q = queryset_generator( q, chunksize=5000, ) else: q = self.type.objects.all() count = q.count() q = queryset_generator( q, chunksize=5000, ) self._chunk_queryset_into_tasks(q, count)
def add_or_update_all(self): """ Iterates over the entire corpus, adding it to the index. Can be run on an empty index or an existing one. If run on an existing index, existing items will be updated. """ self.stdout.write("Adding or updating all items...\n") q = self.type.objects.all() items = queryset_generator(q, chunksize=5000) count = q.count() self._chunk_queryset_into_tasks(items, count)
def add_or_update_all(self): """ Iterates over the entire corpus, adding it to the index. Can be run on an empty index or an existing one. If run on an existing index, existing items will be updated. """ self.stdout.write("Adding or updating all items...\n") if self.type == Person: q = self.type.objects.filter(is_alias_of=None).prefetch_related( 'positions', 'positions__predecessor', 'positions__supervisor', 'positions__appointer', 'positions__court', 'political_affiliations', 'aba_ratings', 'educations__school', 'aliases', 'race', ) # Filter out non-judges -- they don't get searched. q = [item for item in q if item.is_judge] count = len(q) elif self.type == Docket: q = self.type.objects.filter(source=Docket.RECAP) count = q.count() q = queryset_generator( q, chunksize=5000, ) else: q = self.type.objects.all() count = q.count() q = queryset_generator( q, chunksize=5000, ) self._chunk_queryset_into_tasks(q, count)
def process_pdf_queue(options): """Download all PDFs in queue Work through the queue of PDFs that need to be added to the database, download them and add them one by one. :return: None """ pdf_pks = queryset_generator( QueuedPDF.objects.all().order_by("-document_id").values_list( "pk", flat=True)) q = options["queue"] throttle = CeleryThrottle(queue_name=q) for pdf_pk in pdf_pks: throttle.maybe_wait() tasks.download_pdf.apply_async(kwargs={"pdf_pk": pdf_pk}, queue=q)
def do_first_pass(options): idb_rows = FjcIntegratedDatabase.objects.filter( dataset_source=CV_2017, ).order_by("pk") q = options["queue"] throttle = CeleryThrottle(queue_name=q) for i, idb_row in enumerate(queryset_generator(idb_rows)): # Iterate over all items in the IDB and find them in the Docket # table. If they're not there, create a new item. if i < options["offset"]: continue if i >= options["limit"] > 0: break throttle.maybe_wait() # TODO: See conversation in #courtlistener channel from 2019-07-11, # In which it appears we matched a criminal case with a civil one. # The code below doesn't protect against that, but it should (and I # think it does in the `do_second_pass` code, below. ds = Docket.objects.filter( docket_number_core=idb_row.docket_number, court=idb_row.district, ) count = ds.count() if count == 0: logger.info( "%s: Creating new docket for IDB row: %s", i, idb_row ) create_new_docket_from_idb.apply_async( args=(idb_row.pk,), queue=q, ) elif count == 1: d = ds[0] logger.info( "%s: Merging Docket %s with IDB row: %s", i, d, idb_row ) merge_docket_with_idb.apply_async( args=(d.pk, idb_row.pk), queue=q ) elif count > 1: logger.warning( "%s: Unable to merge. Got %s dockets for row: %s", i, count, idb_row, )
def test_queryset_generator(self): """Does the generator work properly with a variety of queries?""" tests = [ {'query': UrlHash.objects.filter(pk__in=['BAD ID']), 'count': 0}, {'query': UrlHash.objects.filter(pk__in=['0']), 'count': 1}, {'query': UrlHash.objects.filter(pk__in=['0', '1']), 'count': 2}, ] for test in tests: print("Testing queryset_generator with %s expected results..." % test['count'], end='') count = 0 for _ in queryset_generator(test['query']): count += 1 self.assertEqual(count, test['count']) print('✓')
def get_pdfs(options): """Get PDFs for the results of the Free Document Report queries. At this stage, we have rows in the PACERFreeDocumentRow table, each of which represents a PDF we need to download and merge into our normal tables: Docket, DocketEntry, and RECAPDocument. In this function, we iterate over the entire table of results, merge it into our normal tables, and then download and extract the PDF. :return: None """ q = options["queue"] index = options["index"] cnt = CaseNameTweaker() rows = PACERFreeDocumentRow.objects.filter(error_msg="").only("pk") count = rows.count() task_name = "downloading" if index: task_name += " and indexing" logger.info("%s %s items from PACER." % (task_name, count)) throttle = CeleryThrottle(queue_name=q) completed = 0 for row in queryset_generator(rows): throttle.maybe_wait() if completed % 30000 == 0: pacer_session = PacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) pacer_session.login() c = chain( process_free_opinion_result.si(row.pk, cnt).set(queue=q), get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q), delete_pacer_row.s(row.pk).set(queue=q), ) if index: c |= add_items_to_solr.s("search.RECAPDocument").set(queue=q) c.apply_async() completed += 1 if completed % 1000 == 0: logger.info( "Sent %s/%s tasks to celery for %s so " "far." % (completed, count, task_name) )
def handle(self, *args, **options): super(Command, self).handle(*args, **options) ds = Docket.objects.filter(source__in=Docket.RECAP_SOURCES).only( 'pk', 'case_name', ) count = ds.count() for i, d in enumerate(queryset_generator(ds, chunksize=50000)): sys.stdout.write('\rDoing docket: %s of %s, with pk: %s' % (i, count, d.pk)) sys.stdout.flush() if d.pk < options['start_pk'] > 0: continue try: d.reprocess_recap_content(do_original_xml=True) except IntegrityError: # Happens when there's wonkiness in the source data. Move on. continue
def get_pdfs(options): """Get PDFs for the results of the Free Document Report queries. At this stage, we have rows in the PACERFreeDocumentRow table, each of which represents a PDF we need to download and merge into our normal tables: Docket, DocketEntry, and RECAPDocument. In this function, we iterate over the entire table of results, merge it into our normal tables, and then download and extract the PDF. :return: None """ q = options['queue'] index = options['index'] cnt = CaseNameTweaker() rows = PACERFreeDocumentRow.objects.filter(error_msg="").only('pk') count = rows.count() task_name = "downloading" if index: task_name += " and indexing" logger.info("%s %s items from PACER." % (task_name, count)) throttle = CeleryThrottle(queue_name=q) completed = 0 for row in queryset_generator(rows): throttle.maybe_wait() if completed % 30000 == 0: pacer_session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) pacer_session.login() c = chain( process_free_opinion_result.si(row.pk, cnt).set(queue=q), get_and_process_pdf.s(pacer_session.cookies, row.pk).set(queue=q), delete_pacer_row.s(row.pk).set(queue=q), ) if index: c |= add_items_to_solr.s('search.RECAPDocument').set(queue=q) c.apply_async() completed += 1 if completed % 1000 == 0: logger.info("Sent %s/%s tasks to celery for %s so " "far." % (completed, count, task_name))
def handle(self, *args, **options): super(Command, self).handle(*args, **options) qs = OpinionCluster.objects.all() start_at = options['start_at'] if start_at: qs = qs.filter(pk__gte=start_at) for i, cluster in enumerate(queryset_generator(qs)): for field in cluster.citation_fields: citation_str = getattr(cluster, field) if citation_str: # Split the citation and add it to the DB. try: citation_obj = get_citations( citation_str, html=False, do_post_citation=False, do_defendant=False, disambiguate=False, )[0] except IndexError: msg = "Errored out on: %s in %s" % (citation_str, cluster.pk) print(msg) logger.info(msg) continue try: Citation.objects.create( cluster=cluster, volume=citation_obj.volume, reporter=citation_obj.reporter, page=citation_obj.page, type=map_model_field_to_citation_type(field) ) except IntegrityError: # Violated unique_together constraint. Fine. pass if i % 1000 == 0: msg = "Completed %s items (last: %s)" print(msg % (i, cluster.pk)) logger.info(msg, i, cluster.pk)
def handle(self, *args, **options): super(Command, self).handle(*args, **options) idb_rows = FjcIntegratedDatabase.objects.filter( dataset_source=CV_2017, ).order_by('pk') q = options['queue'] throttle = CeleryThrottle(queue_name=q) for i, idb_row in enumerate(queryset_generator(idb_rows)): # Iterate over all items in the IDB and find them in the Docket # table. If they're not there, create a new item. if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() docket_number_no_0s = remove_leading_zeros(idb_row.docket_number) ds = Docket.objects.filter( Q(docket_number_core=idb_row.docket_number) | Q(docket_number_core=docket_number_no_0s), court=idb_row.district, ) count = ds.count() if count == 0: logger.info("%s: Creating new docket for IDB row: %s", i, idb_row) create_new_docket_from_idb.apply_async( args=(idb_row.pk, ), queue=q, ) elif count == 1: d = ds[0] logger.info("%s: Merging Docket %s with IDB row: %s", i, d, idb_row) merge_docket_with_idb.apply_async(args=(d.pk, idb_row.pk), queue=q) elif count > 1: logger.warn("%s: Unable to merge. Got %s dockets for row: %s", i, count, idb_row)
def update_any_missing_pacer_case_ids(options): """The network requests were making things far too slow and had to be disabled during the first pass. With this method, we update any items that are missing their pacer case ID value. """ ds = Docket.objects.filter( idb_data__isnull=False, pacer_case_id=None, ) q = options['queue'] throttle = CeleryThrottle(queue_name=q) session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() for i, d in enumerate(queryset_generator(ds)): if i < options['offset']: continue if i >= options['limit'] > 0: break if i % 5000 == 0: # Re-authenticate just in case the auto-login mechanism isn't # working. session = PacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) session.login() throttle.maybe_wait() logger.info("Getting pacer_case_id for item %s", d) params = make_fjc_idb_lookup_params(d.idb_data) chain( get_pacer_case_id_and_title.s( pass_through=d.pk, docket_number=d.idb_data.docket_number, court_id=d.idb_data.district_id, cookies=session.cookies, **params ).set(queue=q), update_docket_from_hidden_api.s().set(queue=q), ).apply_async()
def handle(self, *args, **options): """ For any item that has a citation count > 0, update the citation count based on the DB. """ index_during_processing = False if options['index'] == 'concurrently': index_during_processing = True q = OpinionCluster.objects.filter(citation_count__gt=0) if options.get('doc_id'): q = q.filter(pk__in=options['doc_id']) items = queryset_generator(q, chunksize=10000) for item in items: count = 0 for sub_opinion in item.sub_opinions.all(): count += sub_opinion.citing_opinions.all().count() item.citation_count = count item.save(index=index_during_processing) self.do_solr(options)
def handle(self, *args, **options): super(Command, self).handle(*args, **options) both_list_and_endpoints = (options.get('doc_id') is not None and (options.get('start_id') is not None or options.get('end_id') is not None or options.get('filed_after') is not None)) no_option = (not any([ options.get('doc_id') is None, options.get('start_id') is None, options.get('end_id') is None, options.get('filed_after') is None, options.get('all') is False ])) if both_list_and_endpoints or no_option: raise CommandError('Please specify either a list of documents, a ' 'range of ids, a range of dates, or ' 'everything.') self.index = options['index'] self.si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw') # Use query chaining to build the query query = Opinion.objects.all() if options.get('doc_id'): query = query.filter(pk__in=options.get('doc_id')) if options.get('end_id'): query = query.filter(pk__lte=options.get('end_id')) if options.get('start_id'): query = query.filter(pk__gte=options.get('start_id')) if options.get('filed_after'): query = query.filter( cluster__date_filed__gte=options['filed_after']) if options.get('all'): query = Opinion.objects.all() self.count = query.count() self.average_per_s = 0 self.timings = [] docs = queryset_generator(query, chunksize=10000) self.update_documents(docs)
def handle(self, *args, **options): super(Command, self).handle(*args, **options) ops = queryset_generator( Opinion.objects.exclude( Q(html="") | Q(html=None), Q(html_lawbox="") | Q(html_lawbox=None), Q(html_columbia="") | Q(html_columbia=None), )) for op in ops: content = render_to_string("simple_opinion.html", {"o": op}) output_dir = os.path.join( options["output_directory"], str(op.cluster.date_filed.year), str(op.cluster.date_filed.month), str(op.cluster.date_filed.day), ) mkdir_p(output_dir) output_path = os.path.join(output_dir, f"{op.pk}.html") with open(output_path, "w") as f: f.write(content.encode())
def handle(self, *args, **options): both_list_and_endpoints = options.get("doc_id") is not None and ( options.get("start_id") is not None or options.get("end_id") is not None or options.get("filed_after") is not None ) no_option = not any( [ options.get("doc_id") is None, options.get("start_id") is None, options.get("end_id") is None, options.get("filed_after") is None, options.get("all") is False, ] ) if both_list_and_endpoints or no_option: raise CommandError( "Please specify either a list of documents, a " "range of ids, a range of dates, or " "everything." ) self.index = options["index"] self.si = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode="rw") # Use query chaining to build the query query = Opinion.objects.all() if options.get("doc_id"): query = query.filter(pk__in=options.get("doc_id")) if options.get("end_id"): query = query.filter(pk__lte=options.get("end_id")) if options.get("start_id"): query = query.filter(pk__gte=options.get("start_id")) if options.get("filed_after"): query = query.filter(cluster__date_filed__gte=options["filed_after"]) if options.get("all"): query = Opinion.objects.all() count = query.count() docs = queryset_generator(query, chunksize=10000) self.update_documents(docs, count)
def handle(self, *args, **options): super(Command, self).handle(*args, **options) ops = queryset_generator(Opinion.objects.exclude( Q(html='') | Q(html=None), Q(html_lawbox='') | Q(html_lawbox=None), Q(html_columbia='') | Q(html_columbia=None), )) for op in ops: content = render_to_string('simple_opinion.html', { 'o': op, }) output_dir = os.path.join( options['output_directory'], str(op.cluster.date_filed.year), str(op.cluster.date_filed.month), str(op.cluster.date_filed.day), ) mkdir_p(output_dir) output_path = os.path.join(output_dir, '%s.html' % op.pk) with open(output_path, 'w') as f: f.write(content.encode('utf-8'))
def do_first_pass(options): idb_rows = FjcIntegratedDatabase.objects.filter( dataset_source=CV_2017, ).order_by('pk') q = options['queue'] throttle = CeleryThrottle(queue_name=q) for i, idb_row in enumerate(queryset_generator(idb_rows)): # Iterate over all items in the IDB and find them in the Docket # table. If they're not there, create a new item. if i < options['offset']: continue if i >= options['limit'] > 0: break throttle.maybe_wait() ds = Docket.objects.filter( docket_number_core=idb_row.docket_number, court=idb_row.district, ) count = ds.count() if count == 0: logger.info("%s: Creating new docket for IDB row: %s", i, idb_row) create_new_docket_from_idb.apply_async( args=(idb_row.pk,), queue=q, ) elif count == 1: d = ds[0] logger.info("%s: Merging Docket %s with IDB row: %s", i, d, idb_row) merge_docket_with_idb.apply_async(args=(d.pk, idb_row.pk), queue=q) elif count > 1: logger.warn("%s: Unable to merge. Got %s dockets for row: %s", i, count, idb_row)
def handle(self, *args, **options): super(Command, self).handle(*args, **options) ops = queryset_generator( Opinion.objects.exclude( Q(html='') | Q(html=None), Q(html_lawbox='') | Q(html_lawbox=None), Q(html_columbia='') | Q(html_columbia=None), )) for op in ops: content = render_to_string('simple_opinion.html', { 'o': op, }) output_dir = os.path.join( options['output_directory'], str(op.cluster.date_filed.year), str(op.cluster.date_filed.month), str(op.cluster.date_filed.day), ) mkdir_p(output_dir) output_path = os.path.join(output_dir, '%s.html' % op.pk) with open(output_path, 'w') as f: f.write(content.encode('utf-8'))
def handle(self, *args, **options): super(Command, self).handle(*args, **options) qs = RECAPDocument.objects.filter(is_available=True, file_size=None) for i, rd in enumerate(queryset_generator(qs)): try: rd.file_size = rd.filepath_local.size except OSError as e: if e.errno != 2: # Problem other than No such file or directory. raise continue except ValueError: # The 'filepath_local' attribute has no file # associated with it. continue try: rd.save() except ValidationError: # [u'Duplicate values violate save constraint. An object with # this document_number and docket_entry already exists: # (8, 16188376)'] continue if i % 1000 == 0: logger.info("Completed %s items", i)
def handle(self, *args, **options): """Identify parallel citations and save them as requested. This process proceeds in two phases. The first phase is to work through the entire corpus, identifying citations that occur very near to each other. These are considered parallel citations, and they are built into a graph data structure where citations are nodes and each parallel citation is an edge. The weight of each edge is determined by the number of times a parallel citation has been identified between two citations. This should solve problems like typos or other issues with our heuristic approach. The second phase of this process is to update the database with the high quality citations. This can only be done by matching the citations with actual items in the database and then updating them with parallel citations that are sufficiently likely to be good. """ super(Command, self).handle(*args, **options) no_option = (not any([options.get('doc_id'), options.get('all')])) if no_option: raise CommandError("Please specify if you want all items or a " "specific item.") if not options['update_database']: logger.info( "--update_database is not set. No changes will be made to the " "database." ) # Update Citation object to consider similar objects equal. self.monkey_patch_citation() logger.info("## Entering phase one: Building a network object of " "all citations.\n") q = Opinion.objects.all() if options.get('doc_id'): q = q.filter(pk__in=options['doc_id']) count = q.count() opinions = queryset_generator(q, chunksize=10000) node_count = edge_count = completed = 0 subtasks = [] for o in opinions: subtasks.append( # This will call the second function with the results from the # first. get_document_citations.s(o) | identify_parallel_citations.s() ) last_item = (count == completed + 1) if (completed % 50 == 0) or last_item: job = group(subtasks) result = job.apply_async().join() [self.add_groups_to_network(citation_groups) for citation_groups in result] subtasks = [] completed += 1 if completed % 250 == 0 or last_item: # Only do this once in a while. node_count = len(self.g.nodes()) edge_count = len(self.g.edges()) sys.stdout.write("\r Completed %s of %s. (%s nodes, %s edges)" % ( completed, count, node_count, edge_count, )) sys.stdout.flush() logger.info("\n\n## Entering phase two: Saving the best edges to " "the database.\n\n") for sub_graph in nx.connected_component_subgraphs(self.g): self.handle_subgraph(sub_graph, options) logger.info("\n\n## Done. Added %s new citations." % self.update_count) self.do_solr(options)
def add_or_update_all(self): """ Iterates over the entire corpus, adding it to the index. Can be run on an empty index or an existing one. If run on an existing index, existing items will be updated. """ self.stdout.write("Adding or updating all items...\n") if self.type == Person: q = self.type.objects.filter(is_alias_of=None).prefetch_related( 'positions', 'positions__predecessor', 'positions__supervisor', 'positions__appointer', 'positions__court', 'political_affiliations', 'aba_ratings', 'educations__school', 'aliases', 'race', ) # Filter out non-judges -- they don't get searched. q = [item for item in q if item.is_judge] count = len(q) elif self.type == RECAPDocument: q = self.type.objects.all().prefetch_related( # IDs 'docket_entry__pk', 'docket_entry__docket__pk', 'docket_entry__docket__court__pk', 'docket_entry__docket__assigned_to__pk', 'docket_entry__docket__referred_to__pk', # Docket Entry 'docket_entry__description', 'docket_entry__entry_number', 'docket_entry__date_filed', # Docket 'docket_entry__docket__date_argued', 'docket_entry__docket__date_filed', 'docket_entry__docket__date_terminated', 'docket_entry__docket__docket_number', 'docket_entry__docket__case_name_short', 'docket_entry__docket__case_name', 'docket_entry__docket__case_name_full', 'docket_entry__docket__nature_of_suit', 'docket_entry__docket__cause', 'docket_entry__docket__jury_demand', 'docket_entry__docket__jurisdiction_type', 'docket_entry__docket__slug', # Judges 'docket_entry__docket__assigned_to__name_first', 'docket_entry__docket__assigned_to__name_middle', 'docket_entry__docket__assigned_to__name_last', 'docket_entry__docket__assigned_to__name_suffix', 'docket_entry__docket__assigned_to_str', 'docket_entry__docket__referred_to__name_first', 'docket_entry__docket__referred_to__name_middle', 'docket_entry__docket__referred_to__name_last', 'docket_entry__docket__referred_to__name_suffix', 'docket_entry__docket__referred_to_str', # Court 'docket_entry__docket__court__full_name', 'docket_entry__docket__court__citation_string', ) count = q.count() q = queryset_generator( q, chunksize=5000, ) else: q = self.type.objects.all() count = q.count() q = queryset_generator( q, chunksize=5000, ) self._chunk_queryset_into_tasks(q, count)
def do_second_pass(options): """In the first pass, we ignored the duplicates that we got, preferring to let them stack up for later analysis. In this pass, we attempt to merge those failed items into the DB by more aggressive filtering and algorithmic selection. """ idb_rows = FjcIntegratedDatabase.objects.filter( dataset_source=CV_2017, docket__isnull=True, ).order_by('pk') for i, idb_row in enumerate(queryset_generator(idb_rows)): # Iterate over all items in the IDB and find them in the Docket # table. If they're not there, create a new item. if i < options['offset']: continue if i >= options['limit'] > 0: break ds = Docket.objects.filter( docket_number_core=idb_row.docket_number, court=idb_row.district, docket_number__startswith='%s:' % idb_row.office ).exclude( docket_number__icontains='cr' ).exclude( case_name__icontains="sealed" ).exclude( case_name__icontains='suppressed' ).exclude( case_name__icontains='search warrant' ) count = ds.count() if count == 0: logger.info("%s: Creating new docket for IDB row: %s", i, idb_row) create_new_docket_from_idb(idb_row.pk) continue elif count == 1: d = ds[0] logger.info("%s: Merging Docket %s with IDB row: %s", i, d, idb_row) merge_docket_with_idb(d.pk, idb_row.pk) continue logger.info("%s: Still have %s results after office and civil " "docket number filtering. Filtering further.", i, count) case_names = [] for d in ds: case_name = harmonize(d.case_name) parts = case_name.lower().split(' v. ') if len(parts) == 1: case_names.append(case_name) elif len(parts) == 2: plaintiff, defendant = parts[0], parts[1] case_names.append( '%s v. %s' % (plaintiff[0:30], defendant[0:30]) ) elif len(parts) > 2: case_names.append(case_name) idb_case_name = harmonize('%s v. %s' % (idb_row.plaintiff, idb_row.defendant)) results = find_best_match(case_names, idb_case_name, case_sensitive=False) if results['ratio'] > 0.65: logger.info("%s Found good match by case name for %s: %s", i, idb_case_name, results['match_str']) d = ds[results['match_index']] merge_docket_with_idb(d.pk, idb_row.pk) else: logger.info("%s No good match after office and case name " "filtering. Creating new item: %s", i, idb_row) create_new_docket_from_idb(idb_row.pk)
def handle(self, *args, **options): """Identify parallel citations and save them as requested. This process proceeds in two phases. The first phase is to work through the entire corpus, identifying citations that occur very near to each other. These are considered parallel citations, and they are built into a graph data structure where citations are nodes and each parallel citation is an edge. The weight of each edge is determined by the number of times a parallel citation has been identified between two citations. This should solve problems like typos or other issues with our heuristic approach. The second phase of this process is to update the database with the high quality citations. This can only be done by matching the citations with actual items in the database and then updating them with parallel citations that are sufficiently likely to be good. """ super(Command, self).handle(*args, **options) no_option = not any([options.get("doc_id"), options.get("all")]) if no_option: raise CommandError( "Please specify if you want all items or a specific item.") if not options["update_database"]: logger.info( "--update_database is not set. No changes will be made to the " "database.") logger.info("## Entering phase one: Building a network object of " "all citations.\n") q = Opinion.objects.all() if options.get("doc_id"): q = q.filter(pk__in=options["doc_id"]) count = q.count() opinions = queryset_generator(q, chunksize=10000) node_count = edge_count = completed = 0 subtasks = [] for o in opinions: subtasks.append( # This will call the second function with the results from the # first. get_document_citations.s(o) | identify_parallel_citations.s()) last_item = count == completed + 1 if (completed % 50 == 0) or last_item: job = group(subtasks) result = job.apply_async().join() [ self.add_groups_to_network(citation_groups) for citation_groups in result ] subtasks = [] completed += 1 if completed % 250 == 0 or last_item: # Only do this once in a while. node_count = len(self.g.nodes()) edge_count = len(self.g.edges()) sys.stdout.write("\r Completed %s of %s. (%s nodes, %s edges)" % (completed, count, node_count, edge_count)) sys.stdout.flush() logger.info("\n\n## Entering phase two: Saving the best edges to " "the database.\n\n") for sub_graph in nx.connected_component_subgraphs(self.g): self.handle_subgraph(sub_graph, options) logger.info("\n\n## Done. Added %s new citations." % self.update_count) self.do_solr(options)
def write_json_to_disk(courts, obj_type_str, obj_class, court_attr, serializer, bulk_dir): """Write all items to disk as json files inside directories named by jurisdiction. The main trick is that we identify if we are creating a bulk archive from scratch. If so, we iterate over everything. If not, we only iterate over items that have been modified since the last good date. We deal with two kinds of bulk data. The first is jurisdiction-centric, in which we want to make bulk data for that particular jurisdiction, such as opinions or PACER data, or whatever. The second is non-jurisdiction- specific, like people or schools. For jurisdiction-specific data, we make jurisdiction directories to put the data into. Otherwise, we do not. :param courts: Court objects that you expect to make data for. :param obj_type_str: A string to use for the directory name of a type of data. For example, for clusters, it's 'clusters'. :param obj_class: The actual class to make a bulk data for. :param court_attr: A string that can be used to find the court attribute on an object. For example, on clusters, this is currently docket.court_id. :param serializer: A DRF serializer to use to generate the data. :param bulk_dir: A directory to place the serialized JSON data into. :returns int: The number of items generated """ # Are there already bulk files? history = BulkJsonHistory(obj_type_str, bulk_dir) last_good_date = history.get_last_good_date() history.add_current_attempt_and_save() if court_attr is not None: # Create a directory for every jurisdiction, if they don't already # exist. This does not clobber. for court in courts: mkdir_p(join( bulk_dir, obj_type_str, court.pk, )) else: # Make a directory for the object type. mkdir_p(join(bulk_dir, obj_type_str)) if last_good_date is not None: print(" - Incremental data found. Assuming it's good and using it...") qs = obj_class.objects.filter(date_modified__gte=last_good_date) else: print(" - Incremental data not found. Working from scratch...") qs = obj_class.objects.all() if qs.count() == 0: print(" - No %s-type items in the DB or none that have changed. All " "done here." % obj_type_str) history.mark_success_and_save() return 0 else: if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: # Necessary for Court objects, which don't have ints for ids. item_list = qs i = 0 renderer = JSONRenderer() r = RequestFactory().request() r.META['SERVER_NAME'] = 'www.courtlistener.com' # Else, it's testserver r.META['SERVER_PORT'] = '443' # Else, it's 80 r.META['wsgi.url_scheme'] = 'https' # Else, it's http. r.version = 'v3' r.versioning_scheme = URLPathVersioning() context = dict(request=r) for item in item_list: if i % 1000 == 0: print("Completed %s items so far." % i) json_str = renderer.render( serializer(item, context=context).data, accepted_media_type='application/json; indent=2', ) if court_attr is not None: loc = join(bulk_dir, obj_type_str, deepgetattr(item, court_attr), '%s.json' % item.pk) else: # A non-jurisdiction-centric object. loc = join(bulk_dir, obj_type_str, '%s.json' % item.pk) with open(loc, 'wb') as f: f.write(json_str) i += 1 print (' - %s %s json files created.' % (i, obj_type_str)) history.mark_success_and_save() return i
def write_json_to_disk(courts, obj_type_str, obj_class, court_attr, serializer, bulk_dir): """Write all items to disk as json files inside directories named by jurisdiction. The main trick is that we identify if we are creating a bulk archive from scratch. If so, we iterate over everything. If not, we only iterate over items that have been modified since the last good date. We deal with two kinds of bulk data. The first is jurisdiction-centric, in which we want to make bulk data for that particular jurisdiction, such as opinions or PACER data, or whatever. The second is non-jurisdiction- specific, like people or schools. For jurisdiction-specific data, we make jurisdiction directories to put the data into. Otherwise, we do not. :param courts: Court objects that you expect to make data for. :param obj_type_str: A string to use for the directory name of a type of data. For example, for clusters, it's 'clusters'. :param obj_class: The actual class to make a bulk data for. :param court_attr: A string that can be used to find the court attribute on an object. For example, on clusters, this is currently docket.court_id. :param serializer: A DRF serializer to use to generate the data. :param bulk_dir: A directory to place the serialized JSON data into. :returns int: The number of items generated """ # Are there already bulk files? history = BulkJsonHistory(obj_type_str, bulk_dir) last_good_date = history.get_last_good_date() history.add_current_attempt_and_save() if court_attr is not None: # Create a directory for every jurisdiction, if they don't already # exist. This does not clobber. for court in courts: mkdir_p(join( bulk_dir, obj_type_str, court.pk, )) else: # Make a directory for the object type. mkdir_p(join(bulk_dir, obj_type_str)) if last_good_date is not None: print( " - Incremental data found. Assuming it's good and using it...") qs = obj_class.objects.filter(date_modified__gte=last_good_date) else: print(" - Incremental data not found. Working from scratch...") qs = obj_class.objects.all() if qs.count() == 0: print(" - No %s-type items in the DB or none that have changed. All " "done here." % obj_type_str) history.mark_success_and_save() return 0 else: if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: # Necessary for Court objects, which don't have ints for ids. item_list = qs i = 0 renderer = JSONRenderer() r = RequestFactory().request() r.META[ "SERVER_NAME"] = "www.courtlistener.com" # Else, it's testserver r.META["SERVER_PORT"] = "443" # Else, it's 80 r.META["wsgi.url_scheme"] = "https" # Else, it's http. r.version = "v3" r.versioning_scheme = URLPathVersioning() context = dict(request=r) for item in item_list: if i % 1000 == 0: print("Completed %s items so far." % i) json_str = renderer.render( serializer(item, context=context).data, accepted_media_type="application/json; indent=2", ) if court_attr is not None: loc = join( bulk_dir, obj_type_str, deepgetattr(item, court_attr), "%s.json" % item.pk, ) else: # A non-jurisdiction-centric object. loc = join(bulk_dir, obj_type_str, "%s.json" % item.pk) with open(loc, "wb") as f: f.write(json_str) i += 1 print(" - %s %s json files created." % (i, obj_type_str)) history.mark_success_and_save() return i
def write_json_to_disk(courts, obj_type_str, obj_type, court_attr, serializer): """Write all items to disk as json files inside directories named by jurisdiction. The main trick is that we identify if we are creating a bulk archive from scratch. If so, we iterate over everything. If not, we only iterate over items that have been modified in the last 32 days because it's assumed that the bulk files are generated once per month. """ # Are there already bulk files? history = BulkJsonHistory(obj_type_str) last_good_date = history.get_last_good_date() history.add_current_attempt_and_save() if court_attr is not None: # Create a directory for every jurisdiction, if they don't already # exist. This does not clobber. for court in courts: mkdir_p(join( settings.BULK_DATA_DIR, 'tmp', obj_type_str, court.pk, )) if last_good_date is not None: print " - Incremental data found. Assuming it's good and using it..." qs = obj_type.objects.filter(date_modified__gte=last_good_date) else: print " - Incremental data not found. Working from scratch..." qs = obj_type.objects.all() if qs.count() == 0: print " - No %s-type items in the DB or none that have changed. All done here." % obj_type_str history.mark_success_and_save() return 0 else: if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: # Necessary for Court objects, which don't have ints for ids. item_list = qs i = 0 renderer = JSONRenderer() r = RequestFactory().request() r.META['SERVER_NAME'] = 'www.courtlistener.com' # Else, it's testserver r.version = 'v3' r.versioning_scheme = URLPathVersioning() context = dict(request=r) for item in item_list: json_str = renderer.render( serializer(item, context=context).data, accepted_media_type='application/json; indent=2', ) if court_attr is not None: loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, deepgetattr(item, court_attr), '%s.json' % item.pk) else: # A non-jurisdiction-centric object. loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, '%s.json' % item.pk) with open(loc, 'wb') as f: f.write(json_str) i += 1 print ' - %s %s json files created.' % (i, obj_type_str) history.mark_success_and_save() return i
def write_json_to_disk(courts, obj_type_str, obj_type, court_attr, serializer): """Write all items to disk as json files inside directories named by jurisdiction. The main trick is that we identify if we are creating a bulk archive from scratch. If so, we iterate over everything. If not, we only iterate over items that have been modified since the last good date. """ # Are there already bulk files? history = BulkJsonHistory(obj_type_str) last_good_date = history.get_last_good_date() history.add_current_attempt_and_save() if court_attr is not None: # Create a directory for every jurisdiction, if they don't already # exist. This does not clobber. for court in courts: mkdir_p( join( settings.BULK_DATA_DIR, 'tmp', obj_type_str, court.pk, )) if last_good_date is not None: print " - Incremental data found. Assuming it's good and using it..." qs = obj_type.objects.filter(date_modified__gte=last_good_date) else: print " - Incremental data not found. Working from scratch..." qs = obj_type.objects.all() if qs.count() == 0: print " - No %s-type items in the DB or none that have changed. All done here." % obj_type_str history.mark_success_and_save() return 0 else: if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: # Necessary for Court objects, which don't have ints for ids. item_list = qs i = 0 renderer = JSONRenderer() r = RequestFactory().request() r.META[ 'SERVER_NAME'] = 'www.courtlistener.com' # Else, it's testserver r.version = 'v3' r.versioning_scheme = URLPathVersioning() context = dict(request=r) for item in item_list: json_str = renderer.render( serializer(item, context=context).data, accepted_media_type='application/json; indent=2', ) if court_attr is not None: loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, deepgetattr(item, court_attr), '%s.json' % item.pk) else: # A non-jurisdiction-centric object. loc = join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, '%s.json' % item.pk) with open(loc, 'wb') as f: f.write(json_str) i += 1 print ' - %s %s json files created.' % (i, obj_type_str) history.mark_success_and_save() return i
def migrate_opinions_oral_args_and_dockets(self): self.stdout.write("Migrating dockets, audio files, and opinions to new " "database...") q = DocketOld.objects.using('old').all() old_dockets = queryset_generator(q) num_dockets = q.count() progress = 0 self._print_progress(progress, num_dockets) for old_docket in old_dockets: # First do the docket, then create the cluster and opinion objects. try: old_audio = old_docket.audio_files.all()[0] except IndexError: old_audio = None try: old_document = old_docket.documents.all()[0] except IndexError: old_document = None if old_document is not None: old_citation = old_document.citation old_doc_case_name, old_doc_case_name_full, old_doc_case_name_short = self._get_case_names(old_citation.case_name) if old_audio is not None: old_audio_case_name, old_audio_case_name_full, old_audio_case_name_short = self._get_case_names(old_audio.case_name) court = CourtNew.objects.get(pk=old_docket.court_id) # Courts are in place thanks to initial data. new_docket = DocketNew( pk=old_docket.pk, date_modified=old_docket.date_modified, date_created=old_docket.date_modified, court=court, case_name=old_doc_case_name, case_name_full=old_doc_case_name_full, case_name_short=old_doc_case_name_short, slug=self._none_to_blank(old_docket.slug), docket_number=self._none_to_blank(old_citation.docket_number), date_blocked=old_docket.date_blocked, blocked=old_docket.blocked, ) if old_audio is not None: new_docket.date_argued = old_audio.date_argued new_docket.save(using='default') if old_document is not None: new_opinion_cluster = OpinionClusterNew( pk=old_document.pk, docket=new_docket, judges=self._none_to_blank(old_document.judges), date_modified=old_document.date_modified, date_created=old_document.date_modified, date_filed=old_document.date_filed, slug=self._none_to_blank(old_citation.slug), citation_id=old_document.citation_id, case_name_short=old_doc_case_name_short, case_name=old_doc_case_name, case_name_full=old_doc_case_name_full, federal_cite_one=self._none_to_blank( old_citation.federal_cite_one), federal_cite_two=self._none_to_blank( old_citation.federal_cite_two), federal_cite_three=self._none_to_blank( old_citation.federal_cite_three), state_cite_one=self._none_to_blank( old_citation.state_cite_one), state_cite_two=self._none_to_blank( old_citation.state_cite_two), state_cite_three=self._none_to_blank( old_citation.state_cite_three), state_cite_regional=self._none_to_blank( old_citation.state_cite_regional), specialty_cite_one=self._none_to_blank( old_citation.specialty_cite_one), scotus_early_cite=self._none_to_blank( old_citation.scotus_early_cite), lexis_cite=self._none_to_blank(old_citation.lexis_cite), westlaw_cite=self._none_to_blank(old_citation.westlaw_cite), neutral_cite=self._none_to_blank(old_citation.neutral_cite), scdb_id=self._none_to_blank( old_document.supreme_court_db_id), source=old_document.source, nature_of_suit=old_document.nature_of_suit, citation_count=old_document.citation_count, precedential_status=old_document.precedential_status, date_blocked=old_document.date_blocked, blocked=old_document.blocked, ) new_opinion_cluster.save( using='default', index=False, ) new_opinion = OpinionNew( pk=old_document.pk, cluster=new_opinion_cluster, date_modified=old_document.date_modified, date_created=old_document.time_retrieved, type='010combined', sha1=old_document.sha1, download_url=old_document.download_url, local_path=old_document.local_path, plain_text=old_document.plain_text, html=self._none_to_blank(old_document.html), html_lawbox=self._none_to_blank(old_document.html_lawbox), html_with_citations=old_document.html_with_citations, extracted_by_ocr=old_document.extracted_by_ocr, ) new_opinion.save( using='default', index=False, ) if old_audio is not None: new_audio_file = AudioNew( pk=old_audio.pk, docket=new_docket, source=old_audio.source, case_name=old_audio_case_name, case_name_short=old_audio_case_name_short, case_name_full=old_audio_case_name_full, judges=self._none_to_blank(old_audio.judges), date_created=old_audio.time_retrieved, date_modified=old_audio.date_modified, sha1=old_audio.sha1, download_url=old_audio.download_url, local_path_mp3=old_audio.local_path_mp3, local_path_original_file=old_audio.local_path_original_file, duration=old_audio.duration, processing_complete=old_audio.processing_complete, date_blocked=old_audio.date_blocked, blocked=old_audio.blocked, ) new_audio_file.save( using='default', index=False, ) progress += 1 self._print_progress(progress, num_dockets) self.stdout.write(u'') # Newline
def do_second_pass(options): """In the first pass, we ignored the duplicates that we got, preferring to let them stack up for later analysis. In this pass, we attempt to merge those failed items into the DB by more aggressive filtering and algorithmic selection. """ idb_rows = FjcIntegratedDatabase.objects.filter( dataset_source=CV_2017, docket__isnull=True, ).order_by('pk') for i, idb_row in enumerate(queryset_generator(idb_rows)): # Iterate over all items in the IDB and find them in the Docket # table. If they're not there, create a new item. if i < options['offset']: continue if i >= options['limit'] > 0: break ds = Docket.objects.filter( docket_number_core=idb_row.docket_number, court=idb_row.district, docket_number__startswith='%s:' % idb_row.office).exclude(docket_number__icontains='cr').exclude( case_name__icontains="sealed").exclude( case_name__icontains='suppressed').exclude( case_name__icontains='search warrant') count = ds.count() if count == 0: logger.info("%s: Creating new docket for IDB row: %s", i, idb_row) create_new_docket_from_idb(idb_row.pk) continue elif count == 1: d = ds[0] logger.info("%s: Merging Docket %s with IDB row: %s", i, d, idb_row) merge_docket_with_idb(d.pk, idb_row.pk) continue logger.info( "%s: Still have %s results after office and civil " "docket number filtering. Filtering further.", i, count) case_names = [] for d in ds: case_name = harmonize(d.case_name) parts = case_name.lower().split(' v. ') if len(parts) == 1: case_names.append(case_name) elif len(parts) == 2: plaintiff, defendant = parts[0], parts[1] case_names.append('%s v. %s' % (plaintiff[0:30], defendant[0:30])) elif len(parts) > 2: case_names.append(case_name) idb_case_name = harmonize('%s v. %s' % (idb_row.plaintiff, idb_row.defendant)) results = find_best_match(case_names, idb_case_name, case_sensitive=False) if results['ratio'] > 0.65: logger.info("%s Found good match by case name for %s: %s", i, idb_case_name, results['match_str']) d = ds[results['match_index']] merge_docket_with_idb(d.pk, idb_row.pk) else: logger.info( "%s No good match after office and case name " "filtering. Creating new item: %s", i, idb_row) create_new_docket_from_idb(idb_row.pk)