def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') self.client = Client() # Add a document to the index site = test_opinion_scraper.Site().parse() cite = Citation( docket_number=site.docket_numbers[0], neutral_cite=site.neutral_citations[0], federal_cite_one=site.west_citations[0] ) cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() self.doc = Document( date_filed=site.case_dates[0], citation=cite, docket=docket, precedential_status=site.precedential_statuses[0], ) self.doc.save(index=False)
def test_solr_ingestion_and_deletion(self): """Do items get added to the Solr index when they are ingested?""" site = test_opinion_scraper.Site().parse() path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[0]) # a simple PDF with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( court=self.court, case_name=site.case_names[0], ) docket.save() doc = Document( date_filed=site.case_dates[0], docket=docket, citation=cite, ) file_name = trunc(site.case_names[0].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) response = self.si.raw_query(**{ 'q': 'supreme', 'caller': 'scraper_test', }).execute() count = response.result.numFound self.assertEqual( count, 1, "There were %s items found when there should have been 1" % count)
def test_abort_on_changed_court_website(self): """Similar to the above, but we create a url2Hash with a different hash before checking if it exists.""" site = test_opinion_scraper.Site() site.hash = 'this is a dummy hash code string' for dup_checker in self.dup_checkers: urlToHash(pk=site.url, SHA1=site.hash).save() abort = dup_checker.abort_by_url_hash( site.url, "this is a *different* hash!") if dup_checker.full_crawl: self.assertFalse( abort, "DupChecker says to abort during a full crawl.") else: self.assertFalse( abort, "DupChecker says to abort on a court where the hash has changed." ) dup_checker.url2Hash.delete()
def test_abort_on_unchanged_court_website(self): """Similar to the above, but we create a url2hash object before checking if it exists.""" site = test_opinion_scraper.Site() site.hash = 'this is a dummy hash code string' for dup_checker in self.dup_checkers: urlToHash(id=site.url, SHA1=site.hash).save() abort = dup_checker.abort_by_url_hash(site.url, site.hash) if dup_checker.full_crawl: self.assertFalse( abort, "DupChecker says to abort during a full crawl.") else: self.assertTrue( abort, "DupChecker says not to abort on a court that's been " "crawled before with the same hash") dup_checker.url2Hash.delete()
def test_abort_when_new_court_website(self): """Tests what happens when a new website is discovered.""" site = test_opinion_scraper.Site() site.hash = 'this is a dummy hash code string' for dup_checker in self.dup_checkers: abort = dup_checker.abort_by_url_hash(site.url, site.hash) if dup_checker.full_crawl: self.assertFalse( abort, "DupChecker says to abort during a full crawl.") else: self.assertFalse( abort, "DupChecker says to abort on a court that's never been " "crawled before.") # The checking function creates url2Hashes, that we must delete as # part of cleanup. dup_checker.url2Hash.delete()
def test_content_extraction(self): """Do all of the supported mimetypes get extracted to text successfully, including OCR?""" site = test_opinion_scraper.Site().parse() test_strings = [ 'supreme', 'intelligence', 'indiana', 'reagan', 'indiana', 'fidelity' ] for i in range(0, len(site.case_names)): path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i]) with open(path) as f: content = f.read() cf = ContentFile(content) extension = get_extension(content) cite = Citation() cite.save(index=False) docket = Docket( case_name=site.case_names[i], court=self.court, ) docket.save() doc = Document( date_filed=site.case_dates[i], citation=cite, docket=docket, ) file_name = trunc(site.case_names[i].lower(), 75) + extension doc.local_path.save(file_name, cf, save=False) doc.save(index=False) doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr)) if extension in ['.html', '.wpd']: self.assertIn(test_strings[i], doc.html.lower()) else: self.assertIn(test_strings[i], doc.plain_text.lower()) doc.delete()
def setUp(self): # Set up some handy variables self.court = Court.objects.get(pk='test') # Set up testing cores in Solr and swap them in self.core_name_opinion = '%s.opinion-test-%s' % \ (self.__module__, time.time()) self.core_name_audio = '%s.audio-test-%s' % \ (self.__module__, time.time()) create_solr_core(self.core_name_opinion) create_solr_core( self.core_name_audio, schema=os.path.join(settings.INSTALL_ROOT, 'Solr', 'conf', 'audio_schema.xml'), instance_dir='/usr/local/solr/example/solr/audio', ) swap_solr_core('collection1', self.core_name_opinion) swap_solr_core('audio', self.core_name_audio) self.si_opinion = sunburnt.SolrInterface(settings.SOLR_OPINION_URL, mode='rw') self.si_audio = sunburnt.SolrInterface(settings.SOLR_AUDIO_URL, mode='rw') # Add three documents and three audio files to the index, but don't # extract their contents self.site_opinion = test_opinion_scraper.Site().parse() self.site_audio = test_oral_arg_scraper.Site().parse() cite_counts = (4, 6, 8) self.docs = {} for i in range(0, 3): cite = Citation( case_name=self.site_opinion.case_names[i], docket_number=self.site_opinion.docket_numbers[i], neutral_cite=self.site_opinion.neutral_citations[i], federal_cite_one=self.site_opinion.west_citations[i], ) cite.save(index=False) docket = Docket( case_name=self.site_opinion.case_names[i], court=self.court, ) docket.save() self.docs[i] = Document( date_filed=self.site_opinion.case_dates[i], citation=cite, docket=docket, precedential_status=self.site_opinion.precedential_statuses[i], citation_count=cite_counts[i], nature_of_suit=self.site_opinion.nature_of_suit[i], judges=self.site_opinion.judges[i], ) self.docs[i].save() # Create citations between the documents # 0 ---cites--> 1, 2 # 1 ---cites--> 2 # 2 ---cites--> 0 self.docs[0].cases_cited.add(self.docs[1].citation) self.docs[0].cases_cited.add(self.docs[2].citation) self.docs[1].cases_cited.add(self.docs[2].citation) self.docs[2].cases_cited.add(self.docs[0].citation) for doc in self.docs.itervalues(): doc.save() # Scrape the audio "site" and add its contents site = test_oral_arg_scraper.Site().parse() Command().scrape_court(site, full_crawl=True) self.expected_num_results_opinion = 3 self.expected_num_results_audio = 2 self.si_opinion.commit() self.si_audio.commit()
def test_parsing_xml_opinion_site_to_site_object(self): """Does a basic parse of a site reveal the right number of items?""" site = test_opinion_scraper.Site().parse() self.assertEqual(len(site.case_names), 6)
def test_ingest_opinions(self): """Can we successfully ingest opinions at a high level?""" site = test_opinion_scraper.Site() site.method = "LOCAL" parsed_site = site.parse() OpinionCommand().scrape_court(parsed_site, full_crawl=True)