Exemplo n.º 1
0
 def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date(self):
     content = "this is dummy content that we hash"
     content_hash = hashlib.sha1(content).hexdigest()
     for dup_checker in self.dup_checkers:
         docket = Docket(court=self.court)
         docket.save()
         doc = Document(sha1=content_hash, docket=docket)
         doc.save(index=False)
         # Note that the next case occurs prior to the current one
         onwards = dup_checker.should_we_continue_break_or_carry_on(
             Document,
             now(),
             now() - timedelta(days=1),
             lookup_value=content_hash,
             lookup_by='sha1'
         )
         if dup_checker.full_crawl:
             self.assertEqual(
                 onwards,
                 'CONTINUE',
                 'DupChecker says to %s during a full crawl.' % onwards)
         else:
             self.assertEqual(
                 onwards,
                 'BREAK',
                 "DupChecker says to %s but there should be a duplicate in "
                 "the database. dup_count is %s, and dup_threshold is %s" %
                 (onwards, dup_checker.dup_count, dup_checker.dup_threshold)
             )
         doc.delete()
Exemplo n.º 2
0
class ViewDocumentTest(TestCase):
    fixtures = ['test_court.json']

    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')
        self.client = Client()

        # Add a document to the index
        site = test_scraper.Site().parse()
        cite = Citation(case_name=site.case_names[0],
                        docket_number=site.docket_numbers[0],
                        neutral_cite=site.neutral_citations[0],
                        federal_cite_one=site.west_citations[0])
        cite.save(index=False)
        self.doc = Document(date_filed=site.case_dates[0],
                            court=self.court,
                            citation=cite,
                            precedential_status=site.precedential_statuses[0])
        self.doc.save(index=False)

    def tearDown(self):
        self.doc.delete()

    def test_simple_url_check_for_document(self):
        """Does the page load properly?"""
        response = self.client.get('/test/2/asdf/')
        self.assertEqual(response.status_code, 200)
        self.assertIn('Tarrant', response.content)
Exemplo n.º 3
0
 def test_should_we_continue_break_or_carry_on_with_dup_found_and_older_date(
         self):
     content = "this is dummy content that we hash"
     content_hash = hashlib.sha1(content).hexdigest()
     for dup_checker in self.dup_checkers:
         docket = Docket(court=self.court)
         docket.save()
         doc = Document(sha1=content_hash, docket=docket)
         doc.save(index=False)
         # Note that the next case occurs prior to the current one
         onwards = dup_checker.should_we_continue_break_or_carry_on(
             Document,
             now(),
             now() - timedelta(days=1),
             lookup_value=content_hash,
             lookup_by='sha1')
         if dup_checker.full_crawl:
             self.assertEqual(
                 onwards, 'CONTINUE',
                 'DupChecker says to %s during a full crawl.' % onwards)
         else:
             self.assertEqual(
                 onwards, 'BREAK',
                 "DupChecker says to %s but there should be a duplicate in "
                 "the database. dup_count is %s, and dup_threshold is %s" %
                 (onwards, dup_checker.dup_count,
                  dup_checker.dup_threshold))
         doc.delete()
Exemplo n.º 4
0
    def test_should_we_continue_break_or_carry_on_with_a_dup_found(self):
        # Set the dup_threshold to zero for this test
        self.dup_checkers = [
            DupChecker(self.court, full_crawl=True, dup_threshold=0),
            DupChecker(self.court, full_crawl=False, dup_threshold=0)
        ]
        content = "this is dummy content that we hash"
        content_hash = hashlib.sha1(content).hexdigest()
        for dup_checker in self.dup_checkers:
            # Create a document, then use the dup_checker to see if it exists.
            docket = Docket(court=self.court)
            docket.save()
            doc = Document(sha1=content_hash, docket=docket)
            doc.save(index=False)
            onwards = dup_checker.should_we_continue_break_or_carry_on(
                Document,
                now(),
                now(),
                lookup_value=content_hash,
                lookup_by='sha1')
            if dup_checker.full_crawl:
                self.assertEqual(
                    onwards, 'CONTINUE',
                    'DupChecker says to %s during a full crawl.' % onwards)

            else:
                self.assertEqual(
                    onwards, 'BREAK',
                    "DupChecker says to %s but there should be a duplicate in "
                    "the database. dup_count is %s, and dup_threshold is %s" %
                    (onwards, dup_checker.dup_count,
                     dup_checker.dup_threshold))

            doc.delete()
Exemplo n.º 5
0
    def test_content_extraction(self):
        """Do all of the supported mimetypes get extracted to text successfully, including OCR?"""
        site = test_scraper.Site().parse()

        test_strings = ['supreme',
                        'intelligence',
                        'indiana',
                        'reagan',
                        'indiana',
                        'fidelity']
        for i in range(0, len(site.case_names)):
            path = os.path.join(settings.INSTALL_ROOT, 'alert', site.download_urls[i])
            with open(path) as f:
                content = f.read()
                cf = ContentFile(content)
                extension = get_extension(content)
            cite = Citation(case_name=site.case_names[i])
            cite.save(index=False)
            doc = Document(date_filed=site.case_dates[i],
                           court=self.court,
                           citation=cite)
            file_name = trunc(site.case_names[i].lower(), 75) + extension
            doc.local_path.save(file_name, cf, save=False)
            doc.save(index=False)
            doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
            if extension in ['.html', '.wpd']:
                self.assertIn(test_strings[i], doc.html.lower())
            else:
                self.assertIn(test_strings[i], doc.plain_text.lower())

            doc.delete()
Exemplo n.º 6
0
    def test_should_we_continue_break_or_carry_on_with_a_dup_found(self):
        # Set the dup_threshold to zero for this test
        self.dup_checkers = [
            DupChecker(self.court, full_crawl=True, dup_threshold=0),
            DupChecker(self.court, full_crawl=False, dup_threshold=0),
        ]
        content = "this is dummy content that we hash"
        content_hash = hashlib.sha1(content).hexdigest()
        for dup_checker in self.dup_checkers:
            # Create a document, then use the dup_checker to see if it exists.
            docket = Docket(court=self.court)
            docket.save()
            doc = Document(sha1=content_hash, docket=docket)
            doc.save(index=False)
            onwards = dup_checker.should_we_continue_break_or_carry_on(
                Document, now(), now(), lookup_value=content_hash, lookup_by="sha1"
            )
            if dup_checker.full_crawl:
                self.assertEqual(onwards, "CONTINUE", "DupChecker says to %s during a full crawl." % onwards)

            else:
                self.assertEqual(
                    onwards,
                    "BREAK",
                    "DupChecker says to %s but there should be a duplicate in "
                    "the database. dup_count is %s, and dup_threshold is %s"
                    % (onwards, dup_checker.dup_count, dup_checker.dup_threshold),
                )

            doc.delete()
Exemplo n.º 7
0
class SolrTestCase(TestCase):
    """A generic class that contains the setUp and tearDown functions for inheriting children.
    """
    fixtures = ['test_court.json']

    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')
        self.client = Client()

        # Set up a testing core in Solr and swap it in
        self.core_name = '%s.test-%s' % (self.__module__, time.time())
        create_solr_core(self.core_name)
        swap_solr_core('collection1', self.core_name)
        self.si = sunburnt.SolrInterface(settings.SOLR_URL, mode='rw')

        # Add two documents to the index, but don't extract their contents
        self.site = test_scraper.Site().parse()
        cite_counts = (4, 6)
        for i in range(0, 2):
            cite = Citation(case_name=self.site.case_names[i],
                            docket_number=self.site.docket_numbers[i],
                            neutral_cite=self.site.neutral_citations[i],
                            federal_cite_one=self.site.west_citations[i])
            cite.save(index=False)
            self.doc = Document(date_filed=self.site.case_dates[i],
                                court=self.court,
                                citation=cite,
                                precedential_status=self.site.precedential_statuses[i],
                                citation_count=cite_counts[i],
                                nature_of_suit=self.site.nature_of_suit[i],
                                judges=self.site.judges[i])
            self.doc.save()

        self.expected_num_results = 2

    def tearDown(self):
        self.doc.delete()
        swap_solr_core(self.core_name, 'collection1')
        delete_solr_core(self.core_name)
Exemplo n.º 8
0
    def test_content_extraction(self):
        """Do all of the supported mimetypes get extracted to text
        successfully, including OCR?"""
        site = test_opinion_scraper.Site().parse()

        test_strings = [
            'supreme', 'intelligence', 'indiana', 'reagan', 'indiana',
            'fidelity'
        ]
        for i in range(0, len(site.case_names)):
            path = os.path.join(settings.INSTALL_ROOT, 'alert',
                                site.download_urls[i])
            with open(path) as f:
                content = f.read()
                cf = ContentFile(content)
                extension = get_extension(content)
            cite = Citation()
            cite.save(index=False)
            docket = Docket(
                case_name=site.case_names[i],
                court=self.court,
            )
            docket.save()
            doc = Document(
                date_filed=site.case_dates[i],
                citation=cite,
                docket=docket,
            )
            file_name = trunc(site.case_names[i].lower(), 75) + extension
            doc.local_path.save(file_name, cf, save=False)
            doc.save(index=False)
            doc = extract_doc_content(doc.pk, callback=subtask(extract_by_ocr))
            if extension in ['.html', '.wpd']:
                self.assertIn(test_strings[i], doc.html.lower())
            else:
                self.assertIn(test_strings[i], doc.plain_text.lower())

            doc.delete()
Exemplo n.º 9
0
class ViewDocumentTest(TestCase):
    fixtures = ['test_court.json']

    def setUp(self):
        # Set up some handy variables
        self.court = Court.objects.get(pk='test')
        self.client = Client()

        # Add a document to the index
        site = test_opinion_scraper.Site().parse()
        cite = Citation(
            docket_number=site.docket_numbers[0],
            neutral_cite=site.neutral_citations[0],
            federal_cite_one=site.west_citations[0]
        )
        cite.save(index=False)
        docket = Docket(
            court=self.court,
            case_name=site.case_names[0],
        )
        docket.save()
        self.doc = Document(
            date_filed=site.case_dates[0],
            citation=cite,
            docket=docket,
            precedential_status=site.precedential_statuses[0],
        )
        self.doc.save(index=False)

    def tearDown(self):
        self.doc.delete()

    def test_simple_url_check_for_document(self):
        """Does the page load properly?"""
        response = self.client.get('/opinion/1/asdf/')
        self.assertEqual(response.status_code, 200)
        self.assertIn('Tarrant', response.content)