class PacerHtmlFiles(models.Model): """This is a simple object for holding original HTML content from PACER We use this object to make sure that for every item we receive from users, we can go back and re-parse it one day if we have to. This becomes essential as we do more and more data work where we're purchasing content. If we don't keep an original copy, a bug could be devastating. """ date_created = models.DateTimeField( help_text="The time when this item was created", auto_now_add=True, db_index=True, ) date_modified = models.DateTimeField( help_text="The last moment when the item was modified.", auto_now=True, db_index=True, ) filepath = models.FileField( help_text="The path of the original data from PACER.", upload_to=make_recap_data_path, storage=UUIDFileSystemStorage(), max_length=150, ) content_type = models.ForeignKey(ContentType) object_id = models.PositiveIntegerField() content_object = GenericForeignKey()
class RssFeedData(models.Model): """Store all old RSS data to disk for future analysis.""" date_created = models.DateTimeField( help_text="The time when this item was created", auto_now_add=True, db_index=True, ) date_modified = models.DateTimeField( help_text="The last moment when the item was modified.", auto_now=True, db_index=True, ) court = models.ForeignKey( Court, help_text="The court where the RSS feed was found", on_delete=models.CASCADE, related_name="rss_feed_data", ) filepath = models.FileField( help_text="The path of the file in the local storage area.", upload_to=make_rss_feed_path, storage=UUIDFileSystemStorage(), max_length=150, ) @property def file_contents(self): with open(self.filepath.path, "rb") as f: return bz2.decompress(f.read()).decode("utf-8") def print_file_contents(self): print(self.file_contents)
class UUIDFileSystemStorageTest(SimpleTestCase): # Borrows from https://github.com/django/django/blob/9cbf48693dcd8df6cb22c183dcc94e7ce62b2921/tests/file_storage/tests.py#L89 def setUp(self): self.temp_dir = tempfile.mkdtemp() self.storage = UUIDFileSystemStorage(location=self.temp_dir, base_url='test_uuid_storage') def test_file_save_with_path(self): """Does saving a pathname create directories and filenames correctly?""" self.assertFalse(self.storage.exists('path/to')) file_name = 'filename' extension = 'ext' f = self.storage.save('path/to/%s.%s' % (file_name, extension), ContentFile('file with path')) self.assertTrue(self.storage.exists('path/to')) dir_name_created, file_name_created = os.path.split(f) file_root_created, extension_created = file_name_created.split('.', 1) self.assertEqual(extension_created, extension) self.assertTrue(re.match('[a-f0-9]{32}', file_root_created))
class RssFeedData(models.Model): """Store all old RSS data to disk for future analysis.""" date_created = models.DateTimeField( help_text="The time when this item was created", auto_now_add=True, db_index=True, ) date_modified = models.DateTimeField( help_text="The last moment when the item was modified.", auto_now=True, db_index=True, ) court = models.ForeignKey( Court, help_text="The court where the RSS feed was found", on_delete=models.CASCADE, related_name="rss_feed_data", ) filepath = models.FileField( help_text="The path of the file in the local storage area.", upload_to=make_rss_feed_path, storage=UUIDFileSystemStorage(), max_length=150, ) @property def file_contents(self): with open(self.filepath.path, "rb") as f: return bz2.decompress(f.read()).decode("utf-8") def print_file_contents(self): print(self.file_contents) def reprocess_item(self, metadata_only=False, index=True): """Reprocess the RSS feed :param metadata_only: If True, only do the metadata, not the docket entries. :param index: Whether to save to Solr (note that none will be sent when doing medata only since no entries are modified). """ from cl.recap_rss.tasks import merge_rss_feed_contents from cl.search.tasks import add_items_to_solr rss_feed = PacerRssFeed(map_cl_to_pacer_id(self.court_id)) rss_feed._parse_text(self.file_contents) response = merge_rss_feed_contents( rss_feed.data, self.court_id, metadata_only ) if index: add_items_to_solr( response.get("rds_for_solr", []), "search.RECAPDocument" )
class UUIDFileSystemStorageTest(SimpleTestCase): # Borrows from https://github.com/django/django/blob/9cbf48693dcd8df6cb22c183dcc94e7ce62b2921/tests/file_storage/tests.py#L89 allow_database_queries = True def setUp(self): self.temp_dir = tempfile.mkdtemp() self.storage = UUIDFileSystemStorage(location=self.temp_dir, base_url="test_uuid_storage") def test_file_save_with_path(self): """Does saving a pathname create directories and filenames correctly?""" self.assertFalse(self.storage.exists("path/to")) file_name = "filename" extension = "ext" f = self.storage.save( "path/to/%s.%s" % (file_name, extension), ContentFile("file with path"), ) self.assertTrue(self.storage.exists("path/to")) dir_name_created, file_name_created = os.path.split(f) file_root_created, extension_created = file_name_created.split(".", 1) self.assertEqual(extension_created, extension) self.assertTrue(re.match("[a-f0-9]{32}", file_root_created))
class PacerHtmlFiles(AbstractFile): """This is a simple object for holding original HTML content from PACER We use this object to make sure that for every item we receive from users, we can go back and re-parse it one day if we have to. This becomes essential as we do more and more data work where we're purchasing content. If we don't keep an original copy, a bug could be devastating. """ filepath = models.FileField( help_text="The path of the original data from PACER.", upload_to=make_recap_data_path, storage=UUIDFileSystemStorage(), max_length=150, ) upload_type = models.SmallIntegerField( help_text="The type of object that is uploaded", choices=UPLOAD_TYPE.NAMES, )
def setUp(self): self.temp_dir = tempfile.mkdtemp() self.storage = UUIDFileSystemStorage(location=self.temp_dir, base_url='test_uuid_storage')
class ProcessingQueue(models.Model): AWAITING_PROCESSING = 1 PROCESSING_SUCCESSFUL = 2 PROCESSING_FAILED = 3 PROCESSING_IN_PROGRESS = 4 QUEUED_FOR_RETRY = 5 PROCESSING_STATUSES = ( (AWAITING_PROCESSING, 'Awaiting processing in queue.'), (PROCESSING_SUCCESSFUL, 'Item processed successfully.'), (PROCESSING_FAILED, 'Item encountered an error while processing.'), (PROCESSING_IN_PROGRESS, 'Item is currently being processed.'), (QUEUED_FOR_RETRY, 'Item failed processing, but will be retried.'), ) DOCKET = 1 ATTACHMENT_PAGE = 2 PDF = 3 UPLOAD_TYPES = ( (DOCKET, 'HTML Docket'), (ATTACHMENT_PAGE, 'HTML attachment page'), (PDF, 'PDF'), ) date_created = models.DateTimeField( help_text="The time when this item was created", auto_now_add=True, db_index=True, ) date_modified = models.DateTimeField( help_text="The last moment when the item was modified.", auto_now=True, db_index=True, ) court = models.ForeignKey( Court, help_text="The court where the upload was from", related_name='recap_processing_queue', ) uploader = models.ForeignKey( User, help_text="The user that uploaded the item to RECAP.", related_name='recap_processing_queue', ) pacer_case_id = models.CharField( help_text="The cased ID provided by PACER.", max_length=100, ) pacer_doc_id = models.CharField( help_text="The ID of the document in PACER. This information is " "provided by RECAP.", max_length=32, # Same as in RECAP unique=True, blank=True, ) document_number = models.CharField( help_text="If the file is a document, the number is the " "document_number in RECAP docket.", max_length=32, ) attachment_number = models.SmallIntegerField( help_text="If the file is an attachment, the number is the attachment " "number in RECAP docket.", blank=True, null=True, ) filepath_local = models.FileField( help_text="The path of the uploaded file.", upload_to=make_recap_processing_queue_path, storage=UUIDFileSystemStorage(), max_length=1000, ) status = models.SmallIntegerField( help_text="The current status of this upload.", choices=PROCESSING_STATUSES, ) upload_type = models.SmallIntegerField( help_text="The type of object that is uploaded", choices=UPLOAD_TYPES, ) error_message = models.TextField( help_text="Any errors that occurred while processing an item", blank=True, ) def __unicode__(self): if self.upload_type == self.DOCKET: return u'ProcessingQueue %s: %s case #%s (%s)' % ( self.pk, self.court_id, self.pacer_case_id, self.get_upload_type_display(), ) elif self.upload_type == self.PDF: return u'ProcessingQueue %s: %s.%s.%s.%s (%s)' % ( self.pk, self.court_id, self.pacer_case_id or None, self.document_number or None, self.attachment_number or 0, self.get_upload_type_display(), ) class Meta: permissions = (("has_recap_upload_access", 'Can upload documents to RECAP.'), )
class ProcessingQueue(models.Model): AWAITING_PROCESSING = 1 PROCESSING_SUCCESSFUL = 2 PROCESSING_FAILED = 3 PROCESSING_IN_PROGRESS = 4 QUEUED_FOR_RETRY = 5 INVALID_CONTENT = 6 PROCESSING_STATUSES = ( (AWAITING_PROCESSING, 'Awaiting processing in queue.'), (PROCESSING_SUCCESSFUL, 'Item processed successfully.'), (PROCESSING_FAILED, 'Item encountered an error while processing.'), (PROCESSING_IN_PROGRESS, 'Item is currently being processed.'), (QUEUED_FOR_RETRY, 'Item failed processing, but will be retried.'), (INVALID_CONTENT, 'Item failed validity tests.'), ) DOCKET = 1 ATTACHMENT_PAGE = 2 PDF = 3 DOCKET_HISTORY_REPORT = 4 APPELLATE_DOCKET = 5 APPELLATE_ATTACHMENT_PAGE = 6 UPLOAD_TYPES = ( (DOCKET, 'HTML Docket'), (ATTACHMENT_PAGE, 'HTML attachment page'), (PDF, 'PDF'), (DOCKET_HISTORY_REPORT, 'Docket history report'), (APPELLATE_DOCKET, 'Appellate HTML docket'), (APPELLATE_ATTACHMENT_PAGE, 'Appellate HTML attachment page'), ) date_created = models.DateTimeField( help_text="The time when this item was created", auto_now_add=True, db_index=True, ) date_modified = models.DateTimeField( help_text="The last moment when the item was modified.", auto_now=True, db_index=True, ) court = models.ForeignKey( Court, help_text="The court where the upload was from", related_name='recap_processing_queue', ) uploader = models.ForeignKey( User, help_text="The user that uploaded the item to RECAP.", related_name='recap_processing_queue', ) pacer_case_id = models.CharField( help_text="The cased ID provided by PACER.", max_length=100, db_index=True, blank=True, ) pacer_doc_id = models.CharField( help_text="The ID of the document in PACER.", max_length=32, # Same as in RECAP blank=True, db_index=True, ) document_number = models.BigIntegerField( help_text="The docket entry number for the document.", blank=True, null=True, ) attachment_number = models.SmallIntegerField( help_text="If the file is an attachment, the number is the attachment " "number on the docket.", blank=True, null=True, ) filepath_local = models.FileField( help_text="The path of the uploaded file.", upload_to=make_recap_processing_queue_path, storage=UUIDFileSystemStorage(), max_length=1000, ) status = models.SmallIntegerField( help_text="The current status of this upload. Possible values are: %s" % ', '.join(['(%s): %s' % (t[0], t[1]) for t in PROCESSING_STATUSES]), default=AWAITING_PROCESSING, choices=PROCESSING_STATUSES, db_index=True, ) upload_type = models.SmallIntegerField( help_text="The type of object that is uploaded", choices=UPLOAD_TYPES, ) error_message = models.TextField( help_text="Any errors that occurred while processing an item", blank=True, ) debug = models.BooleanField( help_text="Are you debugging? Debugging uploads will be validated, but " "not saved to the database.", default=False, ) # Post process fields docket = models.ForeignKey( Docket, help_text="The docket that was created or updated by this request.", null=True, ) docket_entry = models.ForeignKey( DocketEntry, help_text="The docket entry that was created or updated by this " "request, if applicable. Only applies to PDFs uploads.", null=True, ) recap_document = models.ForeignKey( RECAPDocument, help_text="The document that was created or updated by this request, " "if applicable. Only applies to PDFs uploads.", null=True, ) def __unicode__(self): if self.upload_type == self.DOCKET: return u'ProcessingQueue %s: %s case #%s (%s)' % ( self.pk, self.court_id, self.pacer_case_id, self.get_upload_type_display(), ) elif self.upload_type == self.PDF: return u'ProcessingQueue: %s: %s.%s.%s.%s (%s)' % ( self.pk, self.court_id, self.pacer_case_id or None, self.document_number or None, self.attachment_number or 0, self.get_upload_type_display(), ) elif self.upload_type == self.ATTACHMENT_PAGE: return u'ProcessingQueue: %s (%s)' % ( self.pk, self.get_upload_type_display(), ) else: raise NotImplementedError class Meta: permissions = (("has_recap_upload_access", 'Can upload documents to RECAP.'), )
class ProcessingQueue(models.Model): date_created = models.DateTimeField( help_text="The time when this item was created", auto_now_add=True, db_index=True, ) date_modified = models.DateTimeField( help_text="The last moment when the item was modified.", auto_now=True, db_index=True, ) court = models.ForeignKey( Court, help_text="The court where the upload was from", related_name='recap_processing_queue', on_delete=models.CASCADE, ) uploader = models.ForeignKey( User, help_text="The user that uploaded the item to RECAP.", related_name='recap_processing_queue', on_delete=models.CASCADE, ) pacer_case_id = models.CharField( help_text="The cased ID provided by PACER.", max_length=100, db_index=True, blank=True, ) pacer_doc_id = models.CharField( help_text="The ID of the document in PACER.", max_length=32, # Same as in RECAP blank=True, db_index=True, ) document_number = models.BigIntegerField( help_text="The docket entry number for the document.", blank=True, null=True, ) attachment_number = models.SmallIntegerField( help_text="If the file is an attachment, the number is the attachment " "number on the docket.", blank=True, null=True, ) filepath_local = models.FileField( help_text="The path of the uploaded file.", upload_to=make_recap_processing_queue_path, storage=UUIDFileSystemStorage(), max_length=1000, ) status = models.SmallIntegerField( help_text="The current status of this upload. Possible values " "are: %s" % ', '.join(['(%s): %s' % (t[0], t[1]) for t in PROCESSING_STATUS.NAMES]), default=PROCESSING_STATUS.ENQUEUED, choices=PROCESSING_STATUS.NAMES, db_index=True, ) upload_type = models.SmallIntegerField( help_text="The type of object that is uploaded", choices=UPLOAD_TYPE.NAMES, ) error_message = models.TextField( help_text="Any errors that occurred while processing an item", blank=True, ) debug = models.BooleanField( help_text="Are you debugging? Debugging uploads will be validated, " "but not saved to the database.", default=False, ) # Post process fields docket = models.ForeignKey( Docket, help_text="The docket that was created or updated by this request.", null=True, on_delete=models.CASCADE, ) docket_entry = models.ForeignKey( DocketEntry, help_text="The docket entry that was created or updated by this " "request, if applicable. Only applies to PDFs uploads.", null=True, on_delete=models.CASCADE, ) recap_document = models.ForeignKey( RECAPDocument, help_text="The document that was created or updated by this request, " "if applicable. Only applies to PDFs uploads.", null=True, on_delete=models.CASCADE, ) def __unicode__(self): if self.upload_type in [ UPLOAD_TYPE.DOCKET, UPLOAD_TYPE.DOCKET_HISTORY_REPORT, UPLOAD_TYPE.APPELLATE_DOCKET]: return u'ProcessingQueue %s: %s case #%s (%s)' % ( self.pk, self.court_id, self.pacer_case_id, self.get_upload_type_display(), ) elif self.upload_type == UPLOAD_TYPE.PDF: return u'ProcessingQueue: %s: %s.%s.%s.%s (%s)' % ( self.pk, self.court_id, self.pacer_case_id or None, self.document_number or None, self.attachment_number or 0, self.get_upload_type_display(), ) elif self.upload_type == UPLOAD_TYPE.ATTACHMENT_PAGE: return u'ProcessingQueue: %s (%s)' % ( self.pk, self.get_upload_type_display(), ) else: raise NotImplementedError( "No __unicode__ method on ProcessingQueue model for upload_" "type of %s" % self.upload_type ) class Meta: permissions = ( ("has_recap_upload_access", 'Can upload documents to RECAP.'), ) @property def file_contents(self): with open(self.filepath_local.path, 'r') as f: return f.read().decode('utf-8') def print_file_contents(self): print(self.file_contents)