class StaticFilesTest(TestCase): good_mp3_path = 'mp3/2014/06/09/ander_v._leo.mp3' good_txt_path = 'txt/2015/12/28/opinion_text.txt' good_pdf_path = 'pdf/2013/06/12/' + \ 'in_re_motion_for_consent_to_disclosure_of_court_records.pdf' def setUp(self): self.court = Court.objects.get(pk='test') self.docket = Docket(case_name=u'Docket', court=self.court, source=Docket.DEFAULT) self.docket.save() self.audio = Audio(local_path_original_file=self.good_mp3_path, local_path_mp3=self.good_mp3_path, docket=self.docket, blocked=False, case_name_full='Ander v. Leo', date_created=datetime.date(2014, 6, 9)) self.audio.save(index=False) self.opinioncluster = OpinionCluster( case_name=u'Hotline Bling', docket=self.docket, date_filed=datetime.date(2015, 12, 14), ) self.opinioncluster.save(index=False) self.txtopinion = Opinion(cluster=self.opinioncluster, type='Lead Opinion', local_path=self.good_txt_path) self.txtopinion.save(index=False) self.pdfopinion = Opinion(cluster=self.opinioncluster, type='Lead Opinion', local_path=self.good_pdf_path) self.pdfopinion.save(index=False) def test_serve_static_file_serves_mp3(self): request = HttpRequest() file_path = self.audio.local_path_mp3 response = serve_static_file(request, file_path=self.good_mp3_path) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'audio/mpeg') self.assertIn('inline;', response['Content-Disposition']) def test_serve_static_file_serves_txt(self): request = HttpRequest() response = serve_static_file(request, file_path=self.good_txt_path) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'text/plain') self.assertIn('inline;', response['Content-Disposition']) self.assertIn('FOR THE DISTRICT OF COLUMBIA CIRCUIT', response.content) def test_serve_static_file_serves_pdf(self): request = HttpRequest() response = serve_static_file(request, file_path=self.good_pdf_path) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'application/pdf') self.assertIn('inline;', response['Content-Disposition'])
def setUp(self): self.court = Court.objects.get(pk='test') self.docket = Docket(case_name=u'Docket', court=self.court, source=Docket.DEFAULT) self.docket.save() self.audio = Audio( local_path_original_file=self.good_mp3_path, local_path_mp3=self.good_mp3_path, docket=self.docket, blocked=False, case_name_full='Ander v. Leo', date_created=datetime.date(2014, 6, 9) ) self.audio.save(index=False) self.opinioncluster = OpinionCluster( case_name=u'Hotline Bling', docket=self.docket, date_filed=datetime.date(2015, 12, 14), ) self.opinioncluster.save(index=False) self.txtopinion = Opinion( cluster=self.opinioncluster, type='Lead Opinion', local_path=self.good_txt_path ) self.txtopinion.save(index=False) self.pdfopinion = Opinion( cluster=self.opinioncluster, type='Lead Opinion', local_path=self.good_pdf_path ) self.pdfopinion.save(index=False)
def make_objects(self, item, court, sha1_hash, content): blocked = item["blocked_statuses"] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = item.get( "case_name_shorts" ) or self.cnt.make_case_name_short(item["case_names"]) docket = Docket( docket_number=item.get("docket_numbers", ""), case_name=item["case_names"], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, date_argued=item["case_dates"], source=Docket.SCRAPER, ) audio_file = Audio( judges=item.get("judges", ""), source="C", case_name=item["case_names"], case_name_short=case_name_short, sha1=sha1_hash, download_url=item["download_urls"], blocked=blocked, date_blocked=date_blocked, ) error = False try: cf = ContentFile(content) extension = get_extension(content) if extension not in [".mp3", ".wma"]: extension = ( "." + item["download_urls"].lower().rsplit(".", 1)[1] ) # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item["case_names"].lower(), 75) + extension audio_file.file_with_date = docket.date_argued audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = ( "Unable to save binary to disk. Deleted audio file: %s.\n " "%s" % (item["case_names"], traceback.format_exc()) ) logger.critical(msg.encode("utf-8")) ErrorLog(log_level="CRITICAL", court=court, message=msg).save() error = True return docket, audio_file, error
def make_objects(self, item, court, sha1_hash, content): blocked = item['blocked_statuses'] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = (item.get('case_name_shorts') or self.cnt.make_case_name_short(item['case_names'])) docket = Docket( docket_number=item.get('docket_numbers', ''), case_name=item['case_names'], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, date_argued=item['case_dates'], source=Docket.SCRAPER, ) audio_file = Audio( judges=item.get('judges', ''), source='C', case_name=item['case_names'], case_name_short=case_name_short, sha1=sha1_hash, download_url=item['download_urls'], blocked=blocked, date_blocked=date_blocked, ) error = False try: cf = ContentFile(content) extension = get_extension(content) if extension not in ['.mp3', '.wma']: extension = '.' + item['download_urls'].lower().rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item['case_names'].lower(), 75) + extension audio_file.file_with_date = docket.date_argued audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary to disk. Deleted audio file: %s.\n ' \ '%s' % (item['case_names'], traceback.format_exc()) logger.critical(msg.encode('utf-8')) ErrorLog(log_level='CRITICAL', court=court, message=msg).save() error = True return docket, audio_file, error
def make_objects( item: Dict[str, Any], court: Court, sha1_hash: str, content: str, ) -> Tuple[Docket, Audio]: blocked = item["blocked_statuses"] if blocked: date_blocked = date.today() else: date_blocked = None case_name_short = item.get("case_name_shorts") or cnt.make_case_name_short( item["case_names"] ) docket = Docket( docket_number=item.get("docket_numbers", ""), case_name=item["case_names"], case_name_short=case_name_short, court=court, blocked=blocked, date_blocked=date_blocked, date_argued=item["case_dates"], source=item.get("source") or Docket.SCRAPER, ) audio_file = Audio( judges=item.get("judges", ""), source=item.get("cluster_source") or "C", case_name=item["case_names"], case_name_short=case_name_short, sha1=sha1_hash, download_url=item["download_urls"], blocked=blocked, date_blocked=date_blocked, ) cf = ContentFile(content) extension = get_extension(content) if extension not in [".mp3", ".wma"]: extension = "." + item["download_urls"].lower().rsplit(".", 1)[1] file_name = trunc(item["case_names"].lower(), 75) + extension audio_file.file_with_date = docket.date_argued audio_file.local_path_original_file.save(file_name, cf, save=False) return docket, audio_file
def migrate_opinions_oral_args_and_dockets(self): self.stdout.write("Migrating dockets, audio files, and opinions to new " "database...") q = DocketOld.objects.using('old').all() old_dockets = queryset_generator(q) num_dockets = q.count() progress = 0 self._print_progress(progress, num_dockets) for old_docket in old_dockets: # First do the docket, then create the cluster and opinion objects. try: old_audio = old_docket.audio_files.all()[0] except IndexError: old_audio = None try: old_document = old_docket.documents.all()[0] except IndexError: old_document = None if old_document is not None: old_citation = old_document.citation old_doc_case_name, old_doc_case_name_full, old_doc_case_name_short = self._get_case_names(old_citation.case_name) if old_audio is not None: old_audio_case_name, old_audio_case_name_full, old_audio_case_name_short = self._get_case_names(old_audio.case_name) court = CourtNew.objects.get(pk=old_docket.court_id) # Courts are in place thanks to initial data. new_docket = DocketNew( pk=old_docket.pk, date_modified=old_docket.date_modified, date_created=old_docket.date_modified, court=court, case_name=old_doc_case_name, case_name_full=old_doc_case_name_full, case_name_short=old_doc_case_name_short, slug=self._none_to_blank(old_docket.slug), docket_number=self._none_to_blank(old_citation.docket_number), date_blocked=old_docket.date_blocked, blocked=old_docket.blocked, ) if old_audio is not None: new_docket.date_argued = old_audio.date_argued new_docket.save(using='default') if old_document is not None: new_opinion_cluster = OpinionClusterNew( pk=old_document.pk, docket=new_docket, judges=self._none_to_blank(old_document.judges), date_modified=old_document.date_modified, date_created=old_document.date_modified, date_filed=old_document.date_filed, slug=self._none_to_blank(old_citation.slug), citation_id=old_document.citation_id, case_name_short=old_doc_case_name_short, case_name=old_doc_case_name, case_name_full=old_doc_case_name_full, federal_cite_one=self._none_to_blank( old_citation.federal_cite_one), federal_cite_two=self._none_to_blank( old_citation.federal_cite_two), federal_cite_three=self._none_to_blank( old_citation.federal_cite_three), state_cite_one=self._none_to_blank( old_citation.state_cite_one), state_cite_two=self._none_to_blank( old_citation.state_cite_two), state_cite_three=self._none_to_blank( old_citation.state_cite_three), state_cite_regional=self._none_to_blank( old_citation.state_cite_regional), specialty_cite_one=self._none_to_blank( old_citation.specialty_cite_one), scotus_early_cite=self._none_to_blank( old_citation.scotus_early_cite), lexis_cite=self._none_to_blank(old_citation.lexis_cite), westlaw_cite=self._none_to_blank(old_citation.westlaw_cite), neutral_cite=self._none_to_blank(old_citation.neutral_cite), scdb_id=self._none_to_blank( old_document.supreme_court_db_id), source=old_document.source, nature_of_suit=old_document.nature_of_suit, citation_count=old_document.citation_count, precedential_status=old_document.precedential_status, date_blocked=old_document.date_blocked, blocked=old_document.blocked, ) new_opinion_cluster.save( using='default', index=False, ) new_opinion = OpinionNew( pk=old_document.pk, cluster=new_opinion_cluster, date_modified=old_document.date_modified, date_created=old_document.time_retrieved, type='010combined', sha1=old_document.sha1, download_url=old_document.download_url, local_path=old_document.local_path, plain_text=old_document.plain_text, html=self._none_to_blank(old_document.html), html_lawbox=self._none_to_blank(old_document.html_lawbox), html_with_citations=old_document.html_with_citations, extracted_by_ocr=old_document.extracted_by_ocr, ) new_opinion.save( using='default', index=False, ) if old_audio is not None: new_audio_file = AudioNew( pk=old_audio.pk, docket=new_docket, source=old_audio.source, case_name=old_audio_case_name, case_name_short=old_audio_case_name_short, case_name_full=old_audio_case_name_full, judges=self._none_to_blank(old_audio.judges), date_created=old_audio.time_retrieved, date_modified=old_audio.date_modified, sha1=old_audio.sha1, download_url=old_audio.download_url, local_path_mp3=old_audio.local_path_mp3, local_path_original_file=old_audio.local_path_original_file, duration=old_audio.duration, processing_complete=old_audio.processing_complete, date_blocked=old_audio.date_blocked, blocked=old_audio.blocked, ) new_audio_file.save( using='default', index=False, ) progress += 1 self._print_progress(progress, num_dockets) self.stdout.write(u'') # Newline
class StaticFilesTest(TestCase): good_mp3_path = "mp3/2014/06/09/ander_v._leo.mp3" good_txt_path = "txt/2015/12/28/opinion_text.txt" good_pdf_path = ( "pdf/2013/06/12/" + "in_re_motion_for_consent_to_disclosure_of_court_records.pdf") def setUp(self): self.court = Court.objects.get(pk="test") self.docket = Docket(case_name=u"Docket", court=self.court, source=Docket.DEFAULT) self.docket.save() self.audio = Audio( local_path_original_file=self.good_mp3_path, local_path_mp3=self.good_mp3_path, docket=self.docket, blocked=False, case_name_full="Ander v. Leo", date_created=datetime.date(2014, 6, 9), ) self.audio.save(index=False) self.opinioncluster = OpinionCluster( case_name=u"Hotline Bling", docket=self.docket, date_filed=datetime.date(2015, 12, 14), ) self.opinioncluster.save(index=False) self.txtopinion = Opinion( cluster=self.opinioncluster, type="Lead Opinion", local_path=self.good_txt_path, ) self.txtopinion.save(index=False) self.pdfopinion = Opinion( cluster=self.opinioncluster, type="Lead Opinion", local_path=self.good_pdf_path, ) self.pdfopinion.save(index=False) def test_serve_static_file_serves_mp3(self): request = HttpRequest() file_path = self.audio.local_path_mp3 response = serve_static_file(request, file_path=self.good_mp3_path) self.assertEqual(response.status_code, 200) self.assertEqual(response["Content-Type"], "audio/mpeg") self.assertIn("inline;", response["Content-Disposition"]) def test_serve_static_file_serves_txt(self): request = HttpRequest() response = serve_static_file(request, file_path=self.good_txt_path) self.assertEqual(response.status_code, 200) self.assertEqual(response["Content-Type"], "text/plain") self.assertIn("inline;", response["Content-Disposition"]) self.assertIn("FOR THE DISTRICT OF COLUMBIA CIRCUIT", response.content) def test_serve_static_file_serves_pdf(self): request = HttpRequest() response = serve_static_file(request, file_path=self.good_pdf_path) self.assertEqual(response.status_code, 200) self.assertEqual(response["Content-Type"], "application/pdf") self.assertIn("inline;", response["Content-Disposition"])
def download_and_save(): """This function is run in many threads simultaneously. Each thread runs so long as there are items in the queue. Once an item is found, it's downloaded and saved. The number of items that can be concurrently saved is determined by the number of threads that are running this function. """ while True: item = queue.get() logger.info("%s: Attempting to add item at: %s" % (threading.current_thread().name, item['url'])) try: msg, r = get_binary_content( item['url'], {}, ) except: logger.info("%s: Unable to get item at: %s" % (threading.current_thread().name, item['url'])) queue.task_done() if msg: logger.warn(msg) queue.task_done() continue sha1_hash = hashlib.sha1(r.content).hexdigest() if Audio.objects.filter(sha1=sha1_hash).exists(): # Simpsons did it! Try the next one. logger.info("%s: Item already exists, moving to next item." % threading.current_thread().name) queue.task_done() continue else: # New item, onwards! logger.info('%s: Adding new document found at: %s' % (threading.current_thread().name, item['url'])) audio_file = Audio( source='H', sha1=sha1_hash, case_name=item['case_name'], download_url=item['url'], processing_complete=False, ) if item['judges']: audio_file.judges = item['judges'] if item['docket_number']: audio_file.docket.docket_number = item['docket_number'] court = Court.objects.get(pk=item['court_code']) docket = Docket( case_name=item['case_name'], court=court, date_argued=item['date_argued'], ) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) if extension not in ['.mp3', '.wma']: extension = '.' + item['url'].rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item['case_name'].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary. Deleted document: %s.\n%s' % \ (item['case_name'], traceback.format_exc()) logger.critical(msg) queue.task_done() docket.save() audio_file.docket = docket audio_file.save(index=False) random_delay = random.randint(0, 3600) process_audio_file.apply_async( (audio_file.pk,), countdown=random_delay ) logger.info("%s: Successfully added audio file %s: %s" % (threading.current_thread().name, audio_file.pk, audio_file.case_name))
def migrate_opinions_oral_args_and_dockets(self): """Migrate the core objects across, diffing as you go. :param start_date: Items changed after this date will be processed. :return: None """ self.stdout.write("Migrating dockets, audio files, and opinions...") # Find dockets modified after date or with sub-items modified after # date. q = Q(date_modified__gte=self.start) q |= Q(documents__date_modified__gte=self.start) q |= Q(audio_files__date_modified__gte=self.start) old_dockets = DocketOld.objects.using('old').filter(q) for old_docket in old_dockets: try: old_audio = old_docket.audio_files.all()[0] except IndexError: old_audio = None try: old_document = old_docket.documents.all()[0] except IndexError: old_document = None if old_document is None and old_audio is None: continue if old_document is not None: old_citation = old_document.citation old_docket.case_name, old_docket.case_name_full, old_docket.case_name_short = self._get_case_names( old_citation.case_name) else: # Fall back on the docket if needed. Assumes they docket and # document case_names are always the same. old_docket.case_name, old_docket.case_name_full, old_docket.case_name_short = self._get_case_names( old_docket.case_name) if old_audio is not None: old_audio.case_name, old_audio.case_name_full, old_audio.case_name_short = self._get_case_names( old_audio.case_name) # Courts are in place thanks to initial data. Get the court. court = CourtNew.objects.get(pk=old_docket.court_id) # Do Dockets try: existing_docket = (DocketNew.objects.using('default').get( pk=old_docket.pk)) except DocketNew.DoesNotExist: existing_docket = None if existing_docket is not None: # Intersection. No need for complicated merge as all differences # have been resolved by hand. new_docket = existing_docket else: # New docket in old system. Create it in the new system. new_docket = DocketNew( pk=old_docket.pk, date_modified=old_docket.date_modified, date_created=old_docket.date_modified, court=court, case_name=old_docket.case_name, case_name_full=old_docket.case_name_full, case_name_short=old_docket.case_name_short, slug=self._none_to_blank(old_docket.slug), docket_number=self._none_to_blank( old_citation.docket_number), date_blocked=old_docket.date_blocked, blocked=old_docket.blocked, ) if old_audio is not None: new_docket.date_argued = old_audio.date_argued new_docket.save(using='default') # Do Documents/Clusters if old_document is not None: try: existing_oc = ( OpinionClusterNew.objects.using('default').get( pk=old_document.pk)) except OpinionClusterNew.DoesNotExist: existing_oc = None try: existing_o = (OpinionNew.objects.using('default').get( pk=old_document.pk)) except OpinionNew.DoesNotExist: existing_o = None if existing_oc is not None or existing_o is not None: # Run the conflict algo. if self.find_conflicts(old_document, old_citation, old_docket, existing_oc, existing_o): self.stdout.write("Found conflict. Resolve that.") else: # No conflicts. Update the existing item. self.add_oc_and_o(old_document, old_citation, old_docket, new_docket) else: # New item. Just add it. self.add_oc_and_o(old_document, old_citation, old_docket, new_docket) # Finally we do Audio. No checks needed because we haven't changed # anything on the new server. if old_audio is not None: new_audio_file = AudioNew( pk=old_audio.pk, docket=new_docket, source=old_audio.source, case_name=old_audio.case_name, case_name_short=old_audio.case_name_short, case_name_full=old_audio.case_name_full, judges=self._none_to_blank(old_audio.judges), date_created=old_audio.time_retrieved, date_modified=old_audio.date_modified, sha1=old_audio.sha1, download_url=old_audio.download_url, local_path_mp3=old_audio.local_path_mp3, local_path_original_file=old_audio. local_path_original_file, duration=old_audio.duration, processing_complete=old_audio.processing_complete, date_blocked=old_audio.date_blocked, blocked=old_audio.blocked, ) new_audio_file.save( using='default', index=False, )
class StaticFilesTest(TestCase): good_mp3_path = 'mp3/2014/06/09/ander_v._leo.mp3' good_txt_path = 'txt/2015/12/28/opinion_text.txt' good_pdf_path = 'pdf/2013/06/12/' + \ 'in_re_motion_for_consent_to_disclosure_of_court_records.pdf' def setUp(self): self.court = Court.objects.get(pk='test') self.docket = Docket(case_name=u'Docket', court=self.court, source=Docket.DEFAULT) self.docket.save() self.audio = Audio( local_path_original_file=self.good_mp3_path, local_path_mp3=self.good_mp3_path, docket=self.docket, blocked=False, case_name_full='Ander v. Leo', date_created=datetime.date(2014, 6, 9) ) self.audio.save(index=False) self.opinioncluster = OpinionCluster( case_name=u'Hotline Bling', docket=self.docket, date_filed=datetime.date(2015, 12, 14), ) self.opinioncluster.save(index=False) self.txtopinion = Opinion( cluster=self.opinioncluster, type='Lead Opinion', local_path=self.good_txt_path ) self.txtopinion.save(index=False) self.pdfopinion = Opinion( cluster=self.opinioncluster, type='Lead Opinion', local_path=self.good_pdf_path ) self.pdfopinion.save(index=False) def test_serve_static_file_serves_mp3(self): request = HttpRequest() file_path = self.audio.local_path_mp3 response = serve_static_file(request, file_path=self.good_mp3_path) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'audio/mpeg') self.assertIn('inline;', response['Content-Disposition']) def test_serve_static_file_serves_txt(self): request = HttpRequest() response = serve_static_file(request, file_path=self.good_txt_path) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'text/plain') self.assertIn('inline;', response['Content-Disposition']) self.assertIn( 'FOR THE DISTRICT OF COLUMBIA CIRCUIT', response.content ) def test_serve_static_file_serves_pdf(self): request = HttpRequest() response = serve_static_file(request, file_path=self.good_pdf_path) self.assertEqual(response.status_code, 200) self.assertEqual(response['Content-Type'], 'application/pdf') self.assertIn('inline;', response['Content-Disposition'])
def migrate_opinions_oral_args_and_dockets(self): """Migrate the core objects across, diffing as you go. :param start_date: Items changed after this date will be processed. :return: None """ self.stdout.write("Migrating dockets, audio files, and opinions...") # Find dockets modified after date or with sub-items modified after # date. q = Q(date_modified__gte=self.start) q |= Q(documents__date_modified__gte=self.start) q |= Q(audio_files__date_modified__gte=self.start) old_dockets = DocketOld.objects.using('old').filter(q) for old_docket in old_dockets: try: old_audio = old_docket.audio_files.all()[0] except IndexError: old_audio = None try: old_document = old_docket.documents.all()[0] except IndexError: old_document = None if old_document is None and old_audio is None: continue if old_document is not None: old_citation = old_document.citation old_docket.case_name, old_docket.case_name_full, old_docket.case_name_short = self._get_case_names( old_citation.case_name) else: # Fall back on the docket if needed. Assumes they docket and # document case_names are always the same. old_docket.case_name, old_docket.case_name_full, old_docket.case_name_short = self._get_case_names( old_docket.case_name) if old_audio is not None: old_audio.case_name, old_audio.case_name_full, old_audio.case_name_short = self._get_case_names( old_audio.case_name) # Courts are in place thanks to initial data. Get the court. court = CourtNew.objects.get(pk=old_docket.court_id) # Do Dockets try: existing_docket = (DocketNew.objects .using('default') .get(pk=old_docket.pk)) except DocketNew.DoesNotExist: existing_docket = None if existing_docket is not None: # Intersection. No need for complicated merge as all differences # have been resolved by hand. new_docket = existing_docket else: # New docket in old system. Create it in the new system. new_docket = DocketNew( pk=old_docket.pk, date_modified=old_docket.date_modified, date_created=old_docket.date_modified, court=court, case_name=old_docket.case_name, case_name_full=old_docket.case_name_full, case_name_short=old_docket.case_name_short, slug=self._none_to_blank(old_docket.slug), docket_number=self._none_to_blank( old_citation.docket_number), date_blocked=old_docket.date_blocked, blocked=old_docket.blocked, ) if old_audio is not None: new_docket.date_argued = old_audio.date_argued new_docket.save(using='default') # Do Documents/Clusters if old_document is not None: try: existing_oc = (OpinionClusterNew.objects .using('default') .get(pk=old_document.pk)) except OpinionClusterNew.DoesNotExist: existing_oc = None try: existing_o = (OpinionNew.objects .using('default') .get(pk=old_document.pk)) except OpinionNew.DoesNotExist: existing_o = None if existing_oc is not None or existing_o is not None: # Run the conflict algo. if self.find_conflicts(old_document, old_citation, old_docket, existing_oc, existing_o): self.stdout.write("Found conflict. Resolve that.") else: # No conflicts. Update the existing item. self.add_oc_and_o(old_document, old_citation, old_docket, new_docket) else: # New item. Just add it. self.add_oc_and_o(old_document, old_citation, old_docket, new_docket) # Finally we do Audio. No checks needed because we haven't changed # anything on the new server. if old_audio is not None: new_audio_file = AudioNew( pk=old_audio.pk, docket=new_docket, source=old_audio.source, case_name=old_audio.case_name, case_name_short=old_audio.case_name_short, case_name_full=old_audio.case_name_full, judges=self._none_to_blank(old_audio.judges), date_created=old_audio.time_retrieved, date_modified=old_audio.date_modified, sha1=old_audio.sha1, download_url=old_audio.download_url, local_path_mp3=old_audio.local_path_mp3, local_path_original_file=old_audio.local_path_original_file, duration=old_audio.duration, processing_complete=old_audio.processing_complete, date_blocked=old_audio.date_blocked, blocked=old_audio.blocked, ) new_audio_file.save( using='default', index=False, )
def download_and_save(): """This function is run in many threads simultaneously. Each thread runs so long as there are items in the queue. Once an item is found, it's downloaded and saved. The number of items that can be concurrently saved is determined by the number of threads that are running this function. """ while True: item = queue.get() logger.info("%s: Attempting to add item at: %s" % (threading.current_thread().name, item['url'])) try: msg, r = get_binary_content( item['url'], {}, ) except: logger.info("%s: Unable to get item at: %s" % (threading.current_thread().name, item['url'])) queue.task_done() if msg: logger.warn(msg) queue.task_done() continue sha1_hash = hashlib.sha1(r.content).hexdigest() if Audio.objects.filter(sha1=sha1_hash).exists(): # Simpsons did it! Try the next one. logger.info("%s: Item already exists, moving to next item." % threading.current_thread().name) queue.task_done() continue else: # New item, onwards! logger.info('%s: Adding new document found at: %s' % (threading.current_thread().name, item['url'])) audio_file = Audio( source='H', sha1=sha1_hash, case_name=item['case_name'], download_url=item['url'], processing_complete=False, ) if item['judges']: audio_file.judges = item['judges'] if item['docket_number']: audio_file.docket.docket_number = item['docket_number'] court = Court.objects.get(pk=item['court_code']) docket = Docket( case_name=item['case_name'], court=court, date_argued=item['date_argued'], ) # Make and associate the file object try: cf = ContentFile(r.content) extension = get_extension(r.content) if extension not in ['.mp3', '.wma']: extension = '.' + item['url'].rsplit('.', 1)[1] # See bitbucket issue #215 for why this must be # lower-cased. file_name = trunc(item['case_name'].lower(), 75) + extension audio_file.local_path_original_file.save(file_name, cf, save=False) except: msg = 'Unable to save binary. Deleted document: %s.\n%s' % \ (item['case_name'], traceback.format_exc()) logger.critical(msg) queue.task_done() docket.save() audio_file.docket = docket audio_file.save(index=False) random_delay = random.randint(0, 3600) process_audio_file.apply_async((audio_file.pk, ), countdown=random_delay) logger.info("%s: Successfully added audio file %s: %s" % (threading.current_thread().name, audio_file.pk, audio_file.case_name))