def write_json_to_disk(obj_type_str, obj_type, court_attr, api_resource_obj, courts): """Write all items to disk as json files inside directories named by jurisdiction. The main trick is that we identify if we are creating a bulk archive from scratch. If so, we iterate over everything. If not, we only iterate over items that have been modified in the last 32 days because it's assumed that the bulk files are generated once per month. """ # Are there already bulk files? incremental = test_if_old_bulk_files_exist(obj_type_str) # Create a directory for every jurisdiction, if they don't already # exist. This does not clobber. for court in courts: mkdir_p(os.path.join( settings.BULK_DATA_DIR, 'tmp', obj_type_str, court.pk, )) if incremental: # Make the archives using updated data from the last 32 days. print " - Incremental data! We assume it's good, and use it..." thirty_two_days_ago = now() - datetime.timedelta(days=32) qs = obj_type.objects.filter(date_modified__gt=thirty_two_days_ago) else: print " - Incremental data not found. Working from scratch..." qs = obj_type.objects.all() item_resource = api_resource_obj() if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: # Necessary for jurisdictions, which don't have ints for ids. item_list = qs i = 0 for item in item_list: json_str = item_resource.serialize( None, item_resource.full_dehydrate( item_resource.build_bundle(obj=item)), 'application/json', ).encode('utf-8') with open(os.path.join( settings.BULK_DATA_DIR, 'tmp', obj_type_str, deepgetattr(item, court_attr), '%s.json' % item.pk), 'wb') as f: f.write(json_str) i += 1 print ' - all %s %s json files created.' % (i, obj_type_str)
def swap_archives(obj_type_str): """Swap out new archives, clobbering the old, if present""" mkdir_p(os.path.join(settings.BULK_DATA_DIR, '%s' % obj_type_str)) path_to_gz_files = os.path.join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, '*.tar*') for f in glob.glob(path_to_gz_files): shutil.move( f, os.path.join(settings.BULK_DATA_DIR, obj_type_str, os.path.basename(f)))
def write_json_to_disk(obj_type_str, obj_type, court_attr, api_resource_obj, courts): """Write all items to disk as json files inside directories named by jurisdiction. The main trick is that we identify if we are creating a bulk archive from scratch. If so, we iterate over everything. If not, we only iterate over items that have been modified in the last 32 days because it's assumed that the bulk files are generated once per month. """ # Are there already bulk files? incremental = test_if_old_bulk_files_exist(obj_type_str) # Create a directory for every jurisdiction, if they don't already # exist. This does not clobber. for court in courts: mkdir_p( os.path.join( settings.BULK_DATA_DIR, 'tmp', obj_type_str, court.pk, )) if incremental: # Make the archives using updated data from the last 32 days. print " - Incremental data! We assume it's good, and use it..." thirty_two_days_ago = now() - datetime.timedelta(days=32) qs = obj_type.objects.filter(date_modified__gt=thirty_two_days_ago) else: print " - Incremental data not found. Working from scratch..." qs = obj_type.objects.all() item_resource = api_resource_obj() if type(qs[0].pk) == int: item_list = queryset_generator(qs) else: # Necessary for jurisdictions, which don't have ints for ids. item_list = qs i = 0 for item in item_list: json_str = item_resource.serialize( None, item_resource.full_dehydrate(item_resource.build_bundle(obj=item)), 'application/json', ).encode('utf-8') with open( os.path.join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, deepgetattr(item, court_attr), '%s.json' % item.pk), 'wb') as f: f.write(json_str) i += 1 print ' - all %s %s json files created.' % (i, obj_type_str)
def swap_archives(obj_type_str): """Swap out new archives, clobbering the old, if present""" mkdir_p(os.path.join(settings.BULK_DATA_DIR, '%s' % obj_type_str)) path_to_gz_files = os.path.join(settings.BULK_DATA_DIR, 'tmp', obj_type_str, '*.tar*') for f in glob.glob(path_to_gz_files): shutil.move( f, os.path.join( settings.BULK_DATA_DIR, obj_type_str, os.path.basename(f) ) )
def make_citation_data(): """Because citations are paginated and because as of this moment there are 11M citations in the database, we cannot provide users with a bulk data file containing the complete objects for every citation. Instead of doing that, we dump our citation table with a shell command, which provides people with compact and reasonable data they can import. """ mkdir_p('/tmp/bulk/citation') print ' - Copying the Document_cases_cited table to disk...' # This command calls the psql COPY command and requests that it dump # the document_id and citation_id columns from the Document_cases_cited # table to disk as a compressed CSV. os.system( '''psql -c "COPY \\"Document_cases_cited\\" (document_id, citation_id) to stdout DELIMITER ',' CSV HEADER" -d courtlistener --username django | gzip > /tmp/bulk/citation/all.csv.gz''' ) print ' - Table created successfully.'
def make_citation_data(): """Because citations are paginated and because as of this moment there are 11M citations in the database, we cannot provide users with a bulk data file containing the complete objects for every citation. Instead of doing that, we dump our citation table with a shell command, which provides people with compact and reasonable data they can import. """ mkdir_p("/tmp/bulk/citation") print " - Copying the Document_cases_cited table to disk..." # This command calls the psql COPY command and requests that it dump # the document_id and citation_id columns from the Document_cases_cited # table to disk as a compressed CSV. os.system( """psql -c "COPY \\"Document_cases_cited\\" (document_id, citation_id) to stdout DELIMITER ',' CSV HEADER" -d courtlistener --username django | gzip > /tmp/bulk/citation/all.csv.gz""" ) print " - Table created successfully."