def export_warc(self): # by using select_for_update and checking for existence of this file, # we make sure that we won't accidentally try to create the file multiple # times in parallel. asset = self.assets.select_for_update().first() if not asset: return # this is not an old-style Link if default_storage.exists(self.warc_storage_file()): return guid = self.guid out = self.open_warc_for_writing() def write_resource_record(file_path, url, content_type): self.write_warc_resource_record( default_storage.open(file_path), url.encode('utf8'), content_type, default_storage.created_time(file_path), out) def write_metadata_record(metadata, target_headers): concurrent_to = (v for k, v in target_headers if k == warctools.WarcRecord.ID).next() warc_date = (v for k, v in target_headers if k == warctools.WarcRecord.DATE).next() url = (v for k, v in target_headers if k == warctools.WarcRecord.URL).next() self.write_warc_metadata_record(metadata, url, concurrent_to, warc_date, out) # write PDF capture if asset.pdf_capture and ('cap' in asset.pdf_capture or 'upload' in asset.pdf_capture): file_path = os.path.join(asset.base_storage_path, asset.pdf_capture) headers = write_resource_record(file_path, "file:///%s/%s" % (guid, asset.pdf_capture), 'application/pdf') #write_metadata_record({'role':'primary', 'user_upload':asset.user_upload}, headers) # write image capture (if it's not a PDF thumbnail) elif (asset.image_capture and ('cap' in asset.image_capture or 'upload' in asset.image_capture)): file_path = os.path.join(asset.base_storage_path, asset.image_capture) mime_type = get_mime_type(asset.image_capture) write_resource_record(file_path, "file:///%s/%s" % (guid, asset.image_capture), mime_type) if asset.warc_capture: # write WARC capture if asset.warc_capture == 'archive.warc.gz': file_path = os.path.join(asset.base_storage_path, asset.warc_capture) self.write_warc_raw_data(default_storage.open(file_path), out) # write wget capture elif asset.warc_capture == 'source/index.html': mime = MimeTypes() for root, dirs, files in default_storage.walk(os.path.join(asset.base_storage_path, 'source')): rel_path = root.split(asset.base_storage_path, 1)[-1] for file_name in files: mime_type = mime.guess_type(file_name)[0] write_resource_record(os.path.join(root, file_name), "file:///%s%s/%s" % (guid, rel_path, file_name), mime_type) self.close_warc_after_writing(out) # regenerate CDX index self.cdx_lines.all().delete()
def migrate_assets(apps, schema_editor): # Update CDXLines to point to Link instead of Asset CDXLine = apps.get_model("perma", "CDXLine") print "Migrating CDXLines." for line in CDXLine.objects.all().select_related('asset'): line.link_id = line.asset.link_id line.save() # Create Captures Asset = apps.get_model("perma", "Asset") Capture = apps.get_model("perma", "Capture") print "Migrating Assets." obj_cache = [] for i, asset in enumerate(Asset.objects.select_related('link').all()): if not i%1000: print "." if asset.pdf_capture: status = 'success' if asset.pdf_capture.endswith( '.pdf') else 'pending' if asset.pdf_capture == 'pending' else 'failed' obj_cache.append(Capture( link_id=asset.link_id, role='primary', status=status, url="file:///%s/%s" % (asset.link_id, asset.pdf_capture) if status == 'success' else None, record_type="resource", content_type="application/pdf", user_upload="upload" in asset.pdf_capture, )) elif asset.image_capture: upload = "upload" in asset.image_capture status = 'success' if 'cap' in asset.image_capture or 'upload' in asset.image_capture else 'pending' if asset.image_capture == 'pending' else 'failed' obj_cache.append(Capture( link_id=asset.link_id, role='primary' if upload else 'screenshot', status=status, url="file:///%s/%s" % (asset.link_id, asset.image_capture) if status == 'success' else None, record_type="resource", content_type=get_mime_type(asset.image_capture) or '', user_upload=upload, )) if asset.warc_capture: is_warc = asset.warc_capture == 'archive.warc.gz' status = 'success' if asset.warc_capture == 'archive.warc.gz' or asset.warc_capture == 'source/index.html' else 'pending' if asset.warc_capture == 'pending' else 'failed' url = None if status == 'success': url = asset.link.submitted_url if is_warc else "file:///%s/source/index.html" % asset.link_id obj_cache.append(Capture( link_id=asset.link_id, role='primary', status=status, url=url, record_type="response" if is_warc else "resource", content_type="text/html", )) if len(obj_cache)>1000: Capture.objects.bulk_create(obj_cache) obj_cache = [] Capture.objects.bulk_create(obj_cache)
def migrate_assets(apps, schema_editor): # Update CDXLines to point to Link instead of Asset CDXLine = apps.get_model("perma", "CDXLine") print "Migrating CDXLines." for line in CDXLine.objects.all().select_related('asset'): line.link_id = line.asset.link_id line.save() # Create Captures Asset = apps.get_model("perma", "Asset") Capture = apps.get_model("perma", "Capture") print "Migrating Assets." obj_cache = [] for i, asset in enumerate(Asset.objects.select_related('link').all()): if not i % 1000: print "." if asset.pdf_capture: status = 'success' if asset.pdf_capture.endswith( '.pdf' ) else 'pending' if asset.pdf_capture == 'pending' else 'failed' obj_cache.append( Capture( link_id=asset.link_id, role='primary', status=status, url="file:///%s/%s" % (asset.link_id, asset.pdf_capture) if status == 'success' else None, record_type="resource", content_type="application/pdf", user_upload="upload" in asset.pdf_capture, )) elif asset.image_capture: upload = "upload" in asset.image_capture status = 'success' if 'cap' in asset.image_capture or 'upload' in asset.image_capture else 'pending' if asset.image_capture == 'pending' else 'failed' obj_cache.append( Capture( link_id=asset.link_id, role='primary' if upload else 'screenshot', status=status, url="file:///%s/%s" % (asset.link_id, asset.image_capture) if status == 'success' else None, record_type="resource", content_type=get_mime_type(asset.image_capture) or '', user_upload=upload, )) if asset.warc_capture: is_warc = asset.warc_capture == 'archive.warc.gz' status = 'success' if asset.warc_capture == 'archive.warc.gz' or asset.warc_capture == 'source/index.html' else 'pending' if asset.warc_capture == 'pending' else 'failed' url = None if status == 'success': url = asset.link.submitted_url if is_warc else "file:///%s/source/index.html" % asset.link_id obj_cache.append( Capture( link_id=asset.link_id, role='primary', status=status, url=url, record_type="response" if is_warc else "resource", content_type="text/html", )) if len(obj_cache) > 1000: Capture.objects.bulk_create(obj_cache) obj_cache = [] Capture.objects.bulk_create(obj_cache)