def handle(self, start=2000, stop=1000, dataset_aging=1.0, log_aging=1.0, sandbox_aging=1.0, synch=False, wait=timedelta(seconds=0), batch_size=100, **kwargs): # noinspection PyBroadException try: if synch: logger.debug('Starting purge synchronization.') self.synch_model(Container, 'file', wait, batch_size) self.synch_model(ContainerRun, 'sandbox_path', wait, batch_size) self.synch_model(ContainerLog, 'long_text', wait, batch_size) self.synch_model(Dataset, 'dataset_file', wait, batch_size) Dataset.external_file_check(batch_size=batch_size) logger.debug('Finished purge synchronization.') else: self.purge(start, stop, dataset_aging, log_aging, sandbox_aging, batch_size) except Exception: logger.error('Purge failed.', exc_info=True)
def create(self, validated_data): """ Create a Dataset object from deserialized and validated data. """ # The default behaviour for keep_file depends on the mode of creation. keep_file = True file_path = validated_data.get("external_path", "") efd = validated_data.get("externalfiledirectory", None) # Both or neither are specified (this is enforced in serializer validation). if file_path: file_path = os.path.join(efd.path, file_path) keep_file = False # don't retain a copy by default # Override the default if specified. keep_file = validated_data.get("save_in_db", keep_file) dataset = Dataset.create_dataset( is_uploaded=True, # Assume serializer is only used for uploads. file_path=file_path, user=self.context["request"].user, users_allowed=validated_data["users_allowed"], groups_allowed=validated_data["groups_allowed"], keep_file=keep_file, name=validated_data["name"], description=validated_data.get("description"), file_source=None, check=True, file_handle=validated_data.get( "dataset_file", None), # should be freshly opened so cursor is at start externalfiledirectory=efd) return dataset
def save_outputs(self, run): output_path = os.path.join(run.full_sandbox_path, 'output') upload_path = os.path.join(run.full_sandbox_path, 'upload') os.mkdir(upload_path) for argument in run.app.arguments.filter(type=ContainerArgument.OUTPUT): argument_path = os.path.join(output_path, argument.name) dataset_name = self.build_dataset_name(run, argument.name) new_argument_path = os.path.join(upload_path, dataset_name) try: os.rename(argument_path, new_argument_path) dataset = Dataset.create_dataset(new_argument_path, name=dataset_name, user=run.user) dataset.copy_permissions(run) run.datasets.create(dataset=dataset, argument=argument) except (OSError, IOError) as ex: if ex.errno != errno.ENOENT: raise logs_path = os.path.join(run.full_sandbox_path, 'logs') for file_name, log_type in (('stdout.txt', ContainerLog.STDOUT), ('stderr.txt', ContainerLog.STDERR)): run.load_log(os.path.join(logs_path, file_name), log_type) run.set_md5() run.state = (ContainerRun.COMPLETE if run.return_code == 0 else ContainerRun.FAILED) run.end_time = timezone.now()
def save_outputs(self, run): output_path = os.path.join(run.full_sandbox_path, 'output') upload_path = os.path.join(run.full_sandbox_path, 'upload') os.mkdir(upload_path) for argument in run.app.arguments.filter( type=ContainerArgument.OUTPUT): argument_path = os.path.join(output_path, argument.name) dataset_name = self.build_dataset_name(run, argument.name) new_argument_path = os.path.join(upload_path, dataset_name) try: os.rename(argument_path, new_argument_path) dataset = Dataset.create_dataset(new_argument_path, name=dataset_name, user=run.user) dataset.copy_permissions(run) run.datasets.create(dataset=dataset, argument=argument) except (OSError, IOError) as ex: if ex.errno != errno.ENOENT: raise logs_path = os.path.join(run.full_sandbox_path, 'logs') for file_name, log_type in (('stdout.txt', ContainerLog.STDOUT), ('stderr.txt', ContainerLog.STDERR)): run.load_log(os.path.join(logs_path, file_name), log_type) run.set_md5() run.state = (ContainerRun.COMPLETE if run.return_code == 0 else ContainerRun.FAILED) run.end_time = timezone.now()
def _save_output_directory_argument(cls, run: ContainerRun, argument: ContainerArgument, output_path: str, upload_path: str) -> None: output_path = pathlib.Path(output_path).absolute() dirarg_path = output_path / argument.name for dirpath, _, filenames in os.walk(dirarg_path): dirpath = pathlib.Path(dirpath) for filename in filenames: datafile_path: pathlib.Path = (dirpath / filename).absolute() dataset_filename = cls._build_directory_file_name( run.id, output_path, datafile_path) destination_path = os.path.join(upload_path, dataset_filename) dataset_name = cls._build_directory_dataset_name( run.id, output_path, datafile_path) try: os.rename(datafile_path, destination_path) dataset = Dataset.create_dataset( destination_path, name=dataset_name, user=run.user, ) dataset.copy_permissions(run) run.datasets.create(dataset=dataset, argument=argument) except (OSError, IOError) as ex: if ex.errno != errno.ENOENT: raise
def create_datasets(self, user): """ Creates the Datasets and the corresponding SymbolicDatasets in same order as cleaned_data["dataset_files"]. Will still save successful Datasets to database even if some of the Datasets fail to create. :return: None and a list of the created Dataset objects in the same order as cleaned_data["dataset_files"]. If particular Dataset failed to create, then the list element contains a dict that can be used to inform the user about the file. """ results = [] for file_size, uploaded_file in self.cleaned_data['dataset_file']: # Note that uploaded_file should be seek'd to the beginning. It was presumably # just opened so that should be OK but if this ever changes we will have to fix this. dataset = error_str = auto_name = None try: # TODO: use correct unique constraints name_prefix = "" if self.cleaned_data["name_prefix"]: name_prefix = self.cleaned_data["name_prefix"] + "_" auto_name = name_prefix + uploaded_file.name + "_" + datetime.now( ).strftime('%Y%m%d%H%M%S%f') if self.cleaned_data["description"]: auto_description = self.cleaned_data["description"] else: auto_description = "Bulk Uploaded File " + uploaded_file.name dataset = Dataset.create_dataset(is_uploaded=True, file_path=None, user=user, keep_file=True, name=auto_name, description=auto_description, file_source=None, check=True, file_handle=uploaded_file) dataset.grant_from_json(self.cleaned_data["permissions"]) except Exception as e: error_str = str(e) LOGGER.exception( "Error while creating Dataset for file with original file name=" + str(uploaded_file.name) + " and autogenerated Dataset name = " + str(auto_name)) if dataset and error_str is None: results.append(dataset) elif error_str and dataset is None: results.append({ "name": uploaded_file.name, "errstr": error_str, "size": file_size }) else: raise ValueError( "Invalid situation. Must either have a dataset or error. Can not have both or none." ) return None, results
def test_removal_skips_inputs(self): run = ContainerRun(id=42, state=ContainerRun.COMPLETE) dataset = Dataset(id=43) argument = ContainerArgument(type=ContainerArgument.INPUT) run.datasets.create(dataset=dataset, argument=argument) expected_plan = {'ContainerRuns': {run}} plan = run.build_removal_plan() self.assertEqual(expected_plan, strip_removal_plan(plan))
def test_create_next_month_upload_dir03(self): """ Test the creation of a monthly dir, where the dir is already present.""" dataset_dir = os.path.join(settings.MEDIA_ROOT, Dataset.UPLOAD_DIR) date_str = (date.today() + timedelta(days=30)).strftime('%Y_%m') next_dirname = os.path.join(dataset_dir, date_str) # make the directory iff it doesn't exist if not os.path.exists(next_dirname): os.makedirs(next_dirname) gg = Dataset.idle_create_next_month_upload_dir() self.man._add_idletask(gg) time_limit = time.time() + 1000.0 self.man._do_idle_tasks(time_limit) self.assertTrue(os.path.exists(next_dirname), "directory was not made")
def test_create_next_month_upload_dir02(self): """ Test the creation of a monthly directory where Dataset may be present.""" dataset_dir = os.path.join(settings.MEDIA_ROOT, Dataset.UPLOAD_DIR) date_str = (date.today() + timedelta(days=30)).strftime('%Y_%m') next_dirname = os.path.join(dataset_dir, date_str) # delete the dir iff it exists. try: shutil.rmtree(next_dirname) except os.error as e: if e.errno != errno.ENOENT: raise gg = Dataset.idle_create_next_month_upload_dir() self.man._add_idletask(gg) time_limit = time.time() + 1000.0 self.man._do_idle_tasks(time_limit) self.assertTrue(os.path.exists(next_dirname), "directory was not made")
def setUp(self): super(RawTests, self).setUp() self.addTypeEqualityFunc(str, self.assertMultiLineEqual) self.pipeline_raw = tools.make_first_pipeline( "raw noop", "a pipeline to do nothing to raw data", self.user_bob) tools.create_linear_pipeline(self.pipeline_raw, [self.method_noop_raw], "raw_in", "raw_out") self.pipeline_raw.create_outputs() self.dataset_raw = Dataset.create_dataset( "/usr/share/dict/words", user=self.user_bob, cdt=None, keep_file=True, name="raw", description="some raw data" )
def build(self): user = User.objects.first() assert user is not None input_path = os.path.abspath( os.path.join( __file__, '../../../../../samplecode/singularity/host_input/example_names.csv' )) family = ContainerFamily.objects.create(name='fixture family', user=user) container_path = os.path.abspath( os.path.join( __file__, '../../../../../samplecode/singularity/python2-alpine-trimmed.simg' )) with open(container_path, "rb") as f: container_md5 = compute_md5(f) container = family.containers.create( tag='vFixture', user=user, file='Containers/kive-default.simg', md5=container_md5) app = container.apps.create() arg1 = app.arguments.create(type=ContainerArgument.INPUT, name='names_csv', position=1) app.arguments.create(type=ContainerArgument.OUTPUT, name='greetings_csv', position=2) dataset = Dataset.create_dataset(input_path, name='names.csv', user=user) run = app.runs.create(name='fixture run', user=user) run.sandbox_path = "" # blank this out as it won't be accessible in testing anyway run.slurm_job_id = None # this also would cause tests to fail on a fresh system run.save(schedule=False) # scheduling would overwrite sandbox_path run.datasets.create(argument=arg1, dataset=dataset) upload_path = os.path.join(settings.MEDIA_ROOT, Container.UPLOAD_DIR) readme_path = os.path.join(upload_path, 'README.md') os.makedirs(upload_path) with open(readme_path, 'w') as f: f.write('Just a placeholder to create the folder for containers.')
def _save_output_argument( self, run: ContainerRun, argument: ContainerArgument, output_path: str, upload_path: str, ): argument_path = os.path.join(output_path, argument.name) dataset_name = self.build_dataset_name(run, argument.name) new_argument_path = os.path.join(upload_path, dataset_name) try: os.rename(argument_path, new_argument_path) dataset = Dataset.create_dataset(new_argument_path, name=dataset_name, user=run.user) dataset.copy_permissions(run) run.datasets.create(dataset=dataset, argument=argument) except (OSError, IOError) as ex: if ex.errno != errno.ENOENT: raise
def setUp(self): self.ds_owner = User.objects.create_user("Noonian", "*****@*****.**", "feeeeeeelings") self.ds_owner.save() self.ds_owner.groups.add(everyone_group()) self.lore = User.objects.create_user("Lore", "*****@*****.**", "Asimov's Three Laws") self.lore.save() self.lore.groups.add(everyone_group()) self.developers_group = Group.objects.get(pk=groups.DEVELOPERS_PK) self.dataset = Dataset.create_empty(user=self.ds_owner) self.dataset.name = "Test" self.dataset.description = "Test dataset" self.dataset.save() self.users_to_intersect = User.objects.filter( pk__in=[self.ds_owner.pk, self.lore.pk]) self.groups_to_intersect = Group.objects.filter( pk__in=[self.developers_group.pk, everyone_group().pk])
def create(self, validated_data): """ Create a Dataset object from deserialized and validated data. """ cdt = None if "structure" in validated_data: cdt = validated_data["structure"].get("compounddatatype", None) # The default behaviour for keep_file depends on the mode of creation. keep_file = True file_path = validated_data.get("external_path", "") efd = validated_data.get("externalfiledirectory", None) # Both or neither are specified (this is enforced in serializer validation). if file_path: file_path = os.path.join(efd.path, file_path) keep_file = False # don't retain a copy by default # Override the default if specified. keep_file = validated_data.get("save_in_db", keep_file) dataset = Dataset.create_dataset( is_uploaded=True, # Assume serializer is only used for uploads. file_path=file_path, user=self.context["request"].user, users_allowed=validated_data["users_allowed"], groups_allowed=validated_data["groups_allowed"], cdt=cdt, keep_file=keep_file, name=validated_data["name"], description=validated_data.get("description"), file_source=None, check=True, file_handle=validated_data.get("dataset_file", None), # should be freshly opened so cursor is at start externalfiledirectory=efd ) return dataset
def dataset_view(request, dataset_id): """ Display the file associated with the dataset in the browser, or update its name/description. """ return_to_run = request.GET.get('run_id', None) is_view_results = "view_results" in request.GET is_view_run = "view_run" in request.GET return_url = reverse("datasets") if return_to_run is not None: if is_view_run: return_url = reverse('view_run', kwargs={'run_id': return_to_run}) elif is_view_results: return_url = reverse('view_results', kwargs={'run_id': return_to_run}) try: if admin_check(request.user): accessible_datasets = Dataset.objects else: accessible_datasets = Dataset.filter_by_user(request.user) dataset = accessible_datasets.prefetch_related( 'structure', 'structure__compounddatatype', 'structure__compounddatatype__members', 'structure__compounddatatype__members__datatype', 'structure__compounddatatype__members__datatype__basic_constraints' ).get(pk=dataset_id) except ObjectDoesNotExist: raise Http404("ID {} cannot be accessed".format(dataset_id)) # Figure out which users and groups could be given access to this Dataset. # If the Dataset is uploaded, it's anyone who doesn't already have access; # if it was generated, it's anyone who had access to the generating run. addable_users, addable_groups = dataset.other_users_groups() if dataset.file_source is None: generating_run = None else: generating_run = dataset.file_source.top_level_run container_dataset = dataset.containers.filter(argument__type='O').first() # Output from which runs? if container_dataset is None: container_run = None else: container_run = container_dataset.run inputs_count = dataset.containers.filter( argument__type='I').values('run_id').distinct().count() if request.method == "POST": # We are going to try and update this Dataset. dataset_form = DatasetDetailsForm( request.POST, access_limits=dataset.get_access_limits(), instance=dataset ) try: if dataset_form.is_valid(): dataset.name = dataset_form.cleaned_data["name"] dataset.description = dataset_form.cleaned_data["description"] dataset.clean() dataset.save() with transaction.atomic(): dataset.grant_from_json(dataset_form.cleaned_data["permissions"]) dataset.validate_restrict_access(dataset.get_access_limits()) return HttpResponseRedirect(return_url) except (AttributeError, ValidationError, ValueError) as e: LOGGER.exception(e.message) dataset_form.add_error(None, e) else: # A DatasetForm which we can use to make submission and editing easier. dataset_form = DatasetDetailsForm( access_limits=dataset.get_access_limits(), initial={"name": dataset.name, "description": dataset.description} ) c = { "is_admin": admin_check(request.user), "is_owner": dataset.user == request.user, "dataset": dataset, "return": return_url, "dataset_form": dataset_form, "generating_run": generating_run, "inputs_count": inputs_count, "container_run": container_run } if not dataset.has_data(): t = loader.get_template("librarian/missing_dataset_view.html") if dataset.external_path: c["missing_data_message"] = "This dataset's external file is missing or has "\ "been modified (MD5 mismatch). " \ "Please consult your system administrator if this is unexpected." elif dataset.is_redacted(): c["missing_data_message"] = "Data has been redacted." else: c["missing_data_message"] = "Data was not retained or has been purged." rendered_response = t.render(c, request) elif dataset.is_raw(): t = loader.get_template("librarian/raw_dataset_view.html") # Test whether this is a binary file or not. # Read 1000 characters. data_handle = dataset.get_open_file_handle('r') if data_handle is None: c["missing_data_message"] = "Data has been removed or renamed." else: with data_handle: sample_content = data_handle.read(1000) c.update({"sample_content": sample_content}) c["is_binary"] = False try: rendered_response = t.render(c, request) except DjangoUnicodeDecodeError as e: c["is_binary"] = True del c["sample_content"] rendered_response = t.render(c, request) else: extra_errors = [] # If we have a mismatched output, we do an alignment # over the columns. if dataset.content_matches_header: col_matching, processed_rows = None, dataset.rows( True, limit=settings.DATASET_DISPLAY_MAX, extra_errors=extra_errors) else: col_matching, insert = dataset.column_alignment() processed_rows = dataset.rows(data_check=True, insert_at=insert, limit=settings.DATASET_DISPLAY_MAX, extra_errors=extra_errors) t = loader.get_template("librarian/csv_dataset_view.html") processed_rows = list(processed_rows) c.update( { 'column_matching': col_matching, 'processed_rows': processed_rows, 'extra_errors': extra_errors, "are_rows_truncated": len(processed_rows) >= settings.DATASET_DISPLAY_MAX } ) rendered_response = t.render(c, request) return HttpResponse(rendered_response)
def create_datasets(self, user): """ Creates the Datasets and the corresponding SymbolicDatasets in same order as cleaned_data["dataset_files"]. Will still save successful Datasets to database even if some of the Datasets fail to create. :return: CDT object and a list of the created Dataset objects in the same order as cleaned_data["dataset_files"]. If particular Dataset failed to create, then the list element contains a dict that can be used to inform the user about the file. """ compound_datatype_obj = None if self.cleaned_data['compound_datatype'] != CompoundDatatype.RAW_ID: compound_datatype_obj = CompoundDatatype.objects.get(pk=self.cleaned_data['compound_datatype']) results = [] for file_size, uploaded_file in self.cleaned_data['dataset_file']: # Note that uploaded_file should be seek'd to the beginning. It was presumably # just opened so that should be OK but if this ever changes we will have to fix this. dataset = error_str = auto_name = None try: # TODO: use correct unique constraints name_prefix = "" if self.cleaned_data["name_prefix"]: name_prefix = self.cleaned_data["name_prefix"] + "_" auto_name = name_prefix + uploaded_file.name + "_" + datetime.now().strftime('%Y%m%d%H%M%S%f') if self.cleaned_data["description"]: auto_description = self.cleaned_data["description"] else: auto_description = "Bulk Uploaded File " + uploaded_file.name dataset = Dataset.create_dataset( is_uploaded=True, file_path=None, user=user, cdt=compound_datatype_obj, keep_file=True, name=auto_name, description=auto_description, file_source=None, check=True, file_handle=uploaded_file ) dataset.grant_from_json(self.cleaned_data["permissions"]) except Exception as e: error_str = str(e) LOGGER.exception("Error while creating Dataset for file with original file name=" + str(uploaded_file.name) + " and autogenerated Dataset name = " + str(auto_name)) if dataset and error_str is None: results.append(dataset) elif error_str and dataset is None: results.append({"name": uploaded_file.name, "errstr": error_str, "size": file_size}) else: raise ValueError("Invalid situation. Must either have a dataset or error. Can not have both or none.") return compound_datatype_obj, results
def purge(self, start, stop, dataset_aging, log_aging, sandbox_aging, batch_size): logger.debug('Starting purge.') container_total = self.set_file_sizes(Container, 'file', 'file_size', 'created') sandbox_total = self.set_file_sizes(ContainerRun, 'sandbox_path', 'sandbox_size', 'end_time') log_total = self.set_file_sizes(ContainerLog, 'long_text', 'log_size', 'run__end_time') dataset_total = self.set_file_sizes(Dataset, 'dataset_file', 'dataset_size', 'date_created') total_storage = remaining_storage = (container_total + sandbox_total + log_total + dataset_total) if total_storage <= start: storage_text = self.summarize_storage(container_total, dataset_total, sandbox_total, log_total) logger.debug(u"No purge needed for %s: %s.", filesizeformat(total_storage), storage_text) return sandbox_ages = ContainerRun.find_unneeded().annotate( entry_type=Value('r', models.CharField()), age=ExpressionWrapper(sandbox_aging * (Now() - F('end_time')), output_field=DurationField())).values_list( 'entry_type', 'id', 'age').order_by() log_ages = ContainerLog.find_unneeded().annotate( entry_type=Value('l', models.CharField()), age=ExpressionWrapper(log_aging * (Now() - F('run__end_time')), output_field=DurationField())).values_list( 'entry_type', 'id', 'age').order_by() dataset_ages = Dataset.find_unneeded().annotate( entry_type=Value('d', models.CharField()), age=ExpressionWrapper(dataset_aging * (Now() - F('date_created')), output_field=FloatField())).values_list( 'entry_type', 'id', 'age').order_by() purge_counts = Counter() max_purge_dates = {} min_purge_dates = {} purge_entries = sandbox_ages.union(log_ages, dataset_ages, all=True).order_by('-age') while remaining_storage > stop: entry_count = 0 for entry_type, entry_id, age in purge_entries[:batch_size]: entry_count += 1 if entry_type == 'r': run = ContainerRun.objects.get(id=entry_id) entry_size = run.sandbox_size entry_date = run.end_time logger.debug("Purged container run %d containing %s.", run.pk, filesizeformat(entry_size)) try: run.delete_sandbox() except OSError: logger.error( u"Failed to purge container run %d at %r.", run.id, run.sandbox_path, exc_info=True) run.sandbox_path = '' run.save() elif entry_type == 'l': log = ContainerLog.objects.get(id=entry_id) entry_size = log.log_size entry_date = log.run.end_time logger.debug("Purged container log %d containing %s.", log.id, filesizeformat(entry_size)) log.long_text.delete() else: assert entry_type == 'd' dataset = Dataset.objects.get(id=entry_id) entry_size = dataset.dataset_size dataset_total -= dataset.dataset_size entry_date = dataset.date_created logger.debug("Purged dataset %d containing %s.", dataset.pk, filesizeformat(entry_size)) dataset.dataset_file.delete() purge_counts[entry_type] += 1 purge_counts[entry_type + ' bytes'] += entry_size # PyCharm false positives... # noinspection PyUnresolvedReferences min_purge_dates[entry_type] = min( entry_date, min_purge_dates.get(entry_type, entry_date)) # noinspection PyUnresolvedReferences max_purge_dates[entry_type] = max( entry_date, max_purge_dates.get(entry_type, entry_date)) remaining_storage -= entry_size if remaining_storage <= stop: break if entry_count == 0: break for entry_type, entry_name in (('r', 'container run'), ('l', 'container log'), ('d', 'dataset')): purged_count = purge_counts[entry_type] if not purged_count: continue min_purge_date = min_purge_dates[entry_type] max_purge_date = max_purge_dates[entry_type] collective = entry_name + pluralize(purged_count) bytes_removed = purge_counts[entry_type + ' bytes'] start_text = naturaltime(min_purge_date) end_text = naturaltime(max_purge_date) date_range = (start_text if start_text == end_text else start_text + ' to ' + end_text) logger.info("Purged %d %s containing %s from %s.", purged_count, collective, filesizeformat(bytes_removed), date_range) if remaining_storage > stop: storage_text = self.summarize_storage(container_total, dataset_total) logger.error('Cannot reduce storage to %s: %s.', filesizeformat(stop), storage_text)
# A dummy Datatype with a prototype. with tempfile.TemporaryFile() as f: f.write("""example,valid True,True true,False y,False n,False False,False false,false""" ) f.seek(0) proto_SD = Dataset.create_dataset( file_path=None, user=kive_user(), cdt=CompoundDatatype.objects.get(pk=CDTs.PROTOTYPE_PK), name="AlwaysTruePrototype", description="Prototype for dummy Datatype", file_handle=f ) always_true = Datatype( user=kive_user(), name="Python True", description="True in python", proto_SD=proto_SD ) always_true.save() always_true.restricts.add(Datatype.objects.get(pk=datatypes.BOOL_PK)) always_true.basic_constraints.create( ruletype=BasicConstraint.REGEXP,
def datasets_add_archive(request): """ Add datasets in bulk to db from an archive file (zip or tarfile). Redirect to /datasets_bulk view so user can examine upload status of each dataset. """ c = {} # If we got posted to, try to create DB entries if request.method == 'POST': try: archive_add_dataset_form = ArchiveAddDatasetForm( data=request.POST, files=request.FILES) # Try to retrieve new datasets. If this fails, we return to our current page is_ok = archive_add_dataset_form.is_valid() if is_ok: CDT_obj, add_results = archive_add_dataset_form.create_datasets( request.user) is_ok = len(add_results) > 0 if not is_ok: # give up and let user try again t = loader.get_template('librarian/datasets_add_archive.html') c = {'archiveAddDatasetForm': archive_add_dataset_form} return HttpResponse(t.render(c, request)) # have some files in the archive, lets display them # NOTE: at this point, we have a list of files in the archive. # some files might be legit, others not. # we have to cobble together information from add_results and the form cleaned data # for display. uploaded_files = archive_add_dataset_form.cleaned_data[ "dataset_file"] if len(uploaded_files) != len(add_results): raise RuntimeError("List length mismatch") t = loader.get_template('librarian/datasets_bulk.html') # Now have add_results, a list of elements e, where e is either # a dataset if the dataset was successfully created # or # a dict if a dataset was not successfully created # Generate a response archive_display_results = [] # Fill in default values for the form fields for add_result, upload_info in zip(add_results, uploaded_files): display_result = {} if isinstance(add_result, dict): # the dataset is invalid display_result["name"] = add_result["name"] display_result["description"] = "" display_result["orig_filename"] = add_result["name"] display_result["filesize"] = add_result["size"] display_result["md5"] = "" display_result["id"] = "" display_result["is_valid"] = False else: display_result["name"] = add_result.name display_result["description"] = add_result.description # This is the original filename as uploaded by the client, not the filename as stored # on the file server. display_result["orig_filename"] = upload_info[1].name display_result[ "filesize"] = add_result.get_formatted_filesize() display_result["md5"] = add_result.compute_md5() display_result["id"] = add_result.id display_result["is_valid"] = True archive_display_results.append(display_result) # now create forms from the display results. BulkDatasetUpdateFormSet = formset_factory( form=BulkDatasetUpdateForm, max_num=len(archive_display_results)) bulk_dataset_update_formset = BulkDatasetUpdateFormSet( initial=archive_display_results) # Fill in the attributes that are not fields in the form # These are not set by the BulkDatasetUpdateFormSet(initial=...) parameter, # so we have to tweak the forms after they have been created for dataset_form, display_result, add_result in zip( bulk_dataset_update_formset, archive_display_results, add_results): if display_result["is_valid"]: dataset_form.dataset = add_result dataset_form.status = BulkDatasetDisplay.STATUS_SUCCESS else: dataset_form.dataset = Dataset() dataset_form.non_field_errors = add_result["errstr"] dataset_form.status = BulkDatasetDisplay.STATUS_FAIL # finally, add some other pertinent information which the template will display num_files_added = sum( [a["is_valid"] for a in archive_display_results]) c["bulk_dataset_formset"] = bulk_dataset_update_formset c["num_files_selected"] = len(add_results) c["num_files_added"] = num_files_added c["cdt_typestr"] = "Unstructured" if CDT_obj is None else CDT_obj except ValidationError as e: LOGGER.exception(e.message) archive_add_dataset_form.add_error(None, e) t = loader.get_template('librarian/datasets_add_archive.html') c.update({'archiveAddDatasetForm': archive_add_dataset_form}) else: # return an empty form for the user to fill in t = loader.get_template('librarian/datasets_add_archive.html') c['archiveAddDatasetForm'] = ArchiveAddDatasetForm() return HttpResponse(t.render(c, request))
from django.core.files import File from django.contrib.auth.models import User import metadata.models from librarian.models import Dataset import method.models import kive.testing_utils as tools # This comes from the initial_user fixture. kive_user = User.objects.get(pk=1) test_fasta = Dataset.create_dataset( file_path="../samplecode/step_0_raw.fasta", user=kive_user, cdt=None, keep_file=True, name="TestFASTA", description="Toy FASTA file for testing pipelines") # Set up a test Pipeline. resource = method.models.CodeResource(name="Fasta2CSV", description="FASTA converter script", filename="Fasta2CSV.py") resource.clean() resource.save() with open("../samplecode/fasta2csv.py", "rb") as f: revision = method.models.CodeResourceRevision( coderesource=resource, revision_name="v1", revision_desc="First version",
def purge(self, start, stop, dataset_aging, log_aging, sandbox_aging, batch_size): logger.debug('Starting purge.') container_total = self.set_file_sizes(Container, 'file', 'file_size', 'created') sandbox_total = self.set_file_sizes(ContainerRun, 'sandbox_path', 'sandbox_size', 'end_time') log_total = self.set_file_sizes(ContainerLog, 'long_text', 'log_size', 'run__end_time') dataset_total = self.set_file_sizes(Dataset, 'dataset_file', 'dataset_size', 'date_created') total_storage = remaining_storage = ( container_total + sandbox_total + log_total + dataset_total) if total_storage <= start: storage_text = self.summarize_storage(container_total, dataset_total, sandbox_total, log_total) logger.debug(u"No purge needed for %s: %s.", filesizeformat(total_storage), storage_text) return sandbox_ages = ContainerRun.find_unneeded().annotate( entry_type=Value('r', models.CharField()), age=sandbox_aging * (Now() - F('end_time'))).values_list( 'entry_type', 'id', 'age').order_by() log_ages = ContainerLog.find_unneeded().annotate( entry_type=Value('l', models.CharField()), age=log_aging * (Now() - F('run__end_time'))).values_list( 'entry_type', 'id', 'age').order_by() dataset_ages = Dataset.find_unneeded().annotate( entry_type=Value('d', models.CharField()), age=dataset_aging * (Now() - F('date_created'))).values_list( 'entry_type', 'id', 'age').order_by() purge_counts = Counter() max_purge_dates = {} min_purge_dates = {} purge_entries = sandbox_ages.union(log_ages, dataset_ages, all=True).order_by('-age') while remaining_storage > stop: entry_count = 0 for entry_type, entry_id, age in purge_entries[:batch_size]: entry_count += 1 if entry_type == 'r': run = ContainerRun.objects.get(id=entry_id) entry_size = run.sandbox_size entry_date = run.end_time logger.debug("Purged container run %d containing %s.", run.pk, filesizeformat(entry_size)) try: run.delete_sandbox() except OSError: logger.error(u"Failed to purge container run %d at %r.", run.id, run.sandbox_path, exc_info=True) run.sandbox_path = '' run.save() elif entry_type == 'l': log = ContainerLog.objects.get(id=entry_id) entry_size = log.log_size entry_date = log.run.end_time logger.debug("Purged container log %d containing %s.", log.id, filesizeformat(entry_size)) log.long_text.delete() else: assert entry_type == 'd' dataset = Dataset.objects.get(id=entry_id) entry_size = dataset.dataset_size dataset_total -= dataset.dataset_size entry_date = dataset.date_created logger.debug("Purged dataset %d containing %s.", dataset.pk, filesizeformat(entry_size)) dataset.dataset_file.delete() purge_counts[entry_type] += 1 purge_counts[entry_type + ' bytes'] += entry_size # PyCharm false positives... # noinspection PyUnresolvedReferences min_purge_dates[entry_type] = min(entry_date, min_purge_dates.get(entry_type, entry_date)) # noinspection PyUnresolvedReferences max_purge_dates[entry_type] = max(entry_date, max_purge_dates.get(entry_type, entry_date)) remaining_storage -= entry_size if remaining_storage <= stop: break if entry_count == 0: break for entry_type, entry_name in (('r', 'container run'), ('l', 'container log'), ('d', 'dataset')): purged_count = purge_counts[entry_type] if not purged_count: continue min_purge_date = min_purge_dates[entry_type] max_purge_date = max_purge_dates[entry_type] collective = entry_name + pluralize(purged_count) bytes_removed = purge_counts[entry_type + ' bytes'] start_text = naturaltime(min_purge_date) end_text = naturaltime(max_purge_date) date_range = (start_text if start_text == end_text else start_text + ' to ' + end_text) logger.info("Purged %d %s containing %s from %s.", purged_count, collective, filesizeformat(bytes_removed), date_range) if remaining_storage > stop: storage_text = self.summarize_storage(container_total, dataset_total) logger.error('Cannot reduce storage to %s: %s.', filesizeformat(stop), storage_text)
prototype_CDT = CompoundDatatype.objects.get(pk=CDTs.PROTOTYPE_PK) # A dummy Datatype with a prototype. with tempfile.TemporaryFile() as f: f.write("""example,valid True,True true,False y,False n,False False,False false,false""") f.seek(0) proto_SD = Dataset.create_dataset( file_path=None, user=kive_user(), cdt=CompoundDatatype.objects.get(pk=CDTs.PROTOTYPE_PK), name="AlwaysTruePrototype", description="Prototype for dummy Datatype", file_handle=f) always_true = Datatype(user=kive_user(), name="Python True", description="True in python", proto_SD=proto_SD) always_true.save() always_true.restricts.add(Datatype.objects.get(pk=datatypes.BOOL_PK)) always_true.basic_constraints.create(ruletype=BasicConstraint.REGEXP, rule="True")
def datasets_add_bulk(request): """ Add datasets in bulk to db. Redirect to /datasets_bulk view so user can examine upload status of each dataset. """ # Redirect to page to allow user to view status of added datasets. c = {} if request.method == 'POST': try: # Add new datasets. bulk_add_dataset_form = BulkAddDatasetForm(data=request.POST, files=request.FILES) isok = bulk_add_dataset_form.is_valid() if isok: CDT_obj, add_results = bulk_add_dataset_form.create_datasets( request.user) isok = len(add_results) > 0 if not isok: # give up and let user try again t = loader.get_template('librarian/datasets_add_bulk.html') c = {'bulkAddDatasetForm': bulk_add_dataset_form} return HttpResponse(t.render(c, request)) # Generate response. uploaded_files = bulk_add_dataset_form.cleaned_data[ "dataset_files"] if len(uploaded_files) != len(add_results): raise RuntimeError("List length mismatch") t = loader.get_template('librarian/datasets_bulk.html') bulk_display_results = [] # Fill in default values for the form fields for add_result, upload_info in zip(add_results, uploaded_files): display_result = {} if isinstance(add_result, dict): # dataset is invalid display_result["name"] = add_result["name"] display_result["description"] = "" display_result["orig_filename"] = add_result["name"] display_result["filesize"] = add_result["size"] display_result["md5"] = "" display_result["id"] = "" display_result["is_valid"] = False else: display_result["name"] = add_result.name display_result["description"] = add_result.description # This is the original filename as uploaded by the client, not the filename as stored # on the file server. display_result["orig_filename"] = upload_info[1].name display_result[ "filesize"] = add_result.get_formatted_filesize() display_result["md5"] = add_result.compute_md5() display_result["id"] = add_result.id display_result["is_valid"] = True bulk_display_results.append(display_result) BulkDatasetUpdateFormSet = formset_factory( form=BulkDatasetUpdateForm, max_num=len(bulk_display_results)) bulk_dataset_update_formset = BulkDatasetUpdateFormSet( initial=bulk_display_results) # Fill in the attributes that are not fields in the form # These are not set by the BulkDatasetUpdateFormSet(initial=...) parameter for dataset_form, display_result, add_result in zip( bulk_dataset_update_formset, bulk_display_results, add_results): if display_result["is_valid"]: dataset_form.dataset = add_result dataset_form.status = BulkDatasetDisplay.STATUS_SUCCESS else: dataset_form.dataset = Dataset() dataset_form.non_field_errors = add_result["errstr"] dataset_form.status = BulkDatasetDisplay.STATUS_FAIL # finally, add some other pertinent information which the template will display num_files_added = sum( [a["is_valid"] for a in bulk_display_results]) c["bulk_dataset_formset"] = bulk_dataset_update_formset c["num_files_selected"] = len(add_results) c["num_files_added"] = num_files_added c["cdt_typestr"] = "Unstructured" if CDT_obj is None else CDT_obj except ValidationError as e: LOGGER.exception(e.message) bulk_add_dataset_form.add_error(None, e) c.update({'bulkAddDatasetForm': bulk_add_dataset_form}) else: # return an empty form for the user to fill in t = loader.get_template('librarian/datasets_add_bulk.html') c.update({'bulkAddDatasetForm': BulkAddDatasetForm()}) return HttpResponse(t.render(c, request))
from django.core.files import File from django.contrib.auth.models import User import metadata.models from librarian.models import Dataset import method.models import kive.testing_utils as tools # This comes from the initial_user fixture. kive_user = User.objects.get(pk=1) test_fasta = Dataset.create_dataset( file_path="../samplecode/step_0_raw.fasta", user=kive_user, cdt=None, keep_file=True, name="TestFASTA", description="Toy FASTA file for testing pipelines" ) # Set up a test Pipeline. resource = method.models.CodeResource(name="Fasta2CSV", description="FASTA converter script", filename="Fasta2CSV.py") resource.clean() resource.save() with open("../samplecode/fasta2csv.py", "rb") as f: revision = method.models.CodeResourceRevision( coderesource=resource, revision_name="v1", revision_desc="First version", content_file=File(f)) revision.clean()
def filter_granted(self, queryset): """ Filter a queryset to only include records explicitly granted. """ return Dataset.filter_by_user(self.request.user)
def __init__(self, *args, **kwargs): super(BulkDatasetUpdateForm, self).__init__(*args, **kwargs) self.dataset = Dataset() self.status = 0
def dataset_view(request, dataset_id): """ Display the file associated with the dataset in the browser, or update its name/description. """ return_to_run = request.GET.get('run_id', None) is_view_results = "view_results" in request.GET is_view_run = "view_run" in request.GET return_url = reverse("datasets") if return_to_run is not None: if is_view_run: return_url = reverse('view_run', kwargs={'run_id': return_to_run}) elif is_view_results: return_url = reverse('view_results', kwargs={'run_id': return_to_run}) try: if admin_check(request.user): accessible_datasets = Dataset.objects else: accessible_datasets = Dataset.filter_by_user(request.user) dataset = accessible_datasets.prefetch_related( 'structure', 'structure__compounddatatype', 'structure__compounddatatype__members', 'structure__compounddatatype__members__datatype', 'structure__compounddatatype__members__datatype__basic_constraints' ).get(pk=dataset_id) except ObjectDoesNotExist: raise Http404("ID {} cannot be accessed".format(dataset_id)) # Figure out which users and groups could be given access to this Dataset. # If the Dataset is uploaded, it's anyone who doesn't already have access; # if it was generated, it's anyone who had access to the generating run. addable_users, addable_groups = dataset.other_users_groups() if dataset.file_source is None: generating_run = None else: generating_run = dataset.file_source.top_level_run container_dataset = dataset.containers.filter( argument__type='O').first() # Output from which runs? if container_dataset is None: container_run = None else: container_run = container_dataset.run inputs_count = dataset.containers.filter( argument__type='I').values('run_id').distinct().count() if request.method == "POST": # We are going to try and update this Dataset. dataset_form = DatasetDetailsForm( request.POST, access_limits=dataset.get_access_limits(), instance=dataset) try: if dataset_form.is_valid(): dataset.name = dataset_form.cleaned_data["name"] dataset.description = dataset_form.cleaned_data["description"] dataset.clean() dataset.save() with transaction.atomic(): dataset.grant_from_json( dataset_form.cleaned_data["permissions"]) dataset.validate_restrict_access( dataset.get_access_limits()) return HttpResponseRedirect(return_url) except (AttributeError, ValidationError, ValueError) as e: LOGGER.exception(e.message) dataset_form.add_error(None, e) else: # A DatasetForm which we can use to make submission and editing easier. dataset_form = DatasetDetailsForm( access_limits=dataset.get_access_limits(), initial={ "name": dataset.name, "description": dataset.description }) c = { "is_admin": admin_check(request.user), "is_owner": dataset.user == request.user, "dataset": dataset, "return": return_url, "dataset_form": dataset_form, "generating_run": generating_run, "inputs_count": inputs_count, "container_run": container_run } if not dataset.has_data(): t = loader.get_template("librarian/missing_dataset_view.html") if dataset.external_path: c["missing_data_message"] = "This dataset's external file is missing or has "\ "been modified (MD5 mismatch). " \ "Please consult your system administrator if this is unexpected." elif dataset.is_redacted(): c["missing_data_message"] = "Data has been redacted." else: c["missing_data_message"] = "Data was not retained or has been purged." rendered_response = t.render(c, request) elif dataset.is_raw(): t = loader.get_template("librarian/raw_dataset_view.html") # Test whether this is a binary file or not. # Read 1000 characters. data_handle = dataset.get_open_file_handle('r') if data_handle is None: c["missing_data_message"] = "Data has been removed or renamed." else: with data_handle: sample_content = data_handle.read(1000) c.update({"sample_content": sample_content}) c["is_binary"] = False try: rendered_response = t.render(c, request) except DjangoUnicodeDecodeError as e: c["is_binary"] = True del c["sample_content"] rendered_response = t.render(c, request) else: extra_errors = [] # If we have a mismatched output, we do an alignment # over the columns. if dataset.content_matches_header: col_matching, processed_rows = None, dataset.rows( True, limit=settings.DATASET_DISPLAY_MAX, extra_errors=extra_errors) else: col_matching, insert = dataset.column_alignment() processed_rows = dataset.rows(data_check=True, insert_at=insert, limit=settings.DATASET_DISPLAY_MAX, extra_errors=extra_errors) t = loader.get_template("librarian/csv_dataset_view.html") processed_rows = list(processed_rows) c.update({ 'column_matching': col_matching, 'processed_rows': processed_rows, 'extra_errors': extra_errors, "are_rows_truncated": len(processed_rows) >= settings.DATASET_DISPLAY_MAX }) rendered_response = t.render(c, request) return HttpResponse(rendered_response)
def dataset_view(request, dataset_id): """ Display the file associated with the dataset in the browser, or update its name/description. """ return_url = reverse("datasets") try: if admin_check(request.user): accessible_datasets = Dataset.objects else: accessible_datasets = Dataset.filter_by_user(request.user) dataset = accessible_datasets.get(pk=dataset_id) except ObjectDoesNotExist: raise Http404("ID {} cannot be accessed".format(dataset_id)) # Figure out which users and groups could be given access to this Dataset. # If the Dataset is uploaded, it's anyone who doesn't already have access; # if it was generated, it's anyone who had access to the generating run. addable_users, addable_groups = dataset.other_users_groups() generating_run = None container_dataset = dataset.containers.filter( argument__type='O').first() # Output from which runs? if container_dataset is None: container_run = None else: container_run = container_dataset.run inputs_count = dataset.containers.filter( argument__type='I').values('run_id').distinct().count() if request.method == "POST": # We are going to try and update this Dataset. dataset_form = DatasetDetailsForm( request.POST, access_limits=dataset.get_access_limits(), instance=dataset) try: if dataset_form.is_valid(): dataset.name = dataset_form.cleaned_data["name"] dataset.description = dataset_form.cleaned_data["description"] dataset.clean() dataset.save() with transaction.atomic(): dataset.grant_from_json( dataset_form.cleaned_data["permissions"]) dataset.validate_restrict_access( dataset.get_access_limits()) return HttpResponseRedirect(return_url) except (AttributeError, ValidationError, ValueError) as e: LOGGER.exception(e.message) dataset_form.add_error(None, e) else: # A DatasetForm which we can use to make submission and editing easier. dataset_form = DatasetDetailsForm( access_limits=dataset.get_access_limits(), initial={ "name": dataset.name, "description": dataset.description }) c = { "is_admin": admin_check(request.user), "is_owner": dataset.user == request.user, "dataset": dataset, "return": return_url, "dataset_form": dataset_form, "generating_run": generating_run, "inputs_count": inputs_count, "container_run": container_run } if not dataset.has_data(): t = loader.get_template("librarian/missing_dataset_view.html") if dataset.external_path: c["missing_data_message"] = "This dataset's external file is missing or has "\ "been modified (MD5 mismatch). " \ "Please consult your system administrator if this is unexpected." elif dataset.is_redacted(): c["missing_data_message"] = "Data has been redacted." else: c["missing_data_message"] = "Data was not retained or has been purged." rendered_response = t.render(c, request) else: t = loader.get_template("librarian/raw_dataset_view.html") # Test whether this is a binary file or not. # Read 1000 characters. data_handle = dataset.get_open_file_handle('r') if data_handle is None: c["missing_data_message"] = "Data has been removed or renamed." else: with data_handle: sample_content = data_handle.read(1000) c.update({"sample_content": sample_content}) c["is_binary"] = False try: rendered_response = t.render(c, request) except DjangoUnicodeDecodeError: c["is_binary"] = True del c["sample_content"] rendered_response = t.render(c, request) return HttpResponse(rendered_response)